図の右上のshow
ボタンを押すとRのコードが表示されます。
library(conflicted)
library(tidyverse)
library(patchwork)
set.seed(42) # シード値を固定
sample_size <- 20 # サンプルサイズ
# 正規分布からのサンプリング
df_normal <- data.frame(value = rnorm(sample_size), type = "正規分布")
# 対数正規分布からのサンプリング
df_lognormal <- data.frame(value = rlnorm(sample_size), type = "対数正規分布")
# 正規分布箱ひげ図
p1 <- df_normal |>
ggplot(aes(x = type, y = value)) +
geom_boxplot(outlier.shape = NA) + # 外れ値非表示
geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.5, fill = "deeppink") +
coord_cartesian(ylim = c(-2, 2)) +
stat_summary(fun = "mean", geom = "point", color = "black", shape = 4, size = 4) +
theme(
axis.title.x = element_blank(), # 軸のラベルを消す
axis.line = element_blank() # 軸の線を消す
) +
labs(y = "観測値", title = "正規分布からのサンプリング")
# 正規分布理論分布
p2 <- data.frame(x = c(-2, 2)) |>
ggplot(aes(x = x)) +
stat_function(fun=dnorm) +
coord_flip() +
theme(
axis.ticks = element_blank(), # tickの線を消す
axis.text = element_blank(), # tickの数字を消す
axis.title = element_blank(), # 軸のラベルを消す
axis.line = element_blank() # 軸の線を消す
)
# 対数正規分布箱ひげ図
p3 <- df_lognormal |>
ggplot(aes(x = type, y = value)) +
geom_boxplot(outlier.shape = NA) + # 外れ値非表示
geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.5, fill = "blue") +
coord_cartesian(ylim = c(0, 7)) +
stat_summary(fun = "mean", geom = "point", color = "black", shape = 4, size = 4) +
theme(
axis.title.x = element_blank(), # 軸のラベルを消す
axis.line = element_blank() # 軸の線を消す
) +
labs(y = "観測値", title = "対数正規分布からのサンプリング")
# 対数正規分布理論分布
p4 <- data.frame(x = c(0, 7)) |>
ggplot(aes(x = x)) +
stat_function(fun=dlnorm) +
coord_flip() +
theme(
axis.ticks = element_blank(), # tickの線を消す
axis.text = element_blank(), # tickの数字を消す
axis.title = element_blank(), # 軸のラベルを消す
axis.line = element_blank() # 軸の線を消す
)
p1 + p2 + p3 + p4 +
plot_layout(ncol = 4, widths = c(2, 1, 2, 1))
library(conflicted)
library(tidyverse)
library(ggbeeswarm)
library(patchwork)
# シード値を固定
set.seed(42)
# サンプルサイズ
n <- (20 + 100 + 500) * 100
# 組み合わせ
eg <- expand_grid(
n = rep(c(rep("20",20), rep("100", 100), rep("500", 500))),
sample = 1:100
)
df <- bind_rows(
bind_cols(data.frame(dist = "正規分布", value = rnorm(n)), eg),
bind_cols(data.frame(dist = "対数正規分布", value = rlnorm(n)), eg)
) |>
summarise(
means = mean(value),
medians = median(value),
max = max(value),
p95 = quantile(value, 0.95),
sd = sd(value),
iqr = IQR(value),
.by = c(dist, n, sample)
) |>
dplyr::select(!sample) |>
pivot_longer(!c(dist, n), names_to = "metric") |>
mutate(dist = factor(dist, levels = c("正規分布", "対数正規分布")))
df_means_medians <- df |>
dplyr::filter(metric %in% c("means","medians")) |>
mutate(n_metric = factor(
paste0(metric, n),
levels = c("means20", "medians20", "means100", "medians100", "means500", "medians500")
))
df_sd_iqr <- df |>
dplyr::filter(metric %in% c("sd","iqr")) |>
mutate(n_metric = factor(
paste0(metric, n),
levels = c("sd20", "iqr20", "sd100", "iqr100", "sd500", "iqr500")
))
df_max_p95 <- df |>
dplyr::filter(metric %in% c("max","p95")) |>
mutate(n_metric = factor(
paste0(metric, n),
levels = c("max20", "p9520", "max100", "p95100", "max500", "p95500")
))
p1 <- df_means_medians |>
ggplot(aes(x = n_metric, y = value, color = n)) +
geom_quasirandom(width=0.4, method = "quasirandom", size = 0.75) +
theme(axis.text.x = element_text(angle = 0, hjust = 1)) +
facet_wrap(vars(dist), scales = "free") +
theme(
legend.position="none",
axis.title = element_blank() # 軸のラベルを消す
) +
scale_x_discrete(
labels = c(
means20 = "N = 20\n平均",
medians20 = "N = 20\n中央値",
means100 = "N = 100\n平均",
medians100 = "N = 100\n中央値",
means500 = "N = 500\n平均",
medians500 = "N = 500\n中央値"
)
)
p2 <- df_sd_iqr |>
ggplot(aes(x = n_metric, y = value, color = n)) +
geom_quasirandom(width=0.4, method = "quasirandom", size = 0.75) +
theme(axis.text.x = element_text(angle = 0, hjust = 1)) +
facet_wrap(vars(dist), scales = "free") +
theme(
legend.position="none",
axis.title = element_blank() # 軸のラベルを消す
) +
scale_x_discrete(
labels = c(
sd20 = "N = 20\n標準偏差",
iqr20 = "N = 20\nIQR",
sd100 = "N = 100\n標準偏差",
iqr100 = "N = 100\nIQR",
sd500 = "N = 500\n標準偏差",
iqr500 = "N = 500\nIQR"
)
)
p3 <- df_max_p95 |>
ggplot(aes(x = n_metric, y = value, color = n)) +
geom_quasirandom(width=0.4, method = "quasirandom", size = 0.75) +
theme(axis.text.x = element_text(angle = 0, hjust = 1)) +
facet_wrap(vars(dist), scales = "free") +
theme(
legend.position="none",
axis.title = element_blank() # 軸のラベルを消す
) +
scale_x_discrete(
labels = c(
max20 = "N = 20\n最大値",
p9520 = "N = 20\n95%点",
max100 = "N = 100\n最大値",
p95100 = "N = 100\n95%点",
max500 = "N = 500\n最大値",
p95500 = "N = 500\n95%点"
)
)
p1 / p2 / p3
library(conflicted)
library(tidyverse)
library(patchwork)
# 乱数のシード設定
set.seed(42)
# 単峰性の分布
single_peak_distribution <- rnorm(1000, mean=0, sd=1)
# 二峰性の分布
bimodal_distribution <- c(rnorm(500, mean=-1, sd=0.2), rnorm(500, mean=1, sd=0.2))
# 単峰性の分布の平均値と標準偏差
single_peak_mean <- mean(single_peak_distribution)
single_peak_std <- sd(single_peak_distribution)
# 二峰性の分布の平均値と標準偏差
bimodal_mean <- mean(bimodal_distribution)
bimodal_std <- sd(bimodal_distribution)
# 単峰性の分布のヒストグラム
p1 <- data.frame(x=single_peak_distribution) |>
ggplot(aes(x=x)) +
geom_histogram(color="gold", fill="gold", binwidth=0.2) +
labs(x="観測値", y="頻度") +
labs(
title = paste(
"単峰性の分布\n(平均=", round(single_peak_mean,2),
", 標準偏差=", round(single_peak_std, 2),
")"
)) +
theme(aspect.ratio = 3/4)
# 二峰性の分布のヒストグラム
p2 <- ggplot(data.frame(x=bimodal_distribution), aes(x=x)) +
geom_histogram(color="blue", fill="blue", binwidth=0.2) +
labs(x="観測値", y="頻度") +
labs(
title = paste(
"二峰性の分布\n(平均=", round(bimodal_mean,2),
", 標準偏差=", round(bimodal_std, 2),
")"
)) +
theme(aspect.ratio = 3/4)
p1 + p2
library(conflicted)
library(tidyverse)
library(ggbeeswarm)
# Wineデータセットのロード
col_names <- c(
"ブドウの品種", "アルコール度数", "リンゴ酸", "ミネラル分",
"ミネラル分のアルカリ度", "マグネシウム", "全フェノール類", "フラバノイド",
"非フラバノイドフェノール類", "プロアントシアニン", "色の強さ", "色相",
"OD280/OD315値", "プロリン"
)
original_data <- read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
col_names = col_names,
col_types = "f"
)
# Zスコア化したデータフレームの作成
zscored_data <- original_data |>
mutate(across(where(is.numeric), \(x) scale(x))) #標準化
# オリジナルのスウォームプロット
p1 <- original_data |>
pivot_longer(!c(ブドウの品種)) |>
mutate(name = factor(name, levels = col_names)) |>
ggplot(aes(x = name, y = value, color = ブドウの品種)) +
geom_quasirandom(width=0.4, method = "quasirandom", size = 0.75) +
theme(
axis.text.x = element_text(angle = 20, hjust = 1),
legend.position = c(0.1, 0.8),
axis.title.x = element_blank()
) +
scale_color_hue(name = "ブドウ品種", labels = c("1" = "品種1", "2" ="品種2", "3" ="品種3")) +
labs(title = "変数ごとに取る値の範囲が異なる", y = "元の値")
# zスコア化したスウォームプロット
p2 <- zscored_data |>
pivot_longer(!c(ブドウの品種)) |>
mutate(name = factor(name, levels = col_names)) |>
ggplot(aes(x = name, y = value, color = ブドウの品種)) +
geom_quasirandom(width=0.4, method = "quasirandom", size = 0.75) +
theme(
axis.text.x = element_text(angle = 20, hjust = 1),
legend.position = "none"
) +
labs(title = "値の範囲が揃っている", x = "", y = "Zスコア")
p1 / p2
library(conflicted)
library(tidyverse)
# サンプルサイズ
n <- (20 + 100 + 500) * 100
# 組み合わせ
eg <- expand_grid(
n = rep(c(rep("20",20), rep("100", 100), rep("500", 500))),
sample = 1:100
)
set.seed(0)
df <- bind_rows(
bind_cols(data.frame(dist = "正規分布からサンプリング", value = rnorm(n)), eg),
bind_cols(data.frame(dist = "対数正規分布からサンプリング", value = rlnorm(n)), eg)
) |>
summarise(
SD = sd(value),# 標準偏差
MeanAD = mad(value, center = mean(value)), # 平均絶対偏差
MedianAD = mad(value, center = median(value)), # 中央絶対偏差
.by = c(dist, n, sample)
) |>
dplyr::select(!sample) |>
pivot_longer(!c(dist, n), names_to = "metric") |>
mutate(dist = factor(dist, levels = c("正規分布からサンプリング", "対数正規分布からサンプリング")))
df |>
mutate(n_metric = factor(
paste0(metric, n),
levels = c("SD20", "SD100", "SD500",
"MeanAD20", "MeanAD100", "MeanAD500",
"MedianAD20", "MedianAD100", "MedianAD500"))) |>
ggplot(aes(x = n_metric, y = value, color = metric)) +
geom_quasirandom(width=0.4, method = "quasirandom", size = 0.75) +
theme(axis.text.x = element_text(angle = 0, hjust = 1)) +
facet_wrap(vars(dist), scales = "free", nrow = 2) +
theme(
legend.position="none",
axis.title = element_blank() # 軸のラベルを消す
) +
scale_x_discrete(
labels = c(
SD20 = "N = 20\n標準偏差",
SD100 = "N = 100\n標準偏差",
SD500 = "N = 500\n標準偏差",
MeanAD20 = "N = 20\n平均絶対偏差",
MeanAD100 = "N = 100\n平均絶対偏差",
MeanAD500 = "N = 500\n平均絶対偏差",
MedianAD20 = "N = 20\n中央絶対偏差",
MedianAD100 = "N = 100\n中央絶対偏差",
MedianAD500 = "N = 500\n中央絶対偏差"
)
)
library(conflicted)
library(tidyverse)
library(e1071)
library(sn) #rsn()
library(VGAM) #rlaplace
library(patchwork)
set.seed(42)
normal_dist <- rnorm(1000, mean = 0, sd = 1)# 正規分布
skewed_dist <- rsn(1000, xi = 0, omega = 2, alpha=4)# 歪正規分布
laplace_dist <- rlaplace(1000, location = 0, scale = 1)# ラプラス分布(歪度=0, 尖度>3)
# ヒストグラムの描画
p1 <- ggplot() +
geom_histogram(aes(x = normal_dist), fill = "#8DA0CB") +
labs(title = "分布1(正規分布)", x = "", y = "")
p2 <- ggplot() +
geom_histogram(aes(x = skewed_dist), fill = "#66C2A5") +
labs(title = "分布2(歪正規分布)", x = "", y = "")
p3 <- ggplot() +
geom_histogram(aes(x = laplace_dist), fill = "#FC8D62") +
labs(title = "分布3(ラプラス分布)", x = "", y = "")
# 統計量の計算
median_skewness <- function(x) {
(mean(x) - median(x)) / sd(x)
}
quartile_skewness <- function(x) {
q3 <- quantile(x, 0.75)
q1 <- quantile(x, 0.25)
as.numeric((q3 + q1 - 2 * median(x)) / (q3 - q1))
}
dists <- list(normal_dist, skewed_dist, laplace_dist)
fills <- c("#8DA0CB", "#66C2A5", "#FC8D62")
p4 <- data.frame(
name = paste0("分布", 1:3),
value = map_dbl(dists, skewness)
) |>
ggplot(aes(x = name, y = value)) +
geom_col(fill = fills) +
labs(title = "歪度", x = "", y = "")
p5 <- data.frame(
name = paste0("分布", 1:3),
value = map_dbl(dists, median_skewness)
) |>
ggplot(aes(x = name, y = value)) +
geom_col(fill = fills) +
labs(title = "中央値歪度", x = "", y = "")
p6 <- data.frame(
name = paste0("分布", 1:3),
value = map_dbl(dists, quartile_skewness)
) |>
ggplot(aes(x = name, y = value)) +
geom_col(fill = fills) +
labs(title = "四分位歪度", x = "", y = "")
p7 <- data.frame(
name = paste0("分布", 1:3),
value = map_dbl(dists, kurtosis)
) |>
ggplot(aes(x = name, y = value)) +
geom_col(fill = fills) +
labs(title = "尖度", x = "", y = "")
# グラフの表示
{p1 | p2 | p3} / { p4 | p5 | p6 | p7 }
library(conflicted)
library(tidyverse)
library(patchwork)
library(entropy)
# シード値を固定
set.seed(42)
# サンプルサイズ
n_samples <- 500
# ビンの設定
bins <- seq(-5, 5, length.out = 40)
# 一様分布
uniform_samples <- runif(n_samples, min = -5, max = 5)
uniform_hist <- hist(uniform_samples, breaks = bins, plot = FALSE)
uniform_entropy <- entropy.empirical(uniform_hist$counts)
# 正規分布
normal_samples <- rnorm(n_samples, mean = 0, sd = 1)
normal_hist <- hist(normal_samples, breaks = bins, plot = FALSE)
normal_entropy <- entropy.empirical(normal_hist$counts)
# 2つの正規分布
gmm_samples <- c(
rnorm(n_samples / 2, mean = -2, sd = 0.25),
rnorm(n_samples / 2, mean = 2, sd = 0.25)
)
gmm_hist <- hist(gmm_samples, breaks = bins, plot = FALSE)
gmm_entropy <- entropy.empirical(gmm_hist$counts)
# ヒストグラムの描画
p1 <- ggplot() +
geom_histogram(aes(x = uniform_samples), bins = length(bins) - 1, fill = "skyblue", color = "black") +
labs(title = paste0("一様分布\nエントロピー: ", round(uniform_entropy, 2))) +
theme(aspect.ratio = 1, axis.title = element_blank())
p2 <- ggplot() +
geom_histogram(aes(x = normal_samples), bins = length(bins) - 1, fill = "salmon", color = "black") +
labs(title = paste0("正規分布\nエントロピー: ", round(normal_entropy, 2))) +
theme(aspect.ratio = 1, axis.title = element_blank())
p3 <- ggplot() +
geom_histogram(aes(x = gmm_samples), bins = length(bins) - 1, fill = "lightgreen", color = "black") +
labs(title = paste0("2つの正規分布の混合\nエントロピー: ", round(gmm_entropy, 2))) +
theme(aspect.ratio = 1, axis.title = element_blank())
# グラフの表示
p1 + p2 + p3
library(conflicted)
library(tidyverse)
library(patchwork)
library(entropy)
# シード値を固定
set.seed(42)
# サンプルサイズ
n_samples <- 50
small_bins <- seq(-5, 5, length.out = 400)# 小さなビン(400分割)
large_bins <- seq(-5, 5, length.out = 5)# 大きなビン(5分割)
# 一様分布
uniform_samples <- runif(n_samples, min = -5, max = 5)
uniform_entropy_small <- entropy.empirical(hist(uniform_samples, breaks = small_bins, plot = FALSE)$counts)
uniform_entropy_large <- entropy.empirical(hist(uniform_samples, breaks = large_bins, plot = FALSE)$counts)
# 正規分布
normal_samples <- rnorm(n_samples, mean = 0, sd = 1)
normal_entropy_small <- entropy.empirical(hist(normal_samples, breaks = small_bins, plot = FALSE)$counts)
normal_entropy_large <- entropy.empirical(hist(normal_samples, breaks = large_bins, plot = FALSE)$counts)
# 2つの正規分布
gmm_samples <- c(rnorm(n_samples / 2, mean = -2, sd = 0.25), rnorm(n_samples / 2, mean = 2, sd = 0.25))
gmm_entropy_small <- entropy.empirical(hist(gmm_samples, breaks = small_bins, plot = FALSE)$counts)
gmm_entropy_large <- entropy.empirical(hist(gmm_samples, breaks = large_bins, plot = FALSE)$counts)
my_plot <- function(samples, bins, entropy, fill_color, title) {
ggplot() +
geom_histogram(aes(x = samples), bins = length(bins) - 1, fill = fill_color) +
labs(title = paste0(title, round(entropy, 2))) +
theme(axis.title = element_blank(), aspect.ratio = 1) +
coord_cartesian(xlim = c(-5, 5))
}
p1 <- my_plot(uniform_samples, small_bins, uniform_entropy_small, "skyblue", "一様分布\nエントロピー: ")
p2 <- my_plot(normal_samples, small_bins, normal_entropy_small, "salmon", "正規分布\nエントロピー: ")
p3 <- my_plot(gmm_samples, small_bins, gmm_entropy_small, "lightgreen", "2つの正規分布の混合\nエントロピー: ")
p4 <- my_plot(uniform_samples, large_bins, uniform_entropy_large, "skyblue", "一様分布\nエントロピー: ")
p5 <- my_plot(normal_samples, large_bins, normal_entropy_large, "salmon", "正規分布\nエントロピー: ")
p6 <- my_plot(gmm_samples, large_bins, gmm_entropy_large, "lightgreen", "2つの正規分布の混合\nエントロピー: ")
# グラフの表示
(p1 + p2 + p3) / (p4 + p5 + p6)
library(conflicted)
library(tidyverse)
library(patchwork)
# シード値を固定
set.seed(42)
# サンプルサイズ
n_samples <- 100
# 正規分布からのサンプリング(平均400万、標準偏差20万)
normal_samples <- rnorm(n_samples, mean = 400, sd = 20)
# 平均400万の対数正規分布からのサンプリング
mean_lognormal <- 400
sigma_lognormal <- 0.5 # 任意の標準偏差
lognormal_samples <- rlnorm(
n_samples,
meanlog = log(mean_lognormal) - 0.5 * sigma_lognormal^2,
sdlog = sigma_lognormal
)
# ローレンツ曲線とジニ係数の計算を行う関数
calculate_lorenz_curve <- function(samples) {
sorted_samples <- sort(samples) # サンプルを昇順に並び替える
cum_frequencies <- cumsum(sorted_samples) / sum(sorted_samples) # 累積相対度数
cum_population <- seq(1, n_samples) / n_samples # 累積人口の割合
gini <- mean(cum_population - cum_frequencies) * 2 # ジニ係数
return(list(cum_population = cum_population, cum_frequencies = cum_frequencies, gini = gini))
}
# 正規分布のローレンツ曲線とジニ係数
normal_lorenz <- calculate_lorenz_curve(normal_samples)
# 対数正規分布のローレンツ曲線とジニ係数
lognormal_lorenz <- calculate_lorenz_curve(lognormal_samples)
# ヒストグラムの描画
p1 <- ggplot() +
geom_histogram(aes(x = normal_samples), bins = 20, fill = "skyblue") +
labs(title = paste0("年収分布その1\nジニ係数: ", round(normal_lorenz$gini, 3)),
x = "年収(万円)", y = "頻度")
p2 <- ggplot() +
geom_histogram(aes(x = lognormal_samples), bins = 20, fill = "salmon") +
labs(title = paste0("年収分布その2\nジニ係数: ", round(lognormal_lorenz$gini, 3)),
x = "年収(万円)", y = "頻度")
# ローレンツ曲線の描画
df <- data.frame(x1 =c(0, 1), y1 = c(0, 0), x2 = c(1, 1), y2 = c(0, 1))
p3 <- ggplot() +
geom_line(aes(x = normal_lorenz$cum_population, y = normal_lorenz$cum_frequencies), color = "blue") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
geom_segment(aes(x = x1, y = y1, xend = x2, yend = y2), data = df, linetype="dotted", color="forestgreen", linewidth = 1) +
labs(title = paste0("ローレンツ曲線(年収分布その1)\nジニ係数: ", round(normal_lorenz$gini, 3)),
x = "累積人数の割合", y = "累積年収の割合")
p4 <- ggplot() +
geom_line(aes(x = lognormal_lorenz$cum_population, y = lognormal_lorenz$cum_frequencies), color = "blue") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
geom_segment(aes(x = x1, y = y1, xend = x2, yend = y2), data = df, linetype="dotted", color="forestgreen", linewidth = 1) +
labs(title = paste0("ローレンツ曲線(年収分布その2)\nジニ係数: ", round(lognormal_lorenz$gini, 3)),
x = "累積人数の割合", y = "累積年収の割合")
(p1 + p2) / (p3 + p4)
library(conflicted)
library(tidyverse)
# シードの固定
set.seed(0)
# パラメータ設定
n <- 10
mean_income <- 400 # 平均収入
mean_log <- log(mean_income) # 対数正規分布の平均(ログスケール)
std_dev_log <- 0.5 # 対数正規分布の標準偏差(ログスケール)
# 対数正規分布から収入をサンプリング
unequal_incomes <- rlnorm(n, meanlog = mean_log, sdlog = std_dev_log)
# 完全均一な収入を作成
mean_income <- mean(unequal_incomes)
equal_incomes <- rep(mean_income, n)
# タイル指数(Theil Index)を計算する関数
calc_theil_index <- function(incomes) {
mean_income <- mean(incomes)
return(sum((incomes / mean_income) * log(incomes / mean_income)) / n)
}
# タイル指数を計算
unequal_theil <- calc_theil_index(unequal_incomes) # 不均一な収入のケース
equal_theil <- calc_theil_index(equal_incomes) # 均一な収入のケース
# プロット
df <- data.frame(
Person = rep(LETTERS[1:n], 2),
Income = c(unequal_incomes, equal_incomes),
Distribution = factor(rep(c("Unequal", "Equal"), each = n), levels = c("Unequal", "Equal"))
)
df |>
ggplot(aes(x = Person, y = Income, fill = Distribution)) +
geom_col() +
scale_fill_manual(values = c("skyblue", "salmon")) +
facet_wrap(
vars(Distribution),
labeller = as_labeller(c(
Unequal = paste0("ばらついた収入分布 (タイル指数: ", format(round(unequal_theil, 4), nsmall = 4), ")"),
Equal = paste0("一様な収入分布 (タイル指数: ", format(round(equal_theil, 4), nsmall = 4), ")")))
) +
labs(x = "", y = "収入 [万円]") +
theme(legend.position = "none")
第5章はここまで。