図の右上のshow
ボタンを押すとRのコードが表示されます。
library(conflicted)
library(tidyverse)
# 平方根記号を表示させる方法が分からない
# ポイントの定義
x1 <- 1
y1 <- 3
x2 <- 4
y2 <- 2
# ユークリッド距離の計算
euclidean_distance <- sqrt((x2 - x1)^2 + (y2 - y1)^2)
# マンハッタン距離の計算
manhattan_distance <- abs(x2 - x1) + abs(y2 - y1)
data.frame(x = c(x1, x2), y = c(y1, y2)) |>
ggplot(aes(x = x, y = y)) +
geom_point(color = "gold", size = 4) +
geom_segment(aes(x = x1, y = y1, xend = x2, yend = y2), linetype = "dashed", color = "blue") +
geom_segment(aes(x = x1, y = y1, xend = x2, yend = y1), linetype = "dashed", color = "salmon") +
geom_segment(aes(x = x2, y = y1, xend = x2, yend = y2), linetype = "dashed", color = "salmon") +
annotate("text", x = (x1 + x2) / 2, y = y1, label = abs(x2 - x1), color = "salmon", hjust = 1, vjust = -1) +
annotate("text", x = x2, y = (y1 + y2) / 2, label = abs(y2 - y1), color = "salmon", hjust = -1, vjust = 1) +
annotate("text", x = (x1 + x2) / 2, y = (y1 + y2) / 2, parse = TRUE,
label = paste("sqrt(10) == ", round(euclidean_distance, 2)),
color = "blue", hjust = 1.1, vjust = 1.1) +
labs(x = "x", y = "y") +
coord_cartesian(xlim = c(0, 5), ylim = c(0, 5)) +
theme(aspect.ratio = 1)
参考: 【R】分散共分散行列とユークリッド距離・マハラノビス距離の関係の可視化
library(conflicted)
library(tidyverse)
library(ellipse)
library(rvest) #スクレイピング
# 公開されている元データを読み取る
df <- read_html("http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_MLB_HeightsWeights") |>
html_table() |>
pluck(2) |>
drop_na() |>
mutate( # 単位をSI単位に変換(Heightをメートルに、Weightをキログラムに)
Height_m = `Height(inches)` * 0.0254,
Weight_kg = `Weight(pounds)` * 0.453592
)
# 共分散行列と平均ベクトルを計算
x_mat <- df[, c("Height_m", "Weight_kg")]
mean_vector <- colMeans(x_mat)
cov_matrix <- cov(x_mat)
# マハラビノス距離を計算
df2 <- df |>
mutate(
Mahalanobis = t(t(x_mat) - mean_vector) %*% solve(cov_matrix) %*% (t(x_mat) - mean_vector) |>
diag() |>
sqrt()
)
my_ellipse <- function(rad) {
car::ellipse(center = mean_vector, shape = cov_matrix, radius = rad, draw = FALSE) |>
as.data.frame() |>
mutate(Height_m = x, Weight_kg = y, .keep = "unused")
}
df2 |>
ggplot(aes(x = Height_m, y = Weight_kg))+
geom_point(aes(color = Mahalanobis)) +
scale_color_viridis_c() +
geom_path(data = my_ellipse(1), color = 'red', linetype = 'dashed') +
geom_path(data = my_ellipse(2), color = 'red', linetype = 'dashed') +
geom_path(data = my_ellipse(3), color = 'red', linetype = 'dashed') +
geom_path(data = my_ellipse(4), color = 'red', linetype = 'dashed') +
labs(title = "メジャーリーガーの身長と体重", x = "身長[m]", y = "体重[kg]", color = "マハラビノス距離") +
theme(aspect.ratio = 1)
library(conflicted)
library(tidyverse)
library(igraph)
library(tidygraph)
library(ggraph)
# グラフ(バーベルグラフ)を生成
g1 <- g2 <- make_full_graph(5)
g <- disjoint_union(g1, g2) |>
add_vertices(1) |>
add_edges(c(5, 11, 11, 6))
# 基準となるノード(ここではノード 1)
source_node <- 1
# 基準ノードからの最短距離を計算
V(g)$label <- distances(g, v=source_node)[1, ]
# ネットワークを描画する際のノードの色を設定
V(g)$color <- if_else(V(g) == source_node, "1", "0")
# tidygraphへ変換
g_tidy <- as_tbl_graph(g, directed = FALSE)
g_tidy |>
ggraph(layout = "igraph", algorithm = "kk") +
geom_edge_link(color = "black", alpha = 0.5) +
geom_node_label(aes(label = label, fill = color), size = 10) +
theme_void() +
theme(aspect.ratio = 1, legend.position = "none")
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(conflicted)
library(tidyverse)
library(patchwork)
# シード値を固定
set.seed(42)
# サンプルサイズを設定。
sample_size <- 1000
# 一つ目と二つ目の分布ペアを生成
mean1 <- 0
std1 <- 1
mean2 <- 0.1
std2 <- 1
dist1 <- rnorm(sample_size, mean1, std1)
dist2 <- rnorm(sample_size, mean2, std2)
# Kolmogorov-Smirnov 統計量を計算
ks_test <- ks.test(dist1, dist2)
# データフレームを作成
df <- bind_rows(
data.frame(value = dist1, distribution = "分布1"),
data.frame(value = dist2, distribution = "分布2")
)
# ヒストグラム
p1 <- df |>
ggplot(aes(x = value, fill = distribution)) +
geom_histogram(position="identity", alpha=0.5, bins=50) +
labs(x="値", y="頻度", fill="分布") +
theme(legend.title = element_blank(), legend.position = c(0.9, 0.8))
# CDF
p2 <- df |>
ggplot(aes(x=value, color=distribution)) +
stat_ecdf(geom = "step") +
labs(x="値", y="累積分布", color="分布") +
annotate("text", x = mean1 + 1, y = 0.5,
label = paste("最大差:", round(ks_test$statistic, 3))
) +
labs(title = paste(
"コルモゴロフ・スミルノフ統計量:", round(ks_test$statistic, 3),
", p値:", round(ks_test$p.value, 3))
) +
theme(legend.title = element_blank(), legend.position = c(0.1, 0.8))
p1 / p2
library(conflicted)
library(tidyverse)
# サンプルデータの生成
num_samples <- 10000
set.seed(0)
samples1 <- rnorm(num_samples, -1, 1)
samples2 <- rnorm(num_samples, 1, 1)
# ビンの設定
bins <- seq(-5, 5, length.out = 51)
# ヒストグラムを計算
hist1 <- hist(samples1, breaks=bins, plot=FALSE)
hist2 <- hist(samples2, breaks=bins, plot=FALSE)
# ヒストグラムを密度に変換
hist1$density <- hist1$counts / sum(hist1$counts) / diff(bins)[1]
hist2$density <- hist2$counts / sum(hist2$counts) / diff(bins)[1]
# 全変動距離(TVD)を計算
tvd_sample <- 0.5 * sum(abs(hist1$density - hist2$density)) * diff(bins)[1]
# データフレームを作成
df <- bind_rows(
data.frame(value = samples1, distribution = "分布1"),
data.frame(value = samples2, distribution = "分布2")
)
ggplot() +
geom_histogram(data = df, aes(x=value, y=after_stat(density), fill=distribution), position='identity', alpha=0.3, bins=50) +
labs(x='値', y='確率密度', fill='分布') +
geom_line(data=data.frame(x=hist1$mids, y=hist1$density, distribution = "分布1"), aes(x=x, y=y), color='red') +
geom_line(data=data.frame(x=hist2$mids, y=hist2$density, distribution = "分布2"), aes(x=x, y=y), color='blue') +
labs(title = paste("全変動距離: ", round(tvd_sample, 4))) +
theme(legend.position = c(0.9, 0.8))
library(conflicted)
library(tidyverse)
# シード値を固定
set.seed(42)
# サンプルデータの生成
num_samples <- 50
samples1 <- sort(rnorm(num_samples, 0, 1))
samples2 <- sort(rnorm(num_samples, 2, 1))
bind_rows(
data.frame(value = samples1, distribution = rep("分布1", num_samples)),
data.frame(value = samples2, distribution = rep("分布2", num_samples))
) |>
ggplot(aes(x=value, y=distribution)) +
geom_segment(
data=data.frame(
x1=samples1, x2=samples2,
y1=rep("分布1", num_samples), y2=rep("分布2", num_samples)),
aes(x=x1, xend=x2, y=y1, yend=y2), color='gray') +
geom_point(aes(color=factor(distribution)), size=3) +
scale_color_manual(values=c('red', 'blue'), labels=c("分布1", "分布2")) +
labs(x="値", y="分布", color="分布", title = "一致させるために必要な総移動距離") +
theme(legend.position="none", axis.title = element_blank())
library(conflicted)
library(tidyverse)
# KL情報量を計算する関数
kl_divergence <- function(p, q) {
return(sum(ifelse(p != 0, p * log(p / q), 0)))
}
# サンプルデータの生成
num_samples <- 10000
set.seed(42)
samples1 <- rnorm(num_samples, 0, 0.5)
samples2 <- rnorm(num_samples, 2, 1)
# サンプルを[-1, 3]の範囲に絞り込む
filtered_samples1 <- samples1[samples1 >= -1 & samples1 <= 3]
filtered_samples2 <- samples2[samples2 >= -1 & samples2 <= 3]
# 非ゼロのビンを確保するために、限られた範囲でヒストグラムを生成
limited_bins <- seq(-1, 3, length.out = 50)
hist1 <- hist(filtered_samples1, breaks = limited_bins, plot = FALSE)
hist2 <- hist(filtered_samples2, breaks = limited_bins, plot = FALSE)
# ヒストグラムを正規化
hist1$density <- hist1$counts / sum(hist1$counts * diff(limited_bins))
hist2$density <- hist2$counts / sum(hist2$counts * diff(limited_bins))
# KLダイバージェンスを再計算
kl_1_to_2 <- kl_divergence(hist1$density, hist2$density)
kl_2_to_1 <- kl_divergence(hist2$density, hist1$density)
# JSダイバージェンスを再計算
js_divergence <- 0.5 * (
kl_divergence(hist1$density, 0.5 * (hist1$density + hist2$density)) +
kl_divergence(hist2$density, 0.5 * (hist1$density + hist2$density))
)
# ダイバージェンスを表示
df1 <- data.frame(x = hist1$mids, y = hist1$density)
df2 <- data.frame(x = hist2$mids, y = hist2$density)
ggplot() +
geom_col(data = df1, aes(x = x, y = y), fill = "blue", alpha = 0.5) +
geom_col(data = df2, aes(x = x, y = y), fill = "red", alpha = 0.5) +
labs(title = paste0("KL情報量: ", round(kl_1_to_2, 4), " (1→2), ∞ (2→1), JS情報量: ", round(js_divergence, 4)),
x = "", y = "相対頻度")
library(conflicted)
library(tidyverse)
library(patchwork)
library(entropy)
library(transport)
# 相互情報量
mutual_information <- function(x, y, n_bins=20) {
hist_x <- hist(x, breaks=n_bins, plot=FALSE)$counts
hist_y <- hist(y, breaks=n_bins, plot=FALSE)$counts
mi <- mi.empirical(hist_x, hist_y)
return(mi)
}
# スミルノフ-コロモゴロフ統計量
ks_statistic <- function(x, y) {
return(ks.test(x, y)$statistic)
}
# カイ二乗統計量
chi_square_statistic <- function(x, y, bin_edges) {
hist_x <- hist(x, breaks=bin_edges, plot=FALSE)$counts
hist_y <- hist(y, breaks=bin_edges, plot=FALSE)$counts
return(chisq.test(hist_x, hist_y)$statistic)
}
# KLダイバージェンス
kl_divergence <- function(p, q) {
kl_div <- entropy::KL.empirical(p, q)
return(ifelse(is.finite(kl_div), kl_div, 0))
}
# JSダイバージェンス
js_divergence <- function(p, q) {
m <- 0.5 * (p + q)
return(0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m))
}
# 全変動距離
total_variation_distance <- function(p, q) {
return(0.5 * sum(abs(p - q)))
}
# 指標を計算
compute_metrics <- function(dist_a, dist_b, bin_edges) {
hist_a <- hist(dist_a, breaks=bin_edges, plot=FALSE)$density
hist_b <- hist(dist_b, breaks=bin_edges, plot=FALSE)$density
return(c(
ks_statistic(dist_a, dist_b),
total_variation_distance(hist_a, hist_b),
# transport::wasserstein1d(bin_edges[-length(bin_edges)], bin_edges[-length(bin_edges)], hist_a, hist_b),
transport::wasserstein1d(dist_a, dist_b),
js_divergence(hist_a, hist_b)
))
}
# サンプルデータ生成
sample_size <- 1000
dist1_1 <- rnorm(sample_size, 0, 1)# 分布1
dist1_2 <- rnorm(sample_size, 1, 1)# 分布2
dist2_1 <- rnorm(sample_size, -1, 0.5)# 分布3
dist2_2 <- rnorm(sample_size, 1, 1.5)# 分布4
dist3_1 <- c(# 分布5と分布6の混合分布
rnorm(sample_size / 2, -2, 1),
rnorm(sample_size / 2, 2, 1)
)
dist3_2 <- rnorm(sample_size, 0, 1.5)# 分布7
# ビンの幅をすべての分布で同じにするためにすべてのデータから最小値と最大値を取得
all_data <- c(dist1_1, dist1_2, dist2_1, dist2_2, dist3_1, dist3_2)
min_val <- min(all_data)
max_val <- max(all_data)
# ビンのエッジを計算
bin_edges <- seq(min_val, max_val, length.out = 21)
# 各距離指標を計算(警告が出る)
distances1 <- compute_metrics(dist1_1, dist1_2, bin_edges)
distances2 <- compute_metrics(dist2_1, dist2_2, bin_edges)
distances3 <- compute_metrics(dist3_1, dist3_2, bin_edges)
# 棒グラフのY軸のスケールを調整
max_distance <- 2.5
# 分布ペアのヒストグラムを描画
df1_1 <- data.frame(x = dist1_1, group = rep("dist1_1", length(dist1_1)))
df1_2 <- data.frame(x = dist1_2, group = rep("dist1_2", length(dist1_2)))
df2_1 <- data.frame(x = dist2_1, group = rep("dist2_1", length(dist2_1)))
df2_2 <- data.frame(x = dist2_2, group = rep("dist2_2", length(dist2_2)))
df3_1 <- data.frame(x = dist3_1, group = rep("dist3_1", length(dist3_1)))
df3_2 <- data.frame(x = dist3_2, group = rep("dist3_2", length(dist3_2)))
p1 <- ggplot() +
geom_histogram(data = df1_1, aes(x = x, fill = group), bins = length(bin_edges), alpha = 0.5) +
geom_histogram(data = df1_2, aes(x = x, fill = group), bins = length(bin_edges), alpha = 0.5) +
labs(title = "分布ペア1") +
theme(legend.position="none", axis.title = element_blank())
p2 <- ggplot() +
geom_histogram(data = df2_1, aes(x = x, fill = group), bins = length(bin_edges), alpha = 0.5) +
geom_histogram(data = df2_2, aes(x = x, fill = group), bins = length(bin_edges), alpha = 0.5) +
labs(title = "分布ペア2") +
theme(legend.position="none", axis.title = element_blank())
p3 <- ggplot() +
geom_histogram(data = df3_1, aes(x = x, fill = group), bins = length(bin_edges), alpha = 0.5) +
geom_histogram(data = df3_2, aes(x = x, fill = group), bins = length(bin_edges), alpha = 0.5) +
labs(title = "分布ペア3") +
theme(legend.position="none", axis.title = element_blank())
# 各指標の距離をグループ棒グラフとして描画
metrics <- c("コルモゴロフ・スミルノフ統計量", "全変動距離", "ワッサースタイン距離", "JS情報量")
p4 <- data.frame(
metrics = factor(rep(metrics, 3), levels = metrics),
distances = c(distances1, distances2, distances3),
group = rep(c("分布ペア1", "分布ペア2", "分布ペア3"), each = length(metrics))
) |>
ggplot(aes(x = metrics, y = distances, fill = group)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = '各指標における分布ペアの距離') +
theme(
axis.title = element_blank(),
legend.title = element_blank(),
legend.position = c(0.9, 0.7)
)
(p1 + p2 + p3) / p4
library(conflicted)
library(tidyverse)
library(ggpubr)
# シード値の設定
set.seed(42)
# 書籍の数
n_books <- 100
# Aさんのレビュー
a_scores <- n_books |> rnorm(60, 15) |> round() |> pmin(100) |> pmax(0)
# Bさんのレビュー(Aさんと高い相関)
b_scores <- (a_scores + (n_books |> rnorm(0, 10) |>round())) |> pmin(100) |> pmax(0)
# Cさんのレビュー(Aさんと緩い逆相関)
c_scores <- (100 - a_scores + (n_books |> rnorm(0, 20) |> round())) |> pmin(100) |> pmax(0)
# 相関係数の計算
corr_ab <- cor(a_scores, b_scores)
corr_ac <- cor(a_scores, c_scores)
# 線形回帰の描画
df <- data.frame(A=a_scores, B=b_scores, C=c_scores)
p1 <- df |>
ggplot(aes(x = A, y = B)) +
geom_point(color = "blue") +
geom_smooth(formula = y ~ x, method = "lm", color="blue", fill="blue") +
labs(title = paste("AさんとBさんの類似度 r =", round(corr_ab, 2)),
x = "Aさんによる評価", y = "Bさんによる評価") +
coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) +
theme(aspect.ratio = 1)
p2 <- df |>
ggplot(aes(x = A, y = C)) +
geom_point(color = "red") +
geom_smooth(formula = y ~ x, method = "lm", color="red", fill="red") +
labs(title = paste("AさんとCさんの類似度 r =", round(corr_ac, 2)),
x = "Aさんによる評価", y = "Cさんによる評価") +
coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) +
theme(aspect.ratio = 1)
p1 + p2
library(conflicted)
library(tidyverse)
# データを読み込む
data_df <- read_tsv(
"https://raw.githubusercontent.com/tkEzaki/data_visualization/main/6%E7%AB%A0/data/brain.txt"
)
data_df |>
mutate(time_steps = seq(0, (nrow(data_df) - 1) * 2.5, by = 2.5)) |>
pivot_longer(!time_steps) |>
ggplot(aes(x = time_steps, y = value, color = name)) +
geom_point(size = 0.5) +
geom_line() +
labs(x = "時間 [秒]", y = "活動量", color = "脳領域") +
theme(aspect.ratio = 1/6)
library(conflicted)
library(tidyverse)
data_df <- read_tsv(
"https://raw.githubusercontent.com/tkEzaki/data_visualization/main/6%E7%AB%A0/data/brain.txt"
)
# r値の計算
r_value <- cor(data_df$ROI1, data_df$ROI2)
# 散布図と回帰直線、信頼区間を描画
data_df |>
ggplot(aes(x = ROI1, y = ROI2)) +
geom_path(color = "grey40") +
geom_point() +
geom_smooth(formula = y ~ x, method = "lm", color = "red", fill = "red") +
coord_cartesian(xlim = c(-2.5, 2.5), ylim = c(-2.5, 2.5)) +
theme_bw() +
theme(aspect.ratio = 1) +
annotate("text", x = -1.5, y = 2.5, label = paste("r =", round(r_value, 2)), size = 8) +
labs(x = "脳領域1", y = "脳領域2")
library(conflicted)
library(tidyverse)
library(patchwork)
library(MASS)
generate_correlation <- function(target_corr=0.8, size=100, iter = 100000) {
group <- rep(1:iter, each = size)
samples <- split(
mvrnorm(n = size * iter, mu = c(0, 0), Sigma = matrix(c(1, target_corr, target_corr, 1), nrow=2)),
group)
correlation <- map_dbl(samples, \(x) {
mat <- matrix(x, 100)
cor(mat[,1], mat[,2])
})
fisher_z <- 0.5 * log((1 + correlation) / (1 - correlation))
return(data.frame(correlation,fisher_z))
}
results <- generate_correlation()
# ヒストグラムを描画
bins_orig <- seq(0.6, 1.0, length.out = 41)
bins_z <- seq(0.6, 1.5, length.out = 41)
# 正規分布のラインを引く
p1 <- results |>
ggplot(aes(x=correlation)) +
geom_histogram(aes(y=after_stat(density)), bins=length(bins_orig)-1, fill="blue", alpha=0.6) +
stat_function(fun = dnorm,
args = list(mean = mean(results$correlation), sd = sd(results$correlation)),
color="darkblue", linetype="dashed") +
labs(title="サンプリングされた相関係数", x=expression(italic(r))) +
theme(axis.title.y = element_blank())
p2 <- results |>
ggplot(aes(x=fisher_z)) +
geom_histogram(aes(y=after_stat(density)), bins=length(bins_z)-1, fill="red", alpha=0.6) +
stat_function(fun = dnorm,
args = list(mean = mean(results$fisher_z), sd = sd(results$fisher_z)),
color="darkred", linetype="dashed") +
labs(title="フィッシャー変換後の相関係数", x=expression(italic(z))) +
theme(axis.title.y = element_blank())
p1 + p2
library(conflicted)
library(tidyverse)
library(patchwork)
# Seabornのtipsデータセットを使用
df <- read.csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv")
# 相関係数の計算
corr_total_tip <- cor(df$total_bill, df$tip)
corr_total_size <- cor(df$total_bill, df$size)
corr_tip_size <- cor(df$tip, df$size)
# 散布図と回帰直線を描画する
p1 <- df |>
ggplot(aes(x=size, y=tip)) +
geom_point(alpha=0.4, color="darkgreen") +
geom_smooth(formula = y ~ x, method="lm", color="darkgreen", fill ="darkgreen") +
labs(title=paste("グループの人数とチップの額\n r = ", round(corr_tip_size, 2)),
x = "グループの人数", y = "チップの額 [$]") +
theme(aspect.ratio = 1)
p2 <- df |>
ggplot(aes(x=total_bill, y=tip)) +
geom_point(alpha=0.4, color="darkblue") +
geom_smooth(formula = y ~ x, method="lm", color="darkblue", fill="darkblue") +
labs(title=paste("会計額とチップの額\n r = ", round(corr_total_tip, 2)),
x = "会計額 [$]", y ="チップの額 [$]") +
theme(aspect.ratio = 1)
p3 <- ggplot(df, aes(x=size, y=total_bill)) +
geom_point(alpha=0.4, color="darkred") +
geom_smooth(formula = y ~ x, method="lm", color="darkred", fill="darkred") +
labs(title=paste("グループの人数と会計額\n r = ", round(corr_total_size, 2)),
x = "グループの人数", y ="会計額 [$]") +
theme(aspect.ratio = 1)
p1 + p2 + p3
library(conflicted)
library(tidyverse)
library(palmerpenguins)
library(rsample)
data("penguins")
# 前処理
penguins_clean <- penguins |>
dplyr::select(species, body_mass_g, bill_length_mm, bill_depth_mm, flipper_length_mm) |>
drop_na()
# データ分割
penguins_split <- penguins_clean |>
initial_split(prop = 0.8, strata = body_mass_g)
penguins_train <- training(penguins_split)
penguins_test <- testing(penguins_split)
fit <- lm(body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm, data = penguins_train)
df_wide <- penguins_test |>
dplyr::select(species, body_mass_g) |>
mutate(pred = predict(fit, newdata = penguins_test)) |>
arrange(species) |>
mutate(id = row_number())
df_long <- df_wide |>
pivot_longer(!c(species, id)) |>
mutate(type = case_when(
name == "pred" ~ "予測値",
name == "body_mass_g" ~ species
))
df_long |>
ggplot(aes(x = id, y = value, color = type)) +
geom_segment(data = df_wide, aes(x = id, y = body_mass_g, xend = id, yend = pred), color = "black", linetype = "dashed") +
geom_point() +
scale_color_manual(values = c("royalblue", "salmon", "forestgreen", "black")) +
labs(title = "個体ごとの真の体重と予測された体重", x = "個体番号", y = "体重 [g]") +
theme(legend.title = element_blank(), legend.position = c(0.9, 0.2))
## 要求されたパッケージ viridisLite をロード中です
# 確率分布を準備
joint_prob <- matrix(c(
1 / 36 - 1 / 72, 1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5,
1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5,
1 / 36 + 1 / 72 / 5, 1 / 36 - 1 / 72, 1 / 36 + 1 / 72 / 5,
1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5,
1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5, 1 / 36 - 1 / 72,
1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5,
1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5,
1 / 36 - 1 / 72, 1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5,
1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5,
1 / 36 + 1 / 72 / 5, 1 / 36 - 1 / 72, 1 / 36 + 1 / 72 / 5,
1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5,
1 / 36 + 1 / 72 / 5, 1 / 36 + 1 / 72 / 5, 1 / 36 - 1 / 72
), nrow = 6, byrow = TRUE)
joint_prob <- joint_prob / sum(joint_prob)
# サンプルをjoint_probに従って1000回サンプリング
set.seed(0)
library(tictoc)
samples <- sample.int(36, size = 1000000, replace = TRUE, prob = c(joint_prob))
# サンプリングされた値を(X, Y)ペアに変換
samples_die1 <- (samples - 1) %/% 6 + 1
samples_die2 <- (samples - 1) %% 6 + 1
# ヒートマップ
p1 <- data.frame(
samples_die1 = factor((samples - 1) %/% 6 + 1),
samples_die2 = factor((samples - 1) %% 6 + 1)
) |>
summarise(n = n(), .by = c(samples_die1, samples_die2)) |>
mutate(prob = n / 1000000) |>
ggplot(aes(x = samples_die1, y = samples_die2, fill = prob, label = format(round(prob, 3)), nsmall = 3)) +
geom_tile() +
geom_text(color = "white", size = 3) +
scale_fill_gradient(low = "darkblue", high = "darkred") +
theme_minimal() +
theme(legend.title = element_blank(),
aspect.ratio = 1) +
labs(x = "サイコロAの出目", y = "サイコロBの出目")
# サイコロAが1が出たときのサイコロBの結果を棒グラフで描画
samples_die2_when_die1_is_1 <- samples_die2[samples_die1 == 1]
p2 <- data.frame(table(samples_die2_when_die1_is_1) / length(samples_die2_when_die1_is_1)) |>
ggplot(aes(y = Freq, x = samples_die2_when_die1_is_1)) +
geom_col(fill = "royalblue", color = "black") +
labs(x = "サイコロBの出目", y = "相対頻度") +
theme(aspect.ratio = 1)
p1 + p2
library(conflicted)
library(tidyverse)
library(patchwork)
# サイコロの実験をシミュレーションする関数
simulate_dice_experiment <- function(n_trials) {
# ボーナスタイムと通常時の確率
bonus_prob <- c(1 / 11, 2 / 11, 2 / 11, 2 / 11, 2 / 11, 2 / 11)
normal_prob <- c(2 / 11, 9 / 55, 9 / 55, 9 / 55, 9 / 55, 9 / 55)
set.seed(5)
die1 <- sample.int(6, size = n_trials, prob = normal_prob, replace = TRUE) # 通常時のサイコロAの出目
set.seed(1)
die2 <- sample.int(6, size = n_trials, prob = normal_prob, replace = TRUE) # 通常時のサイコロBの出目
set.seed(2)
bonus_time_die1 <- sample.int(6, size = n_trials, prob = bonus_prob, replace = TRUE) # ボーナスタイム時のサイコロAの出目
set.seed(3)
bonus_time_die2 <- sample.int(6, size = n_trials, prob = bonus_prob, replace = TRUE) # ボーナスタイム時のサイコロBの出目
die1_lag <- dplyr::lag(die1)
dies <- data.frame(
die1, die2, die1_lag, bonus_time_die1, bonus_time_die2,
samples_die1 = if_else(die1_lag != 1 | is.na(die1_lag), die1, bonus_time_die1),
samples_die2 = if_else(die1_lag != 1 | is.na(die1_lag), die2, bonus_time_die2)
)
prob <- dies |>
summarise(n = n(), .by = c(samples_die1, samples_die2)) |>
mutate(
samples_die1 = factor(samples_die1),
samples_die2 = factor(samples_die2),
prob = n / n_trials, .keep = "unused"
)
return(list(dies = dies, prob = prob))
}
result <- simulate_dice_experiment(100000)
# ヒートマップ
p1 <- result$prob |>
ggplot(aes(x = samples_die1, y = samples_die2, fill = prob, label = format(round(prob, 3)), nsmall = 3)) +
geom_tile() +
geom_text(color = "white", size = 3) +
scale_fill_gradient(low = "darkblue", high = "darkred") +
theme_minimal() +
labs(y = "サイコロAの出目", x = "サイコロBの出目")+
theme(legend.title = element_blank(), aspect.ratio = 1)
# サイコロAが1が出たときのサイコロBの結果を棒グラフで描画する関数
p2 <- result$dies |>
dplyr::filter(die1 == 1) |>
summarise(n = n(), .by = samples_die2) |>
mutate(prob = n / sum(n)) |>
ggplot(aes(x = samples_die2, y = prob)) +
geom_col(fill = "royalblue", color = "black") +
labs(x = "サイコロBの出目", y = "相対頻度")
p1 + p2
bonus_rect <- result$dies |>
slice_head(n = 50) |>
mutate(trial = 1:50) |>
dplyr::filter(die1_lag == 1) |>
dplyr::select(trial) |>
mutate(xmin = trial - 0.5, xmax = trial + 0.5)
df <- data.frame(
trial = 1:50,
die1 = result$dies$samples_die1[1:50],
die2 = result$dies$samples_die2[1:50]
) |>
pivot_longer(!trial)
p3 <- ggplot() +
geom_rect(data = bonus_rect, aes(xmin=xmin, xmax=xmax, ymin=1, ymax=6), fill="royalblue", alpha = 0.5) +
geom_line(data = df, aes(x = trial, y = value, color = name)) +
geom_point(data = df, aes(x = trial, y = value, color = name)) +
labs(x = "試行回数", y = "サイコロの出目", color = "🎲") +
theme(legend.title = element_blank()) +
theme_bw()
p4 <- result$dies |>
dplyr::filter(die1_lag == 1) |>
summarise(n = n(), .by = samples_die1) |>
mutate(prob = n / sum(n)) |>
ggplot(aes(x = samples_die1, y = prob)) +
geom_col(fill = "royalblue", color = "black") +
labs(x = "サイコロAの出目", y = "相対頻度")
p5 <- result$dies |>
dplyr::filter(die1_lag == 1) |>
summarise(n = n(), .by = samples_die2) |>
mutate(prob = n / sum(n)) |>
ggplot(aes(x = samples_die2, y = prob)) +
geom_col(fill = "salmon", color = "black") +
labs(x = "サイコロBの出目", y = "相対頻度")
p3 / (p4 + p5)
第6章はここまで。