## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
hotel <- read_parquet("https://github.com/ghmagazine/Awesomebook_v2/raw/main/data/hotel.parquet")
customer <- read_parquet("https://github.com/ghmagazine/Awesomebook_v2/raw/main/data/customer.parquet")
reservation <- read_parquet("https://github.com/ghmagazine/Awesomebook_v2/raw/main/data/reservation.parquet")
不偏分散、不偏標準偏差ではなく標本分散、標本標準偏差です。
reservation |>
dplyr::filter(status != "canceled") |>
summarise(
reservation_cnt = n(), # カウント
sales = sum(total_price), # 総和
mean_sales = mean(total_price), # 平均値
min_sales = min(total_price), # 最小値
max_sales = max(total_price), # 最大値
var_sales = var(total_price), # 標本分散
std_sales = sd(total_price) # 標本標準偏差
)
## # A tibble: 1 × 7
## reservation_cnt sales mean_sales min_sales max_sales var_sales std_sales
## <int> <dbl> <dbl> <int> <int> <dbl> <dbl>
## 1 1799589 72048928400 40036. 4000 597000 1.22e9 34982.
Rは中央値の線形補間を行わないのでテキストの結果とは異なる
reservation |>
dplyr::filter(status != "canceled") |>
summarise(
median_sales = median(total_price),
p25_sales = quantile(total_price, probs = 0.25),
p75_sales = quantile(total_price, probs = 0.75)
)
## # A tibble: 1 × 3
## median_sales p25_sales p75_sales
## <int> <dbl> <dbl>
## 1 30400 17200 51000
# 最頻値を算出する関数を定義
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
reservation |>
dplyr::filter(status != "canceled") |>
summarise(
mode_people_num = Mode(people_num),
.by = hotel_id
)
## # A tibble: 4,527 × 2
## hotel_id mode_people_num
## <int> <int>
## 1 2460 1
## 2 962 3
## 3 558 4
## 4 3666 2
## 5 2180 4
## 6 974 2
## 7 2260 3
## 8 314 3
## 9 2211 4
## 10 333 1
## # ℹ 4,517 more rows
reservation |>
dplyr::filter(status != "canceled") |>
summarise(
sales = sum(total_price),
.by = hotel_id
)
## # A tibble: 4,527 × 2
## hotel_id sales
## <int> <int>
## 1 2460 12196800
## 2 962 18081000
## 3 558 12311400
## 4 3666 27967500
## 5 2180 15559500
## 6 974 25750800
## 7 2260 22950600
## 8 314 50295000
## 9 2211 16504400
## 10 333 11569600
## # ℹ 4,517 more rows
reservation |>
dplyr::filter(status != "canceled") |>
summarise(
reservation_cnt = n(),
.by = c(hotel_id, customer_id)
)
## # A tibble: 1,798,305 × 3
## hotel_id customer_id reservation_cnt
## <int> <int> <int>
## 1 2460 53431 1
## 2 962 488390 1
## 3 558 341335 1
## 4 3666 398981 1
## 5 2180 220381 1
## 6 974 1494 1
## 7 2260 24104 1
## 8 314 124883 1
## 9 2211 45282 1
## 10 333 390595 1
## # ℹ 1,798,295 more rows
hotel |>
mutate(unit_price_range = floor(unit_price / 5000) * 5000) |>
summarise(count = n(), .by = unit_price_range) |>
arrange(unit_price_range) #独自に加えました
## # A tibble: 8 × 2
## unit_price_range count
## <dbl> <int>
## 1 0 220
## 2 5000 2760
## 3 10000 1389
## 4 15000 456
## 5 20000 139
## 6 25000 29
## 7 30000 6
## 8 35000 1
calc_unit_price_range <- function(x) {
if(x < 5000) {
return("1: 0~4999")
} else if(x < 10000) {
return("2: 5000~9999")
} else if(x < 20000) {
return("3: 10000~19999")
} else if(x < 30000) {
return("4: 20000~29999")
} else {
return("5: 30000~")
}
}
hotel |>
mutate(unit_price_range = sapply(unit_price, calc_unit_price_range)) |>
count(unit_price_range)
## # A tibble: 5 × 2
## unit_price_range n
## <chr> <int>
## 1 1: 0~4999 220
## 2 2: 5000~9999 2760
## 3 3: 10000~19999 1845
## 4 4: 20000~29999 168
## 5 5: 30000~ 7
hotel |>
mutate(unit_price_range = case_when(
unit_price < 5000 ~ "1: 0~4999",
unit_price < 10000 ~ "2: 5000~9999",
unit_price < 20000 ~ "3: 10000~19999",
unit_price < 30000 ~ "4: 20000~29999",
TRUE ~ "5: 30000~"
)) |>
count(unit_price_range)
## # A tibble: 5 × 2
## unit_price_range n
## <chr> <int>
## 1 1: 0~4999 220
## 2 2: 5000~9999 2760
## 3 3: 10000~19999 1845
## 4 4: 20000~29999 168
## 5 5: 30000~ 7
hotel |>
mutate(
unit_price_range = case_when(
unit_price < 5000 ~ 0,
unit_price < 10000 ~ 5000,
unit_price < 20000 ~ 10000,
unit_price < 30000 ~ 20000,
.default = 30000
)) |>
summarise(count = n(), .by = unit_price_range) |>
arrange(unit_price_range) #独自に加えました
## # A tibble: 5 × 2
## unit_price_range count
## <dbl> <int>
## 1 0 220
## 2 5000 2760
## 3 10000 1845
## 4 20000 168
## 5 30000 7
hotel |>
group_by(unit_price_range = cut(unit_price, breaks = c(-Inf, 5000, 10000, 20000, 30000, Inf), right = FALSE)) |>
summarise(Count = n()) |>
arrange(unit_price_range) #独自に加えました
## # A tibble: 5 × 2
## unit_price_range Count
## <fct> <int>
## 1 [-Inf,5e+03) 220
## 2 [5e+03,1e+04) 2760
## 3 [1e+04,2e+04) 1845
## 4 [2e+04,3e+04) 168
## 5 [3e+04, Inf) 7
reservation |>
dplyr::filter(status != "canceled") |>
mutate(month = format(ymd(checkout_date), "%Y-%m")) |>
summarise(total_sales = sum(total_price), .by = month)
## # A tibble: 60 × 2
## month total_sales
## <chr> <dbl>
## 1 2015-01 730844200
## 2 2015-02 650411000
## 3 2015-03 1447995200
## 4 2015-04 697578500
## 5 2015-05 2185718000
## 6 2015-06 697210900
## 7 2015-07 1465165300
## 8 2015-08 2910478200
## 9 2015-09 708236200
## 10 2015-10 733080000
## # ℹ 50 more rows
1)キャンセル済みではない予約の抽出 2)checkout_dateを月ごとの年月に変換 3)monthの値ごとにtotal_priceの総和を計算
reservation |>
dplyr::filter(status != "canceled") |>
mutate(month = floor_date(checkout_date, "month")) |>
group_by(month) |>
summarise(total_price = sum(total_price))
## # A tibble: 60 × 2
## month total_price
## <dttm> <dbl>
## 1 2015-01-01 00:00:00 730844200
## 2 2015-02-01 00:00:00 650411000
## 3 2015-03-01 00:00:00 1447995200
## 4 2015-04-01 00:00:00 697578500
## 5 2015-05-01 00:00:00 2185718000
## 6 2015-06-01 00:00:00 697210900
## 7 2015-07-01 00:00:00 1465165300
## 8 2015-08-01 00:00:00 2910478200
## 9 2015-09-01 00:00:00 708236200
## 10 2015-10-01 00:00:00 733080000
## # ℹ 50 more rows
customer_idの値ごとに、キャンセル済み、かつcanceled_atがcheckin_dateの7日以内、というデータが1件以上存在するかどうかを集計
reservation |>
summarize(
is_canceled_within_7days_to_checkin = any(
status == "canceled" & as.numeric(checkin_date - canceled_at) <= 7
),
.by = customer_id
)
## # A tibble: 417,368 × 2
## customer_id is_canceled_within_7days_to_checkin
## <int> <lgl>
## 1 53431 FALSE
## 2 488390 FALSE
## 3 341335 FALSE
## 4 398981 FALSE
## 5 220381 FALSE
## 6 1494 FALSE
## 7 24104 FALSE
## 8 124883 FALSE
## 9 45282 FALSE
## 10 390595 FALSE
## # ℹ 417,358 more rows
reservation |>
summarise(
sales = sum(ifelse(status != "canceled", total_price, 0)),
cancel_rate = sum(ifelse(status == "canceled", 1, 0)) / n(),
.by = customer_id
)
## # A tibble: 417,368 × 3
## customer_id sales cancel_rate
## <int> <dbl> <dbl>
## 1 53431 227500 0
## 2 488390 190600 0.125
## 3 341335 146500 0.125
## 4 398981 436300 0
## 5 220381 500500 0.111
## 6 1494 55200 0
## 7 24104 274400 0
## 8 124883 482200 0
## 9 45282 101700 0
## 10 390595 146800 0
## # ℹ 417,358 more rows
以上です。