## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
library(arrow)
# ファイルの読み込み
hotel <- read_parquet("https://github.com/ghmagazine/awesomebook_v2/raw/main/data/hotel.parquet")
customer <- read_parquet("https://github.com/ghmagazine/awesomebook_v2/raw/main/data/customer.parquet")
reservation <- read_parquet("https://github.com/ghmagazine/awesomebook_v2/raw/main/data/reservation.parquet")
campaign <- read_parquet("https://github.com/ghmagazine/awesomebook_v2/raw/main/data/campaign.parquet")
# (1) reservationから必要なデータを抽出
result <- reservation |>
# (1-1)キャンセルを除外
# (1-2)2019年のデータを抽出
dplyr::filter(year(checkout_date) == 2019, status != "canceled") |>
mutate(
# (1-3) checkout_dateから年と月を抽出
checkout_year = year(checkout_date),
checkout_month = month(checkout_date),
# (1-4) campaignと結合するために予約日時を丸めて予約日の列を作成
reserve_date = floor_date(as_date(reserved_at), "day")
) |>
# (2) customerと結合
left_join(
customer |>
# (2-1) 年齢を10刻みでカテゴリ化、60以上は一つのカテゴリにまとめる
mutate(age_cat = cut(
age,
breaks = c(seq(0, 50, by = 10), Inf),
labels = c("0-10", "10-20", "20-30", "30-40", "40-50", "60+"),
right = FALSE
)) |>
select(customer_id, sex, age_cat),
by = "customer_id"
) |>
# (3) campaignと結合
left_join(
campaign,
join_by(between(reserved_at, starts_at, ends_at))
) |>
# (4) ディメンションをキーとしてreservationを集約
summarise(
sales = sum(total_price),
reservation_cnt = n(),
length_of_stay_avg = mean(length_of_stay),
people_num_avg = mean(people_num),
.by = c(checkout_year, checkout_month, hotel_id, sex, age_cat, campaign_name)
) |>
# (5) 集約データにhotelを結合
left_join(
hotel |>
# (5)-1 unit_priceをカテゴリ化
mutate(
unit_price_range = cut(
unit_price,
breaks = c(0, 5000, 10000, 20000, 30000, Inf),
labels = c("0", "5000", "10000", "20000", "30000"),
right = FALSE
)) |>
select(hotel_id, hotel_name, hotel_type, address_prefecture, address_town, unit_price_range),
by = "hotel_id"
)
result
## # A tibble: 311,198 × 15
## checkout_year checkout_month hotel_id sex age_cat campaign_name sales
## <dbl> <dbl> <int> <chr> <fct> <chr> <int>
## 1 2019 1 2776 <NA> 60+ 冬休みキャンペーン 78000
## 2 2019 1 1908 NA 30-40 冬休みキャンペーン 12900
## 3 2019 1 2093 NA 60+ 冬休みキャンペーン 151200
## 4 2019 1 2446 F 20-30 冬休みキャンペーン 16900
## 5 2019 1 4780 F 60+ 冬休みキャンペーン 44400
## 6 2019 1 2419 F 60+ 初売りキャンペーン 25000
## 7 2019 1 4781 M 30-40 初売りキャンペーン 31800
## 8 2019 1 2127 M 60+ 初売りキャンペーン 10200
## 9 2019 1 4902 M 20-30 初売りキャンペーン 31200
## 10 2019 1 4335 F 60+ 初売りキャンペーン 6400
## # ℹ 311,188 more rows
## # ℹ 8 more variables: reservation_cnt <int>, length_of_stay_avg <dbl>,
## # people_num_avg <dbl>, hotel_name <chr>, hotel_type <chr>,
## # address_prefecture <chr>, address_town <chr>, unit_price_range <fct>
出来上がったキューブの使い方
result |>
select(checkout_month, age_cat, sales) |> # 必要な列のみを選択
summarise(sales = mean(sales), .by = c(checkout_month, age_cat)) |> #checkout_month, age_cat毎の平均
pivot_wider(names_from = age_cat, values_from = sales)
## # A tibble: 12 × 6
## checkout_month `60+` `30-40` `20-30` `40-50` `10-20`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 44948. 41334. 41916. 40495. 39647.
## 2 2 44446. 41175. 41502. 41261. 41504.
## 3 3 48940. 42031. 42166. 41965. 42193.
## 4 4 44778. 41260. 41890. 40497. 42794.
## 5 5 53543. 42626. 42113. 42703. 40791.
## 6 6 43861. 40344. 41628. 40117. 42068.
## 7 7 49061. 42640. 40951. 42215. 41224.
## 8 8 58147. 43628. 43715. 43696. 39595.
## 9 9 44930. 40656. 41096. 39832. 40434.
## 10 10 45071. 40393. 41535. 40932. 41820.
## 11 11 44269. 41269. 40547. 41792. 40540.
## 12 12 49323. 41330. 42283. 41971. 40298.
library(Matrix)
# 顧客・ホテルごとの予約数を集計した縦持ちデータを作成
long_df <- reservation |>
summarise(cnt = n(), .by = c(customer_id, hotel_id)) |>
mutate(
customer_id = as.factor(customer_id),
hotel_id = as.factor(hotel_id)
)
# 行と列のラベルを作成
row_label <- levels(long_df$customer_id)
col_label <- levels(long_df$hotel_id)
# スパースマトリックス変換
sparse_matrix <- sparseMatrix(
i = as.integer(long_df$customer_id),
j = as.integer(long_df$hotel_id),
x = long_df$cnt,
dims = c(length(row_label), length(col_label))
)
以上です。