## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
hotel <- read_parquet("https://github.com/ghmagazine/awesomebook_v2/raw/main/data/hotel.parquet")
customer <- read_parquet("https://github.com/ghmagazine/awesomebook_v2/raw/main/data/customer.parquet")
reservation <- read_parquet("https://github.com/ghmagazine/awesomebook_v2/raw/main/data/reservation.parquet")
不要な列を指定して削除
reservation |>
select(!c(reserved_at, length_of_stay, total_price, people_num, status, canceled_at))
## # A tibble: 2,000,000 × 5
## reservation_id hotel_id customer_id checkin_date checkout_date
## <int> <int> <int> <dttm> <dttm>
## 1 1 2460 53431 2014-12-31 00:00:00 2015-01-03 00:00:00
## 2 2 962 488390 2014-12-31 00:00:00 2015-01-02 00:00:00
## 3 3 558 341335 2014-12-31 00:00:00 2015-01-01 00:00:00
## 4 4 3666 398981 2014-12-31 00:00:00 2015-01-01 00:00:00
## 5 5 2180 220381 2014-12-31 00:00:00 2015-01-01 00:00:00
## 6 6 974 1494 2015-01-01 00:00:00 2015-01-03 00:00:00
## 7 7 2260 24104 2014-12-30 00:00:00 2015-01-01 00:00:00
## 8 8 314 124883 2014-12-31 00:00:00 2015-01-01 00:00:00
## 9 9 2211 45282 2014-12-31 00:00:00 2015-01-01 00:00:00
## 10 10 333 390595 2014-12-30 00:00:00 2015-01-02 00:00:00
## # ℹ 1,999,990 more rows
インデックスで選択
## # A tibble: 2,000,000 × 5
## reservation_id hotel_id customer_id checkin_date checkout_date
## <int> <int> <int> <dttm> <dttm>
## 1 1 2460 53431 2014-12-31 00:00:00 2015-01-03 00:00:00
## 2 2 962 488390 2014-12-31 00:00:00 2015-01-02 00:00:00
## 3 3 558 341335 2014-12-31 00:00:00 2015-01-01 00:00:00
## 4 4 3666 398981 2014-12-31 00:00:00 2015-01-01 00:00:00
## 5 5 2180 220381 2014-12-31 00:00:00 2015-01-01 00:00:00
## 6 6 974 1494 2015-01-01 00:00:00 2015-01-03 00:00:00
## 7 7 2260 24104 2014-12-30 00:00:00 2015-01-01 00:00:00
## 8 8 314 124883 2014-12-31 00:00:00 2015-01-01 00:00:00
## 9 9 2211 45282 2014-12-31 00:00:00 2015-01-01 00:00:00
## 10 10 333 390595 2014-12-30 00:00:00 2015-01-02 00:00:00
## # ℹ 1,999,990 more rows
列名を指定して列を選択
## # A tibble: 2,000,000 × 5
## reservation_id hotel_id customer_id checkin_date checkout_date
## <int> <int> <int> <dttm> <dttm>
## 1 1 2460 53431 2014-12-31 00:00:00 2015-01-03 00:00:00
## 2 2 962 488390 2014-12-31 00:00:00 2015-01-02 00:00:00
## 3 3 558 341335 2014-12-31 00:00:00 2015-01-01 00:00:00
## 4 4 3666 398981 2014-12-31 00:00:00 2015-01-01 00:00:00
## 5 5 2180 220381 2014-12-31 00:00:00 2015-01-01 00:00:00
## 6 6 974 1494 2015-01-01 00:00:00 2015-01-03 00:00:00
## 7 7 2260 24104 2014-12-30 00:00:00 2015-01-01 00:00:00
## 8 8 314 124883 2014-12-31 00:00:00 2015-01-01 00:00:00
## 9 9 2211 45282 2014-12-31 00:00:00 2015-01-01 00:00:00
## 10 10 333 390595 2014-12-30 00:00:00 2015-01-02 00:00:00
## # ℹ 1,999,990 more rows
列名を列挙して抽出
hotel |>
select(
tag_001, tag_002, tag_003, tag_004, tag_005, tag_006, tag_007, tag_008,
tag_009, tag_010, tag_011, tag_012, tag_013, tag_014, tag_015, tag_016,
tag_017, tag_018, tag_019, tag_020, tag_021, tag_022, tag_023, tag_024,
tag_025, tag_026, tag_027, tag_028, tag_029, tag_030
)
## # A tibble: 5,000 × 30
## tag_001 tag_002 tag_003 tag_004 tag_005 tag_006 tag_007 tag_008 tag_009
## <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 NA 0 0 0 0 0
## 3 NA 1 0 0 0 0 0 0 0
## 4 0 0 0 1 0 0 0 0 0
## 5 0 0 0 1 0 NA 0 0 0
## 6 0 0 0 0 1 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 1 0 0 NA 0
## 9 NA NA 0 0 0 NA 0 0 0
## 10 0 1 1 0 NA 0 0 0 0
## # ℹ 4,990 more rows
## # ℹ 21 more variables: tag_010 <int>, tag_011 <int>, tag_012 <int>,
## # tag_013 <int>, tag_014 <int>, tag_015 <int>, tag_016 <int>, tag_017 <int>,
## # tag_018 <int>, tag_019 <int>, tag_020 <int>, tag_021 <int>, tag_022 <int>,
## # tag_023 <int>, tag_024 <int>, tag_025 <int>, tag_026 <int>, tag_027 <int>,
## # tag_028 <int>, tag_029 <int>, tag_030 <int>
列の開始/終了の部分一致ルールを指定して選択
## # A tibble: 5,000 × 30
## tag_001 tag_002 tag_003 tag_004 tag_005 tag_006 tag_007 tag_008 tag_009
## <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 NA 0 0 0 0 0
## 3 NA 1 0 0 0 0 0 0 0
## 4 0 0 0 1 0 0 0 0 0
## 5 0 0 0 1 0 NA 0 0 0
## 6 0 0 0 0 1 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 1 0 0 NA 0
## 9 NA NA 0 0 0 NA 0 0 0
## 10 0 1 1 0 NA 0 0 0 0
## # ℹ 4,990 more rows
## # ℹ 21 more variables: tag_010 <int>, tag_011 <int>, tag_012 <int>,
## # tag_013 <int>, tag_014 <int>, tag_015 <int>, tag_016 <int>, tag_017 <int>,
## # tag_018 <int>, tag_019 <int>, tag_020 <int>, tag_021 <int>, tag_022 <int>,
## # tag_023 <int>, tag_024 <int>, tag_025 <int>, tag_026 <int>, tag_027 <int>,
## # tag_028 <int>, tag_029 <int>, tag_030 <int>
## customer_id name age sex
## 0 0 0 50286
## address_prefecture address_city address_town address_zipcode
## 0 0 7320 0
## # A tibble: 500,000 × 2
## sex address_town
## <chr> <chr>
## 1 <NA> 鷺山清洲町
## 2 M 地黄
## 3 M 本庄町袋
## 4 M 豊川町高堂太
## 5 F 津門西口町
## 6 F 佐野
## 7 F 日浦
## 8 F 南部青葉町
## 9 <NA> 上高野石田町
## 10 F 下糟屋
## # ℹ 499,990 more rows
データ型を確認してから列名を列挙して抽出
## Rows: 2,000,000
## Columns: 11
## $ reservation_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ hotel_id <int> 2460, 962, 558, 3666, 2180, 974, 2260, 314, 2211, 333, …
## $ customer_id <int> 53431, 488390, 341335, 398981, 220381, 1494, 24104, 124…
## $ reserved_at <dttm> 2013-12-31 07:00:14, 2013-12-31 08:23:35, 2013-12-31 0…
## $ checkin_date <dttm> 2014-12-31, 2014-12-31, 2014-12-31, 2014-12-31, 2014-1…
## $ checkout_date <dttm> 2015-01-03, 2015-01-02, 2015-01-01, 2015-01-01, 2015-0…
## $ length_of_stay <int> 3, 2, 1, 1, 1, 2, 2, 1, 1, 3, 1, 3, 3, 1, 2, 1, 1, 1, 1…
## $ people_num <int> 2, 3, 2, 4, 3, 3, 3, 4, 1, 2, 4, 4, 4, 1, 2, 4, 3, 4, 2…
## $ total_price <int> 37800, 42000, 20400, 39600, 16500, 55200, 104400, 70000…
## $ status <chr> "reserved", "reserved", "reserved", "reserved", "reserv…
## $ canceled_at <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2014-05-12…
reservation |>
select(reservation_id, hotel_id, customer_id, length_of_stay, people_num, total_price)
## # A tibble: 2,000,000 × 6
## reservation_id hotel_id customer_id length_of_stay people_num total_price
## <int> <int> <int> <int> <int> <int>
## 1 1 2460 53431 3 2 37800
## 2 2 962 488390 2 3 42000
## 3 3 558 341335 1 2 20400
## 4 4 3666 398981 1 4 39600
## 5 5 2180 220381 1 3 16500
## 6 6 974 1494 2 3 55200
## 7 7 2260 24104 2 3 104400
## 8 8 314 124883 1 4 70000
## 9 9 2211 45282 1 1 6200
## 10 10 333 390595 3 2 33600
## # ℹ 1,999,990 more rows
select(where())
で数値型の列をすべて選択
## # A tibble: 2,000,000 × 6
## reservation_id hotel_id customer_id length_of_stay people_num total_price
## <int> <int> <int> <int> <int> <int>
## 1 1 2460 53431 3 2 37800
## 2 2 962 488390 2 3 42000
## 3 3 558 341335 1 2 20400
## 4 4 3666 398981 1 4 39600
## 5 5 2180 220381 1 3 16500
## 6 6 974 1494 2 3 55200
## 7 7 2260 24104 2 3 104400
## 8 8 314 124883 1 4 70000
## 9 9 2211 45282 1 1 6200
## 10 10 333 390595 3 2 33600
## # ℹ 1,999,990 more rows
select(where())
で文字列型の列をすべて選択
## # A tibble: 2,000,000 × 1
## status
## <chr>
## 1 reserved
## 2 reserved
## 3 reserved
## 4 reserved
## 5 reserved
## 6 reserved
## 7 reserved
## 8 reserved
## 9 reserved
## 10 reserved
## # ℹ 1,999,990 more rows
select(where())
で日時型の列をすべて選択
## # A tibble: 2,000,000 × 4
## reserved_at checkin_date checkout_date
## <dttm> <dttm> <dttm>
## 1 2013-12-31 07:00:14 2014-12-31 00:00:00 2015-01-03 00:00:00
## 2 2013-12-31 08:23:35 2014-12-31 00:00:00 2015-01-02 00:00:00
## 3 2013-12-31 09:02:05 2014-12-31 00:00:00 2015-01-01 00:00:00
## 4 2013-12-31 23:44:54 2014-12-31 00:00:00 2015-01-01 00:00:00
## 5 2014-01-01 02:47:50 2014-12-31 00:00:00 2015-01-01 00:00:00
## 6 2014-01-01 07:56:58 2015-01-01 00:00:00 2015-01-03 00:00:00
## 7 2014-01-01 13:17:06 2014-12-30 00:00:00 2015-01-01 00:00:00
## 8 2014-01-01 14:22:01 2014-12-31 00:00:00 2015-01-01 00:00:00
## 9 2014-01-01 14:59:04 2014-12-31 00:00:00 2015-01-01 00:00:00
## 10 2014-01-01 20:24:34 2014-12-30 00:00:00 2015-01-02 00:00:00
## # ℹ 1,999,990 more rows
## # ℹ 1 more variable: canceled_at <dttm>
people_numが2以上、かつ4以下の行を抽出
## # A tibble: 1,199,977 × 11
## reservation_id hotel_id customer_id reserved_at checkin_date
## <int> <int> <int> <dttm> <dttm>
## 1 1 2460 53431 2013-12-31 07:00:14 2014-12-31 00:00:00
## 2 2 962 488390 2013-12-31 08:23:35 2014-12-31 00:00:00
## 3 3 558 341335 2013-12-31 09:02:05 2014-12-31 00:00:00
## 4 4 3666 398981 2013-12-31 23:44:54 2014-12-31 00:00:00
## 5 5 2180 220381 2014-01-01 02:47:50 2014-12-31 00:00:00
## 6 6 974 1494 2014-01-01 07:56:58 2015-01-01 00:00:00
## 7 7 2260 24104 2014-01-01 13:17:06 2014-12-30 00:00:00
## 8 8 314 124883 2014-01-01 14:22:01 2014-12-31 00:00:00
## 9 10 333 390595 2014-01-01 20:24:34 2014-12-30 00:00:00
## 10 11 3275 382716 2014-01-02 00:34:44 2015-01-02 00:00:00
## # ℹ 1,199,967 more rows
## # ℹ 6 more variables: checkout_date <dttm>, length_of_stay <int>,
## # people_num <int>, total_price <int>, status <chr>, canceled_at <dttm>
people_numが2から4の範囲の行を抽出
## # A tibble: 1,199,977 × 11
## reservation_id hotel_id customer_id reserved_at checkin_date
## <int> <int> <int> <dttm> <dttm>
## 1 1 2460 53431 2013-12-31 07:00:14 2014-12-31 00:00:00
## 2 2 962 488390 2013-12-31 08:23:35 2014-12-31 00:00:00
## 3 3 558 341335 2013-12-31 09:02:05 2014-12-31 00:00:00
## 4 4 3666 398981 2013-12-31 23:44:54 2014-12-31 00:00:00
## 5 5 2180 220381 2014-01-01 02:47:50 2014-12-31 00:00:00
## 6 6 974 1494 2014-01-01 07:56:58 2015-01-01 00:00:00
## 7 7 2260 24104 2014-01-01 13:17:06 2014-12-30 00:00:00
## 8 8 314 124883 2014-01-01 14:22:01 2014-12-31 00:00:00
## 9 10 333 390595 2014-01-01 20:24:34 2014-12-30 00:00:00
## 10 11 3275 382716 2014-01-02 00:34:44 2015-01-02 00:00:00
## # ℹ 1,199,967 more rows
## # ℹ 6 more variables: checkout_date <dttm>, length_of_stay <int>,
## # people_num <int>, total_price <int>, status <chr>, canceled_at <dttm>
先頭の20,000行を抽出
## # A tibble: 20,000 × 11
## reservation_id hotel_id customer_id reserved_at checkin_date
## <int> <int> <int> <dttm> <dttm>
## 1 1 2460 53431 2013-12-31 07:00:14 2014-12-31 00:00:00
## 2 2 962 488390 2013-12-31 08:23:35 2014-12-31 00:00:00
## 3 3 558 341335 2013-12-31 09:02:05 2014-12-31 00:00:00
## 4 4 3666 398981 2013-12-31 23:44:54 2014-12-31 00:00:00
## 5 5 2180 220381 2014-01-01 02:47:50 2014-12-31 00:00:00
## 6 6 974 1494 2014-01-01 07:56:58 2015-01-01 00:00:00
## 7 7 2260 24104 2014-01-01 13:17:06 2014-12-30 00:00:00
## 8 8 314 124883 2014-01-01 14:22:01 2014-12-31 00:00:00
## 9 9 2211 45282 2014-01-01 14:59:04 2014-12-31 00:00:00
## 10 10 333 390595 2014-01-01 20:24:34 2014-12-30 00:00:00
## # ℹ 19,990 more rows
## # ℹ 6 more variables: checkout_date <dttm>, length_of_stay <int>,
## # people_num <int>, total_price <int>, status <chr>, canceled_at <dttm>
20,000件をランダムサンプリング
## # A tibble: 20,000 × 11
## reservation_id hotel_id customer_id reserved_at checkin_date
## <int> <int> <int> <dttm> <dttm>
## 1 525147 1710 295732 2015-10-28 16:53:45 2016-05-09 00:00:00
## 2 1864595 223 314463 2019-03-08 05:25:48 2019-03-21 00:00:00
## 3 48565 3175 420805 2014-07-15 04:25:33 2015-05-27 00:00:00
## 4 1059854 1268 93822 2017-02-28 17:54:21 2017-03-12 00:00:00
## 5 1628222 1980 95507 2018-08-01 00:07:08 2019-05-24 00:00:00
## 6 227525 4280 428977 2015-01-30 03:07:57 2015-03-30 00:00:00
## 7 1965791 3328 265529 2019-07-23 02:33:56 2019-08-11 00:00:00
## 8 471552 2327 247763 2015-09-09 20:32:43 2015-12-07 00:00:00
## 9 720919 2712 178946 2016-04-25 02:42:59 2016-12-18 00:00:00
## 10 1856335 2257 421505 2019-02-27 16:09:39 2019-08-23 00:00:00
## # ℹ 19,990 more rows
## # ℹ 6 more variables: checkout_date <dttm>, length_of_stay <int>,
## # people_num <int>, total_price <int>, status <chr>, canceled_at <dttm>
minority <- reservation |>
dplyr::filter(status == "canceled")
reservation |>
dplyr::filter(status == "reserved") |>
slice_sample(n = nrow(minority)) |>
bind_rows(minority)
## # A tibble: 400,822 × 11
## reservation_id hotel_id customer_id reserved_at checkin_date
## <int> <int> <int> <dttm> <dttm>
## 1 1550939 3746 225041 2018-05-22 23:38:49 2019-02-20 00:00:00
## 2 68299 3071 151894 2014-08-16 04:56:25 2015-07-16 00:00:00
## 3 869318 4812 255563 2016-09-07 22:33:09 2017-03-10 00:00:00
## 4 859976 279 474745 2016-08-30 09:02:52 2016-12-20 00:00:00
## 5 1308751 3408 323038 2017-10-13 13:22:47 2018-07-16 00:00:00
## 6 520935 1630 493297 2015-10-24 18:52:45 2016-07-20 00:00:00
## 7 715471 1131 446504 2016-04-20 04:54:17 2016-05-31 00:00:00
## 8 102664 2063 289468 2014-09-27 12:15:13 2015-07-27 00:00:00
## 9 1625029 3964 437214 2018-07-29 02:59:45 2018-08-12 00:00:00
## 10 497901 4714 78932 2015-10-03 21:07:35 2016-04-03 00:00:00
## # ℹ 400,812 more rows
## # ℹ 6 more variables: checkout_date <dttm>, length_of_stay <int>,
## # people_num <int>, total_price <int>, status <chr>, canceled_at <dttm>
コラムは省略。
以上です。
topへ