## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
library(arrow)
# ファイルの読み込み
reservation <- read_parquet("https://github.com/ghmagazine/awesomebook_v2/raw/main/data/reservation.parquet")
holiday <- read_parquet("https://github.com/ghmagazine/awesomebook_v2/raw/main/data/holiday.parquet")
日時の処理はlubridateパッケージの出番。tidyverseに含まれています。
tz-naiveなtimestampはRでは使えないみたい。
(clockパッケージでいけそう。)
force_tz()
とwith_tz()
の使い分けに気を付けましょう。
timezoneだけを変換する場合はforce_tzで時刻も変換する場合はwith_tz
tibble(datetime_str = "2023-04-01 10:11:30") |> #文字列
mutate(
# (1)日時型(POSIXct型)へ変換。タイムゾーンはUTC。
timestamp_naive = datetime_str |> ymd_hms(),
# (2)日付のみに変換
date = datetime_str |> date(),
# (3)timeのみ
time = datetime_str |> ymd_hms() |> format("%H:%M:%S") |> hms(),
# (4)TZのみをJSTへ変換
timestamp_jst = datetime_str |> ymd_hms() |> force_tz(tzone = "Asia/Tokyo"),
# (5)TZを変換し時刻も合わせる (PST)
timestamp_pst = datetime_str |> ymd_hms() |> with_tz(tzone = "America/Los_Angeles")
)
## # A tibble: 1 × 6
## datetime_str timestamp_naive date time timestamp_jst
## <chr> <dttm> <date> <Period> <dttm>
## 1 2023-04-01 10:… 2023-04-01 10:11:30 2023-04-01 10H 11M 30S 2023-04-01 10:11:30
## # ℹ 1 more variable: timestamp_pst <dttm>
reserved_atは時刻まで含むデータなので、日付だけで指定すると2019-08-31
は2019-08-31 00:00:00
扱いになるため、2019-08-31
のほとんどのデータが取得できない。
## # A tibble: 28,028 × 11
## reservation_id hotel_id customer_id reserved_at checkin_date
## <int> <int> <int> <dttm> <dttm>
## 1 1953282 4721 56333 2019-07-01 00:00:36 2019-10-11 00:00:00
## 2 1953283 4766 146511 2019-07-01 00:04:32 2019-12-23 00:00:00
## 3 1953284 3448 471762 2019-07-01 00:05:28 2019-11-04 00:00:00
## 4 1953285 4221 288717 2019-07-01 00:05:40 2019-07-12 00:00:00
## 5 1953286 4918 247560 2019-07-01 00:05:51 2019-08-22 00:00:00
## 6 1953287 2296 39178 2019-07-01 00:06:37 2019-08-09 00:00:00
## 7 1953288 2791 309925 2019-07-01 00:06:57 2019-08-30 00:00:00
## 8 1953289 3003 424273 2019-07-01 00:10:57 2019-08-13 00:00:00
## 9 1953290 1324 337280 2019-07-01 00:19:40 2019-10-06 00:00:00
## 10 1953291 788 497384 2019-07-01 00:20:52 2019-10-13 00:00:00
## # ℹ 28,018 more rows
## # ℹ 6 more variables: checkout_date <dttm>, length_of_stay <int>,
## # people_num <int>, total_price <int>, status <chr>, canceled_at <dttm>
年月を抽出するのにformat()
を使うと処理が遅い。
sprintf()
の方が速い。
reservation |>
select(reserved_at) |>
mutate(
# (1) 年を抽出
reserved_at_year = year(reserved_at),
# (2) 月を抽出
reserved_at_month = month(reserved_at),
# (3) 日を抽出
reserved_at_day = day(reserved_at),
# (4) 曜日を抽出 (週の最初を月曜日とした1〜7の数字)
reserved_at_dayofweek = wday(reserved_at),
# (5) 週を抽出 (ISO 8601形式の週番号)
reserved_at_isoweek = isoweek(reserved_at),
# (6) 時を抽出
reserved_at_hour = hour(reserved_at),
# (7) 分を抽出
reserved_at_minute = minute(reserved_at),
# (8) 秒を抽出
reserved_at_second = second(reserved_at),
# (9) 年月を抽出
reserved_at_ym = format(reserved_at, "%Y-%m")
)
## # A tibble: 2,000,000 × 10
## reserved_at reserved_at_year reserved_at_month reserved_at_day
## <dttm> <dbl> <dbl> <int>
## 1 2013-12-31 07:00:14 2013 12 31
## 2 2013-12-31 08:23:35 2013 12 31
## 3 2013-12-31 09:02:05 2013 12 31
## 4 2013-12-31 23:44:54 2013 12 31
## 5 2014-01-01 02:47:50 2014 1 1
## 6 2014-01-01 07:56:58 2014 1 1
## 7 2014-01-01 13:17:06 2014 1 1
## 8 2014-01-01 14:22:01 2014 1 1
## 9 2014-01-01 14:59:04 2014 1 1
## 10 2014-01-01 20:24:34 2014 1 1
## # ℹ 1,999,990 more rows
## # ℹ 6 more variables: reserved_at_dayofweek <dbl>, reserved_at_isoweek <dbl>,
## # reserved_at_hour <int>, reserved_at_minute <int>, reserved_at_second <dbl>,
## # reserved_at_ym <chr>
reservation |>
select(reserved_at) |>
mutate(
# (1) 年を抽出
reserved_at_year = year(reserved_at),
# (2) 月を抽出
reserved_at_month = month(reserved_at),
# (3) 日を抽出
reserved_at_day = day(reserved_at),
# (4) 曜日を抽出 (週の最初を月曜日とした0〜6の数字)
reserved_at_dayofweek = wday(reserved_at, week_start = 1) - 1,
# (5) 週を抽出 (ISO 8601形式の週番号)
reserved_at_isoweek = isoweek(reserved_at),
# (6) 時を抽出
reserved_at_hour = hour(reserved_at),
# (7) 分を抽出
reserved_at_minute = minute(reserved_at),
# (8) 秒を抽出
reserved_at_second = second(reserved_at),
# (9) 年月を抽出
reserved_at_ym = sprintf("%04d-%02d", year(reserved_at), month(reserved_at))
)
## # A tibble: 2,000,000 × 10
## reserved_at reserved_at_year reserved_at_month reserved_at_day
## <dttm> <dbl> <dbl> <int>
## 1 2013-12-31 07:00:14 2013 12 31
## 2 2013-12-31 08:23:35 2013 12 31
## 3 2013-12-31 09:02:05 2013 12 31
## 4 2013-12-31 23:44:54 2013 12 31
## 5 2014-01-01 02:47:50 2014 1 1
## 6 2014-01-01 07:56:58 2014 1 1
## 7 2014-01-01 13:17:06 2014 1 1
## 8 2014-01-01 14:22:01 2014 1 1
## 9 2014-01-01 14:59:04 2014 1 1
## 10 2014-01-01 20:24:34 2014 1 1
## # ℹ 1,999,990 more rows
## # ℹ 6 more variables: reserved_at_dayofweek <dbl>, reserved_at_isoweek <dbl>,
## # reserved_at_hour <int>, reserved_at_minute <int>, reserved_at_second <dbl>,
## # reserved_at_ym <chr>
reservation |>
select(reserved_at) |>
mutate(
# (1)年に切り捨て
reserved_at_year = floor_date(reserved_at, unit = "year"),
# (2)四半期に切り捨て
reserved_at_quarter = floor_date(reserved_at, unit = "quarter"),
# (3)月に切り捨て
reserved_at_month = floor_date(reserved_at, unit = "month"),
# (4)日に切り捨て
reserved_at_day = floor_date(reserved_at, unit = "day"),
# (5)時に切り捨て
reserved_at_hour = floor_date(reserved_at, unit = "hour"),
# (6)分に切り捨て
reserved_at_minute = floor_date(reserved_at, unit = "minute")
)
## # A tibble: 2,000,000 × 7
## reserved_at reserved_at_year reserved_at_quarter
## <dttm> <dttm> <dttm>
## 1 2013-12-31 07:00:14 2013-01-01 00:00:00 2013-10-01 00:00:00
## 2 2013-12-31 08:23:35 2013-01-01 00:00:00 2013-10-01 00:00:00
## 3 2013-12-31 09:02:05 2013-01-01 00:00:00 2013-10-01 00:00:00
## 4 2013-12-31 23:44:54 2013-01-01 00:00:00 2013-10-01 00:00:00
## 5 2014-01-01 02:47:50 2014-01-01 00:00:00 2014-01-01 00:00:00
## 6 2014-01-01 07:56:58 2014-01-01 00:00:00 2014-01-01 00:00:00
## 7 2014-01-01 13:17:06 2014-01-01 00:00:00 2014-01-01 00:00:00
## 8 2014-01-01 14:22:01 2014-01-01 00:00:00 2014-01-01 00:00:00
## 9 2014-01-01 14:59:04 2014-01-01 00:00:00 2014-01-01 00:00:00
## 10 2014-01-01 20:24:34 2014-01-01 00:00:00 2014-01-01 00:00:00
## # ℹ 1,999,990 more rows
## # ℹ 4 more variables: reserved_at_month <dttm>, reserved_at_day <dttm>,
## # reserved_at_hour <dttm>, reserved_at_minute <dttm>
reservation |>
select(reserved_at) |>
mutate(
# (1) 30分を加算
reserved_at_add30min = reserved_at + minutes(30),
# (2) 1時間を加算
reserved_at_add1h = reserved_at + hours(1),
# (3) 1日を加算
reserved_at_add1d = reserved_at + days(1),
# (4) 1ヶ月を加算
reserved_at_add1m = reserved_at + months(1),
# (5) 1年を加算
reserved_at_add1y2 = reserved_at + years(1)
)
## # A tibble: 2,000,000 × 6
## reserved_at reserved_at_add30min reserved_at_add1h
## <dttm> <dttm> <dttm>
## 1 2013-12-31 07:00:14 2013-12-31 07:30:14 2013-12-31 08:00:14
## 2 2013-12-31 08:23:35 2013-12-31 08:53:35 2013-12-31 09:23:35
## 3 2013-12-31 09:02:05 2013-12-31 09:32:05 2013-12-31 10:02:05
## 4 2013-12-31 23:44:54 2014-01-01 00:14:54 2014-01-01 00:44:54
## 5 2014-01-01 02:47:50 2014-01-01 03:17:50 2014-01-01 03:47:50
## 6 2014-01-01 07:56:58 2014-01-01 08:26:58 2014-01-01 08:56:58
## 7 2014-01-01 13:17:06 2014-01-01 13:47:06 2014-01-01 14:17:06
## 8 2014-01-01 14:22:01 2014-01-01 14:52:01 2014-01-01 15:22:01
## 9 2014-01-01 14:59:04 2014-01-01 15:29:04 2014-01-01 15:59:04
## 10 2014-01-01 20:24:34 2014-01-01 20:54:34 2014-01-01 21:24:34
## # ℹ 1,999,990 more rows
## # ℹ 3 more variables: reserved_at_add1d <dttm>, reserved_at_add1m <dttm>,
## # reserved_at_add1y2 <dttm>
reservation |>
select(reserved_at, checkin_date) |>
mutate(
# 年数差を計算
diff_years = interval(reserved_at, checkin_date) |>
time_length("years") |>
as.integer(),
# 月数差を計算
diff_months = interval(reserved_at, checkin_date) |>
time_length("months") |>
as.integer(),
# 日数差を計算
diff_days = as.integer(difftime(checkin_date, reserved_at, units = "days")),
# 時間差を計算
diff_hours = as.integer(difftime(checkin_date, reserved_at, units = "hours")),
# 分差を計算
diff_minutes = as.integer(difftime(checkin_date, reserved_at, units = "mins")),
# 秒差を計算
diff_seconds = as.integer(difftime(checkin_date, reserved_at, units = "secs"))
)
## # A tibble: 2,000,000 × 8
## reserved_at checkin_date diff_years diff_months diff_days
## <dttm> <dttm> <int> <int> <int>
## 1 2013-12-31 07:00:14 2014-12-31 00:00:00 0 11 364
## 2 2013-12-31 08:23:35 2014-12-31 00:00:00 0 11 364
## 3 2013-12-31 09:02:05 2014-12-31 00:00:00 0 11 364
## 4 2013-12-31 23:44:54 2014-12-31 00:00:00 0 11 364
## 5 2014-01-01 02:47:50 2014-12-31 00:00:00 0 11 363
## 6 2014-01-01 07:56:58 2015-01-01 00:00:00 0 11 364
## 7 2014-01-01 13:17:06 2014-12-30 00:00:00 0 11 362
## 8 2014-01-01 14:22:01 2014-12-31 00:00:00 0 11 363
## 9 2014-01-01 14:59:04 2014-12-31 00:00:00 0 11 363
## 10 2014-01-01 20:24:34 2014-12-30 00:00:00 0 11 362
## # ℹ 1,999,990 more rows
## # ℹ 3 more variables: diff_hours <int>, diff_minutes <int>, diff_seconds <int>
# (1) 日付マスタの作成
date_master <-
# (1)-1 連続した日付の生成
data.frame(dt = seq(as.Date("2014-01-01"), as.Date("2019-12-31"), by = "day")) |>
# (1)-2 祝日マスタの結合
left_join(holiday, by = c("dt" = "holiday_date")) |>
mutate(
# (1)-3 休日フラグ(祝日と土日)の作成
is_day_off = if_else(!is.na(holiday_name) | wday(dt) %in% c(6, 7), 1, 0),
# (1)-4 休日前日フラグを作成
is_day_before_day_off = lead(is_day_off, 1)
) |>
select(dt, is_day_off, is_day_before_day_off)
# (2) reservationと結合
reservation |>
left_join(date_master, by = c("checkin_date" = "dt"))
## # A tibble: 2,000,000 × 13
## reservation_id hotel_id customer_id reserved_at checkin_date
## <int> <int> <int> <dttm> <dttm>
## 1 1 2460 53431 2013-12-31 07:00:14 2014-12-31 00:00:00
## 2 2 962 488390 2013-12-31 08:23:35 2014-12-31 00:00:00
## 3 3 558 341335 2013-12-31 09:02:05 2014-12-31 00:00:00
## 4 4 3666 398981 2013-12-31 23:44:54 2014-12-31 00:00:00
## 5 5 2180 220381 2014-01-01 02:47:50 2014-12-31 00:00:00
## 6 6 974 1494 2014-01-01 07:56:58 2015-01-01 00:00:00
## 7 7 2260 24104 2014-01-01 13:17:06 2014-12-30 00:00:00
## 8 8 314 124883 2014-01-01 14:22:01 2014-12-31 00:00:00
## 9 9 2211 45282 2014-01-01 14:59:04 2014-12-31 00:00:00
## 10 10 333 390595 2014-01-01 20:24:34 2014-12-30 00:00:00
## # ℹ 1,999,990 more rows
## # ℹ 8 more variables: checkout_date <dttm>, length_of_stay <int>,
## # people_num <int>, total_price <int>, status <chr>, canceled_at <dttm>,
## # is_day_off <dbl>, is_day_before_day_off <dbl>
以上です。