library(conflicted)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
library(arrow)

# ファイルの読み込み
reservation <- read_parquet("https://github.com/ghmagazine/awesomebook_v2/raw/main/data/reservation.parquet")
holiday     <- read_parquet("https://github.com/ghmagazine/awesomebook_v2/raw/main/data/holiday.parquet")

日時の処理はlubridateパッケージの出番。tidyverseに含まれています。


12-1 日時型への変換

Q: 文字列の日時を日時型に変換

tz-naiveなtimestampはRでは使えないみたい。

(clockパッケージでいけそう。)

https://x.com/eitsupi/status/1799091701581275203

Awesome

force_tz()with_tz()の使い分けに気を付けましょう。

timezoneだけを変換する場合はforce_tzで時刻も変換する場合はwith_tz

tibble(datetime_str = "2023-04-01 10:11:30") |> #文字列
  mutate(
    # (1)日時型(POSIXct型)へ変換。タイムゾーンはUTC。
    timestamp_naive = datetime_str |> ymd_hms(),
    # (2)日付のみに変換
    date = datetime_str |> date(),
    # (3)timeのみ
    time = datetime_str |> ymd_hms() |> format("%H:%M:%S") |> hms(),
    # (4)TZのみをJSTへ変換
    timestamp_jst = datetime_str |> ymd_hms() |> force_tz(tzone = "Asia/Tokyo"),
    # (5)TZを変換し時刻も合わせる (PST)
    timestamp_pst = datetime_str |> ymd_hms() |> with_tz(tzone = "America/Los_Angeles")
  )
## # A tibble: 1 × 6
##   datetime_str    timestamp_naive     date       time        timestamp_jst      
##   <chr>           <dttm>              <date>     <Period>    <dttm>             
## 1 2023-04-01 10:… 2023-04-01 10:11:30 2023-04-01 10H 11M 30S 2023-04-01 10:11:30
## # ℹ 1 more variable: timestamp_pst <dttm>

12-2 日時型の列を用いた行の抽出

Q: 指定期間に予約された予約履歴の抽出

Not Awesome

reserved_atは時刻まで含むデータなので、日付だけで指定すると2019-08-312019-08-31 00:00:00扱いになるため、2019-08-31のほとんどのデータが取得できない。

reservation |>
  dplyr::filter(between(reserved_at, ymd("2019-07-01"), ymd("2019-08-31")))
## # A tibble: 28,028 × 11
##    reservation_id hotel_id customer_id reserved_at         checkin_date       
##             <int>    <int>       <int> <dttm>              <dttm>             
##  1        1953282     4721       56333 2019-07-01 00:00:36 2019-10-11 00:00:00
##  2        1953283     4766      146511 2019-07-01 00:04:32 2019-12-23 00:00:00
##  3        1953284     3448      471762 2019-07-01 00:05:28 2019-11-04 00:00:00
##  4        1953285     4221      288717 2019-07-01 00:05:40 2019-07-12 00:00:00
##  5        1953286     4918      247560 2019-07-01 00:05:51 2019-08-22 00:00:00
##  6        1953287     2296       39178 2019-07-01 00:06:37 2019-08-09 00:00:00
##  7        1953288     2791      309925 2019-07-01 00:06:57 2019-08-30 00:00:00
##  8        1953289     3003      424273 2019-07-01 00:10:57 2019-08-13 00:00:00
##  9        1953290     1324      337280 2019-07-01 00:19:40 2019-10-06 00:00:00
## 10        1953291      788      497384 2019-07-01 00:20:52 2019-10-13 00:00:00
## # ℹ 28,018 more rows
## # ℹ 6 more variables: checkout_date <dttm>, length_of_stay <int>,
## #   people_num <int>, total_price <int>, status <chr>, canceled_at <dttm>

Awesome

reserved_atを日付のみのデータに変換することで解決。

test3 <- reservation |>
  dplyr::filter(between(date(reserved_at), ymd("2019-07-01"), ymd("2019-08-31")))

12-3 日時要素の抽出

Q: 予約日時の日時要素を抽出

Not Awesome

年月を抽出するのにformat()を使うと処理が遅い。

sprintf() の方が速い。

reservation |>
  select(reserved_at) |>
  mutate(
    # (1) 年を抽出
    reserved_at_year = year(reserved_at),
    # (2) 月を抽出
    reserved_at_month = month(reserved_at),
    # (3) 日を抽出
    reserved_at_day = day(reserved_at),
    # (4) 曜日を抽出 (週の最初を月曜日とした1〜7の数字)
    reserved_at_dayofweek = wday(reserved_at),
    # (5) 週を抽出 (ISO 8601形式の週番号)
    reserved_at_isoweek = isoweek(reserved_at),
    # (6) 時を抽出
    reserved_at_hour = hour(reserved_at),
    # (7) 分を抽出
    reserved_at_minute = minute(reserved_at),
    # (8) 秒を抽出
    reserved_at_second = second(reserved_at),
    # (9) 年月を抽出
    reserved_at_ym = format(reserved_at, "%Y-%m")
  )
## # A tibble: 2,000,000 × 10
##    reserved_at         reserved_at_year reserved_at_month reserved_at_day
##    <dttm>                         <dbl>             <dbl>           <int>
##  1 2013-12-31 07:00:14             2013                12              31
##  2 2013-12-31 08:23:35             2013                12              31
##  3 2013-12-31 09:02:05             2013                12              31
##  4 2013-12-31 23:44:54             2013                12              31
##  5 2014-01-01 02:47:50             2014                 1               1
##  6 2014-01-01 07:56:58             2014                 1               1
##  7 2014-01-01 13:17:06             2014                 1               1
##  8 2014-01-01 14:22:01             2014                 1               1
##  9 2014-01-01 14:59:04             2014                 1               1
## 10 2014-01-01 20:24:34             2014                 1               1
## # ℹ 1,999,990 more rows
## # ℹ 6 more variables: reserved_at_dayofweek <dbl>, reserved_at_isoweek <dbl>,
## #   reserved_at_hour <int>, reserved_at_minute <int>, reserved_at_second <dbl>,
## #   reserved_at_ym <chr>

Awesome

reservation |>
  select(reserved_at) |>
  mutate(
    # (1) 年を抽出
    reserved_at_year = year(reserved_at),
    # (2) 月を抽出
    reserved_at_month = month(reserved_at),
    # (3) 日を抽出
    reserved_at_day = day(reserved_at),
    # (4) 曜日を抽出 (週の最初を月曜日とした0〜6の数字)
    reserved_at_dayofweek = wday(reserved_at, week_start = 1) - 1,
    # (5) 週を抽出 (ISO 8601形式の週番号)
    reserved_at_isoweek = isoweek(reserved_at),
    # (6) 時を抽出
    reserved_at_hour = hour(reserved_at),
    # (7) 分を抽出
    reserved_at_minute = minute(reserved_at),
    # (8) 秒を抽出
    reserved_at_second = second(reserved_at),
    # (9) 年月を抽出
    reserved_at_ym = sprintf("%04d-%02d", year(reserved_at), month(reserved_at))
  )
## # A tibble: 2,000,000 × 10
##    reserved_at         reserved_at_year reserved_at_month reserved_at_day
##    <dttm>                         <dbl>             <dbl>           <int>
##  1 2013-12-31 07:00:14             2013                12              31
##  2 2013-12-31 08:23:35             2013                12              31
##  3 2013-12-31 09:02:05             2013                12              31
##  4 2013-12-31 23:44:54             2013                12              31
##  5 2014-01-01 02:47:50             2014                 1               1
##  6 2014-01-01 07:56:58             2014                 1               1
##  7 2014-01-01 13:17:06             2014                 1               1
##  8 2014-01-01 14:22:01             2014                 1               1
##  9 2014-01-01 14:59:04             2014                 1               1
## 10 2014-01-01 20:24:34             2014                 1               1
## # ℹ 1,999,990 more rows
## # ℹ 6 more variables: reserved_at_dayofweek <dbl>, reserved_at_isoweek <dbl>,
## #   reserved_at_hour <int>, reserved_at_minute <int>, reserved_at_second <dbl>,
## #   reserved_at_ym <chr>

12-4 日時の丸め処理

Q: 予約日時の切り捨て

Awesome

reservation |>
  select(reserved_at) |>
  mutate(
    # (1)年に切り捨て
    reserved_at_year = floor_date(reserved_at, unit = "year"),
    # (2)四半期に切り捨て
    reserved_at_quarter = floor_date(reserved_at, unit = "quarter"),
    # (3)月に切り捨て
    reserved_at_month = floor_date(reserved_at, unit = "month"),
    # (4)日に切り捨て
    reserved_at_day = floor_date(reserved_at, unit = "day"),
    # (5)時に切り捨て
    reserved_at_hour = floor_date(reserved_at, unit = "hour"),
    # (6)分に切り捨て
    reserved_at_minute = floor_date(reserved_at, unit = "minute")
  )
## # A tibble: 2,000,000 × 7
##    reserved_at         reserved_at_year    reserved_at_quarter
##    <dttm>              <dttm>              <dttm>             
##  1 2013-12-31 07:00:14 2013-01-01 00:00:00 2013-10-01 00:00:00
##  2 2013-12-31 08:23:35 2013-01-01 00:00:00 2013-10-01 00:00:00
##  3 2013-12-31 09:02:05 2013-01-01 00:00:00 2013-10-01 00:00:00
##  4 2013-12-31 23:44:54 2013-01-01 00:00:00 2013-10-01 00:00:00
##  5 2014-01-01 02:47:50 2014-01-01 00:00:00 2014-01-01 00:00:00
##  6 2014-01-01 07:56:58 2014-01-01 00:00:00 2014-01-01 00:00:00
##  7 2014-01-01 13:17:06 2014-01-01 00:00:00 2014-01-01 00:00:00
##  8 2014-01-01 14:22:01 2014-01-01 00:00:00 2014-01-01 00:00:00
##  9 2014-01-01 14:59:04 2014-01-01 00:00:00 2014-01-01 00:00:00
## 10 2014-01-01 20:24:34 2014-01-01 00:00:00 2014-01-01 00:00:00
## # ℹ 1,999,990 more rows
## # ℹ 4 more variables: reserved_at_month <dttm>, reserved_at_day <dttm>,
## #   reserved_at_hour <dttm>, reserved_at_minute <dttm>

12-5 日時の加減算

Q: 予約日時に一定時間を加算

Awesome

reservation |>
  select(reserved_at) |>
  mutate(
    # (1) 30分を加算
    reserved_at_add30min = reserved_at + minutes(30),
    # (2) 1時間を加算
    reserved_at_add1h = reserved_at + hours(1),
    # (3) 1日を加算
    reserved_at_add1d = reserved_at + days(1),
    # (4) 1ヶ月を加算
    reserved_at_add1m = reserved_at + months(1),    
    # (5) 1年を加算
    reserved_at_add1y2 = reserved_at + years(1)
  )
## # A tibble: 2,000,000 × 6
##    reserved_at         reserved_at_add30min reserved_at_add1h  
##    <dttm>              <dttm>               <dttm>             
##  1 2013-12-31 07:00:14 2013-12-31 07:30:14  2013-12-31 08:00:14
##  2 2013-12-31 08:23:35 2013-12-31 08:53:35  2013-12-31 09:23:35
##  3 2013-12-31 09:02:05 2013-12-31 09:32:05  2013-12-31 10:02:05
##  4 2013-12-31 23:44:54 2014-01-01 00:14:54  2014-01-01 00:44:54
##  5 2014-01-01 02:47:50 2014-01-01 03:17:50  2014-01-01 03:47:50
##  6 2014-01-01 07:56:58 2014-01-01 08:26:58  2014-01-01 08:56:58
##  7 2014-01-01 13:17:06 2014-01-01 13:47:06  2014-01-01 14:17:06
##  8 2014-01-01 14:22:01 2014-01-01 14:52:01  2014-01-01 15:22:01
##  9 2014-01-01 14:59:04 2014-01-01 15:29:04  2014-01-01 15:59:04
## 10 2014-01-01 20:24:34 2014-01-01 20:54:34  2014-01-01 21:24:34
## # ℹ 1,999,990 more rows
## # ℹ 3 more variables: reserved_at_add1d <dttm>, reserved_at_add1m <dttm>,
## #   reserved_at_add1y2 <dttm>

Q: 予約日時とチェックイン日の時間差を計算

Awesome

reservation |>
  select(reserved_at, checkin_date) |>
  mutate(
    # 年数差を計算
    diff_years = interval(reserved_at, checkin_date) |>
      time_length("years") |>
      as.integer(),
    # 月数差を計算
    diff_months = interval(reserved_at, checkin_date) |>
      time_length("months") |>
      as.integer(),
    # 日数差を計算
    diff_days = as.integer(difftime(checkin_date, reserved_at, units = "days")),
    # 時間差を計算
    diff_hours = as.integer(difftime(checkin_date, reserved_at, units = "hours")),
    # 分差を計算
    diff_minutes = as.integer(difftime(checkin_date, reserved_at, units = "mins")),
    # 秒差を計算
    diff_seconds = as.integer(difftime(checkin_date, reserved_at, units = "secs"))
    )
## # A tibble: 2,000,000 × 8
##    reserved_at         checkin_date        diff_years diff_months diff_days
##    <dttm>              <dttm>                   <int>       <int>     <int>
##  1 2013-12-31 07:00:14 2014-12-31 00:00:00          0          11       364
##  2 2013-12-31 08:23:35 2014-12-31 00:00:00          0          11       364
##  3 2013-12-31 09:02:05 2014-12-31 00:00:00          0          11       364
##  4 2013-12-31 23:44:54 2014-12-31 00:00:00          0          11       364
##  5 2014-01-01 02:47:50 2014-12-31 00:00:00          0          11       363
##  6 2014-01-01 07:56:58 2015-01-01 00:00:00          0          11       364
##  7 2014-01-01 13:17:06 2014-12-30 00:00:00          0          11       362
##  8 2014-01-01 14:22:01 2014-12-31 00:00:00          0          11       363
##  9 2014-01-01 14:59:04 2014-12-31 00:00:00          0          11       363
## 10 2014-01-01 20:24:34 2014-12-30 00:00:00          0          11       362
## # ℹ 1,999,990 more rows
## # ℹ 3 more variables: diff_hours <int>, diff_minutes <int>, diff_seconds <int>

12-6 日時の数値化

Q: チェックイン日の休日フラグの付与

Awesome

# (1) 日付マスタの作成
date_master <-
  # (1)-1 連続した日付の生成 
  data.frame(dt = seq(as.Date("2014-01-01"), as.Date("2019-12-31"), by = "day")) |>
  # (1)-2 祝日マスタの結合
  left_join(holiday, by = c("dt" = "holiday_date")) |>
  mutate(
    # (1)-3 休日フラグ(祝日と土日)の作成
    is_day_off = if_else(!is.na(holiday_name) | wday(dt) %in% c(6, 7), 1, 0),
    # (1)-4 休日前日フラグを作成
    is_day_before_day_off = lead(is_day_off, 1)
    ) |>
  select(dt, is_day_off, is_day_before_day_off)

# (2) reservationと結合
reservation |>
  left_join(date_master, by = c("checkin_date" = "dt"))
## # A tibble: 2,000,000 × 13
##    reservation_id hotel_id customer_id reserved_at         checkin_date       
##             <int>    <int>       <int> <dttm>              <dttm>             
##  1              1     2460       53431 2013-12-31 07:00:14 2014-12-31 00:00:00
##  2              2      962      488390 2013-12-31 08:23:35 2014-12-31 00:00:00
##  3              3      558      341335 2013-12-31 09:02:05 2014-12-31 00:00:00
##  4              4     3666      398981 2013-12-31 23:44:54 2014-12-31 00:00:00
##  5              5     2180      220381 2014-01-01 02:47:50 2014-12-31 00:00:00
##  6              6      974        1494 2014-01-01 07:56:58 2015-01-01 00:00:00
##  7              7     2260       24104 2014-01-01 13:17:06 2014-12-30 00:00:00
##  8              8      314      124883 2014-01-01 14:22:01 2014-12-31 00:00:00
##  9              9     2211       45282 2014-01-01 14:59:04 2014-12-31 00:00:00
## 10             10      333      390595 2014-01-01 20:24:34 2014-12-30 00:00:00
## # ℹ 1,999,990 more rows
## # ℹ 8 more variables: checkout_date <dttm>, length_of_stay <int>,
## #   people_num <int>, total_price <int>, status <chr>, canceled_at <dttm>,
## #   is_day_off <dbl>, is_day_before_day_off <dbl>

以上です。

第13章 文字列

第11章 数値

Top