class: title-slide .title[ # Introduction to R ] .author[ ### Eric Stemmler ] .date[ ### 07. Mai 2023 ] --- <style> .middle { margin: 0; position: absolute; top: 50%; left: 25%; -ms-transform: translate(-25%, 50%); transform: translate(-25%, -50%); } </style> .pull-left[ <img src="https://d1zx6djv3kb1v7.cloudfront.net/wp-content/media/2020/05/R-PROGRAMMING-LANGUAGE-i2tutorials.jpg" width="100%" /> ] .pull-right[ More reaons: https://www.i2tutorials.com/introduction-to-r-programming-language/ ] -- .huge[Why do .pink[anything] **twice**?] .large[.center[BAU: How important is data analysis/ cleaning/ reporting?]] .large[.right[Getting things **done**.]] .left[.green[Time to learn something new] vs.] .center[.red[How often will I have to do it ]vs.] .right[.warmyellow[How much time do I have???]] --- <img src="img/is_it_worth_the_time.png" width="70%" /> --- .center[ <img src="img/qr.svg" width="50%" /> .large[https://rcst.netlify.com/doc/presentations/intro-r/ ] ] --- ```r a <- 1:10 print(a) ``` ``` ## [1] 1 2 3 4 5 6 7 8 9 10 ``` ```r a+2 ``` ``` ## [1] 3 4 5 6 7 8 9 10 11 12 ``` ```r letters[1:10] ``` ``` ## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" ``` ```r rep("Hello Tirana!", 2) ``` ``` ## [1] "Hello Tirana!" "Hello Tirana!" ``` --- ```r your_name <- "Marsed" paste("Hello", your_name, ". Today is: ", Sys.Date()) ``` ``` ## [1] "Hello Marsed . Today is: 2023-05-07" ``` -- ```r install.packages(c("data.table", "readxl")) ``` .center[.large[**CRAN**] [https://cran.r-project.org/](https://cran.r-project.org/) 19417 packages ] --- ```r library(data.table) library(readxl) dt <- data.table(x = seq(1:10000), y = letters[1:20], z = rnorm(10000)) dt ``` ``` ## x y z ## 1: 1 a -0.30214945 ## 2: 2 b -0.21876675 ## 3: 3 c -1.09054294 ## 4: 4 d 1.53777209 ## 5: 5 e -0.47530489 ## --- ## 9996: 9996 p 0.28469266 ## 9997: 9997 q 1.59164034 ## 9998: 9998 r 0.01069248 ## 9999: 9999 s -1.06108577 ## 10000: 10000 t 1.20911173 ``` --- .pull-left[ ```r dt[y == 'a', .(new = x+z)] ``` ``` ## new ## 1: 0.6978505 ## 2: 21.3784878 ## 3: 41.2287886 ## 4: 59.6815719 ## 5: 81.2678581 ## --- ## 496: 9903.5709863 ## 497: 9919.8059037 ## 498: 9942.0557327 ## 499: 9960.8002207 ## 500: 9980.6250024 ``` ] -- .pull-right[ ```r dt[, mean(x), by = y] ``` ``` ## y V1 ## 1: a 4991 ## 2: b 4992 ## 3: c 4993 ## 4: d 4994 ## 5: e 4995 ## 6: f 4996 ## 7: g 4997 ## 8: h 4998 ## 9: i 4999 ## 10: j 5000 ## 11: k 5001 ## 12: l 5002 ## 13: m 5003 ## 14: n 5004 ## 15: o 5005 ## 16: p 5006 ## 17: q 5007 ## 18: r 5008 ## 19: s 5009 ## 20: t 5010 ``` ] --- [occupancy.csv](https://rcst.netlify.com/doc/presentations/intro-r//csv/occupancy.csv) ```r dt <- fread("csv/occupancy.csv") ``` --- .small[ ```r summary(dt) ``` ``` ## Date Month Week Hour Hour Slot ## Length:8654 Length:8654 Min. :10.00 Length:8654 Length:8654 ## Class :character Class :character 1st Qu.:12.00 Class :character Class :character ## Mode :character Mode :character Median :15.00 Mode :character Mode :character ## Mean :14.67 ## 3rd Qu.:17.00 ## Max. :18.00 ## NA's :6991 ## AM/PM Station Line Bus Code Direction ## Length:8654 Length:8654 Length:8654 Length:8654 Length:8654 ## Class :character Class :character Class :character Class :character Class :character ## Mode :character Mode :character Mode :character Mode :character Mode :character ## ## ## ## ## Car Plate Occupancy Category ## Length:8654 Min. : 0.000 ## Class :character 1st Qu.: 2.000 ## Mode :character Median : 3.000 ## Mean : 2.996 ## 3rd Qu.: 4.000 ## Max. :21.000 ## ``` ] -- ```r colnames(dt) ``` ``` ## [1] "Date" "Month" "Week" "Hour" ## [5] "Hour Slot" "AM/PM" "Station" "Line" ## [9] "Bus Code" "Direction" "Car Plate" "Occupancy Category" ``` --- ```r # What is "Bus Code"? dt[, unique(`Bus Code`)] ``` ``` ## [1] "N" "F" "V" "" "DF" "F " "V " "N " "D" ``` [Cheat Sheet](https://raw.githubusercontent.com/rstudio/cheatsheets/master/datatable.pdf) ```r setnames(dt, c("Bus Code", "Occupancy Category"), c("type", "occ")) ``` --- .pull-left[ ```r dt[, mean(occ), by = Line] ``` ``` ## Line V1 ## 1: L1 2.427844 ## 2: L2 1.554404 ## 3: L3 2.033613 ## 4: L4 3.089457 ## 5: 5a 2.878049 ## 6: 5b 2.086207 ## 7: L6 2.133880 ## 8: L8 3.821862 ## 9: L9 3.198795 ## 10: L11 3.062385 ## 11: L12 3.611440 ## 12: L13 2.034707 ## 13: L15 4.172533 ## 14: L16 2.455685 ## 15: L5b 2.366947 ## 16: L5a 3.210291 ## 17: L5 2.500000 ## 18: L4 3.000000 ## 19: L6 2.000000 ## 20: L5/a 3.181818 ## 21: l5b 1.333333 ## 22: L8a 3.442177 ## 23: L8b 2.612903 ## 24: L8c 2.666667 ## Line V1 ``` ] -- .pull-right[ .large[Dear Lord! Someone messed up my dataset!] .large[Thank God we have .red[Regular Expressions]:] [Ll]?([[:digit:]]{1,2})\\/?([[:alpha:]]?)[[:space:]]\* ] --- .middle[ .huge[[Ll]?([[:digit:]]{1,2})\\/?([[:alpha:]]?)[[:space:]]\*] ] --- .middle[ .huge[.red[[Ll]?]([[:digit:]]{1,2})\\/?([[:alpha:]]?)[[:space:]]\*] .large[ * Letter "L" or "l" (optional) ]] --- .middle[ .huge[[Ll]?(.red[[[:digit:]]{1,2}])\\/?([[:alpha:]]?)[[:space:]]\*] .large[ * ... followed by 1 or 2 digits * ... 1st group-capture ]] --- .middle[ .huge[[Ll]?([[:digit:]]{1,2}).red[\\/?]([[:alpha:]]?)[[:space:]]\*] .large[ * ... 1 slash character (optional) ]] --- .middle[ .huge[[Ll]?([[:digit:]]{1,2})\\/?(.red[[[:alpha:]]?])[[:space:]]\*] .large[ * ... 1 letter, small or capital (optional) * ... 2nd group-capture ]] --- .middle[ .huge[[Ll]?([[:digit:]]{1,2})\\/?([[:alpha:]]?).red[[[:space:]]\*]] .large[ * ... white space characters, zero or several ]] --- .pull-left[ ```r dt[, mean(occ), by = Line] ``` ``` ## Line V1 ## 1: L1 2.427844 ## 2: L2 1.554404 ## 3: L3 2.033613 ## 4: L4 3.089457 ## 5: 5a 2.878049 ## 6: 5b 2.086207 ## 7: L6 2.133880 ## 8: L8 3.821862 ## 9: L9 3.198795 ## 10: L11 3.062385 ## 11: L12 3.611440 ## 12: L13 2.034707 ## 13: L15 4.172533 ## 14: L16 2.455685 ## 15: L5b 2.366947 ## 16: L5a 3.210291 ## 17: L5 2.500000 ## 18: L4 3.000000 ## 19: L6 2.000000 ## 20: L5/a 3.181818 ## 21: l5b 1.333333 ## 22: L8a 3.442177 ## 23: L8b 2.612903 ## 24: L8c 2.666667 ## Line V1 ``` ] .pull-right[ .large[Oh dear Lord! Someone messed up my dataset!] .large[Thank God we have .red[Regular Expressions]:] [Ll]?([[:digit:]]{1,2})\\/?([[:alpha:]]?)[[:space:]]\* .small[ ```r dt[, Line := gsub(pattern = "[Ll]?([[:digit:]]{1,2})\\/?([[:alpha:]]?)[[:space:]]*", replacement = "\\1\\2", x = Line)] dt[, mean(occ), by = Line] ``` ``` ## Line V1 ## 1: 1 2.427844 ## 2: 2 1.554404 ## 3: 3 2.033613 ## 4: 4 3.089172 ## 5: 5a 3.182365 ## 6: 5b 2.320574 ## 7: 6 2.133152 ## 8: 8 3.821862 ## 9: 9 3.198795 ## 10: 11 3.062385 ## 11: 12 3.611440 ## 12: 13 2.034707 ## 13: 15 4.172533 ## 14: 16 2.455685 ## 15: 5 2.500000 ## 16: 8a 3.442177 ## 17: 8b 2.612903 ## 18: 8c 2.666667 ``` ] ] --- .small[ ```r dt[, mean(occ), by = Date] ``` ``` ## Date V1 ## 1: 03/24/22 3.158537 ## 2: 03/07/22 2.654762 ## 3: 03/10/22 3.263158 ## 4: 03/04/22 2.781250 ## 5: 03/03/22 2.459459 ## --- ## 103: 22.12.2022 2.875000 ## 104: 20.12.2022 3.051020 ## 105: 15.12.2022 3.197279 ## 106: 13.12.2022 3.246575 ## 107: 01.12.2022 3.246914 ``` ] -- .small[ ```r dt[, Date := gsub(pattern = "([[:digit:]]{2})\\/([[:digit:]]{2})\\/([[:digit:]]{2})", replacement = "\\2\\.\\1\\.20\\3", x = Date)] dt[, mean(occ), by = Date] ``` ``` ## Date V1 ## 1: 24.03.2022 3.158537 ## 2: 07.03.2022 2.654762 ## 3: 10.03.2022 3.263158 ## 4: 04.03.2022 2.781250 ## 5: 03.03.2022 2.459459 ## --- ## 103: 22.12.2022 2.875000 ## 104: 20.12.2022 3.051020 ## 105: 15.12.2022 3.197279 ## 106: 13.12.2022 3.246575 ## 107: 01.12.2022 3.246914 ``` ] -- .small[ .pull-left[ ```r dt[, Date := as.POSIXct(x = Date, format = "%d.%m.%Y")] dt[, mean(occ), by = year(Date)] ``` ``` ## year V1 ## 1: 2022 2.996418 ``` ] .pull-right[ ```r dt[, mean(occ), by = month(Date)] ``` ``` ## month V1 ## 1: 3 3.044476 ## 2: 4 3.136646 ## 3: 5 3.087034 ## 4: 6 3.170354 ## 5: 7 3.104978 ## 6: 8 2.339009 ## 7: 9 3.188769 ## 8: 10 2.796392 ## 9: 11 2.911765 ## 10: 12 2.951456 ``` ] ] --- .center[ <img src="img/occ-excel-proportions.png" width="100%" /> ] --- ```r *dt[, Month := format(Date, "%Y-%m")] dt[, .N, by = .(Month, occ)][, .(occ, p = proportions(N) * 100.0), by = .(Month)] |> ``` .center[Create column `Month` (`character`) and put only year and month from the `Date` (`Date`) column] --- ```r dt[, Month := format(Date, "%Y-%m")] *dt[, .N, by = .(Month, occ)][, .(occ, p = proportions(N) * 100.0), by = .(Month)] ``` .center[Count (`.N`) the number of rows, group `by` month and occupancy level] -- .center[... calculate the proportions of N, add occupancy level, do this for each group of `Month`] -- ``` ## Month occ p ## 1: 2022-03 1 9.7560976 ## 2: 2022-03 3 33.5724534 ## 3: 2022-03 2 19.7991392 ## 4: 2022-03 4 27.8335725 ## 5: 2022-03 0 0.4304161 ## 6: 2022-03 5 8.6083214 ## 7: 2022-04 3 29.5031056 ## 8: 2022-04 4 29.6066253 ## 9: 2022-04 2 20.0828157 ## 10: 2022-04 5 11.4906832 ## 11: 2022-04 1 9.1097308 ## 12: 2022-04 0 0.2070393 ## 13: 2022-05 2 19.9822380 ## 14: 2022-05 1 9.2362345 ## 15: 2022-05 3 32.6820604 ## 16: 2022-05 4 28.1527531 ## 17: 2022-05 5 9.7690941 ## 18: 2022-05 0 0.1776199 ## 19: 2022-06 2 17.1460177 ## 20: 2022-06 1 9.9557522 ``` --- ```r dt[, Month := format(Date, "%Y-%m")] dt[, .N, by = .(Month, occ)][, .(occ, p = proportions(N) * 100.0), by = .(Month)] |> *dcast(Month~occ, value.var = "p") |> kable(digits = 2L) ``` .center[Pivot (from long to wide)] .center[... output as formated table on this slide] --- ```r dt[, Month := format(Date, "%Y-%m")] dt[, .N, by = .(Month, occ)][, .(occ, p = proportions(N) * 100.0), by = .(Month)] |> dcast(Month~occ, value.var = "p") |> kable(digits = 0L) ``` |Month | 0| 1| 2| 3| 4| 5| 21| |:-------|--:|--:|--:|--:|--:|--:|--:| |2022-03 | 0| 10| 20| 34| 28| 9| NA| |2022-04 | 0| 9| 20| 30| 30| 11| NA| |2022-05 | 0| 9| 20| 33| 28| 10| NA| |2022-06 | 0| 10| 17| 30| 32| 11| NA| |2022-07 | NA| 10| 18| 35| 30| 8| 0| |2022-08 | NA| 31| 25| 29| 9| 5| NA| |2022-09 | 0| 11| 24| 22| 20| 23| NA| |2022-10 | 0| 15| 30| 28| 13| 14| NA| |2022-11 | NA| 10| 34| 27| 14| 15| NA| |2022-12 | 0| 13| 27| 26| 19| 15| NA| --- .middle[ .pull-left[ <img src="img/occ-excel-proportions.png" width="100%" /> ] .pull-right[ |Month | 0| 1| 2| 3| 4| 5| 21| |:-------|---:|----:|----:|----:|----:|----:|---:| |2022-03 | 0.4| 9.8| 19.8| 33.6| 27.8| 8.6| NA| |2022-04 | 0.2| 9.1| 20.1| 29.5| 29.6| 11.5| NA| |2022-05 | 0.2| 9.2| 20.0| 32.7| 28.2| 9.8| NA| |2022-06 | 0.1| 10.0| 17.1| 29.6| 31.9| 11.3| NA| |2022-07 | NA| 9.5| 17.9| 35.0| 29.7| 7.9| 0.1| |2022-08 | NA| 30.7| 25.1| 29.4| 9.4| 5.4| NA| |2022-09 | 0.2| 10.9| 24.1| 22.2| 19.6| 22.9| NA| |2022-10 | 0.1| 14.9| 30.3| 28.2| 12.6| 13.8| NA| |2022-11 | NA| 10.0| 33.7| 26.6| 14.4| 15.2| NA| |2022-12 | 0.2| 13.0| 26.6| 26.4| 19.2| 14.6| NA| ] ] --- ```r plot(dt[, .(occ = mean(occ, na.rm = TRUE)), keyby = Date], type = 'b') ``` ![](index_files/figure-html/unnamed-chunk-30-1.png)<!-- --> --- ```r plot(dt[, .(occ = mean(occ, na.rm = TRUE)), keyby = Date], type = 'b', ylim = c(0, 5), main = 'average occupancy level per day', ylab = "occupancy") ``` ![](index_files/figure-html/unnamed-chunk-31-1.png)<!-- --> --- ```r p <- ggplot(data = dt[, .(occ = mean(occ, na.rm = TRUE)), by = Date], mapping = aes(x = Date, y = occ)) + geom_point() + geom_line() + scale_y_continuous(limits = c(0, 5), breaks = 0:5) print(p) ``` <img src="index_files/figure-html/unnamed-chunk-32-1.png" width="100%" /> --- ```r p <- ggplot(data = dt[, .(occ = mean(occ, na.rm = TRUE), N = .N), by = Date], * mapping = aes(x = Date, y = occ, size = N, color = N), color = "black") + *geom_point() + geom_line(size = 0.5, color = "black") + scale_y_continuous(limits = c(0, 5), breaks = 0:5) print(p) ``` <img src="index_files/figure-html/unnamed-chunk-33-1.png" width="100%" /> ---