# load needed libraries
library(DBI) # <- low-level database functions
library(dbplyr) # <- high-level database functions using dplyr verbs
library(tidyverse) # <- the legend itself
library(duckdb) # <- a packge for creating databases

# here are two potential ways to connect to a database
con <- DBI::dbConnect(
  RMariaDB::MariaDB(),
  username = "beau"
)
con <- DBI::dbConnect(
  RPostgres::Postgres(),
  hostname = "databases.mycompany.com",
  username = "beau"
)

# we will use duckdb, so let's create a temporary databse, and connect to it

# Approach #1: Create a database in memory that will automatically delete itself
con <- dbConnect(duckdb())

# Approach #2: Create a database on disk (a file) that persists until you delete it
con <- dbConnect(duckdb(), dbdir = "/Users/christbm/Desktop/duckdb")

# here are two datasets in gpplot2 to experiment with
View(mpg)
View(diamonds)

# write the data into your database
dbWriteTable(con, "mpg", mpg)
dbWriteTable(con, "diamonds", diamonds)

# check that the data is there
dbListTables(con)

# load the data from the database back into R
dbReadTable(con, "diamonds") |>
  as_tibble()

# here is an example SQL query for asking for data from a database
sql <- "
  SELECT carat, cut, clarity, color, price
  FROM diamonds
  WHERE price > 15000
"

# if you have a SQL query, this will run it and get the data
dbGetQuery(con, sql) |>
  as_tibble()

# let's use a higher-level approach using "dbplyr"

# first, link to a table in a database
diamonds_db <- tbl(con, "diamonds")

# or if you know SQL, you could query exact data using this approach
diamonds_db <- tbl(con, sql("SELECT * FROM diamonds"))

# run usual dplyr commands on that linked table
big_diamonds <- diamonds_db |>
  filter(price > 15000) |>
  select(carat:clarity, price)

# check out the SQL command that was automatically created for you
big_diamonds |>
  show_query()

# if you actually need the data, collect it
big_diamonds |>
  collect()

# let's try with the nycflights13 dataset
dbplyr::copy_nycflights13(con)
flights <- tbl(con, "flights")
planes <- tbl(con, "planes")

# notice if you don't do anything, it just selects all data by default
flights |>
  show_query()

# look at the sql that is created by running filter and arrange
flights |> 
  filter(dest == "IAH") |> 
  arrange(dep_delay) |>
  show_query()

# now check out group_by and summarize
flights |> 
  group_by(dest) |> 
  summarize(dep_delay = mean(dep_delay, na.rm = TRUE)) |> 
  show_query()

# try select
planes |> 
  select(tailnum, type, manufacturer, model, year) |> 
  show_query()
planes |> 
  select(tailnum, type, manufacturer, model, year) |> 
  rename(year_built = year) |> 
  show_query()
planes |> 
  select(tailnum, type, manufacturer, model, year) |> 
  relocate(manufacturer, model, .before = type) |> 
  show_query()

# try mutate
flights |> 
  mutate(
    speed = distance / (air_time / 60)
  ) |> 
  show_query()

# group_by
diamonds_db |> 
  group_by(cut) |> 
  summarize(
    n = n(),
    avg_price = mean(price, na.rm = TRUE)
  ) |> 
  show_query()

# what happens to filter?
flights |> 
  filter(dest == "IAH" | dest == "HOU") |> 
  show_query()
flights |> 
  filter(arr_delay > 0 & arr_delay < 20) |> 
  show_query()
flights |> 
  filter(dest %in% c("IAH", "HOU")) |> 
  show_query()

# arrange
flights |> 
  arrange(year, month, day, desc(dep_delay)) |> 
  show_query()

# and if you want to explore deeper, such as joining tables together
flights |> 
  left_join(planes |> rename(year_built = year), by = "tailnum") |> 
  show_query()

# regarding SQL
#   SELECT, select, and SelECt are the same (it's case-insenstive)
#   ' ' is for strings
#   AND/OR is used instead of &/|
#   = is used for comparison, because SQL has no assignment
#   NULL is used instead of NA


beatles <- read_csv("Desktop/beatles.csv", col_types = cols(id = col_skip()))
dbWriteTable(con, "beatles", beatles)
dbListTables(con)
data <- dbReadTable(con, "beatles") |>
  as_tibble()
View(data)
