# Regular Expressions (regexs)

library(tidyverse)
library(babynames)

babynames
fruit
words
sentences

# Regex Operations
#   .      Any one character
#   *      Zero or more of whatever comes before it
#   +      One or more of whatever comes before it
#   ?      Zero or one of whatever comes before it
#   |      The thing before "or" the thing after
#   []     Any one thing inside
#   [^]    Any one thing not inside
#   ^      The start of the text
#   $      The end of the text
#   \d \s \w
#   {n}    Match exactly n times
#   {n,}   Match n or more times
#   {,m}   Match no more than m times
#   {n,m}  Match n or more times, but no more than m times
#   ()\\1  

str_view(c("a", "ab", "ae", "bd", "ea", "eab"), "a.")
str_view(fruit, "a...e")
str_view(c("a", "ab", "abb"), "ab?")
str_view(c("a", "ab", "abb"), "ab+")
str_view(c("a", "ab", "abb"), "ab*")

str_view(words, "[aeiou]x[aeiou]")
str_view(words, "[^aeiou]y[^aeiou]")
str_view(words, "[a-zA-Z]y[^a-d]")

str_view(fruit, "apple|melon|nut")
str_view(fruit, "aa|ee|ii|oo|uu")

str_view(sentences, "tree")

str_detect(c("a", "b", "c"), "[aeiou]")
str_subset(c("a", "b", "c"), "[aeiou]")
str_which(c("a", "b", "c"), "[aeiou]")

babynames |>
  filter(str_detect(name, "x"))

babynames |>
  group_by(year) |>
  summarize(prop_x = mean(str_detect(name, "x"))) |>
  ggplot(aes(x = year, y = prop_x)) +
    geom_line()

str_count(c("apple", "banana", "pear"), "p")
str_count("abababa", "aba")
str_view("abababa", "aba")

babynames |>
  count(name) |>
  mutate(
    vowels = str_count(name, "[aeiouAEIOU]"),
    consonants = str_count(name, regex("[^aeiou]", ignore_case = TRUE))
  )

# Match "$^$" exactly.
str_view(c("$^$", "$^^$", "$$"), "\\$\\^\\$")

# Match any text that starts with a "y".
str_view(words, "^y")

# Match any text that ends with an "x".
str_view(words, "x$")

# Match any text that is 3 letters long.
str_view(words, "^.{3}$")
str_view(words, "^...$")

# Match any text that is 7 letters or more.
str_view(words, "^.{7,}$")
str_view(words, ".......")

# Match any text that starts with vowel.
str_view(words, "^[aeiou]")

# Match any text that only contains consonants.
str_subset(words, "[aeiou]", negate = TRUE)
str_view(words, "^[^aeiou]+$")

# Match any text that ends with "ed", but not with "eed".
str_view(words, "[^e]ed$")

# Match any text that ends with "ing" or "ise".
str_view(words, "i(ng|se)$")

# Match any text that starts with 3 consonants.
str_view(words, "^[^aeiou]{3}")

# Match any text that has 3 or more vowels in a row.
str_view(words, "[aeiou]{3,}")

# Match any text that has two or more vowel-consonant pairs.
str_view(words, "([aeiou][^aeiou]){2,}")

# Switch the first and last letters in each word.
new_words <- str_replace(words, "^([A-Za-z])(.*)([A-Za-z])$", "\\3\\2\\1")
intersect(new_words, words)
