## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  fig.align = "center",
  message = FALSE,
  warning = FALSE
)

## ----load-packages------------------------------------------------------------
# Core packages
library(datacommons)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(scales)
library(knitr)

# Set a consistent theme for plots
theme_set(theme_minimal())

## ----api-setup, eval=FALSE----------------------------------------------------
# # Set your API key
# dc_set_api_key("YOUR_API_KEY_HERE")
# 
# # Or manually set DATACOMMONS_API_KEY in your .Renviron file

## ----api-check, echo=FALSE----------------------------------------------------
# Check for API key
if (!dc_has_api_key()) {
  message("This vignette requires a Data Commons API key.")
  message(
    "Get your free key at: https://docs.datacommons.org/api/#obtain-an-api-key"
  )
  knitr::knit_exit()
}

## ----national-population------------------------------------------------------
# Get U.S. population data
# We found these identifiers using the Data Commons website:
# - Variable: "Count_Person" (from Statistical Variable Explorer)
# - Place: "country/USA" (from Place Explorer)
us_population <- dc_get_observations(
  variable_dcids = "Count_Person",
  entity_dcids = "country/USA",
  date = "all",
  return_type = "data.frame"
)

# Examine the structure
glimpse(us_population)

# Notice we have multiple sources (facets) for the same data
unique(us_population$facet_name)

## ----clean-population-data----------------------------------------------------
# Strategy 1: Use the official Census PEP (Population Estimates Program)
us_pop_clean <- us_population |>
  filter(facet_name == "USCensusPEP_Annual_Population") |>
  mutate(
    year = as.integer(date),
    population_millions = value / 1e6
  ) |>
  filter(year >= 2000) |>
  select(year, population_millions) |>
  arrange(year)

# Visualize the clean data
ggplot(us_pop_clean, aes(x = year, y = population_millions)) +
  geom_line(color = "steelblue", linewidth = 1.5) +
  geom_point(color = "steelblue", size = 3) +
  scale_y_continuous(
    labels = label_number(suffix = "M"),
    limits = c(280, 350)
  ) +
  scale_x_continuous(breaks = seq(2000, 2025, 5)) +
  labs(
    title = "U.S. Population Growth",
    subtitle = "Using Census Population Estimates Program data",
    x = "Year",
    y = "Population (millions)",
    caption = "Source: U.S. Census Bureau via Data Commons"
  )

# Calculate average annual growth
growth_rate <- us_pop_clean |>
  mutate(
    annual_growth =
      (population_millions / lag(population_millions) - 1) * 100
  ) |>
  summarize(
    avg_growth = mean(annual_growth, na.rm = TRUE),
    pre_2020_growth = mean(annual_growth[year < 2020], na.rm = TRUE),
    post_2020_growth = mean(annual_growth[year >= 2020], na.rm = TRUE)
  )

message(
  "Average annual growth rate: ", round(growth_rate$avg_growth, 2), "%\n",
  "Pre-2020 growth rate: ", round(growth_rate$pre_2020_growth, 2), "%\n",
  "Post-2020 growth rate: ", round(growth_rate$post_2020_growth, 2), "%"
)

# Note for statistics enthusiasts: We're using arithmetic mean for simplicity.
# For compound growth rates, geometric mean would be more precise, which you can
# calculate as geometric_mean_growth = (last_value/first_value)^(1/n_years) - 1.
# But given the small annual changes (~0.7%), the difference is negligible.

## ----aggregate-sources--------------------------------------------------------
# Strategy 2: Take the median across all sources for each year
us_pop_aggregated <- us_population |>
  filter(!is.na(value)) |>
  mutate(year = as.integer(date)) |>
  filter(year >= 2000) |>
  group_by(year) |>
  summarize(
    population_millions = median(value) / 1e6,
    n_sources = n_distinct(facet_name),
    .groups = "drop"
  )

# Show which years have the most sources
us_pop_aggregated |>
  slice_max(n_sources, n = 5) |>
  kable(caption = "Years with most data sources")

## ----state-analysis-----------------------------------------------------------
# Method 1: Use the parent/child relationship in observations
state_data <- dc_get_observations(
  variable_dcids = c("Count_Person", "Median_Age_Person"),
  date = "latest",
  parent_entity = "country/USA",
  entity_type = "State",
  return_type = "data.frame"
)

# The code automatically constructs the following query:
# Find all entities of type State contained in country/USA

glimpse(state_data)

# Process the data - reshape from long to wide
state_summary <- state_data |>
  filter(str_detect(facet_name, "Census")) |> # Use Census data
  select(
    entity = entity_dcid,
    state_name = entity_name,
    variable = variable_dcid,
    value
  ) |>
  group_by(entity, state_name, variable) |>
  # Take first if duplicates
  summarize(value = first(value), .groups = "drop") |>
  pivot_wider(names_from = variable, values_from = value) |>
  filter(
    !is.na(Count_Person),
    !is.na(Median_Age_Person),
    Count_Person > 500000 # Focus on states, not small territories
  )

# Visualize the relationship
ggplot(state_summary, aes(x = Median_Age_Person, y = Count_Person)) +
  geom_point(aes(size = Count_Person), alpha = 0.6, color = "darkblue") +
  geom_text(
    data = state_summary |> filter(Count_Person > 10e6),
    aes(label = state_name),
    vjust = -1, hjust = 0.5, size = 3
  ) +
  scale_y_log10(labels = label_comma()) +
  scale_size(range = c(3, 10), guide = "none") +
  labs(
    title = "State Demographics: Population vs. Median Age",
    x = "Median Age (years)",
    y = "Population (log scale)",
    caption = "Source: U.S. Census Bureau via Data Commons"
  )

# Find extremes
state_summary |>
  filter(
    Median_Age_Person == min(Median_Age_Person) |
      Median_Age_Person == max(Median_Age_Person)
  ) |>
  select(state_name, Median_Age_Person, Count_Person) |>
  mutate(Count_Person = label_comma()(Count_Person)) |>
  kable(caption = "States with extreme median ages")

## ----cross-dataset------------------------------------------------------------
# Get multiple variables for California counties
# Notice how we can mix variables from different sources in one query!
ca_integrated <- dc_get_observations(
  variable_dcids = c(
    "Count_Person", # Census
    "Median_Age_Person", # Census
    "Percent_Person_Obesity", # CDC
    "UnemploymentRate_Person" # BLS
  ),
  date = "latest",
  parent_entity = "geoId/06", # California
  entity_type = "County",
  return_type = "data.frame"
)

# Check which sources we're pulling from
ca_integrated |>
  group_by(variable_name, facet_name) |>
  summarize(n = n(), .groups = "drop") |>
  slice_head(n = 10) |>
  kable(caption = "Data sources by variable")

# Process the integrated data
ca_analysis <- ca_integrated |>
  # Pick one source per variable for consistency
  filter(
    (variable_dcid == "Count_Person" &
       str_detect(facet_name, "CensusACS5Year")) |
      (variable_dcid == "Median_Age_Person" &
         str_detect(facet_name, "CensusACS5Year")) |
      (variable_dcid == "Percent_Person_Obesity" &
         str_detect(facet_name, "CDC")) |
      (variable_dcid == "UnemploymentRate_Person" &
         str_detect(facet_name, "BLS"))
  ) |>
  select(
    entity = entity_dcid,
    county_name = entity_name,
    variable = variable_dcid,
    value
  ) |>
  group_by(entity, county_name, variable) |>
  summarize(value = first(value), .groups = "drop") |>
  pivot_wider(names_from = variable, values_from = value) |>
  drop_na() |>
  mutate(
    county_name = str_remove(county_name, " County$"),
    population_k = Count_Person / 1000
  )

# Explore relationships between variables
ggplot(ca_analysis, aes(x = Median_Age_Person, y = Percent_Person_Obesity)) +
  geom_point(aes(size = Count_Person, color = UnemploymentRate_Person),
    alpha = 0.7
  ) +
  geom_smooth(method = "lm", se = FALSE, color = "red", linetype = "dashed") +
  scale_size(range = c(2, 10), guide = "none") +
  scale_color_viridis_c(name = "Unemployment\nRate (%)") +
  labs(
    title = "California Counties: Age, Obesity, and Unemployment",
    subtitle = "Integrating Census, CDC, and BLS data through Data Commons",
    x = "Median Age (years)",
    y = "Obesity Rate (%)",
    caption = "Sources: Census ACS, CDC PLACES, BLS via Data Commons"
  ) +
  theme(legend.position = "right")

# Show correlations
ca_analysis |>
  select(Median_Age_Person, Percent_Person_Obesity, UnemploymentRate_Person) |>
  cor() |>
  round(2) |>
  kable(caption = "Correlations between demographic and health variables")

## ----time-series--------------------------------------------------------------
# Major city DCIDs (found using datacommons.org/place)
major_cities <- c(
  "geoId/3651000", # New York City
  "geoId/0644000", # Los Angeles
  "geoId/1714000", # Chicago
  "geoId/4835000", # Houston
  "geoId/0455000" # Phoenix
)

# Get historical population data
city_populations <- dc_get_observations(
  variable_dcids = "Count_Person",
  entity_dcids = major_cities,
  date = "all",
  return_type = "data.frame"
)

# Process - use Census PEP data for consistency
city_pop_clean <- city_populations |>
  filter(
    str_detect(facet_name, "USCensusPEP"),
    !is.na(value)
  ) |>
  mutate(
    year = as.integer(date),
    population_millions = value / 1e6,
    city = str_extract(entity_name, "^[^,]+")
  ) |>
  filter(year >= 2000) |>
  group_by(city, year) |>
  summarize(
    population_millions = mean(population_millions),
    .groups = "drop"
  )

# Visualize growth trends
ggplot(
  city_pop_clean,
  aes(x = year, y = population_millions, color = city)
) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2) +
  scale_color_brewer(palette = "Set1") +
  scale_y_continuous(labels = label_number(suffix = "M")) +
  labs(
    title = "Population Growth in Major U.S. Cities",
    x = "Year",
    y = "Population (millions)",
    color = "City",
    caption = "Source: Census Population Estimates Program via Data Commons"
  ) +
  theme(legend.position = "bottom")

# Calculate growth rates
city_pop_clean |>
  group_by(city) |>
  filter(year == min(year) | year == max(year)) |>
  arrange(city, year) |>
  summarize(
    years = paste(min(year), "-", max(year)),
    start_pop = first(population_millions),
    end_pop = last(population_millions),
    total_growth_pct = round((end_pop / start_pop - 1) * 100, 1),
    .groups = "drop"
  ) |>
  arrange(desc(total_growth_pct)) |>
  kable(
    caption = "City population growth rates",
    col.names = c("City", "Period", "Start (M)", "End (M)", "Growth %")
  )

## ----understanding-facets-----------------------------------------------------
# Example: What sources are available for U.S. population?
us_population |>
  filter(date == "2020") |>
  select(facet_name, value) |>
  distinct() |>
  mutate(value = label_comma()(value)) |>
  arrange(desc(value)) |>
  kable(caption = "Different sources for 2020 U.S. population")

