Document Extraction Verification: US

Setup

Show code

library(tidyverse)
library(targets)
library(here)
library(quanteda)
library(kableExtra)
library(digest)
pacman::p_load(quarto)

here::i_am("notebooks/verify_body.qmd")
tar_config_set(store = here("_targets"))


# Load data - tar_read_raw accepts character strings
body_data <- tar_read_raw(params$body_target)

# Load labels if available
has_labels <- FALSE
if (!is.null(params$labels_target) && params$labels_target != "") {
  tryCatch({
    labels_data <- tar_read_raw(params$labels_target)
    has_labels <- TRUE
  }, error = function(e) {
    message("Labels target not found - skipping known act validation")
    has_labels <<- FALSE
  })
}

# Fiscal vocabulary
fiscal_terms <- params$fiscal_vocab

# Test results storage
test_results <- list()

Overview Statistics

Show code

# Total documents, pages, sources
overview_stats <- body_data %>%
  summarize(
    total_documents = n(),
    successful_extractions = sum(n_pages > 0),
    total_pages = sum(n_pages),
    years_covered = n_distinct(year),
    year_range = sprintf("%d-%d", min(year), max(year)),
    sources_used = n_distinct(source),
    document_types = n_distinct(body),
    ocr_documents = sum(ocr_used, na.rm = TRUE)
  )

overview_stats %>%
  mutate(across(everything(), as.character)) %>%
  pivot_longer(everything(), names_to = "Metric", values_to = "Value") %>%
  mutate(Metric = str_replace_all(Metric, "_", " ") %>% str_to_title()) %>%
  kable(caption = "Extraction Overview") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Extraction Overview
Metric	Value
Total Documents	313
Successful Extractions	304
Total Pages	97475
Years Covered	77
Year Range	1946-2022
Sources Used	3
Document Types	3
Ocr Documents	64

Page Distribution by Source and Body

Show code

# Pages by year, body, and source
body_data %>%
  filter(n_pages > 0) %>%
  ggplot(aes(x = year, y = n_pages, fill = body)) +
  geom_col() +
  facet_wrap(~source, ncol = 1) +
  labs(
    title = "Pages Extracted by Year, Source, and Body",
    x = "Year",
    y = "Number of Pages",
    fill = "Document Type"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

Show code

# Page count distribution
body_data %>%
  filter(n_pages > 0) %>%
  ggplot(aes(x = n_pages, fill = body)) +
  geom_histogram(bins = 50, alpha = 0.7) +
  facet_wrap(~body, ncol = 1, scales = "free_y") +
  labs(
    title = "Distribution of Page Counts by Document Type",
    x = "Number of Pages",
    y = "Count"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

Test (i): PDF URL Resolution & Page Count Validation

Show code

# Identify failed extractions
failed_extractions <- body_data %>%
  filter(n_pages == 0) %>%
  select(year, body, source, pdf_url, n_pages)

# Summary by source
url_resolution_summary <- body_data %>%
  group_by(source, body) %>%
  summarize(
    total_docs = n(),
    successful = sum(n_pages > 0),
    failed = sum(n_pages == 0),
    success_rate = mean(n_pages > 0),
    ocr_docs = sum(ocr_used, na.rm = TRUE),
    ocr_rate = mean(ocr_used, na.rm = TRUE),
    .groups = "drop"
  )

# Overall success rate
overall_success_rate <- mean(body_data$n_pages > 0)

# Determine status
test_i_status <- case_when(
  overall_success_rate >= 0.95 ~ "PASS",
  overall_success_rate >= 0.85 ~ "WARN",
  TRUE ~ "FAIL"
)

# Store result
test_results$test_i <- list(
  metric = "URL resolution success rate",
  value = sprintf("%.1f%%", overall_success_rate * 100),
  target = "≥95%",
  status = test_i_status
)

Results

Overall Success Rate: 97.1% Status: PASS

Show code

if (nrow(failed_extractions) > 0) {
  cat("\n### Failed Extractions\n\n")
  failed_extractions %>%
    kable(caption = sprintf("Failed PDF extractions (%d documents)", nrow(failed_extractions))) %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
} else {
  cat("\n✅ All PDFs successfully extracted!\n\n")
}


### Failed Extractions

Failed PDF extractions (9 documents)
year	body	source	pdf_url
1991	Budget of the United States Government	fraser.stlouisfed.org	https://fraser.stlouisfed.org/files/docs/publications/usbudget/bus_1991.pdf
1992	Budget of the United States Government	fraser.stlouisfed.org	https://fraser.stlouisfed.org/files/docs/publications/usbudget/bus_1992.pdf
1993	Budget of the United States Government	fraser.stlouisfed.org	https://fraser.stlouisfed.org/files/docs/publications/usbudget/bus_1993.pdf
1994	Budget of the United States Government	fraser.stlouisfed.org	https://fraser.stlouisfed.org/files/docs/publications/usbudget/bus_1994.pdf
2005	Budget of the United States Government	fraser.stlouisfed.org	https://fraser.stlouisfed.org/files/docs/publications/usbudget/BUDGET-2005-BUD.pdf
2006	Budget of the United States Government	fraser.stlouisfed.org	https://fraser.stlouisfed.org/files/docs/publications/usbudget/BUDGET-2006-BUD.pdf
2007	Budget of the United States Government	fraser.stlouisfed.org	https://fraser.stlouisfed.org/files/docs/publications/usbudget/BUDGET-2007-BUD.pdf
2008	Budget of the United States Government	fraser.stlouisfed.org	https://fraser.stlouisfed.org/files/docs/publications/usbudget/BUDGET-2008-BUD.pdf
2009	Budget of the United States Government	fraser.stlouisfed.org	https://fraser.stlouisfed.org/files/docs/publications/usbudget/BUDGET-2009-BUD.pdf

Show code

# Success rate by source/body
url_resolution_summary %>%
  mutate(success_rate = sprintf("%.1f%%", success_rate * 100)) %>%
  kable(caption = "Success Rate by Source and Body") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Success Rate by Source and Body
source	body	total_docs	successful	failed	success_rate	ocr_docs	ocr_rate
fraser.stlouisfed.org	Annual Report of the Treasury	35	35	0	100.0%	7	0.2000000
fraser.stlouisfed.org	Budget of the United States Government	191	182	9	95.3%	0	0.0000000
fraser.stlouisfed.org	Economic Report of the President	48	48	0	100.0%	40	0.8333333
govinfo.gov	Economic Report of the President	27	27	0	100.0%	17	0.6296296
home.treasury.gov	Annual Report of the Treasury	12	12	0	100.0%	0	0.0000000

Show code

# OCR usage visualization
body_data %>%
  filter(n_pages > 0) %>%
  count(source, ocr_used) %>%
  ggplot(aes(x = source, y = n, fill = ocr_used)) +
  geom_col(position = "stack") +
  labs(
    title = "OCR Usage by Source",
    x = "Source",
    y = "Number of Documents",
    fill = "OCR Used"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Test (ii): Boundary Document Verification

Show code

# Get boundary documents (earliest and latest per source/body)
boundary_docs <- bind_rows(
  body_data %>%
    filter(n_pages > 0) %>%
    group_by(source, body) %>%
    slice_min(year, n = 1, with_ties = FALSE) %>%
    mutate(boundary_type = "Earliest"),
  body_data %>%
    filter(n_pages > 0) %>%
    group_by(source, body) %>%
    slice_max(year, n = 1, with_ties = FALSE) %>%
    mutate(boundary_type = "Latest")
) %>%
  ungroup()


# Extract sample pages
extract_sample_pages <- function(text_list) {
  if (is.null(text_list) || length(text_list) == 0) {
    return(list(first = NA, middle = NA, last = NA))
  }

  pages <- text_list[[1]]
  n <- length(pages)

  if (n == 0) {
    return(list(first = NA, middle = NA, last = NA))
  }

  list(
    first = pages[1],
    middle = if (n > 1) pages[ceiling(n/2)] else NA,
    last = if (n > 1) pages[n] else NA
  )
}

boundary_docs <- boundary_docs %>%
  mutate(sample_pages = map(text, extract_sample_pages))

# Check if all boundary docs have sufficient pages
boundary_valid <- all(boundary_docs$n_pages >= 10, na.rm = TRUE)

test_ii_status <- case_when(
  boundary_valid ~ "PASS",
  any(boundary_docs$n_pages < 10 & boundary_docs$n_pages > 0) ~ "WARN",
  TRUE ~ "FAIL"
)

test_results$test_ii <- list(
  metric = "Boundary documents valid",
  value = sprintf("%d/%d valid", sum(boundary_docs$n_pages >= 10), nrow(boundary_docs)),
  target = "All ≥10 pages",
  status = test_ii_status
)

Results

Status: PASS

Show code

# Summary table
boundary_docs %>%
  select(body, source, year, boundary_type, n_pages, ocr_used) %>%
  arrange(body, source, boundary_type) %>%
  kable(caption = "Boundary Documents") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Boundary Documents
body	source	year	boundary_type	n_pages	ocr_used
Annual Report of the Treasury	fraser.stlouisfed.org	1946	Earliest	723	FALSE
Annual Report of the Treasury	fraser.stlouisfed.org	1980	Latest	654	TRUE
Annual Report of the Treasury	home.treasury.gov	2011	Earliest	192	FALSE
Annual Report of the Treasury	home.treasury.gov	2022	Latest	118	FALSE
Budget of the United States Government	fraser.stlouisfed.org	1946	Earliest	971	FALSE
Budget of the United States Government	fraser.stlouisfed.org	2022	Latest	72	FALSE
Economic Report of the President	fraser.stlouisfed.org	1947	Earliest	69	TRUE
Economic Report of the President	fraser.stlouisfed.org	1988	Latest	384	TRUE
Economic Report of the President	govinfo.gov	1996	Earliest	360	FALSE
Economic Report of the President	govinfo.gov	2022	Latest	432	TRUE

Sample Pages from Boundary Documents

Show code

# Display sample text from first few boundary documents
display_sample_text <- function(text, label, max_chars = 1000) {
  if (is.na(text) || is.null(text) || nchar(text) == 0) {
    cat(sprintf("\n**%s:** (No text available)\n\n", label))
    return()
  }

  truncated <- str_trunc(text, max_chars)
  cat(sprintf("\n**%s:**\n\n", label))
  cat("```text\n")
  cat(truncated)
  cat("\n```\n\n")

  if (nchar(text) > max_chars) {
    cat(sprintf("*(Truncated: showing %d of %d characters)*\n\n", max_chars, nchar(text)))
  }
}

# Show samples from first 3 boundary documents
for (i in seq_len(min(3, nrow(boundary_docs)))) {
  row <- boundary_docs[i, ]
  cat(sprintf("\n### %s - %s %d (%s)\n\n",
              row$body, row$boundary_type, row$year, row$source))

  samples <- row$sample_pages[[1]]
  display_sample_text(samples$middle, "Middle Page", 800)
}


### Annual Report of the Treasury - Earliest 1946 (fraser.stlouisfed.org)


**Middle Page:** (No text available)


### Budget of the United States Government - Earliest 1946 (fraser.stlouisfed.org)


**Middle Page:** (No text available)


### Economic Report of the President - Earliest 1947 (fraser.stlouisfed.org)


**Middle Page:** (No text available)

Test (iii): Known Act Validation

Important Finding: Year Lag in Economic Reports

The Economic Report of the President (ERP) discusses legislation retrospectively - acts passed in year N are typically discussed in the year N+1 or N+2 ERP. For example:

Economic Recovery Tax Act of 1981 → Found in 1982-1990 ERPs (not 1981)
Omnibus Budget Reconciliation Act of 1990 → Found in 1991-1994 ERPs (not 1990)
Tax Reform Act of 1986 → Found in 1987-1990 ERPs (not 1986)

This is expected behavior: ERPs review the previous year’s economic events and policy changes. Therefore, we use an expanded year window (year to year+2) when validating act detection.

Show code

if (has_labels) {
  # Prepare labels data with expected year
  labels_prepared <- labels_data %>%
    mutate(expected_year = year(date))

  # Get unique acts with their years (filter NA years)
  acts_by_year <- labels_prepared %>%
    distinct(act_name, expected_year) %>%
    filter(!is.na(expected_year))

  # Prepare document text
  docs_with_text <- body_data %>%
    filter(n_pages > 0) %>%
    mutate(full_text = map_chr(.data$text, function(text_list) {
      if (is.null(text_list) || length(text_list) == 0) return("")
      pages <- if (is.list(text_list[[1]])) text_list[[1]] else text_list
      if (length(pages) == 0) return("")
      paste(pages, collapse = " ")
    })) %>%
    select(year, body, full_text)

  # --- STRICT MATCHING (exact year only) ---
  known_acts_strict <- acts_by_year %>%
    left_join(docs_with_text, by = c("expected_year" = "year"),
              relationship = "many-to-many") %>%
    filter(!is.na(full_text))

  act_validation_strict <- known_acts_strict %>%
    mutate(act_name_found = str_detect(full_text, fixed(act_name, ignore_case = TRUE))) %>%
    group_by(act_name, expected_year) %>%
    summarize(n_docs = n(), found_in_any = any(act_name_found), .groups = "drop")

  strict_recall <- mean(act_validation_strict$found_in_any, na.rm = TRUE)

  # --- EXPANDED MATCHING (year to year+2) ---
  # This accounts for the retrospective nature of Economic Reports
  known_acts_expanded <- acts_by_year %>%
    cross_join(docs_with_text) %>%
    filter(year >= expected_year & year <= expected_year + 2) %>%
    filter(!is.na(full_text))

  act_validation_expanded <- known_acts_expanded %>%
    mutate(act_name_found = str_detect(full_text, fixed(act_name, ignore_case = TRUE))) %>%
    group_by(act_name, expected_year) %>%
    summarize(
      n_docs = n(),
      found_in_any = any(act_name_found),
      years_checked = paste(sort(unique(year)), collapse = ", "),
      .groups = "drop"
    )

  expanded_recall <- mean(act_validation_expanded$found_in_any, na.rm = TRUE)

  # Use expanded recall as the primary metric
  act_name_recall <- expanded_recall
  act_name_validation <- act_validation_expanded

  # Determine status based on expanded recall
  test_iii_status <- case_when(
    expanded_recall >= 0.85 ~ "PASS",
    expanded_recall >= 0.75 ~ "WARN",
    TRUE ~ "FAIL"
  )

  test_results$test_iii_strict <- list(
    metric = "Act recall (exact year)",
    value = sprintf("%.1f%%", strict_recall * 100),
    target = "Reference only",
    status = "INFO"
  )

  test_results$test_iii_acts <- list(
    metric = "Act recall (year to year+2)",
    value = sprintf("%.1f%%", expanded_recall * 100),
    target = ">=85%",
    status = test_iii_status
  )
} else {
  test_iii_status <- "SKIP"
  act_name_recall <- NA
  strict_recall <- NA
  expanded_recall <- NA

  test_results$test_iii_acts <- list(
    metric = "Known act validation",
    value = "N/A",
    target = "N/A",
    status = "SKIP"
  )
}

Results

Show code

if (has_labels) {
  cat("\n### Recall Comparison\n\n")
  cat("| Matching Method | Acts Found | Recall | Notes |\n")
  cat("|-----------------|------------|--------|-------|\n")
  cat(sprintf("| Exact year only | %d/%d | %.1f%% | Too strict - misses retrospective mentions |\n",
              sum(act_validation_strict$found_in_any),
              nrow(act_validation_strict),
              strict_recall * 100))
  cat(sprintf("| **Year to Year+2** | **%d/%d** | **%.1f%%** | **Primary metric** - accounts for ERP lag |\n\n",
              sum(act_validation_expanded$found_in_any),
              nrow(act_validation_expanded),
              expanded_recall * 100))

  cat(sprintf("**Status:** %s (target: >=85%%)\n\n", test_iii_status))

  # Show acts NOT found (for investigation)
  missing_acts <- act_name_validation %>%
    filter(!found_in_any)

  if (nrow(missing_acts) > 0) {
    cat("\n### Acts Not Found in Expanded Window\n\n")
    cat("These acts were not found even when searching year to year+2:\n\n")
    missing_acts %>%
      select(act_name, expected_year, years_checked, n_docs) %>%
      arrange(expected_year) %>%
      kable(caption = sprintf("Missing acts (%d total)", nrow(missing_acts))) %>%
      kable_styling(bootstrap_options = c("striped", "hover"))

    cat("\n**Possible reasons:**\n\n")
    cat("- Non-standard act names (e.g., 'Public Law 89-800' vs formal name)\n")
    cat("- Acts referred to informally in documents\n")
    cat("- OCR issues in older documents (pre-1950)\n")
    cat("- Labels data quality (wrong expected_year)\n\n")
  }

  # Show found acts summary
  cat("\n### Act Validation Summary (Found Acts)\n\n")
  act_name_validation %>%
    filter(found_in_any) %>%
    mutate(Status = "✓ Found") %>%
    arrange(expected_year) %>%
    head(20) %>%
    select(act_name, expected_year, years_checked, Status) %>%
    kable(caption = "Successfully validated acts (first 20)") %>%
    kable_styling(bootstrap_options = c("striped", "hover"))

  # Show detection rate by decade
  cat("\n### Detection Rate by Decade\n\n")
  act_name_validation %>%
    mutate(decade = floor(expected_year / 10) * 10) %>%
    group_by(decade) %>%
    summarize(
      total_acts = n(),
      acts_found = sum(found_in_any),
      recall_rate = mean(found_in_any),
      .groups = "drop"
    ) %>%
    filter(!is.na(decade)) %>%
    mutate(
      decade_label = sprintf("%ds", decade),
      recall_rate = sprintf("%.0f%%", recall_rate * 100)
    ) %>%
    select(decade_label, total_acts, acts_found, recall_rate) %>%
    kable(caption = "Act Detection Rate by Decade",
          col.names = c("Decade", "Total Acts", "Found", "Recall")) %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
} else {
  cat("\n⚠️ Skipping: No ground truth labels available for this country\n\n")
}


### Recall Comparison

| Matching Method | Acts Found | Recall | Notes |
|-----------------|------------|--------|-------|
| Exact year only | 21/55 | 38.2% | Too strict - misses retrospective mentions |
| **Year to Year+2** | **45/56** | **80.4%** | **Primary metric** - accounts for ERP lag |

**Status:** WARN (target: >=85%)


### Acts Not Found in Expanded Window

These acts were not found even when searching year to year+2:


**Possible reasons:**

- Non-standard act names (e.g., 'Public Law 89-800' vs formal name)
- Acts referred to informally in documents
- OCR issues in older documents (pre-1950)
- Labels data quality (wrong expected_year)


### Act Validation Summary (Found Acts)


### Detection Rate by Decade

Act Detection Rate by Decade
Decade	Total Acts	Found	Recall
1940s	3	1	33%
1950s	11	11	100%
1960s	11	9	82%
1970s	9	8	89%
1980s	13	10	77%
1990s	3	1	33%
2000s	6	5	83%

Test (iv): Temporal & Source Coverage

Show code

# Define expected coverage
# ERP and Budget: every year from min_year to max_year
# Treasury: 1946-1980 and 2011-present (gap 1981-2010)
expected_coverage <- bind_rows(
  expand_grid(
    year = params$min_year:params$max_year,
    body = c("Economic Report of the President", "Budget of the United States Government")
  ) %>%
    mutate(expected = year <= lubridate::year(Sys.Date())),
  expand_grid(
    year = params$min_year:params$max_year,
    body = "Annual Report of the Treasury"
  ) %>%
    mutate(expected = (year <= 1980 | year >= 2011) & year <= lubridate::year(Sys.Date()))
)

# Actual coverage
actual_coverage <- body_data %>%
  filter(n_pages > 0) %>%
  count(year, body, name = "n_docs")

# Join and analyze
coverage_analysis <- expected_coverage %>%
  left_join(actual_coverage, by = c("year", "body")) %>%
  mutate(
    n_docs = replace_na(n_docs, 0),
    status = case_when(
      !expected ~ "Not expected",
      n_docs > 0 ~ "Present",
      TRUE ~ "Missing"
    )
  )

# Calculate coverage rate
coverage_gaps <- coverage_analysis %>%
  filter(expected & n_docs == 0)

total_expected <- sum(coverage_analysis$expected)
total_present <- sum(coverage_analysis$expected & coverage_analysis$n_docs > 0)
coverage_rate <- total_present / total_expected

test_iv_status <- case_when(
  coverage_rate >= 0.95 ~ "PASS",
  coverage_rate >= 0.85 ~ "WARN",
  TRUE ~ "FAIL"
)

test_results$test_iv <- list(
  metric = "Coverage rate",
  value = sprintf("%.1f%%", coverage_rate * 100),
  target = "≥95%",
  status = test_iv_status
)

Results

Coverage Rate: 96.0% (193 of 201 expected documents) Status: PASS

Show code

# Coverage heatmap
coverage_analysis %>%
  filter(year >= max(params$min_year, 1946)) %>%
  mutate(
    status_color = case_when(
      status == "Not expected" ~ 0,
      status == "Present" ~ 1,
      TRUE ~ -1
    )
  ) %>%
  ggplot(aes(x = year, y = body, fill = status_color)) +
  geom_tile(color = "white", linewidth = 0.5) +
  scale_fill_gradient2(
    low = "red", mid = "grey90", high = "green",
    midpoint = 0,
    breaks = c(-1, 0, 1),
    labels = c("Missing", "Not expected", "Present"),
    name = "Status"
  ) +
  labs(
    title = "Document Coverage by Year and Type",
    x = "Year",
    y = "Document Type"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 8),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  )

Show code

# Show gaps if any
if (nrow(coverage_gaps) > 0) {
  cat("\n### Coverage Gaps\n\n")
  coverage_gaps %>%
    select(year, body) %>%
    arrange(body, year) %>%
    kable(caption = sprintf("Missing expected documents (%d gaps)", nrow(coverage_gaps))) %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
} else {
  cat("\n✅ No coverage gaps! All expected documents are present.\n\n")
}


### Coverage Gaps

Missing expected documents (8 gaps)
year	body
1946	Economic Report of the President
1989	Economic Report of the President
1990	Economic Report of the President
1991	Economic Report of the President
1992	Economic Report of the President
1993	Economic Report of the President
1994	Economic Report of the President
1995	Economic Report of the President

Test (v): Text Quality Indicators

Show code

# Calculate page-level quality metrics
# Compile fiscal terms regex once for efficiency
fiscal_regex <- regex(paste(fiscal_terms, collapse = "|"), ignore_case = TRUE)

quality_metrics <- body_data %>%
  filter(n_pages > 0) %>%
  sample_n(size = min(5, n())) %>%  # Reduced sample size for performance
  select(year, body, package_id, source, text) %>%  # Keep identifiers
  mutate(
    page_metrics = map(text, function(text_list) {
      # Handle the nested structure: text is a list-column
      if (is.null(text_list) || length(text_list) == 0) return(NULL)

      # Extract pages from the nested list structure
      pages <- if (is.list(text_list[[1]])) text_list[[1]] else text_list
      if (length(pages) == 0) return(NULL)

      # Limit to first 50 pages per document to avoid memory issues
      pages <- pages[1:min(50, length(pages))]

      tibble(
        page_num = seq_along(pages),
        page_text = as.character(pages),
        n_chars = nchar(page_text),
        # Use simple word count instead of quanteda for performance
        n_tokens = str_count(page_text, "\\S+"),
        special_char_rate = str_count(page_text, "[^a-zA-Z0-9\\s.,!?;:'-]") / pmax(n_chars, 1),
        whitespace_rate = str_count(page_text, "\\s") / pmax(n_chars, 1),
        non_ascii_rate = str_count(page_text, "[^\x01-\x7F]") / pmax(n_chars, 1),
        has_fiscal_terms = str_detect(page_text, fiscal_regex)
      )
    })
  ) %>%
  filter(!map_lgl(page_metrics, is.null)) %>%
  select(-text) %>%  # Remove text column before unnest
  unnest(page_metrics)

# Identify suspicious pages
suspicious_pages <- quality_metrics %>%
  filter(
    n_chars < 100 |
    special_char_rate > 0.1 |
    non_ascii_rate > 0.05 |
    (!has_fiscal_terms & page_num > 5)
  )

# Document-level quality summary
doc_quality <- quality_metrics %>%
  group_by(year, body, package_id) %>%
  summarize(
    total_pages = n(),
    avg_tokens_per_page = mean(n_tokens, na.rm = TRUE),
    pct_fiscal_pages = mean(has_fiscal_terms, na.rm = TRUE),
    suspicious_pages_count = sum(
      n_chars < 100 | special_char_rate > 0.1 | non_ascii_rate > 0.05,
      na.rm = TRUE
    ),
    quality_score = (pmin(avg_tokens_per_page / 300, 1)) * pct_fiscal_pages *
                    (1 - suspicious_pages_count / total_pages),
    .groups = "drop"
  )

# Calculate metrics
pct_suspicious <- nrow(suspicious_pages) / nrow(quality_metrics)
pct_fiscal <- mean(quality_metrics$has_fiscal_terms, na.rm = TRUE)

test_v_status <- case_when(
  pct_suspicious < 0.05 & pct_fiscal > 0.70 ~ "PASS",
  pct_suspicious < 0.10 | pct_fiscal > 0.50 ~ "WARN",
  TRUE ~ "FAIL"
)

test_results$test_v_quality <- list(
  metric = "Text quality - suspicious pages",
  value = sprintf("%.1f%%", pct_suspicious * 100),
  target = "<5%",
  status = test_v_status
)

test_results$test_v_fiscal <- list(
  metric = "Text quality - fiscal pages",
  value = sprintf("%.1f%%", pct_fiscal * 100),
  target = ">70%",
  status = test_v_status
)

Results

Suspicious Pages: 24.6% Fiscal Term Coverage: 73.8% Status: WARN

Show code

# Token distribution
quality_metrics %>%
  ggplot(aes(x = n_tokens)) +
  geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
  labs(
    title = "Distribution of Tokens per Page",
    x = "Tokens per Page",
    y = "Count"
  ) +
  theme_minimal()

Show code

# Special character rates by source
quality_metrics %>%
  ggplot(aes(x = source, y = special_char_rate)) +
  geom_boxplot(fill = "coral", alpha = 0.7) +
  labs(
    title = "Special Character Rate by Source",
    x = "Source",
    y = "Special Character Rate"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Show code

# Show low quality documents
low_quality_docs <- doc_quality %>%
  filter(quality_score < 0.5) %>%
  arrange(quality_score)

if (nrow(low_quality_docs) > 0) {
  cat("\n### Low Quality Documents\n\n")
  low_quality_docs %>%
    select(year, body, total_pages, avg_tokens_per_page, pct_fiscal_pages, quality_score) %>%
    mutate(
      avg_tokens_per_page = round(avg_tokens_per_page, 0),
      pct_fiscal_pages = sprintf("%.1f%%", pct_fiscal_pages * 100),
      quality_score = round(quality_score, 2)
    ) %>%
    head(10) %>%
    kable(caption = "Documents with quality_score < 0.5 (top 10)") %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
} else {
  cat("\n✅ No low-quality documents found!\n\n")
}


### Low Quality Documents

Documents with quality_score < 0.5 (top 10)
year	body	total_pages	avg_tokens_per_page	pct_fiscal_pages	quality_score
2021	Economic Report of the President	50	471	54.0%	0.36
1949	Economic Report of the President	50	313	50.0%	0.44

Test (vi): Anomaly Detection

Show code

# Document-level anomalies
doc_anomalies <- body_data %>%
  filter(n_pages > 0) %>%
  mutate(
    too_short = n_pages < 10,
    too_long = n_pages > 1000,
    first_pages = map_chr(text, function(pages) {
      if (length(pages) == 0) return("")
      paste(pages[1:min(5, length(pages))], collapse = " ")
    }),
    has_title_indicators = str_detect(
      first_pages,
      regex("(report|budget|economic|president|treasury|united states)", ignore_case = TRUE)
    ),
    extraction_time_z = if (sd(extraction_time, na.rm = TRUE) > 0) {
      scale(extraction_time)[,1]
    } else {
      0
    },
    slow_extraction = ocr_used & extraction_time_z > 3
  )

# Duplicate detection (hash first 5 pages)
duplicate_check <- body_data %>%
  filter(n_pages > 0) %>%
  mutate(
    text_hash = map_chr(text, function(pages) {
      if (length(pages) == 0) return("")
      first_pages <- pages[1:min(5, length(pages))]
      digest::digest(paste(first_pages, collapse = ""))
    })
  ) %>%
  group_by(text_hash) %>%
  filter(n() > 1, text_hash != "") %>%
  ungroup() %>%
  select(year, body, package_id, text_hash)

# Year-level trends
year_trends <- body_data %>%
  filter(n_pages > 0) %>%
  group_by(year, body) %>%
  summarize(
    total_pages = sum(n_pages),
    n_docs = n(),
    .groups = "drop"
  ) %>%
  arrange(body, year) %>%
  group_by(body) %>%
  mutate(
    yoy_change = (total_pages - lag(total_pages)) / lag(total_pages),
    sudden_drop = !is.na(yoy_change) & yoy_change < -0.5
  ) %>%
  ungroup()

# Anomaly counts
n_too_short <- sum(doc_anomalies$too_short, na.rm = TRUE)
n_too_long <- sum(doc_anomalies$too_long, na.rm = TRUE)
n_no_title <- sum(!doc_anomalies$has_title_indicators, na.rm = TRUE)
n_duplicates <- nrow(duplicate_check)
n_sudden_drops <- sum(year_trends$sudden_drop, na.rm = TRUE)

test_results$test_vi <- list(
  metric = "Anomalies detected",
  value = sprintf("%d issues", n_too_short + n_too_long + n_duplicates + n_sudden_drops),
  target = "Manual review",
  status = "INFO"
)

Results

Status: INFO (anomalies flagged for manual review)

Show code

# Anomaly summary
anomaly_summary <- tribble(
  ~Anomaly, ~Count,
  "Too short (< 10 pages)", n_too_short,
  "Too long (> 1000 pages)", n_too_long,
  "Missing title indicators", n_no_title,
  "Duplicate documents", n_duplicates,
  "Sudden year drops (>50%)", n_sudden_drops
)

anomaly_summary %>%
  kable(caption = "Anomaly Summary") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Anomaly Summary
Anomaly	Count
Too short (< 10 pages)	64
Too long (> 1000 pages)	17
Missing title indicators	0
Duplicate documents	2
Sudden year drops (>50%)	7

Show code

# Show anomalous documents
if (n_too_short > 0 || n_too_long > 0 || n_no_title > 0) {
  cat("\n### Anomalous Documents\n\n")
  doc_anomalies %>%
    filter(too_short | too_long | !has_title_indicators) %>%
    select(year, body, package_id, n_pages, too_short, too_long, has_title_indicators) %>%
    arrange(desc(too_short), desc(too_long)) %>%
    head(10) %>%
    kable(caption = "Documents with anomalies (top 10)") %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
}


### Anomalous Documents

Documents with anomalies (top 10)
year	body	package_id	n_pages	too_short	too_long	has_title_indicators
2006	Budget of the United States Government	BUDGET-2006	5	TRUE	FALSE	TRUE
2006	Budget of the United States Government	BUDGET-2006	9	TRUE	FALSE	TRUE
2006	Budget of the United States Government	BUDGET-2006	9	TRUE	FALSE	TRUE
2006	Budget of the United States Government	BUDGET-2006	7	TRUE	FALSE	TRUE
2006	Budget of the United States Government	BUDGET-2006	9	TRUE	FALSE	TRUE
2006	Budget of the United States Government	BUDGET-2006	8	TRUE	FALSE	TRUE
2006	Budget of the United States Government	BUDGET-2006	7	TRUE	FALSE	TRUE
2006	Budget of the United States Government	BUDGET-2006	8	TRUE	FALSE	TRUE
2006	Budget of the United States Government	BUDGET-2006	7	TRUE	FALSE	TRUE
2007	Budget of the United States Government	BUDGET-2007	3	TRUE	FALSE	TRUE

Show code

# Show duplicates
if (n_duplicates > 0) {
  cat("\n### Duplicate Documents\n\n")
  duplicate_check %>%
    kable(caption = sprintf("Potential duplicates (%d documents)", n_duplicates)) %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
}


### Duplicate Documents

Potential duplicates (2 documents)
year	body	package_id	text_hash
2015	Annual Report of the Treasury	AR_TREASURY-2015	25c71c0a85cb4a39edfdf7acdf4371b0
2016	Annual Report of the Treasury	AR_TREASURY-2016	25c71c0a85cb4a39edfdf7acdf4371b0

Show code

# Year trends
year_trends %>%
  ggplot(aes(x = year, y = total_pages, color = body)) +
  geom_line(linewidth = 1) +
  geom_point(data = year_trends %>% filter(sudden_drop),
             color = "red", size = 3, shape = 1) +
  labs(
    title = "Total Pages by Year and Body",
    subtitle = "Red circles indicate sudden drops (>50%)",
    x = "Year",
    y = "Total Pages",
    color = "Document Type"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

Summary Report & Pass/Fail Dashboard

Show code

# Compile test results
test_summary <- bind_rows(
  lapply(names(test_results), function(name) {
    result <- test_results[[name]]
    tibble(
      test = name,
      metric = as.character(result$metric),
      value = as.character(result$value),
      target = as.character(result$target),
      status = as.character(result$status)
    )
  })
)

# Determine overall status
status_priority <- c("FAIL" = 3, "WARN" = 2, "PASS" = 1, "INFO" = 0, "SKIP" = 0)
overall_status <- test_summary %>%
  filter(status != "SKIP", status != "INFO") %>%
  pull(status) %>%
  {names(which.max(status_priority[.]))}

if (length(overall_status) == 0) overall_status <- "PASS"

Overall Status: WARN

Show code

# Display results table
test_summary %>%
  mutate(
    Status = cell_spec(
      status,
      color = case_when(
        status == "PASS" ~ "green",
        status == "WARN" ~ "orange",
        status == "FAIL" ~ "red",
        status == "INFO" ~ "blue",
        status == "SKIP" ~ "grey",
        TRUE ~ "black"
      ),
      bold = TRUE
    )
  ) %>%
  select(-status) %>%
  kable(escape = FALSE, caption = "Verification Test Results") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Verification Test Results
test	metric	value	target	Status
test_i	URL resolution success rate	97.1%	≥95%	<span style=" font-weight: bold; color: green !important;" >PASS</span>
test_ii	Boundary documents valid	10/10 valid	All ≥10 pages	<span style=" font-weight: bold; color: green !important;" >PASS</span>
test_iii_strict	Act recall (exact year)	38.2%	Reference only	<span style=" font-weight: bold; color: blue !important;" >INFO</span>
test_iii_acts	Act recall (year to year+2)	80.4%	>=85%	<span style=" font-weight: bold; color: orange !important;" >WARN</span>
test_iv	Coverage rate	96.0%	≥95%	<span style=" font-weight: bold; color: green !important;" >PASS</span>
test_v_quality	Text quality - suspicious pages	24.6%	<5%	<span style=" font-weight: bold; color: orange !important;" >WARN</span>
test_v_fiscal	Text quality - fiscal pages	73.8%	>70%	<span style=" font-weight: bold; color: orange !important;" >WARN</span>
test_vi	Anomalies detected	90 issues	Manual review	<span style=" font-weight: bold; color: blue !important;" >INFO</span>

Recommendations

Show code

if (overall_status == "PASS") {
  cat("\n✅ **READY FOR LLM PROCESSING**\n\n")
  cat("All verification tests passed. Proceed with:\n\n")
  cat("- `tar_make(chunks)` to create LLM-ready chunks\n")
  cat("- Days 2-3: Training data preparation\n")
  cat("- LLM-based fiscal shock identification\n\n")
} else if (overall_status == "WARN") {
  cat("\n⚠️ **PROCEED WITH CAUTION**\n\n")
  cat("Some tests raised warnings. Review flagged issues before full run:\n\n")

  test_summary %>%
    filter(status == "WARN") %>%
    pull(metric) %>%
    paste("-", .) %>%
    cat(sep = "\n")

  cat("\n\nConsider re-running extractions for problematic documents.\n\n")
} else {
  cat("\n❌ **MANUAL REVIEW REQUIRED**\n\n")
  cat("Critical issues detected. Address the following before proceeding:\n\n")

  test_summary %>%
    filter(status == "FAIL") %>%
    pull(metric) %>%
    paste("-", .) %>%
    cat(sep = "\n")

  cat("\n\nReview extraction settings and re-run failed documents.\n\n")
}


⚠️ **PROCEED WITH CAUTION**

Some tests raised warnings. Review flagged issues before full run:

- Act recall (year to year+2)
- Text quality - suspicious pages
- Text quality - fiscal pages


Consider re-running extractions for problematic documents.

Next Steps

Show code

cat("\n**Based on this verification:**\n\n")


**Based on this verification:**

Show code

cat(sprintf("- Total documents verified: %d\n", nrow(body_data)))

- Total documents verified: 313

Show code

cat(sprintf("- Successful extractions: %d (%.1f%%)\n",
            sum(body_data$n_pages > 0),
            mean(body_data$n_pages > 0) * 100))

- Successful extractions: 304 (97.1%)

Show code

cat(sprintf("- Total pages extracted: %s\n",
            scales::comma(sum(body_data$n_pages))))

- Total pages extracted: 97,475

Show code

cat(sprintf("- Documents using OCR: %d\n", sum(body_data$ocr_used, na.rm = TRUE)))

- Documents using OCR: 64

Show code

cat(sprintf("- Overall status: **%s**\n\n", overall_status))

- Overall status: **WARN**

Show code

cat("**Recommended actions:**\n\n")

**Recommended actions:**

Show code

if (overall_status == "PASS") {
  cat("1. Proceed to Days 2-3 implementation (training data preparation)\n")
  cat("2. Run `tar_make(chunks)` to create document chunks\n")
  cat("3. Begin Model A development (act detection)\n")
} else {
  cat("1. Review failed/warned tests above\n")
  cat("2. Address extraction issues for flagged documents\n")
  cat("3. Re-run verification before proceeding\n")
}

1. Review failed/warned tests above
2. Address extraction issues for flagged documents
3. Re-run verification before proceeding