Document Extraction Verification: US

Data completeness and parsing quality checks

Published

January 22, 2026

Setup

Show code
library(tidyverse)
library(targets)
library(here)
library(quanteda)
library(kableExtra)
library(digest)
pacman::p_load(quarto)

here::i_am("notebooks/verify_body.qmd")
tar_config_set(store = here("_targets"))


# Load data - tar_read_raw accepts character strings
body_data <- tar_read_raw(params$body_target)

# Load labels if available
has_labels <- FALSE
if (!is.null(params$labels_target) && params$labels_target != "") {
  tryCatch({
    labels_data <- tar_read_raw(params$labels_target)
    has_labels <- TRUE
  }, error = function(e) {
    message("Labels target not found - skipping known act validation")
    has_labels <<- FALSE
  })
}

# Fiscal vocabulary
fiscal_terms <- params$fiscal_vocab

# Test results storage
test_results <- list()

Overview Statistics

Show code
# Total documents, pages, sources
overview_stats <- body_data %>%
  summarize(
    total_documents = n(),
    successful_extractions = sum(n_pages > 0),
    total_pages = sum(n_pages),
    years_covered = n_distinct(year),
    year_range = sprintf("%d-%d", min(year), max(year)),
    sources_used = n_distinct(source),
    document_types = n_distinct(body),
    ocr_documents = sum(ocr_used, na.rm = TRUE)
  )

overview_stats %>%
  mutate(across(everything(), as.character)) %>%
  pivot_longer(everything(), names_to = "Metric", values_to = "Value") %>%
  mutate(Metric = str_replace_all(Metric, "_", " ") %>% str_to_title()) %>%
  kable(caption = "Extraction Overview") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Extraction Overview
Metric Value
Total Documents 313
Successful Extractions 304
Total Pages 97475
Years Covered 77
Year Range 1946-2022
Sources Used 3
Document Types 3
Ocr Documents 64

Page Distribution by Source and Body

Show code
# Pages by year, body, and source
body_data %>%
  filter(n_pages > 0) %>%
  ggplot(aes(x = year, y = n_pages, fill = body)) +
  geom_col() +
  facet_wrap(~source, ncol = 1) +
  labs(
    title = "Pages Extracted by Year, Source, and Body",
    x = "Year",
    y = "Number of Pages",
    fill = "Document Type"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

Show code
# Page count distribution
body_data %>%
  filter(n_pages > 0) %>%
  ggplot(aes(x = n_pages, fill = body)) +
  geom_histogram(bins = 50, alpha = 0.7) +
  facet_wrap(~body, ncol = 1, scales = "free_y") +
  labs(
    title = "Distribution of Page Counts by Document Type",
    x = "Number of Pages",
    y = "Count"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

Test (i): PDF URL Resolution & Page Count Validation

Show code
# Identify failed extractions
failed_extractions <- body_data %>%
  filter(n_pages == 0) %>%
  select(year, body, source, pdf_url, n_pages)

# Summary by source
url_resolution_summary <- body_data %>%
  group_by(source, body) %>%
  summarize(
    total_docs = n(),
    successful = sum(n_pages > 0),
    failed = sum(n_pages == 0),
    success_rate = mean(n_pages > 0),
    ocr_docs = sum(ocr_used, na.rm = TRUE),
    ocr_rate = mean(ocr_used, na.rm = TRUE),
    .groups = "drop"
  )

# Overall success rate
overall_success_rate <- mean(body_data$n_pages > 0)

# Determine status
test_i_status <- case_when(
  overall_success_rate >= 0.95 ~ "PASS",
  overall_success_rate >= 0.85 ~ "WARN",
  TRUE ~ "FAIL"
)

# Store result
test_results$test_i <- list(
  metric = "URL resolution success rate",
  value = sprintf("%.1f%%", overall_success_rate * 100),
  target = "≥95%",
  status = test_i_status
)

Results

Overall Success Rate: 97.1% Status: PASS

Show code
if (nrow(failed_extractions) > 0) {
  cat("\n### Failed Extractions\n\n")
  failed_extractions %>%
    kable(caption = sprintf("Failed PDF extractions (%d documents)", nrow(failed_extractions))) %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
} else {
  cat("\n✅ All PDFs successfully extracted!\n\n")
}

### Failed Extractions
Failed PDF extractions (9 documents)
year body source pdf_url n_pages
1991 Budget of the United States Government fraser.stlouisfed.org https://fraser.stlouisfed.org/files/docs/publications/usbudget/bus_1991.pdf 0
1992 Budget of the United States Government fraser.stlouisfed.org https://fraser.stlouisfed.org/files/docs/publications/usbudget/bus_1992.pdf 0
1993 Budget of the United States Government fraser.stlouisfed.org https://fraser.stlouisfed.org/files/docs/publications/usbudget/bus_1993.pdf 0
1994 Budget of the United States Government fraser.stlouisfed.org https://fraser.stlouisfed.org/files/docs/publications/usbudget/bus_1994.pdf 0
2005 Budget of the United States Government fraser.stlouisfed.org https://fraser.stlouisfed.org/files/docs/publications/usbudget/BUDGET-2005-BUD.pdf 0
2006 Budget of the United States Government fraser.stlouisfed.org https://fraser.stlouisfed.org/files/docs/publications/usbudget/BUDGET-2006-BUD.pdf 0
2007 Budget of the United States Government fraser.stlouisfed.org https://fraser.stlouisfed.org/files/docs/publications/usbudget/BUDGET-2007-BUD.pdf 0
2008 Budget of the United States Government fraser.stlouisfed.org https://fraser.stlouisfed.org/files/docs/publications/usbudget/BUDGET-2008-BUD.pdf 0
2009 Budget of the United States Government fraser.stlouisfed.org https://fraser.stlouisfed.org/files/docs/publications/usbudget/BUDGET-2009-BUD.pdf 0
Show code
# Success rate by source/body
url_resolution_summary %>%
  mutate(success_rate = sprintf("%.1f%%", success_rate * 100)) %>%
  kable(caption = "Success Rate by Source and Body") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Success Rate by Source and Body
source body total_docs successful failed success_rate ocr_docs ocr_rate
fraser.stlouisfed.org Annual Report of the Treasury 35 35 0 100.0% 7 0.2000000
fraser.stlouisfed.org Budget of the United States Government 191 182 9 95.3% 0 0.0000000
fraser.stlouisfed.org Economic Report of the President 48 48 0 100.0% 40 0.8333333
govinfo.gov Economic Report of the President 27 27 0 100.0% 17 0.6296296
home.treasury.gov Annual Report of the Treasury 12 12 0 100.0% 0 0.0000000
Show code
# OCR usage visualization
body_data %>%
  filter(n_pages > 0) %>%
  count(source, ocr_used) %>%
  ggplot(aes(x = source, y = n, fill = ocr_used)) +
  geom_col(position = "stack") +
  labs(
    title = "OCR Usage by Source",
    x = "Source",
    y = "Number of Documents",
    fill = "OCR Used"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Test (ii): Boundary Document Verification

Show code
# Get boundary documents (earliest and latest per source/body)
boundary_docs <- bind_rows(
  body_data %>%
    filter(n_pages > 0) %>%
    group_by(source, body) %>%
    slice_min(year, n = 1, with_ties = FALSE) %>%
    mutate(boundary_type = "Earliest"),
  body_data %>%
    filter(n_pages > 0) %>%
    group_by(source, body) %>%
    slice_max(year, n = 1, with_ties = FALSE) %>%
    mutate(boundary_type = "Latest")
) %>%
  ungroup()


# Extract sample pages
extract_sample_pages <- function(text_list) {
  if (is.null(text_list) || length(text_list) == 0) {
    return(list(first = NA, middle = NA, last = NA))
  }

  pages <- text_list[[1]]
  n <- length(pages)

  if (n == 0) {
    return(list(first = NA, middle = NA, last = NA))
  }

  list(
    first = pages[1],
    middle = if (n > 1) pages[ceiling(n/2)] else NA,
    last = if (n > 1) pages[n] else NA
  )
}

boundary_docs <- boundary_docs %>%
  mutate(sample_pages = map(text, extract_sample_pages))

# Check if all boundary docs have sufficient pages
boundary_valid <- all(boundary_docs$n_pages >= 10, na.rm = TRUE)

test_ii_status <- case_when(
  boundary_valid ~ "PASS",
  any(boundary_docs$n_pages < 10 & boundary_docs$n_pages > 0) ~ "WARN",
  TRUE ~ "FAIL"
)

test_results$test_ii <- list(
  metric = "Boundary documents valid",
  value = sprintf("%d/%d valid", sum(boundary_docs$n_pages >= 10), nrow(boundary_docs)),
  target = "All ≥10 pages",
  status = test_ii_status
)

Results

Status: PASS

Show code
# Summary table
boundary_docs %>%
  select(body, source, year, boundary_type, n_pages, ocr_used) %>%
  arrange(body, source, boundary_type) %>%
  kable(caption = "Boundary Documents") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Boundary Documents
body source year boundary_type n_pages ocr_used
Annual Report of the Treasury fraser.stlouisfed.org 1946 Earliest 723 FALSE
Annual Report of the Treasury fraser.stlouisfed.org 1980 Latest 654 TRUE
Annual Report of the Treasury home.treasury.gov 2011 Earliest 192 FALSE
Annual Report of the Treasury home.treasury.gov 2022 Latest 118 FALSE
Budget of the United States Government fraser.stlouisfed.org 1946 Earliest 971 FALSE
Budget of the United States Government fraser.stlouisfed.org 2022 Latest 72 FALSE
Economic Report of the President fraser.stlouisfed.org 1947 Earliest 69 TRUE
Economic Report of the President fraser.stlouisfed.org 1988 Latest 384 TRUE
Economic Report of the President govinfo.gov 1996 Earliest 360 FALSE
Economic Report of the President govinfo.gov 2022 Latest 432 TRUE

Sample Pages from Boundary Documents

Show code
# Display sample text from first few boundary documents
display_sample_text <- function(text, label, max_chars = 1000) {
  if (is.na(text) || is.null(text) || nchar(text) == 0) {
    cat(sprintf("\n**%s:** (No text available)\n\n", label))
    return()
  }

  truncated <- str_trunc(text, max_chars)
  cat(sprintf("\n**%s:**\n\n", label))
  cat("```text\n")
  cat(truncated)
  cat("\n```\n\n")

  if (nchar(text) > max_chars) {
    cat(sprintf("*(Truncated: showing %d of %d characters)*\n\n", max_chars, nchar(text)))
  }
}

# Show samples from first 3 boundary documents
for (i in seq_len(min(3, nrow(boundary_docs)))) {
  row <- boundary_docs[i, ]
  cat(sprintf("\n### %s - %s %d (%s)\n\n",
              row$body, row$boundary_type, row$year, row$source))

  samples <- row$sample_pages[[1]]
  display_sample_text(samples$middle, "Middle Page", 800)
}

### Annual Report of the Treasury - Earliest 1946 (fraser.stlouisfed.org)


**Middle Page:** (No text available)


### Budget of the United States Government - Earliest 1946 (fraser.stlouisfed.org)


**Middle Page:** (No text available)


### Economic Report of the President - Earliest 1947 (fraser.stlouisfed.org)


**Middle Page:** (No text available)

Test (iii): Known Act Validation

NoteImportant Finding: Year Lag in Economic Reports

The Economic Report of the President (ERP) discusses legislation retrospectively - acts passed in year N are typically discussed in the year N+1 or N+2 ERP. For example:

  • Economic Recovery Tax Act of 1981 → Found in 1982-1990 ERPs (not 1981)
  • Omnibus Budget Reconciliation Act of 1990 → Found in 1991-1994 ERPs (not 1990)
  • Tax Reform Act of 1986 → Found in 1987-1990 ERPs (not 1986)

This is expected behavior: ERPs review the previous year’s economic events and policy changes. Therefore, we use an expanded year window (year to year+2) when validating act detection.

Show code
if (has_labels) {
  # Prepare labels data with expected year
  labels_prepared <- labels_data %>%
    mutate(expected_year = year(date))

  # Get unique acts with their years (filter NA years)
  acts_by_year <- labels_prepared %>%
    distinct(act_name, expected_year) %>%
    filter(!is.na(expected_year))

  # Prepare document text
  docs_with_text <- body_data %>%
    filter(n_pages > 0) %>%
    mutate(full_text = map_chr(.data$text, function(text_list) {
      if (is.null(text_list) || length(text_list) == 0) return("")
      pages <- if (is.list(text_list[[1]])) text_list[[1]] else text_list
      if (length(pages) == 0) return("")
      paste(pages, collapse = " ")
    })) %>%
    select(year, body, full_text)

  # --- STRICT MATCHING (exact year only) ---
  known_acts_strict <- acts_by_year %>%
    left_join(docs_with_text, by = c("expected_year" = "year"),
              relationship = "many-to-many") %>%
    filter(!is.na(full_text))

  act_validation_strict <- known_acts_strict %>%
    mutate(act_name_found = str_detect(full_text, fixed(act_name, ignore_case = TRUE))) %>%
    group_by(act_name, expected_year) %>%
    summarize(n_docs = n(), found_in_any = any(act_name_found), .groups = "drop")

  strict_recall <- mean(act_validation_strict$found_in_any, na.rm = TRUE)

  # --- EXPANDED MATCHING (year to year+2) ---
  # This accounts for the retrospective nature of Economic Reports
  known_acts_expanded <- acts_by_year %>%
    cross_join(docs_with_text) %>%
    filter(year >= expected_year & year <= expected_year + 2) %>%
    filter(!is.na(full_text))

  act_validation_expanded <- known_acts_expanded %>%
    mutate(act_name_found = str_detect(full_text, fixed(act_name, ignore_case = TRUE))) %>%
    group_by(act_name, expected_year) %>%
    summarize(
      n_docs = n(),
      found_in_any = any(act_name_found),
      years_checked = paste(sort(unique(year)), collapse = ", "),
      .groups = "drop"
    )

  expanded_recall <- mean(act_validation_expanded$found_in_any, na.rm = TRUE)

  # Use expanded recall as the primary metric
  act_name_recall <- expanded_recall
  act_name_validation <- act_validation_expanded

  # Determine status based on expanded recall
  test_iii_status <- case_when(
    expanded_recall >= 0.85 ~ "PASS",
    expanded_recall >= 0.75 ~ "WARN",
    TRUE ~ "FAIL"
  )

  test_results$test_iii_strict <- list(
    metric = "Act recall (exact year)",
    value = sprintf("%.1f%%", strict_recall * 100),
    target = "Reference only",
    status = "INFO"
  )

  test_results$test_iii_acts <- list(
    metric = "Act recall (year to year+2)",
    value = sprintf("%.1f%%", expanded_recall * 100),
    target = ">=85%",
    status = test_iii_status
  )
} else {
  test_iii_status <- "SKIP"
  act_name_recall <- NA
  strict_recall <- NA
  expanded_recall <- NA

  test_results$test_iii_acts <- list(
    metric = "Known act validation",
    value = "N/A",
    target = "N/A",
    status = "SKIP"
  )
}

Results

Show code
if (has_labels) {
  cat("\n### Recall Comparison\n\n")
  cat("| Matching Method | Acts Found | Recall | Notes |\n")
  cat("|-----------------|------------|--------|-------|\n")
  cat(sprintf("| Exact year only | %d/%d | %.1f%% | Too strict - misses retrospective mentions |\n",
              sum(act_validation_strict$found_in_any),
              nrow(act_validation_strict),
              strict_recall * 100))
  cat(sprintf("| **Year to Year+2** | **%d/%d** | **%.1f%%** | **Primary metric** - accounts for ERP lag |\n\n",
              sum(act_validation_expanded$found_in_any),
              nrow(act_validation_expanded),
              expanded_recall * 100))

  cat(sprintf("**Status:** %s (target: >=85%%)\n\n", test_iii_status))

  # Show acts NOT found (for investigation)
  missing_acts <- act_name_validation %>%
    filter(!found_in_any)

  if (nrow(missing_acts) > 0) {
    cat("\n### Acts Not Found in Expanded Window\n\n")
    cat("These acts were not found even when searching year to year+2:\n\n")
    missing_acts %>%
      select(act_name, expected_year, years_checked, n_docs) %>%
      arrange(expected_year) %>%
      kable(caption = sprintf("Missing acts (%d total)", nrow(missing_acts))) %>%
      kable_styling(bootstrap_options = c("striped", "hover"))

    cat("\n**Possible reasons:**\n\n")
    cat("- Non-standard act names (e.g., 'Public Law 89-800' vs formal name)\n")
    cat("- Acts referred to informally in documents\n")
    cat("- OCR issues in older documents (pre-1950)\n")
    cat("- Labels data quality (wrong expected_year)\n\n")
  }

  # Show found acts summary
  cat("\n### Act Validation Summary (Found Acts)\n\n")
  act_name_validation %>%
    filter(found_in_any) %>%
    mutate(Status = "✓ Found") %>%
    arrange(expected_year) %>%
    head(20) %>%
    select(act_name, expected_year, years_checked, Status) %>%
    kable(caption = "Successfully validated acts (first 20)") %>%
    kable_styling(bootstrap_options = c("striped", "hover"))

  # Show detection rate by decade
  cat("\n### Detection Rate by Decade\n\n")
  act_name_validation %>%
    mutate(decade = floor(expected_year / 10) * 10) %>%
    group_by(decade) %>%
    summarize(
      total_acts = n(),
      acts_found = sum(found_in_any),
      recall_rate = mean(found_in_any),
      .groups = "drop"
    ) %>%
    filter(!is.na(decade)) %>%
    mutate(
      decade_label = sprintf("%ds", decade),
      recall_rate = sprintf("%.0f%%", recall_rate * 100)
    ) %>%
    select(decade_label, total_acts, acts_found, recall_rate) %>%
    kable(caption = "Act Detection Rate by Decade",
          col.names = c("Decade", "Total Acts", "Found", "Recall")) %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
} else {
  cat("\n⚠️ Skipping: No ground truth labels available for this country\n\n")
}

### Recall Comparison

| Matching Method | Acts Found | Recall | Notes |
|-----------------|------------|--------|-------|
| Exact year only | 21/55 | 38.2% | Too strict - misses retrospective mentions |
| **Year to Year+2** | **45/56** | **80.4%** | **Primary metric** - accounts for ERP lag |

**Status:** WARN (target: >=85%)


### Acts Not Found in Expanded Window

These acts were not found even when searching year to year+2:


**Possible reasons:**

- Non-standard act names (e.g., 'Public Law 89-800' vs formal name)
- Acts referred to informally in documents
- OCR issues in older documents (pre-1950)
- Labels data quality (wrong expected_year)


### Act Validation Summary (Found Acts)


### Detection Rate by Decade
Act Detection Rate by Decade
Decade Total Acts Found Recall
1940s 3 1 33%
1950s 11 11 100%
1960s 11 9 82%
1970s 9 8 89%
1980s 13 10 77%
1990s 3 1 33%
2000s 6 5 83%

Test (iv): Temporal & Source Coverage

Show code
# Define expected coverage
# ERP and Budget: every year from min_year to max_year
# Treasury: 1946-1980 and 2011-present (gap 1981-2010)
expected_coverage <- bind_rows(
  expand_grid(
    year = params$min_year:params$max_year,
    body = c("Economic Report of the President", "Budget of the United States Government")
  ) %>%
    mutate(expected = year <= lubridate::year(Sys.Date())),
  expand_grid(
    year = params$min_year:params$max_year,
    body = "Annual Report of the Treasury"
  ) %>%
    mutate(expected = (year <= 1980 | year >= 2011) & year <= lubridate::year(Sys.Date()))
)

# Actual coverage
actual_coverage <- body_data %>%
  filter(n_pages > 0) %>%
  count(year, body, name = "n_docs")

# Join and analyze
coverage_analysis <- expected_coverage %>%
  left_join(actual_coverage, by = c("year", "body")) %>%
  mutate(
    n_docs = replace_na(n_docs, 0),
    status = case_when(
      !expected ~ "Not expected",
      n_docs > 0 ~ "Present",
      TRUE ~ "Missing"
    )
  )

# Calculate coverage rate
coverage_gaps <- coverage_analysis %>%
  filter(expected & n_docs == 0)

total_expected <- sum(coverage_analysis$expected)
total_present <- sum(coverage_analysis$expected & coverage_analysis$n_docs > 0)
coverage_rate <- total_present / total_expected

test_iv_status <- case_when(
  coverage_rate >= 0.95 ~ "PASS",
  coverage_rate >= 0.85 ~ "WARN",
  TRUE ~ "FAIL"
)

test_results$test_iv <- list(
  metric = "Coverage rate",
  value = sprintf("%.1f%%", coverage_rate * 100),
  target = "≥95%",
  status = test_iv_status
)

Results

Coverage Rate: 96.0% (193 of 201 expected documents) Status: PASS

Show code
# Coverage heatmap
coverage_analysis %>%
  filter(year >= max(params$min_year, 1946)) %>%
  mutate(
    status_color = case_when(
      status == "Not expected" ~ 0,
      status == "Present" ~ 1,
      TRUE ~ -1
    )
  ) %>%
  ggplot(aes(x = year, y = body, fill = status_color)) +
  geom_tile(color = "white", linewidth = 0.5) +
  scale_fill_gradient2(
    low = "red", mid = "grey90", high = "green",
    midpoint = 0,
    breaks = c(-1, 0, 1),
    labels = c("Missing", "Not expected", "Present"),
    name = "Status"
  ) +
  labs(
    title = "Document Coverage by Year and Type",
    x = "Year",
    y = "Document Type"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 8),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  )

Show code
# Show gaps if any
if (nrow(coverage_gaps) > 0) {
  cat("\n### Coverage Gaps\n\n")
  coverage_gaps %>%
    select(year, body) %>%
    arrange(body, year) %>%
    kable(caption = sprintf("Missing expected documents (%d gaps)", nrow(coverage_gaps))) %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
} else {
  cat("\n✅ No coverage gaps! All expected documents are present.\n\n")
}

### Coverage Gaps
Missing expected documents (8 gaps)
year body
1946 Economic Report of the President
1989 Economic Report of the President
1990 Economic Report of the President
1991 Economic Report of the President
1992 Economic Report of the President
1993 Economic Report of the President
1994 Economic Report of the President
1995 Economic Report of the President

Test (v): Text Quality Indicators

Show code
# Calculate page-level quality metrics
# Compile fiscal terms regex once for efficiency
fiscal_regex <- regex(paste(fiscal_terms, collapse = "|"), ignore_case = TRUE)

quality_metrics <- body_data %>%
  filter(n_pages > 0) %>%
  sample_n(size = min(5, n())) %>%  # Reduced sample size for performance
  select(year, body, package_id, source, text) %>%  # Keep identifiers
  mutate(
    page_metrics = map(text, function(text_list) {
      # Handle the nested structure: text is a list-column
      if (is.null(text_list) || length(text_list) == 0) return(NULL)

      # Extract pages from the nested list structure
      pages <- if (is.list(text_list[[1]])) text_list[[1]] else text_list
      if (length(pages) == 0) return(NULL)

      # Limit to first 50 pages per document to avoid memory issues
      pages <- pages[1:min(50, length(pages))]

      tibble(
        page_num = seq_along(pages),
        page_text = as.character(pages),
        n_chars = nchar(page_text),
        # Use simple word count instead of quanteda for performance
        n_tokens = str_count(page_text, "\\S+"),
        special_char_rate = str_count(page_text, "[^a-zA-Z0-9\\s.,!?;:'-]") / pmax(n_chars, 1),
        whitespace_rate = str_count(page_text, "\\s") / pmax(n_chars, 1),
        non_ascii_rate = str_count(page_text, "[^\x01-\x7F]") / pmax(n_chars, 1),
        has_fiscal_terms = str_detect(page_text, fiscal_regex)
      )
    })
  ) %>%
  filter(!map_lgl(page_metrics, is.null)) %>%
  select(-text) %>%  # Remove text column before unnest
  unnest(page_metrics)

# Identify suspicious pages
suspicious_pages <- quality_metrics %>%
  filter(
    n_chars < 100 |
    special_char_rate > 0.1 |
    non_ascii_rate > 0.05 |
    (!has_fiscal_terms & page_num > 5)
  )

# Document-level quality summary
doc_quality <- quality_metrics %>%
  group_by(year, body, package_id) %>%
  summarize(
    total_pages = n(),
    avg_tokens_per_page = mean(n_tokens, na.rm = TRUE),
    pct_fiscal_pages = mean(has_fiscal_terms, na.rm = TRUE),
    suspicious_pages_count = sum(
      n_chars < 100 | special_char_rate > 0.1 | non_ascii_rate > 0.05,
      na.rm = TRUE
    ),
    quality_score = (pmin(avg_tokens_per_page / 300, 1)) * pct_fiscal_pages *
                    (1 - suspicious_pages_count / total_pages),
    .groups = "drop"
  )

# Calculate metrics
pct_suspicious <- nrow(suspicious_pages) / nrow(quality_metrics)
pct_fiscal <- mean(quality_metrics$has_fiscal_terms, na.rm = TRUE)

test_v_status <- case_when(
  pct_suspicious < 0.05 & pct_fiscal > 0.70 ~ "PASS",
  pct_suspicious < 0.10 | pct_fiscal > 0.50 ~ "WARN",
  TRUE ~ "FAIL"
)

test_results$test_v_quality <- list(
  metric = "Text quality - suspicious pages",
  value = sprintf("%.1f%%", pct_suspicious * 100),
  target = "<5%",
  status = test_v_status
)

test_results$test_v_fiscal <- list(
  metric = "Text quality - fiscal pages",
  value = sprintf("%.1f%%", pct_fiscal * 100),
  target = ">70%",
  status = test_v_status
)

Results

Suspicious Pages: 24.6% Fiscal Term Coverage: 73.8% Status: WARN

Show code
# Token distribution
quality_metrics %>%
  ggplot(aes(x = n_tokens)) +
  geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
  labs(
    title = "Distribution of Tokens per Page",
    x = "Tokens per Page",
    y = "Count"
  ) +
  theme_minimal()

Show code
# Special character rates by source
quality_metrics %>%
  ggplot(aes(x = source, y = special_char_rate)) +
  geom_boxplot(fill = "coral", alpha = 0.7) +
  labs(
    title = "Special Character Rate by Source",
    x = "Source",
    y = "Special Character Rate"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Show code
# Show low quality documents
low_quality_docs <- doc_quality %>%
  filter(quality_score < 0.5) %>%
  arrange(quality_score)

if (nrow(low_quality_docs) > 0) {
  cat("\n### Low Quality Documents\n\n")
  low_quality_docs %>%
    select(year, body, total_pages, avg_tokens_per_page, pct_fiscal_pages, quality_score) %>%
    mutate(
      avg_tokens_per_page = round(avg_tokens_per_page, 0),
      pct_fiscal_pages = sprintf("%.1f%%", pct_fiscal_pages * 100),
      quality_score = round(quality_score, 2)
    ) %>%
    head(10) %>%
    kable(caption = "Documents with quality_score < 0.5 (top 10)") %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
} else {
  cat("\n✅ No low-quality documents found!\n\n")
}

### Low Quality Documents
Documents with quality_score < 0.5 (top 10)
year body total_pages avg_tokens_per_page pct_fiscal_pages quality_score
2021 Economic Report of the President 50 471 54.0% 0.36
1949 Economic Report of the President 50 313 50.0% 0.44

Test (vi): Anomaly Detection

Show code
# Document-level anomalies
doc_anomalies <- body_data %>%
  filter(n_pages > 0) %>%
  mutate(
    too_short = n_pages < 10,
    too_long = n_pages > 1000,
    first_pages = map_chr(text, function(pages) {
      if (length(pages) == 0) return("")
      paste(pages[1:min(5, length(pages))], collapse = " ")
    }),
    has_title_indicators = str_detect(
      first_pages,
      regex("(report|budget|economic|president|treasury|united states)", ignore_case = TRUE)
    ),
    extraction_time_z = if (sd(extraction_time, na.rm = TRUE) > 0) {
      scale(extraction_time)[,1]
    } else {
      0
    },
    slow_extraction = ocr_used & extraction_time_z > 3
  )

# Duplicate detection (hash first 5 pages)
duplicate_check <- body_data %>%
  filter(n_pages > 0) %>%
  mutate(
    text_hash = map_chr(text, function(pages) {
      if (length(pages) == 0) return("")
      first_pages <- pages[1:min(5, length(pages))]
      digest::digest(paste(first_pages, collapse = ""))
    })
  ) %>%
  group_by(text_hash) %>%
  filter(n() > 1, text_hash != "") %>%
  ungroup() %>%
  select(year, body, package_id, text_hash)

# Year-level trends
year_trends <- body_data %>%
  filter(n_pages > 0) %>%
  group_by(year, body) %>%
  summarize(
    total_pages = sum(n_pages),
    n_docs = n(),
    .groups = "drop"
  ) %>%
  arrange(body, year) %>%
  group_by(body) %>%
  mutate(
    yoy_change = (total_pages - lag(total_pages)) / lag(total_pages),
    sudden_drop = !is.na(yoy_change) & yoy_change < -0.5
  ) %>%
  ungroup()

# Anomaly counts
n_too_short <- sum(doc_anomalies$too_short, na.rm = TRUE)
n_too_long <- sum(doc_anomalies$too_long, na.rm = TRUE)
n_no_title <- sum(!doc_anomalies$has_title_indicators, na.rm = TRUE)
n_duplicates <- nrow(duplicate_check)
n_sudden_drops <- sum(year_trends$sudden_drop, na.rm = TRUE)

test_results$test_vi <- list(
  metric = "Anomalies detected",
  value = sprintf("%d issues", n_too_short + n_too_long + n_duplicates + n_sudden_drops),
  target = "Manual review",
  status = "INFO"
)

Results

Status: INFO (anomalies flagged for manual review)

Show code
# Anomaly summary
anomaly_summary <- tribble(
  ~Anomaly, ~Count,
  "Too short (< 10 pages)", n_too_short,
  "Too long (> 1000 pages)", n_too_long,
  "Missing title indicators", n_no_title,
  "Duplicate documents", n_duplicates,
  "Sudden year drops (>50%)", n_sudden_drops
)

anomaly_summary %>%
  kable(caption = "Anomaly Summary") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Anomaly Summary
Anomaly Count
Too short (< 10 pages) 64
Too long (> 1000 pages) 17
Missing title indicators 0
Duplicate documents 2
Sudden year drops (>50%) 7
Show code
# Show anomalous documents
if (n_too_short > 0 || n_too_long > 0 || n_no_title > 0) {
  cat("\n### Anomalous Documents\n\n")
  doc_anomalies %>%
    filter(too_short | too_long | !has_title_indicators) %>%
    select(year, body, package_id, n_pages, too_short, too_long, has_title_indicators) %>%
    arrange(desc(too_short), desc(too_long)) %>%
    head(10) %>%
    kable(caption = "Documents with anomalies (top 10)") %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
}

### Anomalous Documents
Documents with anomalies (top 10)
year body package_id n_pages too_short too_long has_title_indicators
2006 Budget of the United States Government BUDGET-2006 5 TRUE FALSE TRUE
2006 Budget of the United States Government BUDGET-2006 9 TRUE FALSE TRUE
2006 Budget of the United States Government BUDGET-2006 9 TRUE FALSE TRUE
2006 Budget of the United States Government BUDGET-2006 7 TRUE FALSE TRUE
2006 Budget of the United States Government BUDGET-2006 9 TRUE FALSE TRUE
2006 Budget of the United States Government BUDGET-2006 8 TRUE FALSE TRUE
2006 Budget of the United States Government BUDGET-2006 7 TRUE FALSE TRUE
2006 Budget of the United States Government BUDGET-2006 8 TRUE FALSE TRUE
2006 Budget of the United States Government BUDGET-2006 7 TRUE FALSE TRUE
2007 Budget of the United States Government BUDGET-2007 3 TRUE FALSE TRUE
Show code
# Show duplicates
if (n_duplicates > 0) {
  cat("\n### Duplicate Documents\n\n")
  duplicate_check %>%
    kable(caption = sprintf("Potential duplicates (%d documents)", n_duplicates)) %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
}

### Duplicate Documents
Potential duplicates (2 documents)
year body package_id text_hash
2015 Annual Report of the Treasury AR_TREASURY-2015 25c71c0a85cb4a39edfdf7acdf4371b0
2016 Annual Report of the Treasury AR_TREASURY-2016 25c71c0a85cb4a39edfdf7acdf4371b0
Show code
# Year trends
year_trends %>%
  ggplot(aes(x = year, y = total_pages, color = body)) +
  geom_line(linewidth = 1) +
  geom_point(data = year_trends %>% filter(sudden_drop),
             color = "red", size = 3, shape = 1) +
  labs(
    title = "Total Pages by Year and Body",
    subtitle = "Red circles indicate sudden drops (>50%)",
    x = "Year",
    y = "Total Pages",
    color = "Document Type"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

Summary Report & Pass/Fail Dashboard

Show code
# Compile test results
test_summary <- bind_rows(
  lapply(names(test_results), function(name) {
    result <- test_results[[name]]
    tibble(
      test = name,
      metric = as.character(result$metric),
      value = as.character(result$value),
      target = as.character(result$target),
      status = as.character(result$status)
    )
  })
)

# Determine overall status
status_priority <- c("FAIL" = 3, "WARN" = 2, "PASS" = 1, "INFO" = 0, "SKIP" = 0)
overall_status <- test_summary %>%
  filter(status != "SKIP", status != "INFO") %>%
  pull(status) %>%
  {names(which.max(status_priority[.]))}

if (length(overall_status) == 0) overall_status <- "PASS"

Overall Status: WARN

Show code
# Display results table
test_summary %>%
  mutate(
    Status = cell_spec(
      status,
      color = case_when(
        status == "PASS" ~ "green",
        status == "WARN" ~ "orange",
        status == "FAIL" ~ "red",
        status == "INFO" ~ "blue",
        status == "SKIP" ~ "grey",
        TRUE ~ "black"
      ),
      bold = TRUE
    )
  ) %>%
  select(-status) %>%
  kable(escape = FALSE, caption = "Verification Test Results") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Verification Test Results
test metric value target Status
test_i URL resolution success rate 97.1% ≥95% <span style=" font-weight: bold; color: green !important;" >PASS</span>
test_ii Boundary documents valid 10/10 valid All ≥10 pages <span style=" font-weight: bold; color: green !important;" >PASS</span>
test_iii_strict Act recall (exact year) 38.2% Reference only <span style=" font-weight: bold; color: blue !important;" >INFO</span>
test_iii_acts Act recall (year to year+2) 80.4% >=85% <span style=" font-weight: bold; color: orange !important;" >WARN</span>
test_iv Coverage rate 96.0% ≥95% <span style=" font-weight: bold; color: green !important;" >PASS</span>
test_v_quality Text quality - suspicious pages 24.6% <5% <span style=" font-weight: bold; color: orange !important;" >WARN</span>
test_v_fiscal Text quality - fiscal pages 73.8% >70% <span style=" font-weight: bold; color: orange !important;" >WARN</span>
test_vi Anomalies detected 90 issues Manual review <span style=" font-weight: bold; color: blue !important;" >INFO</span>

Recommendations

Show code
if (overall_status == "PASS") {
  cat("\n✅ **READY FOR LLM PROCESSING**\n\n")
  cat("All verification tests passed. Proceed with:\n\n")
  cat("- `tar_make(chunks)` to create LLM-ready chunks\n")
  cat("- Days 2-3: Training data preparation\n")
  cat("- LLM-based fiscal shock identification\n\n")
} else if (overall_status == "WARN") {
  cat("\n⚠️ **PROCEED WITH CAUTION**\n\n")
  cat("Some tests raised warnings. Review flagged issues before full run:\n\n")

  test_summary %>%
    filter(status == "WARN") %>%
    pull(metric) %>%
    paste("-", .) %>%
    cat(sep = "\n")

  cat("\n\nConsider re-running extractions for problematic documents.\n\n")
} else {
  cat("\n❌ **MANUAL REVIEW REQUIRED**\n\n")
  cat("Critical issues detected. Address the following before proceeding:\n\n")

  test_summary %>%
    filter(status == "FAIL") %>%
    pull(metric) %>%
    paste("-", .) %>%
    cat(sep = "\n")

  cat("\n\nReview extraction settings and re-run failed documents.\n\n")
}

⚠️ **PROCEED WITH CAUTION**

Some tests raised warnings. Review flagged issues before full run:

- Act recall (year to year+2)
- Text quality - suspicious pages
- Text quality - fiscal pages


Consider re-running extractions for problematic documents.

Next Steps

Show code
cat("\n**Based on this verification:**\n\n")

**Based on this verification:**
Show code
cat(sprintf("- Total documents verified: %d\n", nrow(body_data)))
- Total documents verified: 313
Show code
cat(sprintf("- Successful extractions: %d (%.1f%%)\n",
            sum(body_data$n_pages > 0),
            mean(body_data$n_pages > 0) * 100))
- Successful extractions: 304 (97.1%)
Show code
cat(sprintf("- Total pages extracted: %s\n",
            scales::comma(sum(body_data$n_pages))))
- Total pages extracted: 97,475
Show code
cat(sprintf("- Documents using OCR: %d\n", sum(body_data$ocr_used, na.rm = TRUE)))
- Documents using OCR: 64
Show code
cat(sprintf("- Overall status: **%s**\n\n", overall_status))
- Overall status: **WARN**
Show code
cat("**Recommended actions:**\n\n")
**Recommended actions:**
Show code
if (overall_status == "PASS") {
  cat("1. Proceed to Days 2-3 implementation (training data preparation)\n")
  cat("2. Run `tar_make(chunks)` to create document chunks\n")
  cat("3. Begin Model A development (act detection)\n")
} else {
  cat("1. Review failed/warned tests above\n")
  cat("2. Address extraction issues for flagged documents\n")
  cat("3. Re-run verification before proceeding\n")
}
1. Review failed/warned tests above
2. Address extraction issues for flagged documents
3. Re-run verification before proceeding