library(tidyverse)
library(here)
here::i_am("notebooks/test_text_extraction.qmd")
# Source the local extraction function
source(here("R/pull_text_local.R"))
# Load ground truth data
us_shocks <- read_csv(here("data/raw/us_shocks.csv"))
us_labels <- read_csv(here("data/raw/us_labels.csv"))Overview
This notebook tests whether the local PyMuPDF PDF extraction (with OCR for scanned documents) produces text that Claude 3.5 Sonnet can effectively use for:
- Model A: Act detection (identify fiscal legislation passages)
- Model B: Motivation classification (spending-driven, countercyclical, etc.)
- Model C: Information extraction (timing, magnitudes from tables)
Setup
Phase 1: Sample Selection
We select documents with well-known fiscal acts that have clear ground truth labels.
# Key test cases with known acts
test_cases <- tribble(
~act_name, ~year, ~motivation, ~expected_url_pattern,
"Revenue Act of 1964", 1964, "Long-run", "ERP.*1965",
"Tax Reform Act of 1986", 1986, "Long-run", "ERP.*1987",
"Economic Recovery Tax Act of 1981", 1981, "Long-run", "ERP.*1982",
"Tax Reduction Act of 1975", 1975, "Countercyclical", "ERP.*1976",
"Revenue Act of 1950", 1950, "Spending-driven", "ERP.*1951"
)
test_cases# A tibble: 5 × 4
act_name year motivation expected_url_pattern
<chr> <dbl> <chr> <chr>
1 Revenue Act of 1964 1964 Long-run ERP.*1965
2 Tax Reform Act of 1986 1986 Long-run ERP.*1987
3 Economic Recovery Tax Act of 1981 1981 Long-run ERP.*1982
4 Tax Reduction Act of 1975 1975 Countercyclical ERP.*1976
5 Revenue Act of 1950 1950 Spending-driven ERP.*1951
# Get actual URLs for test documents
# Using Fraser St. Louis Fed ERP archive
test_urls <- c(
"https://fraser.stlouisfed.org/files/docs/publications/ERP/1965/ERP_1965.pdf",
"https://fraser.stlouisfed.org/files/docs/publications/ERP/1982/ERP_1982.pdf"
)
cat("Test URLs:\n")Test URLs:
cat(test_urls, sep = "\n")https://fraser.stlouisfed.org/files/docs/publications/ERP/1965/ERP_1965.pdf
https://fraser.stlouisfed.org/files/docs/publications/ERP/1982/ERP_1982.pdf
Phase 2: Extract Sample Documents
# Extract text from sample PDFs using local PyMuPDF+OCR
# This may take 5-15 minutes for scanned documents (OCR needed)
message("Starting local extraction...")
sample_results <- pull_text_local(
pdf_url = test_urls,
output_dir = here("data/extracted"),
workers = 4,
ocr_dpi = 200
)
message("Extraction complete!")Extraction Summary
sample_results |>
mutate(
doc_name = basename(test_urls)
) |>
select(doc_name, n_pages, ocr_used, extraction_time, extracted_at)# A tibble: 2 × 5
doc_name n_pages ocr_used extraction_time extracted_at
<chr> <int> <lgl> <dbl> <dttm>
1 ERP_1965.pdf 300 TRUE 277. 2026-01-21 19:33:20
2 ERP_1982.pdf 368 TRUE 368. 2026-01-21 19:33:20
Phase 3: Text Quality Inspection
3.1 Basic Text Readability
# Show sample text from first document
cat("=== Sample text from ERP 1965 (first 2000 chars) ===\n\n")=== Sample text from ERP 1965 (first 2000 chars) ===
first_doc_text <- unlist(sample_results$text[[1]])
cat(substr(paste(first_doc_text, collapse = "\n\n"), 1, 2000))_ Jatuary 1960
|
Together With
THE ANNUAL REPORT
of the
COUNCIL OF ECONOMIC ADVISERS
Digitized for FRASER
http://fraser.stlouisfed.org,
Federal Reserve Bank of St.
Louis
8H-1464 2M 7-70
B
gut ANK
us|
Q
VY
la
(ers 2
4 See ce
< MAY 18.
LIBRARY
Digitized for FRASER
http://fraser.stlouisfed.org/
Federal Reserve Bank of St. Louis
°
Economic Report
.
of the President
oRNth
Ye’ fz
erage
Transmitted to the Congress
January 1965
TOGETHER WITH
THE ANNUAL REPORT
OF THE
COUNCIL OF ECONOMIC ADVISERS
UNITED STATES GOVERNMENT PRINTING OFFICE
WASHINGTON: 1965
Digitized for FRASER
http://fraser.stlouisfed.org/
Federal Reserve Bank of St. Louis
Digitized for FRASER
http://fraser.stlouisfed.org/
Federal Reserve Bank of St. Louis
CONTENTS
Economic REPORT OF THE PRESIDENT
Page
Procress TowarD Our Economic GOALs...........-.-.000-5
3
Full Employment... 0.0.0.0... 0-0.
o cence eee eee
3
Rapid Growth. ..... 00.00.
e eens
4
Price Stability... 22.60. .eceeeeee
4
Balance of Payments Equilibrium. .....................065
4
Consistency of Our Goals... 2.0.0.0... 20
e eee eee eee
5
Tue ROLE oF Economic POLicy............0.
000 cece eee eee
5
THE UNFINISHED TASKS... 2.00.00
o
eee
eee eee ee
7
Economic PROSPECTS FOR 1965..........
0000 cece eee eee eee
9
Federal Fiscal Policy... 2... ...... 0.00.
c cece eee
9
Progress Toward Full Employment.....................05.
10
CoMBATING RECESSIONS...
......00.00000 ceee eect
eee
10
Monetary POLicy IN 1965......
000.0 c cece eee
ees
11
MalInTAINING WAGE-PRICE STABILITY...
0.2.0...000.00 eeeee
12
INTERNATIONAL ECONOMIC POLICIES. ........0...000
0000 e eee eee
13
Restoring Balance in Our External Payments...............
13
Building a Stronger World Order..................0000005
14
Manpower POLiIciEs FOR A FLEXIBLE ECONOMY..........-..--
15
U.S. Employment Service... 0... ..000. 0.0
15
Manpower Training.............. 0000s cee
cece
15
Private Pension and Welfare Funds..........-......0-.2005
15
MaiInTAINING INCOMES OF THE
DISADVANTAGED...............-
3.2 Check for Known Act Names
# Acts we expect to find in these documents
expected_acts <- c(
"Revenue Act of 1964",
"Economic Recovery Tax Act",
"Tax Equity and Fiscal Responsibility Act"
)
# Search for act names in extracted text
all_text <- paste(unlist(sample_results$text), collapse = " ")
act_detection <- map_dfr(expected_acts, function(act) {
found <- str_detect(all_text, fixed(act))
tibble(
act_name = act,
found = found,
mentions = sum(str_count(all_text, fixed(act)))
)
})
act_detection# A tibble: 3 × 3
act_name found mentions
<chr> <lgl> <int>
1 Revenue Act of 1964 TRUE 4
2 Economic Recovery Tax Act TRUE 17
3 Tax Equity and Fiscal Responsibility Act FALSE 0
# Act detection rate
detection_rate <- mean(act_detection$found)
cat(sprintf("Act detection rate: %.0f%% (%d/%d)\n",
detection_rate * 100,
sum(act_detection$found),
nrow(act_detection)))Act detection rate: 67% (2/3)
if (detection_rate < 0.8) {
warning("Act detection rate below 80% - may need to check extraction quality")
}3.3 Numeric Value Preservation
# Check for dollar amounts (critical for Model C)
dollar_pattern <- "\\$\\s*\\d+\\.?\\d*\\s*(billion|million|B|M)"
year_pattern <- "\\b(19[4-9]\\d|20[0-2]\\d)\\b"
numeric_metrics <- tibble(
metric = c("Dollar amounts", "Year mentions", "Percentage values"),
pattern = c(dollar_pattern, year_pattern, "\\d+\\.?\\d*\\s*percent"),
count = c(
sum(str_count(all_text, regex(dollar_pattern, ignore_case = TRUE))),
sum(str_count(all_text, year_pattern)),
sum(str_count(all_text, regex("\\d+\\.?\\d*\\s*percent", ignore_case = TRUE)))
)
)
numeric_metrics# A tibble: 3 × 3
metric pattern count
<chr> <chr> <int>
1 Dollar amounts "\\$\\s*\\d+\\.?\\d*\\s*(billion|million|B|M)" 214
2 Year mentions "\\b(19[4-9]\\d|20[0-2]\\d)\\b" 6201
3 Percentage values "\\d+\\.?\\d*\\s*percent" 527
3.4 Fiscal Policy Terms
# Check for key fiscal policy terminology
fiscal_terms <- c(
"tax cut", "tax reduction", "fiscal policy",
"federal budget", "deficit", "expenditure",
"revenue", "appropriation"
)
term_counts <- map_dfr(fiscal_terms, function(term) {
tibble(
term = term,
count = sum(str_count(all_text, regex(term, ignore_case = TRUE)))
)
})
term_counts |>
filter(count > 0) |>
arrange(desc(count))# A tibble: 8 × 2
term count
<chr> <int>
1 expenditure 246
2 deficit 192
3 revenue 74
4 tax cut 50
5 federal budget 32
6 fiscal policy 31
7 tax reduction 17
8 appropriation 4
Phase 4: LLM Readiness Tests
4.1 Passage Length Analysis
# Check if pages are reasonable length for LLM context
first_doc_pages <- sample_results$text[[1]]
page_lengths <- map_int(first_doc_pages, nchar)
tibble(
metric = c("Min page length", "Max page length", "Mean page length", "Total chars"),
value = c(min(page_lengths), max(page_lengths), mean(page_lengths), sum(page_lengths))
) |>
mutate(value = scales::comma(value))# A tibble: 4 × 2
metric value
<chr> <chr>
1 Min page length 85
2 Max page length 7,831
3 Mean page length 3,158
4 Total chars 947,522
4.2 Token Estimation
# Rough token estimate (1 token ~ 4 chars for English)
total_chars <- sum(map_int(unlist(sample_results$text), nchar))
estimated_tokens <- total_chars / 4
cat(sprintf("Estimated tokens: %s\n", scales::comma(estimated_tokens)))Estimated tokens: 520,314
cat(sprintf("Claude 3.5 Sonnet context: 200K tokens\n"))Claude 3.5 Sonnet context: 200K tokens
cat(sprintf("Fits in context: %s\n",
ifelse(estimated_tokens < 200000, "YES", "NO - need chunking")))Fits in context: NO - need chunking
4.3 Sample Passage for Model A Test
# Find a passage mentioning a fiscal act
first_doc_pages <- sample_results$text[[1]]
act_passages <- map_dfr(seq_along(first_doc_pages), function(i) {
page_text <- first_doc_pages[[i]]
if (str_detect(page_text, regex("act of \\d{4}", ignore_case = TRUE))) {
tibble(
page = i,
text = str_trunc(page_text, 500),
has_act_mention = TRUE
)
} else {
NULL
}
})
if (nrow(act_passages) > 0) {
cat("=== Sample passage with act mention (for Model A) ===\n\n")
cat(act_passages$text[1])
} else {
cat("No passages with 'Act of YYYY' pattern found")
}=== Sample passage with act mention (for Model A) ===
or a lack of confidence in the dollar.
Since 1946, therefore, we have
come to recognize that the mandate of the Employment Act implies a
series of objectives closely related to the goal of full employment:
—rapid growth,
— price stability, and
—equilibrium in our balance of payments.
Rapp GRrowTH
True prosperity means more than the full use of the productive
powers available at any given time.
It also means the rapid expansion
of those powers.
In the long run, it is only a growth of over-all ...
Phase 5: Quality Metrics Summary
# Compile all quality metrics
total_pages <- sum(sample_results$n_pages)
total_tables <- 0 # PyMuPDF doesn't extract tables separately
# Convert all values to character to avoid type mismatch in tribble
quality_report <- tribble(
~metric, ~value, ~target, ~status,
"Documents extracted", as.character(nrow(sample_results)), "2", "PASS",
"Pages extracted", as.character(total_pages), ">0",
ifelse(total_pages > 0, "PASS", "FAIL"),
"OCR used", as.character(sum(sample_results$ocr_used)), "as needed", "INFO",
"Act name recall", sprintf("%.0f%%", detection_rate * 100), ">80%",
ifelse(detection_rate >= 0.8, "PASS", "FAIL"),
"Dollar amounts found", as.character(numeric_metrics$count[1]), ">0",
ifelse(numeric_metrics$count[1] > 0, "PASS", "WARN"),
"Year mentions", as.character(numeric_metrics$count[2]), ">10",
ifelse(numeric_metrics$count[2] > 10, "PASS", "WARN"),
"Fits in LLM context", ifelse(estimated_tokens < 200000, "YES", "NO"), "YES",
ifelse(estimated_tokens < 200000, "PASS", "WARN")
)
quality_report |>
knitr::kable()| metric | value | target | status |
|---|---|---|---|
| Documents extracted | 2 | 2 | PASS |
| Pages extracted | 668 | >0 | PASS |
| OCR used | 2 | as needed | INFO |
| Act name recall | 67% | >80% | FAIL |
| Dollar amounts found | 214 | >0 | PASS |
| Year mentions | 6201 | >10 | PASS |
| Fits in LLM context | NO | YES | WARN |
Conclusions
pass_count <- sum(quality_report$status == "PASS")
warn_count <- sum(quality_report$status == "WARN")
fail_count <- sum(quality_report$status == "FAIL")
info_count <- sum(quality_report$status == "INFO")
cat(sprintf("\n=== QUALITY ASSESSMENT ===\n"))
=== QUALITY ASSESSMENT ===
cat(sprintf("PASS: %d | WARN: %d | FAIL: %d | INFO: %d\n\n",
pass_count, warn_count, fail_count, info_count))PASS: 4 | WARN: 1 | FAIL: 1 | INFO: 1
if (fail_count == 0) {
cat("Extraction quality is SUFFICIENT for LLM processing\n")
cat(" Proceed with full pipeline: tar_make(us_text)\n")
} else {
cat("Extraction quality needs IMPROVEMENT before full run\n")
cat(" Review failed metrics and adjust extraction settings\n")
}Extraction quality needs IMPROVEMENT before full run
Review failed metrics and adjust extraction settings
Next Steps
Based on the quality assessment:
- If PASS: Run
tar_make(us_text)to extract all 350 documents - If WARN: Review warnings but proceed with caution
- If FAIL: Debug extraction issues before full run
Recommended Actions
if (fail_count > 0) {
cat("Issues to address:\n")
quality_report |>
filter(status == "FAIL") |>
pull(metric) %>%
paste("-", .) |>
cat(sep = "\n")
}Issues to address:
- Act name recall
Extraction Performance
# Summary of extraction times
cat("=== Extraction Performance ===\n")=== Extraction Performance ===
cat(sprintf("Total documents: %d\n", nrow(sample_results)))Total documents: 2
cat(sprintf("Total pages: %d\n", sum(sample_results$n_pages)))Total pages: 668
cat(sprintf("Total extraction time: %.1f seconds\n",
sum(sample_results$extraction_time, na.rm = TRUE)))Total extraction time: 644.2 seconds
cat(sprintf("Average time per document: %.1f seconds\n",
mean(sample_results$extraction_time, na.rm = TRUE)))Average time per document: 322.1 seconds
cat(sprintf("Average time per page: %.2f seconds\n",
sum(sample_results$extraction_time, na.rm = TRUE) /
sum(sample_results$n_pages)))Average time per page: 0.96 seconds