Back to Article
Step 3: NC CoreLogic and Parcel Matching
Download Source

Step 3: NC CoreLogic and Parcel Matching

Author

Russell Blessing

Results and QA/QC Notes

This notebook replicates and extends the parcel–CoreLogic string matching approach from the prior analysis. The previous code generated 3,237,387 string-matched records and noted that there is no single straightforward join between the OneMap parcel data and CoreLogic — matching required county-by-county inspection and manipulation of parcel ID strings (e.g., adding/removing hyphens, padding or trimming leading zeros) before merging.

In [1]:
Show / hide code
library(sf)
Linking to GEOS 3.12.0, GDAL 3.11.0, PROJ 9.2.1; sf_use_s2() is TRUE
Show / hide code
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
Show / hide code
library(readr)
library(stringr)
library(purrr)
library(gt)
In [2]:
Show / hide code
# Read data
nc1 <- st_read("/proj/mhinolab/users/rbless/data/Obstacles_Output/parcels_study_area.gpkg", quiet = TRUE) |>
  st_drop_geometry() |>
  mutate(
    parcel_index = as.character(parcel_index),
    cntyfips = paste0("37", cntyfips))

cl <- read_csv("/proj/mhinolab/users/rbless/data/Obstacles_Output/cl.csv",
               col_types = cols(.default = col_character()))
In [3]:
Show / hide code
# Helper: strip non-alphanumeric characters
strip_alnum <- function(x) str_replace_all(x, "[^a-zA-Z0-9]", "")

# Core matching function for a single county
match_county <- function(fips, nc1, cl, nc_key, cl_key,
                         nc_transform = NULL, cl_transform = NULL) {
  # Filter datasets to target county and strip non-alphanumeric characters from keys
  nc_fips <- nc1 |> filter(cntyfips == fips) |> mutate(across(all_of(nc_key), strip_alnum))
  cl_fips <- cl  |> filter(`FIPS CODE` == fips) |> mutate(across(all_of(cl_key), strip_alnum))

  # Apply optional transformations to datasets
  if (!is.null(nc_transform)) nc_fips <- nc_transform(nc_fips)
  if (!is.null(cl_transform)) cl_fips <- cl_transform(cl_fips)

  # Store count before merging for match rate calculation
  total_before_merge <- nrow(nc_fips)

  # Join datasets on specified keys, allowing many-to-many relationships
  joined <- left_join(
    nc_fips, cl_fips,
    by = setNames(cl_key, nc_key),
    keep = TRUE,
    relationship = "many-to-many"
  )

  # Count parcels with duplicate matches
  n_duplicates <- joined |>
    filter(duplicated(parcel_index) | duplicated(parcel_index, fromLast = TRUE)) |>
    distinct(parcel_index) |>
    nrow()

  # Keep only first occurrence of each parcel and remove unmatched records
  joined <- joined |>
    distinct(parcel_index, .keep_all = TRUE) |>
    filter(!is.na(.data[[cl_key]]))

  # Calculate match statistics
  n_matched <- nrow(joined)
  pct_matched <- if (total_before_merge > 0) (n_matched / total_before_merge) * 100 else 0

  # Return summary statistics and matched data
  list(
    stats = tibble(
      FIPS = fips,
      total_before_merge = total_before_merge,
      matched = n_matched,
      pct_matched = pct_matched,
      duplicates = n_duplicates
    ),
    data = joined
  )
}
In [4]:
Show / hide code
# County configurations: list of lists with fips, nc_key, cl_key, and optional transforms
county_configs <- list(
  # 28 common FIPS: parno vs APN (PARCEL NUMBER UNFORMATTED)
  list(fips = "37005"), list(fips = "37133"), list(fips = "37093"), list(fips = "37097"),
  list(fips = "37025"), list(fips = "37167"), list(fips = "37117"), list(fips = "37041"),
  list(fips = "37053"), list(fips = "37029"), list(fips = "37139"), list(fips = "37015"),
  list(fips = "37195"), list(fips = "37181"), list(fips = "37191"), list(fips = "37105"),
  list(fips = "37033"), list(fips = "37123"), list(fips = "37031"), list(fips = "37153"),
  list(fips = "37037"), list(fips = "37009"), list(fips = "37135"), list(fips = "37019"),
  list(fips = "37179"), list(fips = "37049"), list(fips = "37017"), list(fips = "37151"),
  # Unique cases
  list(fips = "37177", nc_key = "altparno",  cl_key = "APN (PARCEL NUMBER UNFORMATTED)"),
  list(fips = "37059", nc_key = "altparno",  cl_key = "APN (PARCEL NUMBER UNFORMATTED)"),
  list(fips = "37127", nc_key = "altparno",  cl_key = "APN (PARCEL NUMBER UNFORMATTED)"),
  list(fips = "37073", nc_key = "altparno",  cl_key = "APN (PARCEL NUMBER UNFORMATTED)"),
  list(fips = "37131", nc_key = "altparno",  cl_key = "APN (PARCEL NUMBER UNFORMATTED)"),
  list(fips = "37083", nc_key = "altparno",  cl_key = "ALTERNATE PARCEL ID"),
  list(fips = "37069", nc_key = "altparno",  cl_key = "ALTERNATE PARCEL ID"),
  list(fips = "37091", nc_key = "altparno",  cl_key = "ONLINE FORMATTED PARCEL ID"),
  list(fips = "37143", nc_key = "altparno",  cl_key = "ORIGINAL APN"),
  list(fips = "37145", nc_key = "altparno",  cl_key = "ORIGINAL APN"),
  list(fips = "37155", nc_key = "altparno",  cl_key = "ORIGINAL APN"),
  list(fips = "37187", nc_key = "altparno",  cl_key = "ORIGINAL APN"),
  list(fips = "37027", nc_key = "altparno",  cl_key = "ORIGINAL APN"),
  list(fips = "37157", nc_key = "altparno",  cl_key = "ORIGINAL APN"),
  list(fips = "37197", nc_key = "altparno",  cl_key = "APN (PARCEL NUMBER UNFORMATTED)"),
  list(fips = "37129", nc_key = "altparno",  cl_key = "APN (PARCEL NUMBER UNFORMATTED)"),
  list(fips = "37067", nc_key = "altparno",  cl_key = "APN (PARCEL NUMBER UNFORMATTED)"),
  list(fips = "37057", nc_key = "altparno",  cl_key = "ONLINE FORMATTED PARCEL ID"),
  list(fips = "37003", nc_key = "altparno",  cl_key = "APN (PARCEL NUMBER UNFORMATTED)"),
  list(fips = "37189", nc_key = "parno",     cl_key = "ONLINE FORMATTED PARCEL ID"),
  list(fips = "37047", nc_key = "parno",     cl_key = "ALTERNATE PARCEL ID"),
  list(fips = "37101", nc_key = "parno",     cl_key = "ALTERNATE PARCEL ID"),
  list(fips = "37193", nc_key = "parno",     cl_key = "ALTERNATE PARCEL ID"),
  list(fips = "37183", nc_key = "parno",     cl_key = "ALTERNATE PARCEL ID"),
  list(fips = "37063", nc_key = "altparno",  cl_key = "APN (PARCEL NUMBER UNFORMATTED)"),
  list(fips = "37071", nc_key = "parno",     cl_key = "ALTERNATE PARCEL ID"),
  list(fips = "37109", nc_key = "parno",     cl_key = "ALTERNATE PARCEL ID"),
  list(fips = "37137", nc_key = "parno",     cl_key = "ALTERNATE PARCEL ID"),
  list(fips = "37013", nc_key = "parno",     cl_key = "ALTERNATE PARCEL ID"),
  list(fips = "37001", nc_key = "parno",     cl_key = "ALTERNATE PARCEL ID"),
  list(fips = "37103", nc_key = "parno",     cl_key = "ORIGINAL APN"),
  list(fips = "37159", nc_key = "parno",     cl_key = "ORIGINAL APN"),
  list(fips = "37141", nc_key = "parno",     cl_key = "ONLINE FORMATTED PARCEL ID"),
  list(fips = "37165", nc_key = "parno",     cl_key = "APN (PARCEL NUMBER UNFORMATTED)",
       cl_transform = function(df) mutate(df, `APN (PARCEL NUMBER UNFORMATTED)` = str_pad(`APN (PARCEL NUMBER UNFORMATTED)`, 11, pad = "0"))),
  list(fips = "37035", nc_key = "parno",     cl_key = "APN (PARCEL NUMBER UNFORMATTED)",
       cl_transform = function(df) mutate(df, `APN (PARCEL NUMBER UNFORMATTED)` = str_sub(`APN (PARCEL NUMBER UNFORMATTED)`, 1, -5))),
  list(fips = "37081", nc_key = "parno",     cl_key = "ALTERNATE PARCEL ID",
       nc_transform = function(df) mutate(df, parno = str_sub(parno, 1, -4))),
  list(fips = "37085", nc_key = "parno",     cl_key = "ORIGINAL APN",
       nc_transform = function(df) mutate(df, parno = str_sub(parno, 1, -4))),
  list(fips = "37147", nc_key = "parno",     cl_key = "ALTERNATE PARCEL ID",
       cl_transform = function(df) mutate(df,
         `ALTERNATE PARCEL ID` = str_sub(`ALTERNATE PARCEL ID`, 1, 5),
         `APN (PARCEL NUMBER UNFORMATTED)` = str_pad(`APN (PARCEL NUMBER UNFORMATTED)`, 5, pad = "0"),
         `ALTERNATE PARCEL ID` = paste0(`ALTERNATE PARCEL ID`, `APN (PARCEL NUMBER UNFORMATTED)`))),
  list(fips = "37065", nc_key = "parno",     cl_key = "ONLINE FORMATTED PARCEL ID",
       cl_transform = function(df) mutate(df, `ONLINE FORMATTED PARCEL ID` = str_sub(`ONLINE FORMATTED PARCEL ID`, 1, -3))),
  list(fips = "37011", nc_key = "parno",     cl_key = "ONLINE FORMATTED PARCEL ID",
       cl_transform = function(df) mutate(df, `ONLINE FORMATTED PARCEL ID` = str_sub(`ONLINE FORMATTED PARCEL ID`, 2, -6))),
  # nparno cases (strip leading 5 chars after alnum clean)
  list(fips = "37077", nc_key = "nparno",    cl_key = "ALTERNATE PARCEL ID",
       nc_transform = function(df) mutate(df, nparno = str_sub(nparno, 6))),
  list(fips = "37007", nc_key = "nparno",    cl_key = "ONLINE FORMATTED PARCEL ID",
       nc_transform = function(df) mutate(df, nparno = str_sub(nparno, 6)),
       cl_transform = function(df) mutate(df, `ONLINE FORMATTED PARCEL ID` = str_sub(`ONLINE FORMATTED PARCEL ID`, 1, -3))),
  list(fips = "37023", nc_key = "nparno",    cl_key = "ALTERNATE PARCEL ID",
       nc_transform = function(df) mutate(df, nparno = str_sub(nparno, 6))),
  list(fips = "37125", nc_key = "nparno",    cl_key = "ONLINE FORMATTED PARCEL ID",
       nc_transform = function(df) mutate(df, nparno = str_sub(nparno, 6))),
  list(fips = "37169", nc_key = "nparno",    cl_key = "ONLINE FORMATTED PARCEL ID",
       nc_transform = function(df) mutate(df, nparno = str_sub(nparno, 6))),
  list(fips = "37171", nc_key = "nparno",    cl_key = "ONLINE FORMATTED PARCEL ID",
       nc_transform = function(df) mutate(df, nparno = str_sub(nparno, 6)))
)

# Fill defaults for common cases
county_configs <- map(county_configs, function(cfg) {
  cfg$nc_key        <- cfg$nc_key        %||% "parno"
  cfg$cl_key        <- cfg$cl_key        %||% "APN (PARCEL NUMBER UNFORMATTED)"
  cfg$nc_transform  <- cfg$nc_transform  %||% NULL
  cfg$cl_transform  <- cfg$cl_transform  %||% NULL
  cfg
})
In [5]:
Show / hide code
stats_list <- vector("list", length(county_configs))
out_file <- "/proj/mhinolab/users/rbless/data/Obstacles_Output/parcel_cl_stringmatch.csv"

# Remove output file if it exists so we can append fresh
if (file.exists(out_file)) file.remove(out_file)
[1] TRUE
Show / hide code
for (i in seq_along(county_configs)) {
  cfg <- county_configs[[i]]
  result <- match_county(cfg$fips, nc1, cl, cfg$nc_key, cfg$cl_key,
                         cfg$nc_transform, cfg$cl_transform)
  
  # Append data to CSV incrementally (write header only on first iteration)
  write_csv(result$data, out_file, append = (i > 1))
  
  # Store stats
  stats_list[[i]] <- result$stats
  
  gc()  # release memory after each county
  message("Done: ", cfg$fips, " (", i, "/", length(county_configs), ")")
}
Done: 37005 (1/74)
Done: 37133 (2/74)
Done: 37093 (3/74)
Done: 37097 (4/74)
Done: 37025 (5/74)
Done: 37167 (6/74)
Done: 37117 (7/74)
Done: 37041 (8/74)
Done: 37053 (9/74)
Done: 37029 (10/74)
Done: 37139 (11/74)
Done: 37015 (12/74)
Done: 37195 (13/74)
Done: 37181 (14/74)
Done: 37191 (15/74)
Done: 37105 (16/74)
Done: 37033 (17/74)
Done: 37123 (18/74)
Done: 37031 (19/74)
Done: 37153 (20/74)
Done: 37037 (21/74)
Done: 37009 (22/74)
Done: 37135 (23/74)
Done: 37019 (24/74)
Done: 37179 (25/74)
Done: 37049 (26/74)
Done: 37017 (27/74)
Done: 37151 (28/74)
Done: 37177 (29/74)
Done: 37059 (30/74)
Done: 37127 (31/74)
Done: 37073 (32/74)
Done: 37131 (33/74)
Done: 37083 (34/74)
Done: 37069 (35/74)
Done: 37091 (36/74)
Done: 37143 (37/74)
Done: 37145 (38/74)
Done: 37155 (39/74)
Done: 37187 (40/74)
Done: 37027 (41/74)
Done: 37157 (42/74)
Done: 37197 (43/74)
Done: 37129 (44/74)
Done: 37067 (45/74)
Done: 37057 (46/74)
Done: 37003 (47/74)
Done: 37189 (48/74)
Done: 37047 (49/74)
Done: 37101 (50/74)
Done: 37193 (51/74)
Done: 37183 (52/74)
Done: 37063 (53/74)
Done: 37071 (54/74)
Done: 37109 (55/74)
Done: 37137 (56/74)
Done: 37013 (57/74)
Done: 37001 (58/74)
Done: 37103 (59/74)
Done: 37159 (60/74)
Done: 37141 (61/74)
Done: 37165 (62/74)
Done: 37035 (63/74)
Done: 37081 (64/74)
Done: 37085 (65/74)
Done: 37147 (66/74)
Done: 37065 (67/74)
Done: 37011 (68/74)
Done: 37077 (69/74)
Done: 37007 (70/74)
Done: 37023 (71/74)
Done: 37125 (72/74)
Done: 37169 (73/74)
Done: 37171 (74/74)
Show / hide code
stats_df <- list_rbind(stats_list)
write_csv(stats_df, "/proj/mhinolab/users/rbless/data/Obstacles_Output/parcel_cl_str_stats.csv")
In [6]:
Show / hide code
stats_df <- readr::read_csv(
  "/proj/mhinolab/users/rbless/data/Obstacles_Output/parcel_cl_str_stats.csv",
  col_types = cols(.default = col_character())
) |>
  mutate(
    total_before_merge = as.integer(total_before_merge),
    matched            = as.integer(matched),
    pct_matched        = as.numeric(pct_matched),
    duplicates         = as.integer(duplicates)
  )

total_matched   <- sum(stats_df$matched, na.rm = TRUE)
total_parcels   <- sum(stats_df$total_before_merge, na.rm = TRUE)
total_unmatched <- total_parcels - total_matched
pct_unmatched   <- (total_unmatched / total_parcels) * 100
prior_benchmark <- 3237387L
diff_from_prior <- total_matched - prior_benchmark
zero_parcel     <- stats_df |> filter(total_before_merge == 0)

Summary of Matching Results

This code follows the same county-by-county strategy across all 74 counties in the study area, joining on the best available parcel ID field for each county (parno, altparno, or nparno on the parcel side; APN (PARCEL NUMBER UNFORMATTED), ALTERNATE PARCEL ID, ORIGINAL APN, or ONLINE FORMATTED PARCEL ID on the CoreLogic side).

This run produced 3,248,412 matched records, compared to the prior benchmark of 3,237,387 (difference: 11,025).

5 counties (37071, 37109, 37035, 37011, 37023) had no parcels in the OneMap data and therefore produced no matches. This reflects a data gap in the parcel source rather than a matching failure, and these counties are carried forward in remaining_all for follow-up.

In [7]:
Show / hide code
library(gt)

tibble::tibble(
  Group = c(rep("This Run", 4), rep("Comparison to Prior Analysis", 2)),
  Metric = c(
    "Total Parcels", "Matched", "Unmatched", "% Unmatched",
    "Prior Benchmark", "Difference from Prior"
  ),
  Value = c(
    scales::comma(total_parcels),
    scales::comma(total_matched),
    scales::comma(total_unmatched),
    paste0(round(pct_unmatched, 2), "%"),
    scales::comma(prior_benchmark),
    scales::comma(diff_from_prior)
  )
) |>
  gt::gt(groupname_col = "Group") |>
  gt::tab_header(title = "Match Summary") |>
  gt::cols_label(Metric = "", Value = "")
Match Summary
This Run
Total Parcels 3,820,824
Matched 3,248,412
Unmatched 572,412
% Unmatched 14.98%
Comparison to Prior Analysis
Prior Benchmark 3,237,387
Difference from Prior 11,025