Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions reggie/configs/data/maine.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,35 @@ ordered_columns:
- KCB
- ACF
- DT CHG

# In June 2026 Maine renamed some columns, this is the old to new dict
rename_columns:
REG_MUNI: REG TOWN
VOTER_REC_NUM: VOTER ID
LAST_NAME: LAST NAME
FIRST_NAME: FIRST NAME
MIDDLE_NAME: MIDDLE NAME
SUFFIX: SUFF
RES_STR_NUM: ST NUM
RES_SUF_A: ST NUM A
RES_SUF_B: ST NUM B
RES_STR_NAME: ST NAME1
RES_UNIT_TYPE: ST NAME2
RES_UNIT_NUM: UNIT
RES_MUNI: CITY # same
RES_ST: STATE
RES_ZIP5: ZIP
RES_ZIP4: ZIP4
MAIL_STR_NUM: MAIL ST NUM
MAIL_SUF_A: MAIL ST NUM A
MAIL_SUF_B: MAIL ST NUM B
MAIL_STR_NAME: MAIL ST NAME1
MAIL_UNIT_NUM: MAIL UNIT
MAIL_MUNI: MAIL CITY
MAIL_ST: MAIL STATE
VOTER_STATUS: STATUS
W-P: WP
REG_DATE: DT ACCEPT # No longer do we have DT_EFFECT, it seems no more status change date either

county_codes:
Androscoggin: 01and
Expand Down
13 changes: 10 additions & 3 deletions reggie/ingestion/preprocessor/maine_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,14 @@ def keep_most_recent_record(voters_df, cancelled_df):
cancelled_df = pd.DataFrame()
hist_df = pd.DataFrame()
for file in new_files:
if "voter.txt" in file["name"].lower(): # needs extension
if ".html" in file["name"].lower():
continue
elif "voter.txt" in file["name"].lower() or ("a&i" and "partycampaignusevoterfile") in file["name"].lower():
logging.info(f"voter file found: {file['name']}")
voter_df = self.read_csv_count_error_lines(
file["obj"], sep="|", dtype="str", on_bad_lines="warn"
)
voter_df.rename(columns=self.config["rename_columns"], inplace=True)
voter_df_shape_before = voter_df.shape
voter_df.dropna(subset=["VOTER ID"], inplace=True)
voter_df_shape_after = voter_df.shape
Expand All @@ -121,9 +124,13 @@ def keep_most_recent_record(voters_df, cancelled_df):
logging.info(
f"Dropped {voter_df_shape_before[0] - voter_df_shape_after[0]} rows due to NaN county values"
)

#As of June 2026, there are no more reason codes in Maine,
# The other missing columns are:
if "REASON" not in voter_df.columns:
voter_df["REASON"] = np.NAN
elif (
"history" in file["name"].lower()
and ".html" not in file["name"].lower()
):
# Maine Voter History seems to come one file per election,
# Sometimes they have a history report html file that we skip
Expand All @@ -133,7 +140,7 @@ def keep_most_recent_record(voters_df, cancelled_df):
)
logging.info(f"concatenating {file['name']}")
hist_df = pd.concat([hist_df, new_hist])
elif "cancelled" in file["name"].lower():
elif "cancelled" or "cxl" in file["name"].lower():
# Note: the cancelled file does not have a county column
logging.info(f"cancelled file found: {file['name']}")

Expand Down
Loading