diff --git a/reggie/configs/data/maine.yaml b/reggie/configs/data/maine.yaml index f5f87d714..61e2aca6e 100644 --- a/reggie/configs/data/maine.yaml +++ b/reggie/configs/data/maine.yaml @@ -253,6 +253,35 @@ ordered_columns: - KCB - ACF - DT CHG + +# In June 2026 Maine renamed some columns, this is the old to new dict +rename_columns: + REG_MUNI: REG TOWN + VOTER_REC_NUM: VOTER ID + LAST_NAME: LAST NAME + FIRST_NAME: FIRST NAME + MIDDLE_NAME: MIDDLE NAME + SUFFIX: SUFF + RES_STR_NUM: ST NUM + RES_SUF_A: ST NUM A + RES_SUF_B: ST NUM B + RES_STR_NAME: ST NAME1 + RES_UNIT_TYPE: ST NAME2 + RES_UNIT_NUM: UNIT + RES_MUNI: CITY # same + RES_ST: STATE + RES_ZIP5: ZIP + RES_ZIP4: ZIP4 + MAIL_STR_NUM: MAIL ST NUM + MAIL_SUF_A: MAIL ST NUM A + MAIL_SUF_B: MAIL ST NUM B + MAIL_STR_NAME: MAIL ST NAME1 + MAIL_UNIT_NUM: MAIL UNIT + MAIL_MUNI: MAIL CITY + MAIL_ST: MAIL STATE + VOTER_STATUS: STATUS + W-P: WP + REG_DATE: DT ACCEPT # No longer do we have DT_EFFECT, it seems no more status change date either county_codes: Androscoggin: 01and diff --git a/reggie/ingestion/preprocessor/maine_preprocessor.py b/reggie/ingestion/preprocessor/maine_preprocessor.py index 80825c39a..3c0201de7 100644 --- a/reggie/ingestion/preprocessor/maine_preprocessor.py +++ b/reggie/ingestion/preprocessor/maine_preprocessor.py @@ -104,11 +104,14 @@ def keep_most_recent_record(voters_df, cancelled_df): cancelled_df = pd.DataFrame() hist_df = pd.DataFrame() for file in new_files: - if "voter.txt" in file["name"].lower(): # needs extension + if ".html" in file["name"].lower(): + continue + elif "voter.txt" in file["name"].lower() or ("a&i" and "partycampaignusevoterfile") in file["name"].lower(): logging.info(f"voter file found: {file['name']}") voter_df = self.read_csv_count_error_lines( file["obj"], sep="|", dtype="str", on_bad_lines="warn" ) + voter_df.rename(columns=self.config["rename_columns"], inplace=True) voter_df_shape_before = voter_df.shape voter_df.dropna(subset=["VOTER ID"], inplace=True) voter_df_shape_after = voter_df.shape @@ -121,9 +124,13 @@ def keep_most_recent_record(voters_df, cancelled_df): logging.info( f"Dropped {voter_df_shape_before[0] - voter_df_shape_after[0]} rows due to NaN county values" ) + + #As of June 2026, there are no more reason codes in Maine, + # The other missing columns are: + if "REASON" not in voter_df.columns: + voter_df["REASON"] = np.NAN elif ( "history" in file["name"].lower() - and ".html" not in file["name"].lower() ): # Maine Voter History seems to come one file per election, # Sometimes they have a history report html file that we skip @@ -133,7 +140,7 @@ def keep_most_recent_record(voters_df, cancelled_df): ) logging.info(f"concatenating {file['name']}") hist_df = pd.concat([hist_df, new_hist]) - elif "cancelled" in file["name"].lower(): + elif "cancelled" or "cxl" in file["name"].lower(): # Note: the cancelled file does not have a county column logging.info(f"cancelled file found: {file['name']}")