Source code for src.master.main

import pandas as pd


[docs]def get_new_records(records: pd.DataFrame, previous_update: pd.DataFrame, cols: list): """ Identify new records in an update data and a previous update data. Based on a string of `cols` pasted together to form an identifier. Example: Given `cols` = `['country_territory_area', 'date_start']`, pastes values in these columns together. Referred to as a "combo string". Any records in `records` with a "combo string" in `previous_update` will be not be recognised as a new record. i.e. "United States of America_2020-01-01" == "United States of America_2020-01-01" means that records match. Parameters ---------- records : pd.DataFrame Newly updated data. previous_update : pd.DataFrame Previously updated data. cols : list Columns to be considered when merging records. Returns ------- pd.DataFrame New records not present in `previous_update`. """ records = records.copy() previous_update = previous_update.copy() # Concatenate values in `cols` separated by "_" in update data records['combo_string'] = get_combo_string(records, cols) # And previous update previous_update['combo_string'] = get_combo_string(previous_update, cols) # Identify which concatenated strings are unique in the new data new_combo_strings = set(records['combo_string']).difference(set(previous_update['combo_string'])) #print(len(records['combo_string'])) #print(len(new_combo_strings)) #print(list(new_combo_strings)[0]) #print(records.loc[records['combo_string'] == list(new_combo_strings)[0], 'comments']) # get a subset of the update data by these unique strings new_records = records.loc[[x in new_combo_strings for x in records['combo_string']], :] new_records = new_records.drop(['combo_string'], axis=1) return(new_records)
[docs]def get_combo_string(records: pd.DataFrame, cols: list): """ Paste column values together, separated by '_'. Parameters ---------- records : pd.DataFrame Input dataset. cols : list Columns to be pasted together. Returns ------- list List of pasted column values. """ combo_string = records[cols].apply(lambda x: x.astype(str)).agg('_'.join, axis=1) return(combo_string)