Source code for src.postprocess.CDC_ITF

import pandas as pd
import logging


[docs]def postprocess(data: pd.DataFrame): """ Apply dataset-level transformations to CDC_ITF data. Parameters ---------- data : pd.DataFrame Input CDC_ITF data. Returns ------- pd.DataFrame CDC_ITF data with transformations appied. """ data = remove_id_duplicates(data) return(data)
[docs]def remove_id_duplicates(data: pd.DataFrame): """ Remove duplicate records with identical measure_stage values Removes duplicate records with identical values in `ref_cols`. Parameters ---------- data : pd.DataFrame Input CDC_ITF data. Returns ------- pd.DataFrame Data with duplicates removed. """ n_records = len(data.iloc[:, 1]) ref_cols = ['country_territory_area', 'prov_subcategory', 'prov_measure', 'comments', 'link', 'date_start'] dup_ref = data.groupby(ref_cols).count().reset_index()[ref_cols + ['dataset']] dup_ref = dup_ref.loc[dup_ref['dataset'] > 1, :] dup_ref.loc[:, 'duplicate'] = True dup_ref = dup_ref[ref_cols + ['duplicate']] data = pd.merge(data, dup_ref, how='outer', left_on=ref_cols, right_on=ref_cols) dups = data.loc[(data['duplicate'] == True) & (data['measure_stage'] == 'Lift'), :].dropna(subset = ['prop_id']).copy() dup_ids = dups['prop_id'].unique() dups.loc[:, 'who_code'] = 12 dups.loc[:, 'prov_subcategory'] = 'duplicate' dups.loc[:, 'prov_measure'] = 'duplicate' data = data.loc[([x not in dup_ids for x in data['prop_id']]), :] data = pd.concat([data, dups]) data = data.drop('duplicate', axis = 1) logging.warning('Missing %d CDC Duplicates.' % (n_records - len(data.iloc[:, 1]))) return(data)