Source code for src.postprocess.JH_HIT

import re
import pandas as pd


[docs]def postprocess(data: pd.DataFrame): """ Apply dataset-level transformations to JH_HIT data. Parameters ---------- data : pd.DataFrame Input JH_HIT data. Returns ------- pd.DataFrame JH_HIT data with transformations appied. """ data = combine_measures(data, '4.1.2', '_school_closure') data = combine_measures(data, '5.7', '_border_air') data = combine_measures(data, '5.8', '_border_sea') data = combine_measures(data, '5.9', '_border_land') return(data)
[docs]def combine_measures(data: pd.DataFrame, who_code: str, id_stub: str): """ Combine groups of records with an arbitrary `who_code`. Example: Groups are defined by records with identical numeric `prop_id` values: 333_school_secondary, 333_school_nursery, 333_school_primary etc. -> 333_school_closure or 234_border_in, 234_border_out -> 234_border_closure Parameters ---------- data : pd.DataFrame Input data. who_code : str `who_code` to combine. id_stub : str Stub name to add to combined ID numbers. Returns ------- pd.DataFrame Data with combination applied. """ # Get records with target who_code records = data.copy().loc[data['who_code'] == who_code] # Get records without target who_code other_data = data.copy().loc[data['who_code'] != who_code, :] # Check that no records are being dropped assert len(records.index) + len(other_data.index) == len(data.index) # Extract numeric values from prop_id_numeric records['prop_id_numeric'] = [re.findall(r'\d+', x)[0] for x in records['prop_id']] # Split records into groups by numeric id records = records.groupby('prop_id_numeric') records = [records.get_group(x) for x in records.groups] res = [] for id_group in records: group = {} for col_name in id_group.columns: group[col_name] = list(id_group[col_name].unique())[0] try: group['targeted'] = ', '.join(list(id_group['targeted'])) except Exception: group['targeted'] = None group['prop_id'] = group['prop_id_numeric'] + id_stub group['dataset'] = 'JH_HIT' del group['prop_id_numeric'] res.append(group) res = pd.DataFrame(res) data = pd.concat([other_data, res]) return(data)