Source code for src.manually_cleaned.main

import pandas as pd


[docs]def adjust_manually_cleaned(manually_cleaned: pd.DataFrame): """ Unify value adjustments in manually cleaned data. Parameters ---------- manually_cleaned : pd.DataFrame Manually cleaned data. Returns ------- pd.DataFrame Description of returned object. """ manually_cleaned = update_measure_stage_date(manually_cleaned) return(manually_cleaned)
[docs]def update_following_measures(manually_cleaned: pd.DataFrame): """ Update `date_end` values for records that have been assigned a following record. Parameters ---------- manually_cleaned : pd.DataFrame Manually cleaned data. Returns ------- pd.DataFrame Manually cleaned data with `date_end` values adjusted. """ has_following_measure = pd.Series([not pd.isna(x) for x in manually_cleaned['following_measure_number']]) to_alter = manually_cleaned[has_following_measure] not_to_alter = manually_cleaned[~has_following_measure] to_alter_res = [] for i, row in to_alter.iterrows(): following_measure_number = row['following_measure_number'] following_measure = manually_cleaned.loc[manually_cleaned['who_id'] == following_measure_number, :] new_date_end = following_measure['date_start'] new_reason_ended = following_measure['measure_stage'] if len(new_date_end) > 0: row['date_end'] = new_date_end row['reason_ended'] = new_reason_ended else: row['date_end'] = row['date_end'] row['reason_ended'] = row['reason_ended'] to_alter_res.append(row) to_alter = pd.concat([x.to_frame().T for x in to_alter_res]) assert (len(to_alter.index) + len(not_to_alter.index)) == len(manually_cleaned.index) return(pd.concat([to_alter, not_to_alter]))
[docs]def update_measure_stage_date(manually_cleaned: pd.DataFrame): """ Updates `date_end` and `reason_ended` based on `measure_stage` value. If measure stage is "finish", date_end should == date_start and reason_ended == "finish". Parameters ---------- manually_cleaned : pd.DataFrame Manually cleaned data. Returns ------- pd.DataFrame Manually cleaned data with adjustments. """ is_null_date_end = pd.isna(manually_cleaned['date_end']) is_finish = manually_cleaned['measure_stage'] == 'finish' manually_cleaned.loc[(is_null_date_end) & (is_finish), "reason_ended"] = 'finish' manually_cleaned.loc[(is_null_date_end) & (is_finish), "date_end"] = manually_cleaned.loc[(is_null_date_end) & (is_finish), "date_start"] return(manually_cleaned)
[docs]def columns_to_lower(manually_cleaned: pd.DataFrame, lowercase_columns: list): """ Set all values in a column to lowercase. Parameters ---------- manually_cleaned : pd.DataFrame Manually cleaned data. lowercase_columns : list list of columns to transform to lowercase. Returns ------- pd.DataFrame Manually cleaned data with conversion applied. """ for col in lowercase_columns: try: assert all(isinstance(x, str) for x in manually_cleaned[col] if not pd.isna(x)) except AssertionError: raise AssertionError('Column {} does not only contain strings'.format(col)) manually_cleaned[col] = manually_cleaned[col].str.lower() return(manually_cleaned)