Source code for src.manually_cleaned.main
import pandas as pd
[docs]def adjust_manually_cleaned(manually_cleaned: pd.DataFrame):
"""
Unify value adjustments in manually cleaned data.
Parameters
----------
manually_cleaned : pd.DataFrame
Manually cleaned data.
Returns
-------
pd.DataFrame
Description of returned object.
"""
manually_cleaned = update_measure_stage_date(manually_cleaned)
return(manually_cleaned)
[docs]def update_following_measures(manually_cleaned: pd.DataFrame):
"""
Update `date_end` values for records that have been assigned a following record.
Parameters
----------
manually_cleaned : pd.DataFrame
Manually cleaned data.
Returns
-------
pd.DataFrame
Manually cleaned data with `date_end` values adjusted.
"""
has_following_measure = pd.Series([not pd.isna(x) for x in manually_cleaned['following_measure_number']])
to_alter = manually_cleaned[has_following_measure]
not_to_alter = manually_cleaned[~has_following_measure]
to_alter_res = []
for i, row in to_alter.iterrows():
following_measure_number = row['following_measure_number']
following_measure = manually_cleaned.loc[manually_cleaned['who_id'] == following_measure_number, :]
new_date_end = following_measure['date_start']
new_reason_ended = following_measure['measure_stage']
if len(new_date_end) > 0:
row['date_end'] = new_date_end
row['reason_ended'] = new_reason_ended
else:
row['date_end'] = row['date_end']
row['reason_ended'] = row['reason_ended']
to_alter_res.append(row)
to_alter = pd.concat([x.to_frame().T for x in to_alter_res])
assert (len(to_alter.index) + len(not_to_alter.index)) == len(manually_cleaned.index)
return(pd.concat([to_alter, not_to_alter]))
[docs]def update_measure_stage_date(manually_cleaned: pd.DataFrame):
"""
Updates `date_end` and `reason_ended` based on `measure_stage` value.
If measure stage is "finish", date_end should == date_start and reason_ended == "finish".
Parameters
----------
manually_cleaned : pd.DataFrame
Manually cleaned data.
Returns
-------
pd.DataFrame
Manually cleaned data with adjustments.
"""
is_null_date_end = pd.isna(manually_cleaned['date_end'])
is_finish = manually_cleaned['measure_stage'] == 'finish'
manually_cleaned.loc[(is_null_date_end) & (is_finish), "reason_ended"] = 'finish'
manually_cleaned.loc[(is_null_date_end) & (is_finish), "date_end"] = manually_cleaned.loc[(is_null_date_end) & (is_finish), "date_start"]
return(manually_cleaned)
[docs]def columns_to_lower(manually_cleaned: pd.DataFrame, lowercase_columns: list):
"""
Set all values in a column to lowercase.
Parameters
----------
manually_cleaned : pd.DataFrame
Manually cleaned data.
lowercase_columns : list
list of columns to transform to lowercase.
Returns
-------
pd.DataFrame
Manually cleaned data with conversion applied.
"""
for col in lowercase_columns:
try:
assert all(isinstance(x, str) for x in manually_cleaned[col] if not pd.isna(x))
except AssertionError:
raise AssertionError('Column {} does not only contain strings'.format(col))
manually_cleaned[col] = manually_cleaned[col].str.lower()
return(manually_cleaned)