Source code for src.preprocess.check
import pandas as pd
import logging
from datetime import datetime
[docs]def check_input(records: pd.DataFrame, column_config: pd.DataFrame, date_config: pd.DataFrame, dataset: str):
"""
Function to unify all input checks.
Parameters
----------
records : pd.DataFrame
Dataframe of provider data.
column_config : pd.DataFrame
Reference for accepted column names.
date_config : pd.DataFrame
Reference for accepted date formats.
dataset : str
Name of provider dataset.
Returns
-------
type
Description of returned object.
"""
check_column_names(records, column_config)
check_date_format(records, date_config, dataset)
[docs]def check_column_names(records: pd.DataFrame, config: pd.DataFrame, log: bool = True):
"""
Function to check that column names agree with config or raise exception.
Parameters
----------
records : pd.DataFrame
Dataframe of provider data.
config : pd.DataFrame
Reference for accepted column names.
log : bool
Whether or not to log results of checks.
Returns
-------
None
"""
dataset = list(config['dataset'].unique())[0]
try:
assert set(records.columns) == set(config['column'])
if log:
logging.info('INPUT_CHECK_SUCCESS=%s input columns OK.' % dataset)
except Exception as e:
present_in_input = set(records.columns).difference(set(config['column']))
present_in_config = set(config['column']).difference(set(records.columns))
message = 'INPUT_CHECK_FAILURE=Unexpected %s columns. Present in input: %s, present in config: %s' % (dataset, present_in_input, present_in_config)
if log:
logging.info(message)
#raise e
[docs]def check_date_format(data: pd.DataFrame,
config: pd.DataFrame,
dataset: str,
log: bool = True):
"""
Check that an input date is in the expected format.
Parameters
----------
data : pd.DataFrame
Dataframe of provider data..
config : pd.DataFrame
Reference for accepted date formats.
dataset : str
Name of provider dataset.
log : bool
Whether or not to log results of checks.
Returns
-------
None
"""
format = config.loc[config['dataset'] == dataset, 'format'].item()
date_column = config.loc[config['dataset'] == dataset, 'date_column'].item()
res = [validate_date_format(x, format) for x in data[date_column] if x is None]
try:
assert len(res) == 0
if log:
logging.info('INPUT_CHECK_SUCCESS=%s %s date format is %s.' % (dataset, date_column, format))
except Exception:
if log:
logging.info('INPUT_CHECK_FAILURE=%s %s %d dates not in the format %s.' % (dataset, date_column, len(res), format))
[docs]def validate_date_format(date, format):
"""
Return None if a date format does not parse.
Parameters
----------
date : type
Input date string.
format : type
Input accpeted format to try.
Returns
-------
type
Returns date on successful parse or None on parsing failure.
"""
try:
return(datetime.strptime(date, format))
except Exception:
return(None)