Source code for pyhealth.utils.utility

# -*- coding: utf-8 -*-
"""A set of utility functions to support outlier detection.
"""
# Author: Yue Zhao <zhaoy@cmu.edu>
# License: BSD 2 clause

from __future__ import division
from __future__ import print_function

import os
import numpy as np
import pandas as pd
from numpy import percentile
import numbers

import sklearn
from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler

from sklearn.utils import column_or_1d
from sklearn.utils import check_array
from sklearn.utils import check_consistent_length

from sklearn.utils import check_random_state
from sklearn.utils.random import sample_without_replacement

MAX_INT = np.iinfo(np.int32).max
MIN_INT = -1 * MAX_INT

[docs]def make_dirs_if_not_exists(save_dir): # make saving directory if needed if not os.path.isdir(save_dir): os.makedirs(save_dir)
[docs]def read_csv_to_df(file_loc, header_lower=True, usecols=None, dtype=None, low_memory=True, encoding=None): """Read in csv files with necessary processing Parameters ---------- file_loc header_lower low_memory Returns ------- """ if dtype != None: df = pd.read_csv(file_loc, usecols=usecols, dtype=dtype, low_memory=low_memory, encoding=encoding) else: df = pd.read_csv(file_loc, usecols=usecols, low_memory=low_memory, encoding=encoding) if header_lower: df.columns = df.columns.str.lower() return df
[docs]def read_excel_to_df(file_loc, header_lower=True, usecols=None, dtype=None, low_memory=True, encoding=None): """Read in excel files with necessary processing Parameters ---------- file_loc header_lower low_memory Returns ------- """ if dtype != None: df = pd.read_excel(file_loc, usecols=usecols, dtype=dtype, low_memory=low_memory, encoding=encoding) else: df = pd.read_excel(file_loc, usecols=usecols, low_memory=low_memory, encoding=encoding) if header_lower: df.columns = df.columns.str.lower() return df
[docs]def check_parameter(param, low=MIN_INT, high=MAX_INT, param_name='', include_left=False, include_right=False): """Check if an input is within the defined range. Parameters ---------- param : int, float The input parameter to check. low : int, float The lower bound of the range. high : int, float The higher bound of the range. param_name : str, optional (default='') The name of the parameter. include_left : bool, optional (default=False) Whether includes the lower bound (lower bound <=). include_right : bool, optional (default=False) Whether includes the higher bound (<= higher bound). Returns ------- within_range : bool or raise errors Whether the parameter is within the range of (low, high) """ # param, low and high should all be numerical if not isinstance(param, (numbers.Integral, np.integer, np.float)): raise TypeError('{param_name} is set to {param} Not numerical'.format( param=param, param_name=param_name)) if not isinstance(low, (numbers.Integral, np.integer, np.float)): raise TypeError('low is set to {low}. Not numerical'.format(low=low)) if not isinstance(high, (numbers.Integral, np.integer, np.float)): raise TypeError('high is set to {high}. Not numerical'.format( high=high)) # at least one of the bounds should be specified if low is MIN_INT and high is MAX_INT: raise ValueError('Neither low nor high bounds is undefined') # if wrong bound values are used if low > high: raise ValueError( 'Lower bound > Higher bound') # value check under different bound conditions if (include_left and include_right) and (param < low or param > high): raise ValueError( '{param_name} is set to {param}. ' 'Not in the range of [{low}, {high}].'.format( param=param, low=low, high=high, param_name=param_name)) elif (include_left and not include_right) and ( param < low or param >= high): raise ValueError( '{param_name} is set to {param}. ' 'Not in the range of [{low}, {high}).'.format( param=param, low=low, high=high, param_name=param_name)) elif (not include_left and include_right) and ( param <= low or param > high): raise ValueError( '{param_name} is set to {param}. ' 'Not in the range of ({low}, {high}].'.format( param=param, low=low, high=high, param_name=param_name)) elif (not include_left and not include_right) and ( param <= low or param >= high): raise ValueError( '{param_name} is set to {param}. ' 'Not in the range of ({low}, {high}).'.format( param=param, low=low, high=high, param_name=param_name)) else: return True