The most commonly used functions in machine learning data cleaning

Gary

3 years ago

1. Get null percentage of columns

def get_null_percentage(df):
  count = df.isnull().sum()
  percent = (df.isnull().mean() * 100).round(2)
  summary = pd.DataFrame({'count': count, 'percentage': percent})

  return summary[summary["count"] > 0].sort_values(ascending=False, by="percentage")

2. Get zero percentage of columns

def get_zero_percentage(df):
  count = df.eq(0).sum()
  percent = (df.eq(0).mean() * 100).round(2)
  summary = pd.DataFrame({'count': count, 'percentage': percent})

  return summary[summary["count"] > 0].sort_values(ascending=False, by="percentage")

3. Remove Outliers for one column

3.1 using IQR

def remove_outliers_by_iqr(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    value = q3 - q1
    lower_value = q1 - 1.5 * value
    higer_value = q3 + 1.5 * value

    return df[(df[column] < higer_value) & (df[column] > lower_value)]

This might end up with an empty DataFrame if all or a significant portion of your data points are considered outliers and removed. e.g.,

We can use z-score to solve it.

3.2 using Z-Score

def remove_outliers_by_zscore(df, column_name, z_threshold=3):
    # Calculate the Z-scores for the specified column
    z_scores = (df[column_name] - df[column_name].mean()) / df[column_name].std()

    # Filter out rows where the Z-score is greater than the threshold
    return df[abs(z_scores) <= z_threshold]

4 Get extreme distribution columns

def check_extreme_distribution(df, threshold=0.95):
  extreme_columns = []
  for column in df.columns:
    mode_value = df[column].mode().iloc[0]  # first value, it is also the most contributed value
    mode_percentage = (df[column] == mode_value).sum() / len(df)

    if mode_percentage >= threshold:
      extreme_columns.append(column)

  return extreme_columns

5 Get same percentile columns

def get_same_percentile_columns(df, value):
    column_stats = df.describe()
    return column_stats.columns[(column_stats.loc['min'] == value) & 
                                            (column_stats.loc['25%'] == value) & 
                                            (column_stats.loc['50%'] == value) & 
                                            (column_stats.loc['75%'] == value)]

We can use it to check the columns that all percentiles are 0:

get_percentile_same_columns(df, 0)