Source code for sanityze.cleanser

import pandas as pd
from sanityze.spotters import * 

[docs]class Cleanser: """ The main class for the sanityze package. It's purpose is to clean the data frame before it's consumed by the training or prediction pipeline. Parameters ---------- include_default_spotters : bool, optional If True, the default spotters will be added to the Cleanser. The default is True. hash_spotted : bool, optional If True, the spotters will hash the values within the columns they spot. The default is False. """ def __init__(self, include_default_spotters=True, hash_spotted = False): if (include_default_spotters): self.chain = [EmailSpotter("DEFAULTEMAILS",hash_spotted),CreditCardSpotter("DEFAULCCS",hash_spotted)] else: self.chain = []
[docs] def add_spotter(self, spotter) -> bool: """ Add a specific spotter to the Cleanser Parameters ---------- spotter : Spotter A subclass of Spotter to add to the Cleanser. Note that spotters are added at the end of the list. Adding the same spotter will return False Returns ------- True if the spotter was added, False if it was not added. Examples -------- >>> c = Cleanser(include_default_spotters=False) >>> s1 = EmailSpotter("EMAILS",True) >>> c.add_spotter(s1) """ if (spotter is None): raise ValueError("spotter cannot be None in Cleanser.add_spotter()") if (spotter in self.chain): return False for s in self.chain: if (s.getSpotterUID() == spotter.getSpotterUID()): return False self.chain.append(spotter)
[docs] def remove_spotter(self, spotter_id) -> bool: """ Remove a specific spotter from the Cleanser using the spotter's id Parameters ---------- spotter_id : str The id of the spotter to remove verbose: bool, optional If True, the spotter will print out debug information. The default is False. Returns ------- True if the spotter was removed, False if it was not removed. Examples -------- >>> c = Cleanser(include_default_spotters=False) >>> s1 = EmailSpotter("EMAILADDRS",True) >>> c.remove_spotter("EMAILADDRS") """ if (spotter_id is None): raise ValueError("spotter_id cannot be None in Cleanser.remove_spotter()") for s in self.chain: if (s.getSpotterUID() == spotter_id): self.chain.remove(s) return True return False
[docs] def _log(self, message: str, verbose: bool) -> None: """ Internal utility function to log messages to the console Parameters ---------- verbose: bool The verbosity of the log message : str The message to log Returns ------- None Examples -------- (called by clean()) """ if (verbose): print(f"- {message}")
[docs] def clean(self, df: pd.DataFrame, verbose=False) -> pd.DataFrame: """ Sanitizes the data frame using the spotters added to the Cleanser Parameters ---------- df : pd.DataFrame The data frame to sanitize Returns ------- The sanitized data frame Examples -------- >>> df = pd.DataFrame(data = {'product_name': ['laptop', 'printer foo@gaga.com', 'tablet', 'desk 5555 5555 5555 4444', 'chair'], 'price': [1200, 150, 300, 450, 200]}) >>> c = Cleanser() >>> c.clean(df, verbose=False) product_name price 0 laptop 1200 1 printer EMAILADDRS 150 2 tablet 300 3 desk 5555 5555 5555 4444 450 4 chair 200 """ if (df is None): raise ValueError("df cannot be None in clean") if not isinstance(df,pd.DataFrame): raise TypeError("df must be a pandas DataFrame in clean") # we only operate on a copy of the data frame, leaving # the original data frame intact df_copy = df.copy() # iterate thru the data frame cells row_len, col_len = df.shape for i in range(row_len): for j in range(col_len): cell = df.iat[i,j] # iterate thru the spotters and redact content # if the cell is of type string if isinstance(cell, str): for spotter in self.chain: self._log(f"{spotter.getSpotterUID()}: Processing cell {cell} ", verbose) cell = spotter.process(cell) self._log(f"{spotter.getSpotterUID()}: Processed cell {cell} ", verbose) # update the cell in the copy of the data frame df_copy.iat[i,j] = cell return df_copy