Source code for sanityze.spotters

import re
import hashlib

[docs]class Spotter(): """ The Spotter interface to be implemented Attributes ---------- uid : str uid of the spotter hashSpotted : bool, optional False by default, whether to hash or replace the spotted sensitive information Methods ------- getSpotterUID() return the Spotter uid isHashSpotted() return whether the hashSpotted is True or False process(text) process the text depending on the hashSpotted value, if it is hash, replace it with hash otherwise, replace it with some default value Examples -------- Spotter should be initialized in a subclass level, therefore, skipping examples in the parent class >>> """ # please add the following line in the subclass # spotter_uid = "<uid of the spotter>" def __init__(self, uid: str, hashSpotted=False): self.uid = uid self.hashSpotted = hashSpotted
[docs] def getSpotterUID(self) -> str: """Getting the spotter uid Returns ------- self.uid : str the spotter uid Examples -------- >>> sub_spotter.getSpotterUID() "<sub class spotter UID>" """ return self.uid
[docs] def isHashSpotted(self) -> bool: """Getting the value of hashSpotted Returns ------- self.hashSpotted : bool the Truth value of hashSpotted Examples -------- >>> sub_spotter.isHashSpotted() TRUE """ return self.hashSpotted
[docs] def process(self, text: str) -> str: """Process the given text, if hashSpotted is True, replace the spotted text with hash, otherwise, replace the spotted text with some default values Parameters ---------- text : str The text to be spotted & modified Returns ------- new_text : str Examples -------- >>> df = pd.DataFrame(data = {'product_name': ['laptop', 'printer foo@gaga.com', 'tablet', 'desk 5555 5555 5555 4444', 'chair'], 'price': [1200, 150, 300, 450, 200]}) >>> c = Cleanser() >>> c.clean(df, verbose=False) product_name price 0 laptop 1200 1 printer EMAILADDRS 150 2 tablet 300 3 desk 5555 5555 5555 4444 450 4 chair 200 """ # # to be implemented in the specific spotter level # if self.isHashSpotted(): # new_text = "hash" # else: # new_text = "" # return new_text pass
[docs]class CreditCardSpotter(Spotter): """ The Credit Card Spotter Subclass Attributes ---------- uid : str uid of the spotter, "CREDITCARD" hashSpotted : bool, optional False by default, whether to hash or replace the spotted sensitive information Methods ------- getSpotterUID() return the Spotter uid, "CREDITCARD" isHashSpotted() return whether the hashSpotted is True or False process(text) process the text depending on the hashSpotted value, if hashSpotted is True, replace the spotted credit card number with hash otherwise, replace the spotted credit card number with some default value Examples -------- >>> CreditCardSpotter("CREDITCARDS",True) <sanityze.spotters.CreditCardSpotter object at 0x000001207F7B5880> """
[docs] def getSpotterUID(self) -> str: """Getting the credit card spotter uid Returns ------- "CREDITCARD" : str a fixed str value for CreditCardSpotter Examples -------- >>> cc = CreditCardSpotter("CREDITCARDS",True) >>> cc.getSpotterUID() CREDITCARD """ return "CREDITCARD"
[docs] def process(self, text: str) -> str: """Process the given text, if hashSpotted is True, replace the spotted credit card number with hash, otherwise, replace the spotted credit card number with some default values Parameters ---------- text : str The text to be spotted & modified Returns ------- new_text : str the text with credit card number replaced by a hash or the default string value Examples -------- >>> cc = CreditCardSpotter("CREDITCARDS", False) >>> cc.process("4556129404313766") CREDITCARD """ # Regexes from: # http://www.regular-expressions.info/creditcard.html # taken from the alphagov fork of scrubadub: https://github.com/alphagov/scrubadub # credit card patterns to match cc_pattern = re.compile(( r"(?:4[0-9]{12}(?:[0-9]{3})?" # Visa r"|(?:5[1-5][0-9]{2}" # MasterCard r"|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}" r"|3[47][0-9]{13}" # American Express r"|3(?:0[0-5]|[68][0-9])[0-9]{13}" # Diners Club r"|6(?:011|5[0-9]{2})[0-9]{12}" # Discover r"|(?:2131|1800|35\d{3})\d{11})" # JCB ), re.VERBOSE) # sets replacement value based on output of isHashSpotted() if self.isHashSpotted(): replacement = hashlib.md5(text.encode()).hexdigest() else: replacement = self.getSpotterUID() # replaces cc number with replacement value clean_str = re.sub(cc_pattern, replacement, text) return(clean_str)
[docs]class EmailSpotter(Spotter): """ The Email Spotter Subclass Attributes ---------- uid : str uid of the spotter, "EMAILADDRS" hashSpotted : bool, optional False by default, whether to hash or replace the spotted sensitive information Methods ------- getSpotterUID() return the Spotter uid, "EMAILADDRS" isHashSpotted() return whether the hashSpotted is True or False process(text) process the text depending on the hashSpotted value, if hashSpotted is True, replace the spotted email with hash otherwise, replace the spotted email with some default value """
[docs] def getSpotterUID(self) -> str: """Getting the email spotter uid Returns ------- "EMAILADDRS" : str a fixed str value for EmailSpotter Examples -------- >>> ee = EmailSpotter("EMAILS", False) >>> ee.getSpotterUID() EMAILADDRS """ return "EMAILADDRS"
[docs] def process(self, text: str) -> str: """Process the given text, if hashSpotted is True, replace the spotted email with hash, otherwise, replace the spotted email with some default values Parameters ---------- text : str The text to be spotted & modified Returns ------- new_text : str the text with email replaced by a hash or the default string value Examples -------- >>> ee = EmailSpotter("EMAILS", False) >>> ee.process("abcd1234@gmail.com") EMAILADDRS """ # base preprocessing (if needed) # email regex (adapted from [https://scrubadub.readthedocs.io/en/stable/_modules/scrubadub/detectors/email.html#EmailDetector:~:text=regex%20%3D%20re,.IGNORECASE)]) regex = re.compile(( r"\b[a-z0-9!#$%&'*+\/=?^_`{|}~-]" # start with this character r"(?:" r" [\.a-z0-9!#$%&'*+\/=?^_`{|}~-]{0,62}" # valid next characters (max length 64 chars before @) r" [a-z0-9!#$%&'*+\/=?^_`{|}~-]" # end with this character r")?" r"(?:@|\sat\s)" # @ or the word 'at' instead r"[a-z0-9]" # domain starts like this r"(?:" r" (?=[a-z0-9-]*(\.|\sdot\s))" # A lookahead to ensure there is a dot in the domain r" (?:\.|\sdot\s|[a-z0-9-]){0,251}" # might have a '.' or the word 'dot' instead r" [a-z0-9]" # domain has max 253 chars, ends with one of these r")+\b" ), re.VERBOSE | re.IGNORECASE) if self.isHashSpotted(): text = re.sub(regex, lambda x:hashlib.md5(x.group().encode()).hexdigest(), text) new_text = text else: new_text = re.sub(regex, self.getSpotterUID(), text) return new_text