Source code for ae.valid

"""
data validation helper functions
================================

this module is pure Python and has no dependencies.

the two slightly bigger helper functions provided by this namespace portion are :func:`correct_email` and
:func:`correct_phone`, which are useful to check if a string contains a valid email address or phone number.

they also allow you to automatically correct an email address or a phone number to a valid format. more sophisticated
helpers for the validation of email addresses, phone numbers and post addresses are available in the
:mod:`ae.validation` namespace portion.
"""
from string import ascii_letters, digits
from typing import List, Optional, Tuple


__version__ = '0.3.3'


[docs]def correct_email(email: str, changed: bool = False, removed: Optional[List[str]] = None) -> Tuple[str, bool]: """ check and correct email address from a user input (removing all comments). special conversions that are not returned as changed/corrected are: the domain part of an email will be corrected to lowercase characters, additionally emails with all letters in uppercase will be converted into lowercase. regular expressions are not working for all edge cases (see the answer to this SO question: https://stackoverflow.com/questions/201323/using-a-regular-expression-to-validate-an-email-address) because RFC822 is very complex (even the reg expression recommended by RFC 5322 is not complete; there is also a more readable form given in the informational RFC 3696). additionally a regular expression does not allow corrections. therefore this function is using a procedural approach (using recommendations from RFC 822 and https://en.wikipedia.org/wiki/Email_address). :param email: email address to check and correct. :param changed: optional flag if email address got changed (before calling this function) - will be returned unchanged if email did not get corrected. :param removed: optional list declared by caller to pass back all the removed characters including the index in the format "<index>:<removed_character(s)>". :return: tuple of (possibly corrected email address, flag if email got changed/corrected). """ if not email: # email could be None, also shortcut if email == "" return "", False if removed is None: removed = [] letters_or_digits = ascii_letters + digits in_local_part = True in_quoted_part = False in_comment = False all_upper_case = True local_part = "" domain_part = "" domain_beg_idx = -1 domain_end_idx = len(email) - 1 comment = '' last_ch = '' ch_before_comment = '' for idx, char in enumerate(email): if char.islower(): all_upper_case = False next_ch = email[idx + 1] if idx + 1 < domain_end_idx else '' if in_comment: comment += char if char == ')': in_comment = False removed.append(comment) last_ch = ch_before_comment continue if char == '(' and not in_quoted_part \ and (idx == 0 or email[idx:].find(')@') >= 0 if in_local_part else idx == domain_beg_idx or email[idx:].find(')') == domain_end_idx - idx): comment = str(idx) + ':(' ch_before_comment = last_ch in_comment = True changed = True continue if char == '"' \ and (not in_local_part or last_ch != '.' and idx and not in_quoted_part or next_ch not in ('.', '@') and last_ch != '\\' and in_quoted_part): removed.append(str(idx) + ':' + char) changed = True continue if char == '@' and in_local_part and not in_quoted_part: in_local_part = False domain_beg_idx = idx + 1 elif char in letters_or_digits: # ch.isalnum(): pass # uppercase and lowercase latin letters A to Z and a to z (isalnum() includes also umlauts) elif ord(char) > 127 and in_local_part: pass # international characters above U+007F elif char == '.' and in_local_part and not in_quoted_part and last_ch != '.' and idx and next_ch != '@': pass # if not the first or last unless quoted, and does not appear consecutively unless quoted elif char in ('-', '.') and not in_local_part and (last_ch != '.' or char == '-') \ and idx not in (domain_beg_idx, domain_end_idx): pass # if not duplicated dot and not the first or last character in domain part elif (char in ' (),:;<>@[]' or char in '\\"' and last_ch == '\\' or char == '\\' and next_ch == '\\') \ and in_quoted_part: pass # in quoted part and in addition, a backslash or double-quote must be preceded by a backslash elif char == '"' and in_local_part: in_quoted_part = not in_quoted_part elif (char in "!#$%&'*+-/=?^_`{|}~" or char == '.' and (last_ch and last_ch != '.' and next_ch != '@' or in_quoted_part)) \ and in_local_part: pass # special characters (in local part only and not at beg/end and no dup dot outside of quoted part) else: removed.append(str(idx) + ':' + char) changed = True continue if in_local_part: local_part += char else: domain_part += char.lower() last_ch = char if all_upper_case: local_part = local_part.lower() return local_part + domain_part, changed
[docs]def correct_phone(phone: str, changed: bool = False, removed: Optional[List[str]] = None, keep_1st_hyphen: bool = False ) -> Tuple[str, bool]: """ check and correct phone number from a user input (removing all invalid characters including spaces). :param phone: phone number to check and correct. :param changed: optional flag if phone got changed (before calling this function) - will be returned unchanged if phone did not get corrected. :param removed: optional list declared by caller to pass back all the removed characters including the index in the format "<index>:<removed_character(s)>". :param keep_1st_hyphen: pass True to keep at least the first occurring hyphen character. :return: tuple of (possibly corrected phone number, flag if phone got changed/corrected). """ if removed is None: removed = [] corr_phone = '' got_hyphen = False for idx, char in enumerate(phone or ""): # allow phone Is None if char.isdigit(): corr_phone += char elif keep_1st_hyphen and char == '-' and not got_hyphen: got_hyphen = True corr_phone += char else: if char == '+' and not corr_phone and not phone[idx + 1:].startswith('00'): corr_phone = '00' removed.append(str(idx) + ':' + char) changed = True return corr_phone, changed