Source code for ae.valid

"""
data validation helper functions
================================

this module is pure Python and has no dependencies.

the two slightly bigger helper functions provided by this namespace portion are :func:`correct_email` and
:func:`correct_phone`, which are useful to check if a string contains a valid email address or phone number.

they also allow you to automatically correct an email address or a phone number to a valid format. more sophisticated
helpers for the validation of email addresses, phone numbers and post addresses are available in the
:mod:`ae.validation` namespace portion.
"""
from string import ascii_letters, digits
from typing import List, Optional, Tuple


__version__ = '0.3.3'



[docs]
def correct_email(email: str, changed: bool = False, removed: Optional[List[str]] = None) -> Tuple[str, bool]:
    """ check and correct email address from a user input (removing all comments).

    special conversions that are not returned as changed/corrected are: the domain part of an email will be corrected
    to lowercase characters, additionally emails with all letters in uppercase will be converted into lowercase.

    regular expressions are not working for all edge cases (see the answer to this SO question:
    https://stackoverflow.com/questions/201323/using-a-regular-expression-to-validate-an-email-address) because RFC822
    is very complex (even the reg expression recommended by RFC 5322 is not complete; there is also a more readable form
    given in the informational RFC 3696). additionally a regular expression does not allow corrections. therefore this
    function is using a procedural approach (using recommendations from RFC 822 and
    https://en.wikipedia.org/wiki/Email_address).

    :param email:               email address to check and correct.
    :param changed:             optional flag if email address got changed (before calling this function) - will be
                                returned unchanged if email did not get corrected.
    :param removed:             optional list declared by caller to pass back all the removed characters including the
                                index in the format "<index>:<removed_character(s)>".
    :return:                    tuple of (possibly corrected email address, flag if email got changed/corrected).
    """
    if not email:       # email could be None, also shortcut if email == ""
        return "", False

    if removed is None:
        removed = []

    letters_or_digits = ascii_letters + digits
    in_local_part = True
    in_quoted_part = False
    in_comment = False
    all_upper_case = True
    local_part = ""
    domain_part = ""
    domain_beg_idx = -1
    domain_end_idx = len(email) - 1
    comment = ''
    last_ch = ''
    ch_before_comment = ''
    for idx, char in enumerate(email):
        if char.islower():
            all_upper_case = False
        next_ch = email[idx + 1] if idx + 1 < domain_end_idx else ''
        if in_comment:
            comment += char
            if char == ')':
                in_comment = False
                removed.append(comment)
                last_ch = ch_before_comment
            continue
        if char == '(' and not in_quoted_part \
                and (idx == 0 or email[idx:].find(')@') >= 0 if in_local_part
                     else idx == domain_beg_idx or email[idx:].find(')') == domain_end_idx - idx):
            comment = str(idx) + ':('
            ch_before_comment = last_ch
            in_comment = True
            changed = True
            continue
        if char == '"' \
                and (not in_local_part
                     or last_ch != '.' and idx and not in_quoted_part
                     or next_ch not in ('.', '@') and last_ch != '\\' and in_quoted_part):
            removed.append(str(idx) + ':' + char)
            changed = True
            continue

        if char == '@' and in_local_part and not in_quoted_part:
            in_local_part = False
            domain_beg_idx = idx + 1
        elif char in letters_or_digits:  # ch.isalnum():
            pass  # uppercase and lowercase latin letters A to Z and a to z (isalnum() includes also umlauts)
        elif ord(char) > 127 and in_local_part:
            pass    # international characters above U+007F
        elif char == '.' and in_local_part and not in_quoted_part and last_ch != '.' and idx and next_ch != '@':
            pass    # if not the first or last unless quoted, and does not appear consecutively unless quoted
        elif char in ('-', '.') and not in_local_part and (last_ch != '.' or char == '-') \
                and idx not in (domain_beg_idx, domain_end_idx):
            pass    # if not duplicated dot and not the first or last character in domain part
        elif (char in ' (),:;<>@[]' or char in '\\"' and last_ch == '\\' or char == '\\' and next_ch == '\\') \
                and in_quoted_part:
            pass    # in quoted part and in addition, a backslash or double-quote must be preceded by a backslash
        elif char == '"' and in_local_part:
            in_quoted_part = not in_quoted_part
        elif (char in "!#$%&'*+-/=?^_`{|}~"
              or char == '.' and (last_ch and last_ch != '.' and next_ch != '@' or in_quoted_part)) \
                and in_local_part:
            pass    # special characters (in local part only and not at beg/end and no dup dot outside of quoted part)
        else:
            removed.append(str(idx) + ':' + char)
            changed = True
            continue

        if in_local_part:
            local_part += char
        else:
            domain_part += char.lower()
        last_ch = char

    if all_upper_case:
        local_part = local_part.lower()

    return local_part + domain_part, changed




[docs]
def correct_phone(phone: str, changed: bool = False, removed: Optional[List[str]] = None, keep_1st_hyphen: bool = False
                  ) -> Tuple[str, bool]:
    """ check and correct phone number from a user input (removing all invalid characters including spaces).

    :param phone:               phone number to check and correct.
    :param changed:             optional flag if phone got changed (before calling this function) - will be returned
                                unchanged if phone did not get corrected.
    :param removed:             optional list declared by caller to pass back all the removed characters including the
                                index in the format "<index>:<removed_character(s)>".
    :param keep_1st_hyphen:     pass True to keep at least the first occurring hyphen character.
    :return:                    tuple of (possibly corrected phone number, flag if phone got changed/corrected).
    """
    if removed is None:
        removed = []

    corr_phone = ''
    got_hyphen = False
    for idx, char in enumerate(phone or ""):      # allow phone Is None
        if char.isdigit():
            corr_phone += char
        elif keep_1st_hyphen and char == '-' and not got_hyphen:
            got_hyphen = True
            corr_phone += char
        else:
            if char == '+' and not corr_phone and not phone[idx + 1:].startswith('00'):
                corr_phone = '00'
            removed.append(str(idx) + ':' + char)
            changed = True

    return corr_phone, changed