Source code for wrangle_in_py.column_name_standardizer

import pandas as pd
from collections import defaultdict
import re


[docs]
def string_standardizer(messy_string):
    """
    Converts the inputted messy_string to lowercase and
    non-alphanumerics (including spaces and punctuation) will be replaced with underscores.

    Parameters
    ----------
    messy_string : str
        The input string to be standardized.
    
    Raises
    ------
    TypeError :
        If the input messy_string is not a string.

    Returns
    -------
    str
        A standardized version of the input string in lowercase 
        and non-alphanumeric characters replaced by underscores.

    Examples
    --------
    >>> string_standardizer('Jack Fruit 88')
    'jack_fruit_88'
    
    >>> string_standardizer('PINEAPPLES')
    'pineapples'

    >>> string_standardizer('Dragon (Fruit)')
    'dragon__fruit_'
    """
    if not isinstance(messy_string, str):
        raise TypeError("messy_string input should be of type string.")
    new_string = re.sub(r'[^\w]', '_', messy_string)
    new_string = new_string.lower()
    return new_string



[docs]
def resulting_duplicates(original_strings, standardized_strings):
    """
    Identifies which strings became duplicates after standardization.

    Parameters
    ----------
    original_strings : list of str
        List of strings before standardization.
        
    standardized_strings : list of str
        List of strings after standardization.

    Raises
    ------
    ValueError :
        If the inputs original_strings and standardized_strings are not the same length.
        
    TypeError :
        If either of the inputs, original_strings or standardized_strings,
        are not a list of strings.

    Returns
    -------
    dict :
        A dictionary where the keys are the standardized strings with duplicate(s),
        and the values are lists of the original strings that map to them.
    
    Examples
    --------
    >>> strings_before = ['Jack Fruit 88.', "Jack!Fruit!88!", "PINEAPPLES"]
    >>> strings_after = ["jack_fruit_88_", "jack_fruit_88_", "pineapples"]
    >>> identify_duplicates(strings_before, strings_after)
    {'jack_fruit_88_': ['Jack Fruit 88.', 'Jack!Fruit!88!']}
    """
    # check if original_strings is a list of strings
    if not isinstance(original_strings, list) or not all(isinstance(element, str) for element in original_strings):
        raise TypeError("original_strings must be a list of strings.")
    
    # check if standardized_strings is a list of strings
    if not isinstance(standardized_strings, list) or not all(isinstance(element, str) for element in standardized_strings):
        raise TypeError("standardized_strings must be a list of strings.")
    
    # check if original_strings and standardized_strings are the same length
    if len(original_strings) != len(standardized_strings):
        raise ValueError("Both inputs must be of the same length.")
    
    # Map standardized names to original names
    duplicates = defaultdict(list)

    for orig, std in zip(original_strings, standardized_strings):
        duplicates[std].append(orig)

    # Filter to keep only those with multiple original columns mapping to the same standardized name
    duplicates = {key: value for key, value in duplicates.items() if len(value) > 1}

    return duplicates



[docs]
def column_name_standardizer(df):
    """
    Returns a copy of the inputted dataframe with standardized column names.
    Column names will be converted to lowercase and
    non-alphanumerics (including spaces and punctuation) will be replaced with underscores.

    If the standardization results in duplicate column names, a warning will be raised.

    Parameters
    ----------
    df : pandas DataFrame
        The input pandas DataFrame whose column names need standardization.
    
    Warnings
    --------
    UserWarning :
        If any of the standardized column names are the same.
    
    Raises
    ------
    TypeError:
        If the input dataframe is not a pandas DataFrame.

    Returns
    -------
    pandas.DataFrame :
        A new DataFrame with standardized column names.

    Examples
    --------
    >>> import pandas as pd
    >>> data = {'Jack Fruit 88': [1, 2], 'PINEAPPLES': [3, 4], 'Dragon (Fruit)': [25, 30]}
    >>> df = pd.DataFrame(data)
    >>> column_name_standardizer(df)
       jack_fruit_88  pineapples  dragon__fruit_
    0           1          3         25
    1           2          4         30
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("The input must be a pandas DataFrame.")

    original_columns = df.columns.tolist()
    standardized_columns = [string_standardizer(col) for col in original_columns]

    duplicates = resulting_duplicates(original_columns, standardized_columns)

    if bool(duplicates):
        import warnings
        warnings.warn(f"Duplicate column names found after standardization: {duplicates}")

    standardized_df = df.copy()
    standardized_df.columns = standardized_columns
    return standardized_df