Source code for wrangle_in_py.remove_duplicates

import pandas as pd

[docs] def remove_duplicates(df, subset_columns=None, keep='first'): """ Remove duplicate rows from a DataFrame based on specified columns. Parameters ---------- df : pd.DataFrame The dataframe to process. subset_columns : list or None List of column names to consider for identifying duplicates. If None (default), consider all columns. keep : str Determines which duplicates to keep: - 'first': Keep the first occurrence (default). - 'last': Keep the last occurrence. - False: Drop all duplicates. Raises ------ ValueError : If the input for df is not a pandas DataFrame. If any column in subset_columns is not a column in the input dataframe. If the input for keep is not 'first', 'last', or False. Returns ------- pd.DataFrame: A DataFrame with duplicates removed. Example ------- >>> data = {'A': [1, 2, 2, 4], 'B': [5, 6, 6, 8]} >>> df = pd.DataFrame(data) >>> remove_duplicates(df, subset_columns=['A']) A B 0 1 5 1 2 6 3 4 8 """ # Validate input if not isinstance(df, pd.DataFrame): raise ValueError("Input must be a pandas DataFrame") if subset_columns is not None: if not all(col in df.columns for col in subset_columns): raise ValueError("Some columns in subset_columns are not present in the DataFrame") if keep not in ['first', 'last', False]: raise ValueError("Invalid value for 'keep'. Must be 'first', 'last', or False.") # Drop duplicates using pandas original_row_count = len(df) result = df.drop_duplicates(subset=subset_columns, keep=keep) dropped_rows = original_row_count - len(result) print(f"{dropped_rows} rows have been dropped.") return result