Python Data Manipulation: Cutting and Processing DataFrames with Pandas Functions

Here is the code with added documentation and some minor improvements for readability:

import pandas as pd

def cut_dataframe(df_, rules):
    """
    Select rows by index and create a new DataFrame based on cut rules.

    Parameters:
        df_ (DataFrame): DataFrame to process.
        rules (dict): Dictionary of rules. Keys represent index location values contain
            a dictionary representing the kwargs for pd.cut.

    Returns:
        New DataFrame with the updated values.
    """
    new_df = pd.DataFrame(columns=df_.columns)
    for idx, kwargs in rules.items():
        # Create a new row in the new DataFrame based on the cut rule
        new_row = pd.cut(df_.loc[idx], **kwargs).tolist()
        new_df.loc[idx] = new_row
    return new_df

def process_dataframe(df_, rules):
    """
    Select rows by index and create a new DataFrame based on rules.

    Parameters:
        df_ (DataFrame): DataFrame to process.
        rules (dict): Dictionary of rules. Keys represent index location values contain
            a dictionary containing the following keys:
                - fn: name of the pd function to apply
                - args: positional arguments for the pd function
                - kwargs: keyword arguments for the pd function

    Returns:
        New DataFrame with the updated values.
    """
    new_df = pd.DataFrame(columns=df_.columns)
    for idx, rule in rules.items():
        try:
            # Apply the specified pd function to the row at index idx
            new_row = getattr(pd, rule['fn'])(df_.loc[idx], *(rule['args'] if 'args' in rule else []), **(rule['kwargs'] if 'kwargs' in rule else {}))
            new_df.loc[idx] = new_row
        except AttributeError:
            # Invalid Function
            pass
    return new_df

# Example usage
df = pd.DataFrame({
    1: [1605.605445, 1.641198, 88.0],
    2: [1573.885642, 1.649328, 83.585213],
    3: [1449.610451, 1.53556, 80.738045],
    4: [1443.882435, 1.507821, 79.9],
    5: [1458.104066, 1.408316, 67.6],
    6: [1357.563727, 1.349214, 76.3]
}, index=['GDP per capita', 'CO2 per capita', 'Electricity Access'])

# Cut example
out_df_cut = cut_dataframe(df, {
    'GDP per capita': dict(bins=3, labels=['low', 'medium', 'high']),
    'CO2 per capita': dict(bins=3, labels=['high', 'medium', 'low']),
    'Electricity Access': dict(bins=[0, 80, 85, 100], labels=['low', 'medium', 'high'])
})

# Process example
out_df_process = process_dataframe(df, {
    'GDP per capita': {'fn': 'cut', 'kwargs': dict(bins=3, labels=['low', 'medium', 'high'])},
    'CO2 per capita': {'fn': 'cut', 'kwargs': dict(bins=3, labels=['high', 'medium', 'low'])},
    'Electricity Access': {'fn': 'cut', 'kwargs': dict(bins=[0, 80, 85, 100], labels=['low', 'medium', 'high'])}
})

Note that I’ve used Markdown formatting to make the code and text more readable. Let me know if you have any further requests!


Last modified on 2024-11-10