Python Data Manipulation: Cutting and Processing DataFrames with Pandas Functions
Here is the code with added documentation and some minor improvements for readability:
import pandas as pd
def cut_dataframe(df_, rules):
"""
Select rows by index and create a new DataFrame based on cut rules.
Parameters:
df_ (DataFrame): DataFrame to process.
rules (dict): Dictionary of rules. Keys represent index location values contain
a dictionary representing the kwargs for pd.cut.
Returns:
New DataFrame with the updated values.
"""
new_df = pd.DataFrame(columns=df_.columns)
for idx, kwargs in rules.items():
# Create a new row in the new DataFrame based on the cut rule
new_row = pd.cut(df_.loc[idx], **kwargs).tolist()
new_df.loc[idx] = new_row
return new_df
def process_dataframe(df_, rules):
"""
Select rows by index and create a new DataFrame based on rules.
Parameters:
df_ (DataFrame): DataFrame to process.
rules (dict): Dictionary of rules. Keys represent index location values contain
a dictionary containing the following keys:
- fn: name of the pd function to apply
- args: positional arguments for the pd function
- kwargs: keyword arguments for the pd function
Returns:
New DataFrame with the updated values.
"""
new_df = pd.DataFrame(columns=df_.columns)
for idx, rule in rules.items():
try:
# Apply the specified pd function to the row at index idx
new_row = getattr(pd, rule['fn'])(df_.loc[idx], *(rule['args'] if 'args' in rule else []), **(rule['kwargs'] if 'kwargs' in rule else {}))
new_df.loc[idx] = new_row
except AttributeError:
# Invalid Function
pass
return new_df
# Example usage
df = pd.DataFrame({
1: [1605.605445, 1.641198, 88.0],
2: [1573.885642, 1.649328, 83.585213],
3: [1449.610451, 1.53556, 80.738045],
4: [1443.882435, 1.507821, 79.9],
5: [1458.104066, 1.408316, 67.6],
6: [1357.563727, 1.349214, 76.3]
}, index=['GDP per capita', 'CO2 per capita', 'Electricity Access'])
# Cut example
out_df_cut = cut_dataframe(df, {
'GDP per capita': dict(bins=3, labels=['low', 'medium', 'high']),
'CO2 per capita': dict(bins=3, labels=['high', 'medium', 'low']),
'Electricity Access': dict(bins=[0, 80, 85, 100], labels=['low', 'medium', 'high'])
})
# Process example
out_df_process = process_dataframe(df, {
'GDP per capita': {'fn': 'cut', 'kwargs': dict(bins=3, labels=['low', 'medium', 'high'])},
'CO2 per capita': {'fn': 'cut', 'kwargs': dict(bins=3, labels=['high', 'medium', 'low'])},
'Electricity Access': {'fn': 'cut', 'kwargs': dict(bins=[0, 80, 85, 100], labels=['low', 'medium', 'high'])}
})
Note that I’ve used Markdown formatting to make the code and text more readable. Let me know if you have any further requests!
Last modified on 2024-11-10