column helpers
source
move_columns
def move_columns(
df:DataFrame, # Input
cols_to_move:str , # Single
pos:int , # Target
)-> DataFrame:
Move one or more columns to a specified position in a DataFrame.
df = pd.DataFrame({'A' : [1 , 2 , 3 ], 'B' : [4 , 5 , 6 ], 'C' : [7 , 8 , 9 ]})
df
sample_string = 'KnotenNr. der Stücklistenposition'
sample_string = sample_string.lower()
sample_string
'knotennr. der stücklistenposition'
[c for c in sample_string][:10 ]
['k', 'n', 'o', 't', 'e', 'n', 'n', 'r', '.', ' ']
[c.isalnum() for c in sample_string][:10 ]
[True, True, True, True, True, True, True, True, False, False]
[c if c.isalnum() else '_' for c in sample_string][:10 ]
['k', 'n', 'o', 't', 'e', 'n', 'n', 'r', '_', '_']
sample_string = "" .join([c if c.isalnum() else '_' for c in sample_string])
sample_string
'knotennr__der_stücklistenposition'
['knotennr', '', 'der', 'stücklistenposition']
[o for o in filter (None , sample_string.split('_' ))]
['knotennr', 'der', 'stücklistenposition']
'_' .join(filter (None ,sample_string.split('_' )))
'knotennr_der_stücklistenposition'
source
clean_string
def clean_string(
input_string:str
):
Cleans input_string
clean_string(sample_string)
'knotennr_der_stücklistenposition'
source
clean_col_names
def clean_col_names(
df:DataFrame
)-> DataFrame:
Returns df with clean column names by using clean_string on each column name.
0
101
2024-02-01
Widget A
10
99.99
1
102
2024-02-02
Widget B
20
149.99
df = clean_col_names(df)
df.head(2 )
0
101
2024-02-01
Widget A
10
99.99
1
102
2024-02-02
Widget B
20
149.99
source
show_identical_columns
def show_identical_columns(
df:DataFrame, # The DataFrame to analyze
columns:list , # The list of column names to compare
)-> DataFrame: # A DataFrame matrix showing identity status between columns
Checks if specified columns in df are identical.
etl functions
source
reduce_mem_usage
def reduce_mem_usage(
df:pandas.DataFrame | pandas.Series, # Input DataFrame or Series
verbose:bool = True , # Whether to print memory usage reduction
)-> pandas.DataFrame | pandas.Series: # Reduced DataFrame or Series
Reduces memory usage of a DataFrame or Series by downcasting numerical types.
source
group_resample
def group_resample(
df:DataFrame, # Input DataFrame
id_col:str , # Column name to group/unstack by
value_col:str , # Column name containing values to aggregate
date_col:str , # Column name containing dates
freq:str = 'W' , # Resampling frequency (e.g., 'W', 'ME', 'D')
aggfunc:str = 'sum' , # Aggregation function (e.g., 'sum', 'mean')
)-> DataFrame: # Resampled and stacked DataFrame
Group, resample, and restack a DataFrame by ID and date.
source
polars_resample
def polars_resample(
df, date_col:str = 'ds' , group_cols:str = 'unique_id' , agg_col:str = 'y' , frequency:str = '1mo'
):
Call self as a function.
validate functions
Tests to check if we use out of scope variables inside function
def my_sum(b):
return a + b
my_sum(32 )
# inspect.getclosurevars(eval("my_sum"))
inspect.getclosurevars(my_sum)
ClosureVars(nonlocals={}, globals={'a': 10}, builtins={}, unbound=set())
def my_callback(result):
print ("Cell just finished running!" )
get_ipython().events.register('post_run_cell' , my_callback)
Cell just finished running!
get_ipython().events.unregister('post_run_cell' , my_callback)
def check_funcs(result):
tree = ast.parse(result.info.raw_cell)
func_names = [node.name for node in ast.walk(tree) if isinstance (node, ast.FunctionDef)]
if func_names:
print (f"Functions defined: { func_names} " )
get_ipython().events.register('post_run_cell' , check_funcs)
get_ipython().events.unregister('post_run_cell' , check_funcs)
def check_funcs(result):
tree = ast.parse(result.info.raw_cell)
func_names = [node.name for node in ast.walk(tree) if isinstance (node, ast.FunctionDef)]
if func_names:
print (f"Functions defined: { func_names} " )
get_ipython().events.register('post_run_cell' , check_funcs)
get_ipython().events.unregister('post_run_cell' , check_funcs)
import ast
import inspect
def check_global_deps(result):
tree = ast.parse(result.info.raw_cell)
func_names = [node.name for node in ast.walk(tree) if isinstance (node, ast.FunctionDef)]
ns = get_ipython().user_ns
for name in func_names:
if name in ns:
func = ns[name]
cv = inspect.getclosurevars(func)
if cv.globals :
print (f"⚠️ ' { name} ' depends on global variables: { list (cv.globals .keys())} " )
get_ipython().events.register('post_run_cell' , check_global_deps)
def my_sum(b):
return a + b
my_sum(32 )
⚠️ 'my_sum' depends on global variables: ['a']
get_ipython().events.unregister('post_run_cell' , check_global_deps)