core

Universal helper functions

column helpers


source

move_columns


def move_columns(
    df:DataFrame, # Input
    cols_to_move:str, # Single
    pos:int, # Target
)->DataFrame:

Move one or more columns to a specified position in a DataFrame.

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
df
A B C
0 1 4 7
1 2 5 8
2 3 6 9
move_columns(df, 'C', 0)
C A B
0 7 1 4
1 8 2 5
2 9 3 6
sample_string = 'KnotenNr. der Stücklistenposition'
sample_string = sample_string.lower()
sample_string
'knotennr. der stücklistenposition'
[c for c in sample_string][:10]
['k', 'n', 'o', 't', 'e', 'n', 'n', 'r', '.', ' ']
[c.isalnum() for c in sample_string][:10]
[True, True, True, True, True, True, True, True, False, False]
[c if c.isalnum() else '_' for c in sample_string][:10]
['k', 'n', 'o', 't', 'e', 'n', 'n', 'r', '_', '_']
sample_string = "".join([c if c.isalnum() else '_' for c in sample_string])
sample_string
'knotennr__der_stücklistenposition'
sample_string.split('_')
['knotennr', '', 'der', 'stücklistenposition']
[o for o in filter(None, sample_string.split('_'))]
['knotennr', 'der', 'stücklistenposition']
'_'.join(filter(None,sample_string.split('_')))
'knotennr_der_stücklistenposition'

source

clean_string


def clean_string(
    input_string:str
):

Cleans input_string

clean_string(sample_string)
'knotennr_der_stücklistenposition'

source

clean_col_names


def clean_col_names(
    df:DataFrame
)->DataFrame:

Returns df with clean column names by using clean_string on each column name.

df.head(2)
Cust_ID. Order--Date Prdct.Name! QTY___Ordered Unit$Price
0 101 2024-02-01 Widget A 10 99.99
1 102 2024-02-02 Widget B 20 149.99
df = clean_col_names(df)
df.head(2)
cust_id order_date prdct_name qty_ordered unit_price
0 101 2024-02-01 Widget A 10 99.99
1 102 2024-02-02 Widget B 20 149.99

source

show_identical_columns


def show_identical_columns(
    df:DataFrame, # The DataFrame to analyze
    columns:list, # The list of column names to compare
)->DataFrame: # A DataFrame matrix showing identity status between columns

Checks if specified columns in df are identical.

etl functions


source

reduce_mem_usage


def reduce_mem_usage(
    df:pandas.DataFrame | pandas.Series, # Input DataFrame or Series
    verbose:bool=True, # Whether to print memory usage reduction
)->pandas.DataFrame | pandas.Series: # Reduced DataFrame or Series

Reduces memory usage of a DataFrame or Series by downcasting numerical types.


source

group_resample


def group_resample(
    df:DataFrame, # Input DataFrame
    id_col:str, # Column name to group/unstack by
    value_col:str, # Column name containing values to aggregate
    date_col:str, # Column name containing dates
    freq:str='W', # Resampling frequency (e.g., 'W', 'ME', 'D')
    aggfunc:str='sum', # Aggregation function (e.g., 'sum', 'mean')
)->DataFrame: # Resampled and stacked DataFrame

Group, resample, and restack a DataFrame by ID and date.


source

polars_resample


def polars_resample(
    df, date_col:str='ds', group_cols:str='unique_id', agg_col:str='y', frequency:str='1mo'
):

Call self as a function.

validate functions

Tests to check if we use out of scope variables inside function

a = 10
def my_sum(b):
    return a + b

my_sum(32)
42
import inspect
# inspect.getclosurevars(eval("my_sum"))
inspect.getclosurevars(my_sum)
ClosureVars(nonlocals={}, globals={'a': 10}, builtins={}, unbound=set())
def my_callback(result):
    print("Cell just finished running!")

get_ipython().events.register('post_run_cell', my_callback)
Cell just finished running!
get_ipython().events.unregister('post_run_cell', my_callback)
import ast
def check_funcs(result):
    tree = ast.parse(result.info.raw_cell)
    func_names = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]
    if func_names:
        print(f"Functions defined: {func_names}")
get_ipython().events.register('post_run_cell', check_funcs)
get_ipython().events.unregister('post_run_cell', check_funcs)
def check_funcs(result):
    tree = ast.parse(result.info.raw_cell)
    func_names = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]
    if func_names:
        print(f"Functions defined: {func_names}")
get_ipython().events.register('post_run_cell', check_funcs)
get_ipython().events.unregister('post_run_cell', check_funcs)
import ast
import inspect
def check_global_deps(result):
    tree = ast.parse(result.info.raw_cell)
    func_names = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]
    
    ns = get_ipython().user_ns
    for name in func_names:
        if name in ns:
            func = ns[name]
            cv = inspect.getclosurevars(func)
            if cv.globals:
                print(f"⚠️  '{name}' depends on global variables: {list(cv.globals.keys())}")
get_ipython().events.register('post_run_cell', check_global_deps)
def my_sum(b):
    return a + b

my_sum(32)
42
⚠️  'my_sum' depends on global variables: ['a']
get_ipython().events.unregister('post_run_cell', check_global_deps)