colassigner
fitting somewhat complex, nested data structures into tables, and removing the need to remember the name and construction logic of any column, if you can rely on static analysis
things to think about:
draw a reliance dag based on calls
pivot table: data content based columns
enum type support
changing record entity type
partial inheritance / composite types
Installation:
using pip
pip install colassigner
Quickstart
Assign Columns
import pandas as pd
from colassigner import ColAssigner
class Cols(ColAssigner):
def col1(self, df):
return df.iloc[:, 0] * 2
def col2(self, df):
return "added-another"
df = pd.DataFrame({"a": [1, 2, 3]}).pipe(Cols())
df
a | col_1 | col_2 | |
---|---|---|---|
0 | 1 | 2 | added-another |
1 | 2 | 4 | added-another |
2 | 3 | 6 | added-another |
df.loc[:, Cols.col2]
0 added-another
1 added-another
2 added-another
Name: col_2, dtype: object
Access Columns
while also documenting datatypes
from colassigner import ColAccessor
class Cols(ColAccessor):
x = int
y = float
df = pd.DataFrame({Cols.x: [1, 2, 3], Cols.y: [0.3, 0.1, 0.9]})
df
x | y | |
---|---|---|
0 | 1 | 0.3 |
1 | 2 | 0.1 |
2 | 3 | 0.9 |
df.loc[:, Cols.y]
0 0.3
1 0.1
2 0.9
Name: y, dtype: float64
Nested Structures
Nested Accessor
import pandas as pd
from colassigner import ColAccessor
class GrandChildCols(ColAccessor):
x = str
y = str
class ChildCols(ColAccessor):
a = int
b = float
grandchild_a = GrandChildCols
grandchild_b = GrandChildCols
class Cols(ColAccessor):
fing = int
assigned_child = ChildCols
class InheritedChild(ChildCols):
pass
pd.DataFrame(
{
Cols.fing: [2, 3, 4],
Cols.assigned_child.grandchild_a.y: ["a", "b", "c"],
Cols.InheritedChild.b: [0.1, 0.2, 0.3],
}
)
fing | assigned_child__grandchild_a__y | inherited_child__b | |
---|---|---|---|
0 | 2 | a | 0.1 |
1 | 3 | b | 0.2 |
2 | 4 | c | 0.3 |
Nested Assigner
from colassigner import ColAssigner
class SourceCols(ColAccessor):
x = float
b = bool
class SepChild(ColAssigner):
_col = SourceCols.x
def neg(self, df):
return -df[self._col]
def double(self, df):
return 2 * df[self._col]
class Cols(ColAssigner):
def col_one(self, df):
return 1
class SubCol(ColAssigner):
def fing(self, df):
return df.sum(axis=1)
class SubSubCol(ColAssigner):
_prefix = "pref_"
def sub_x(self, df):
return 0
def sub_y(self, df):
return self._prefix + df[Cols.col_one].astype(str)
class SubSubCol2(SubSubCol):
_prefix = "pref2_"
sep_child = SepChild
class SepChildB(SepChild):
_col = SourceCols.b
df = pd.DataFrame({
SourceCols.x: [1.5, 3.4, 9.1], SourceCols.b: [False, True, True]
}).pipe(Cols())
df.T
0 | 1 | 2 | |
---|---|---|---|
x | 1.5 | 3.4 | 9.1 |
b | False | True | True |
col_one | 1 | 1 | 1 |
sub_col__fing | 2.5 | 5.4 | 11.1 |
sub_col__sub_sub_col__sub_x | 0 | 0 | 0 |
sub_col__sub_sub_col__sub_y | pref_1 | pref_1 | pref_1 |
sub_col__sub_sub_col_2__sub_x | 0 | 0 | 0 |
sub_col__sub_sub_col_2__sub_y | pref2_1 | pref2_1 | pref2_1 |
sep_child__neg | -1.5 | -3.4 | -9.1 |
sep_child__double | 3.0 | 6.8 | 18.2 |
sep_child_b__neg | True | False | False |
sep_child_b__double | 0 | 2 | 2 |
df.loc[:, [Cols.sep_child.double, Cols.SubCol.SubSubCol2.sub_x]]
sep_child__double | sub_col__sub_sub_col_2__sub_x | |
---|---|---|
0 | 3.0 | 0 |
1 | 6.8 | 0 |
2 | 18.2 | 0 |
Designated Child Assigner
These are designed for information sharing among assigners and do not take the dataframe as arguments for their methods but, take both the df and their parent assigner as parameters for their
__init__
import numpy as np
from colassigner import ChildColAssigner
class RawCols(ColAccessor):
cat = str
num = int
class RawCols2(ColAccessor):
b = str
c = str
class IntSides(ChildColAssigner):
# note the type and order of the parameters:
def __init__(self, df, parent_assigner: "GbReindex") -> None:
self.arr = parent_assigner.arr
# note the absence of parameters
def lower(self):
return np.floor(self.arr).astype(int)
def upper(self):
return np.ceil(self.arr).astype(int)
class GbReindex(ChildColAssigner):
main_col = ...
def __init__(self, df, bc: "BaseCols"):
# note that this reindex needs to be done only once
# and can be used in many child assigners
self.arr = bc.base_gb.reindex(df[self.main_col]).values
def values(self):
return self.arr
sides = IntSides
class BaseCols(ColAssigner):
def __init__(self, base_df):
self.base_gb = base_df.groupby(RawCols.cat)[RawCols.num].mean()
class GbB(GbReindex):
main_col = RawCols2.b
class GbC(GbReindex):
main_col = RawCols2.c
def prod(self, df):
return df.loc[
:, [BaseCols.GbB.sides.lower, BaseCols.GbC.values]
].prod(axis=1)
df1 = pd.DataFrame({RawCols.cat: ["x", "y", "y"], RawCols.num: [2, 3, 4]})
assigner = BaseCols(df1)
df2 = pd.DataFrame({"b": ["x", "y", "x"], "c": ["y", "y", "x"]}).pipe(assigner)
df2
b | c | gb_b__values | gb_b__sides__lower | gb_b__sides__upper | gb_c__values | gb_c__sides__lower | gb_c__sides__upper | prod | |
---|---|---|---|---|---|---|---|---|---|
0 | x | y | 2.0 | 2 | 2 | 3.5 | 3 | 4 | 7.0 |
1 | y | y | 3.5 | 3 | 4 | 3.5 | 3 | 4 | 10.5 |
2 | x | x | 2.0 | 2 | 2 | 2.0 | 2 | 2 | 4.0 |
API
colassigner Package
Helper for assigning and accessing pandas columns
Functions
|
|
|
|
|
returns a list of strings of all columns given by the type |
|
get the true assigned value for the class attribute |
|
generates a dag of the reliances of columns based on the ast of a colassigner |
|
|
|
Classes
|
assigner specifically for nested structures |
|
|
describe and access raw columns |
|
define functions that create columns in a dataframe |
Class Inheritance Diagram
digraph inheritance21f814a7d2 { bgcolor=transparent; rankdir=LR; size="8.0, 12.0"; "ChildColAssigner" [URL="index.html#colassigner.ChildColAssigner",fillcolor=white,fontname="Vera Sans, DejaVu Sans, Liberation Sans, Arial, Helvetica, sans",fontsize=10,height=0.25,shape=box,style="setlinewidth(0.5),filled",target="_top",tooltip="assigner specifically for nested structures"]; "ColAssigner" -> "ChildColAssigner" [arrowsize=0.5,style="setlinewidth(0.5)"]; "Col" [URL="index.html#colassigner.Col",fillcolor=white,fontname="Vera Sans, DejaVu Sans, Liberation Sans, Arial, Helvetica, sans",fontsize=10,height=0.25,shape=box,style="setlinewidth(0.5),filled",target="_top"]; "Generic" -> "Col" [arrowsize=0.5,style="setlinewidth(0.5)"]; "ColAccessor" [URL="index.html#colassigner.ColAccessor",fillcolor=white,fontname="Vera Sans, DejaVu Sans, Liberation Sans, Arial, Helvetica, sans",fontsize=10,height=0.25,shape=box,style="setlinewidth(0.5),filled",target="_top",tooltip="describe and access raw columns"]; "ColAssigner" [URL="index.html#colassigner.ColAssigner",fillcolor=white,fontname="Vera Sans, DejaVu Sans, Liberation Sans, Arial, Helvetica, sans",fontsize=10,height=0.25,shape=box,style="setlinewidth(0.5),filled",target="_top",tooltip="define functions that create columns in a dataframe"]; "ColAccessor" -> "ColAssigner" [arrowsize=0.5,style="setlinewidth(0.5)"]; "Generic" [fillcolor=white,fontname="Vera Sans, DejaVu Sans, Liberation Sans, Arial, Helvetica, sans",fontsize=10,height=0.25,shape=box,style="setlinewidth(0.5),filled",tooltip="Abstract base class for generic types."]; }Release Notes
v0.0.0
first release of colassigner, yay!!
v0.0.1
first release of colassigner, yay!!
v0.0.2
points of whats new
v0.0.3
add calling graph feature
v0.0.4
add colaccessor
v0.0.5
extend allcols
v0.0.6
unify accessor and assigner
v0.1.0
change base structure to dry it a bit add col type retention
v0.2.0
rethink nested structures
do away with .assign(CA()) pattern
replace with .pipe(CA())
put aside graph generation for a while
add proper docs
v0.2.1
multidigit number fix
v0.2.2
v0.3.0
v0.3.1
v0.4.0
cleanup
v0.4.1
minor extension and simplification
v0.4.2
getitem