Module nowcastlib.pipeline.features
module containing functionality related to feature engineering and selection
Expand source code
"""module containing functionality related to feature engineering and selection"""
import logging
import pandas as pd
from nowcastlib.pipeline.structs import config
from . import generate
logger = logging.getLogger(__name__)
def generate_field(
data_df: pd.core.frame.DataFrame,
field_config: config.GeneratedField,
):
"""
generates a new field by applying the relevant generator function
Parameters
----------
data_df : pandas.core.frame.DataFrame
the dataframe holding the data we can access
field_config : nowcastlib.pipeline.structs.config.GeneratedField
Returns
-------
pandas.core.series.Series
the resulting dataseries
"""
# convert tuple to list, safely
input_fields = [element for element in field_config.input_fields]
if (
field_config.gen_func == config.GeneratorFunction.CUSTOM
or field_config.func_path is not None
):
# TODO handle custom case
raise NotImplementedError("Custom generator functions are not yet supported")
else:
if "index" in input_fields:
input_df = data_df[
[field for field in input_fields if field != "index"]
].assign(index=data_df.index)
# ensure column order matches input_fields order
input_df = input_df[input_fields]
else:
input_df = data_df[input_fields].copy()
# use the function_map dictionary to select the right generator function
func = generate.function_map[field_config.gen_func]
# prepare additional kwargs appropriately
additional_args = (
{} if field_config.additional_kwargs is None else field_config.additional_kwargs
)
# finally, generate the field with the function we picked earlier
return func(*[input_df[col] for col in input_df], **additional_args)
def generate_fields(options: config.DataSet, data_df: pd.core.frame.DataFrame):
"""
Augments an input dataframe with additional fields
generated from the existing fields and auxiliary data
"""
proc_df = data_df.copy()
if options.generated_fields is not None:
logger.info("Generating additional fields...")
for new_field in options.generated_fields:
logger.debug("Generating field %s...", new_field.target_name)
proc_df[new_field.target_name] = generate_field(data_df, new_field)
logger.info("Field Generation complete.")
return proc_df
Sub-modules
nowcastlib.pipeline.features.generate
-
functions for generating new fields
Functions
def generate_field(data_df: pandas.core.frame.DataFrame, field_config: GeneratedField)
-
generates a new field by applying the relevant generator function
Parameters
data_df
:pandas.core.frame.DataFrame
- the dataframe holding the data we can access
field_config
:GeneratedField
Returns
pandas.core.series.Series
- the resulting dataseries
Expand source code
def generate_field( data_df: pd.core.frame.DataFrame, field_config: config.GeneratedField, ): """ generates a new field by applying the relevant generator function Parameters ---------- data_df : pandas.core.frame.DataFrame the dataframe holding the data we can access field_config : nowcastlib.pipeline.structs.config.GeneratedField Returns ------- pandas.core.series.Series the resulting dataseries """ # convert tuple to list, safely input_fields = [element for element in field_config.input_fields] if ( field_config.gen_func == config.GeneratorFunction.CUSTOM or field_config.func_path is not None ): # TODO handle custom case raise NotImplementedError("Custom generator functions are not yet supported") else: if "index" in input_fields: input_df = data_df[ [field for field in input_fields if field != "index"] ].assign(index=data_df.index) # ensure column order matches input_fields order input_df = input_df[input_fields] else: input_df = data_df[input_fields].copy() # use the function_map dictionary to select the right generator function func = generate.function_map[field_config.gen_func] # prepare additional kwargs appropriately additional_args = ( {} if field_config.additional_kwargs is None else field_config.additional_kwargs ) # finally, generate the field with the function we picked earlier return func(*[input_df[col] for col in input_df], **additional_args)
def generate_fields(options: DataSet, data_df: pandas.core.frame.DataFrame)
-
Augments an input dataframe with additional fields generated from the existing fields and auxiliary data
Expand source code
def generate_fields(options: config.DataSet, data_df: pd.core.frame.DataFrame): """ Augments an input dataframe with additional fields generated from the existing fields and auxiliary data """ proc_df = data_df.copy() if options.generated_fields is not None: logger.info("Generating additional fields...") for new_field in options.generated_fields: logger.debug("Generating field %s...", new_field.target_name) proc_df[new_field.target_name] = generate_field(data_df, new_field) logger.info("Field Generation complete.") return proc_df