Skip to content

RegularDataProcessor

Bases: BaseProcessor

Main class for Regular/Tabular Data Preprocessing. It works like any other transformer in scikit learn with the methods fit, transform and inverse transform.

Parameters:

Name Type Description Default
num_cols list of strings

List of names of numerical columns.

None
cat_cols list of strings

List of names of categorical columns.

None
Source code in /opt/hostedtoolcache/Python/3.10.12/x64/lib/python3.10/site-packages/ydata_synthetic/preprocessing/regular/processor.py
@typechecked
class RegularDataProcessor(BaseProcessor):
    """
    Main class for Regular/Tabular Data Preprocessing.
    It works like any other transformer in scikit learn with the methods fit, transform and inverse transform.
    Args:
        num_cols (list of strings):
            List of names of numerical columns.
        cat_cols (list of strings):
            List of names of categorical columns.
    """
    def __init__(self, num_cols: Optional[List[str]] = None, cat_cols: Optional[List[str]] = None):
        super().__init__(num_cols, cat_cols)

        self._col_order_ = None
        self._num_col_idx_ = None
        self._cat_col_idx_ = None

    # pylint: disable=W0106
    def fit(self, X: DataFrame) -> RegularDataProcessor:
        """Fits the DataProcessor to a passed DataFrame.
        Args:
            X (DataFrame):
                DataFrame used to fit the processor parameters.
                Should be aligned with the num/cat columns defined in initialization.
        Returns:
            self (RegularDataProcessor): The fitted data processor.
        """
        self._validate_cols(X.columns)

        self._col_order_ = [c for c in X.columns if c in self.num_cols + self.cat_cols]

        self._types = X.dtypes

        self._num_pipeline = Pipeline([
            ("scaler", MinMaxScaler()),
        ])
        self._cat_pipeline = Pipeline([
            ("encoder", OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
        ])

        self.num_pipeline.fit(X[self.num_cols]) if self.num_cols else zeros([len(X), 0])
        self.cat_pipeline.fit(X[self.cat_cols]) if self.num_cols else zeros([len(X), 0])

        self._num_col_idx_ = len(self.num_pipeline.get_feature_names_out())
        self._cat_col_idx_ = self._num_col_idx_ + len(self.cat_pipeline.get_feature_names_out())

        return self

    def transform(self, X: DataFrame) -> ndarray:
        """Transforms the passed DataFrame with the fit DataProcessor.
        Args:
            X (DataFrame):
                DataFrame used to fit the processor parameters.
                Should be aligned with the columns types defined in initialization.
        Returns:
            transformed (ndarray):
                Processed version of the passed DataFrame.
        """
        self._check_is_fitted()

        num_data = self.num_pipeline.transform(X[self.num_cols]) if self.num_cols else zeros([len(X), 0])
        cat_data = self.cat_pipeline.transform(X[self.cat_cols]) if self.cat_cols else zeros([len(X), 0])

        transformed = concatenate([num_data, cat_data], axis=1)

        return transformed

    def inverse_transform(self, X: ndarray) -> DataFrame:
        """Inverts the data transformation pipelines on a passed DataFrame.
        Args:
            X (ndarray):
                Numpy array to be brought back to the original data format.
                Should share the schema of data transformed by this DataProcessor.
                Can be used to revert transformations of training data or for synthetic samples.
        Returns:
            result (DataFrame):
                DataFrame with all performed transformations inverted.
        """
        self._check_is_fitted()

        num_data, cat_data, _ = split(X, [self._num_col_idx_, self._cat_col_idx_], axis=1)

        num_data = self.num_pipeline.inverse_transform(num_data) if self.num_cols else zeros([len(X), 0])
        cat_data = self.cat_pipeline.inverse_transform(cat_data) if self.cat_cols else zeros([len(X), 0])

        result = concat([DataFrame(num_data, columns=self.num_cols),
                         DataFrame(cat_data, columns=self.cat_cols)], axis=1)

        result = result.loc[:, self._col_order_]

        for col in result.columns:
            result[col]=result[col].astype(self._types[col])

        return result

fit(X)

Fits the DataProcessor to a passed DataFrame.

Parameters:

Name Type Description Default
X DataFrame

DataFrame used to fit the processor parameters. Should be aligned with the num/cat columns defined in initialization.

required

Returns:

Name Type Description
self RegularDataProcessor

The fitted data processor.

Source code in /opt/hostedtoolcache/Python/3.10.12/x64/lib/python3.10/site-packages/ydata_synthetic/preprocessing/regular/processor.py
def fit(self, X: DataFrame) -> RegularDataProcessor:
    """Fits the DataProcessor to a passed DataFrame.
    Args:
        X (DataFrame):
            DataFrame used to fit the processor parameters.
            Should be aligned with the num/cat columns defined in initialization.
    Returns:
        self (RegularDataProcessor): The fitted data processor.
    """
    self._validate_cols(X.columns)

    self._col_order_ = [c for c in X.columns if c in self.num_cols + self.cat_cols]

    self._types = X.dtypes

    self._num_pipeline = Pipeline([
        ("scaler", MinMaxScaler()),
    ])
    self._cat_pipeline = Pipeline([
        ("encoder", OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
    ])

    self.num_pipeline.fit(X[self.num_cols]) if self.num_cols else zeros([len(X), 0])
    self.cat_pipeline.fit(X[self.cat_cols]) if self.num_cols else zeros([len(X), 0])

    self._num_col_idx_ = len(self.num_pipeline.get_feature_names_out())
    self._cat_col_idx_ = self._num_col_idx_ + len(self.cat_pipeline.get_feature_names_out())

    return self

inverse_transform(X)

Inverts the data transformation pipelines on a passed DataFrame.

Parameters:

Name Type Description Default
X ndarray

Numpy array to be brought back to the original data format. Should share the schema of data transformed by this DataProcessor. Can be used to revert transformations of training data or for synthetic samples.

required

Returns:

Name Type Description
result DataFrame

DataFrame with all performed transformations inverted.

Source code in /opt/hostedtoolcache/Python/3.10.12/x64/lib/python3.10/site-packages/ydata_synthetic/preprocessing/regular/processor.py
def inverse_transform(self, X: ndarray) -> DataFrame:
    """Inverts the data transformation pipelines on a passed DataFrame.
    Args:
        X (ndarray):
            Numpy array to be brought back to the original data format.
            Should share the schema of data transformed by this DataProcessor.
            Can be used to revert transformations of training data or for synthetic samples.
    Returns:
        result (DataFrame):
            DataFrame with all performed transformations inverted.
    """
    self._check_is_fitted()

    num_data, cat_data, _ = split(X, [self._num_col_idx_, self._cat_col_idx_], axis=1)

    num_data = self.num_pipeline.inverse_transform(num_data) if self.num_cols else zeros([len(X), 0])
    cat_data = self.cat_pipeline.inverse_transform(cat_data) if self.cat_cols else zeros([len(X), 0])

    result = concat([DataFrame(num_data, columns=self.num_cols),
                     DataFrame(cat_data, columns=self.cat_cols)], axis=1)

    result = result.loc[:, self._col_order_]

    for col in result.columns:
        result[col]=result[col].astype(self._types[col])

    return result

transform(X)

Transforms the passed DataFrame with the fit DataProcessor.

Parameters:

Name Type Description Default
X DataFrame

DataFrame used to fit the processor parameters. Should be aligned with the columns types defined in initialization.

required

Returns:

Name Type Description
transformed ndarray

Processed version of the passed DataFrame.

Source code in /opt/hostedtoolcache/Python/3.10.12/x64/lib/python3.10/site-packages/ydata_synthetic/preprocessing/regular/processor.py
def transform(self, X: DataFrame) -> ndarray:
    """Transforms the passed DataFrame with the fit DataProcessor.
    Args:
        X (DataFrame):
            DataFrame used to fit the processor parameters.
            Should be aligned with the columns types defined in initialization.
    Returns:
        transformed (ndarray):
            Processed version of the passed DataFrame.
    """
    self._check_is_fitted()

    num_data = self.num_pipeline.transform(X[self.num_cols]) if self.num_cols else zeros([len(X), 0])
    cat_data = self.cat_pipeline.transform(X[self.cat_cols]) if self.cat_cols else zeros([len(X), 0])

    transformed = concatenate([num_data, cat_data], axis=1)

    return transformed