Skip to content

dataprocess module

ExtendedDataFrame

This ExtendedDataFrame class inherits the pandas DataFrame class

Parameters:

Name Type Description Default
pd pd.DataFrame

A pandas DataFrame

required

deduplicate(self, columns=None)

Drops duplicate records and resets the index of the ExtendedDataFrame

Parameters:

Name Type Description Default
columns list

Columns for which to identify dupulicate records. Defaults to None.

None

Returns:

Type Description
ExtendedDataFrame

A de-deuplicated ExtendedDataFrame with the index reset.

Source code in hagerstrand/dataprocess.py
def deduplicate(self, columns=None):
    """Drops duplicate records and resets the index of the ExtendedDataFrame

    Args:
        columns (list, optional): Columns for which to identify dupulicate records. Defaults to None.

    Returns:
        ExtendedDataFrame: A de-deuplicated ExtendedDataFrame with the index reset.
    """
    if columns == None:
        df = self.drop_duplicates()
        df = df.reset_index(drop=True)
        return df
    else:
        df = self.drop_duplicates(subset=columns)
        df = df.reset_index(drop=True)
        return df

jsoncol_merge(self)

Unpack a JSON column and merge to the existing DataFrame

Returns:

Type Description
ExtendedDataFrame

A new ExtendedDataFrame that includes the original DataFrame and the unpacked JSON column.

Source code in hagerstrand/dataprocess.py
def jsoncol_merge(self):
    """Unpack a JSON column and merge to the existing DataFrame

    Returns:
        ExtendedDataFrame: A new ExtendedDataFrame that includes the original DataFrame and the unpacked JSON column.
    """
    df = unpack_json_and_merge(self)
    return df

jsoncol_newdf(self)

Unpack a JSON column and return a new ExtendedDataFrame

Returns:

Type Description
ExtendedDataFrame

A new ExtendedDataFrame of an unpacked JSON column.

Source code in hagerstrand/dataprocess.py
def jsoncol_newdf(self):
    """Unpack a JSON column and return a new ExtendedDataFrame

    Returns:
        ExtendedDataFrame: A new ExtendedDataFrame of an unpacked JSON column.
    """
    df = unpack_json(self)
    return df

load_json_nan(df, json_col)

Load a JSON file even if there are NaNs.

Parameters:

Name Type Description Default
df pd.DataFrame

The DataFrame containing the JSON column to be loaded.

required
json_col str

The JSON column to be loaded.

required

Returns:

Type Description
df (pd.Series)

A pd.Series of a JSON column

Source code in hagerstrand/dataprocess.py
def load_json_nan(df, json_col):
    """Load a JSON file even if there are NaNs.

    Args:
        df (pd.DataFrame): The DataFrame containing the JSON column to be loaded.
        json_col (str): The JSON column to be loaded.

    Returns:
        df (pd.Series): A pd.Series of a JSON column
    """
    return df[json_col].apply(lambda x: json.loads(x) if type(x) == str else x)

unique_sorted_columns_plus_ALL(gj)

Obtain a sorted array of all columns in a GeoJSON, including an additional value of 'ALL' to denote all values.

Parameters:

Name Type Description Default
gj GeoJSON

A GeoJSON.

required

Returns:

Type Description
list

A sorted array of unique column names including an additional value of 'ALL'.

Source code in hagerstrand/dataprocess.py
def unique_sorted_columns_plus_ALL(gj):
    """Obtain a sorted array of all columns in a GeoJSON, including an additional value of 'ALL' to denote all values.

    Args:
        gj (GeoJSON): A GeoJSON.

    Returns:
        list: A sorted array of unique column names including an additional value of 'ALL'.
    """
    if isinstance(gj, json):
        array = gpd.GeoDataFrame.from_features(gj)
        unique = array.unique().tolist()
        unique.sort()
        unique.insert(0, "ALL")
        return unique

    else:
        raise TypeError("The provided argument for gj must be of type json.")

unique_sorted_values_plus_ALL(array)

Obtain a sorted array of all unique values in an array, including an additional value of 'ALL' to denote all values.

Parameters:

Name Type Description Default
array list|pd.Series|np.array

An array of values.

required

Returns:

Type Description
list

A sorted array of unique values including an additional value of 'ALL'.

Source code in hagerstrand/dataprocess.py
def unique_sorted_values_plus_ALL(array):
    """Obtain a sorted array of all unique values in an array, including an additional value of 'ALL' to denote all values.

    Args:
        array (list|pd.Series|np.array): An array of values.

    Returns:
        list: A sorted array of unique values including an additional value of 'ALL'.
    """
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, "ALL")
    return unique

unpack_json(df, json_column='visitor_home_cbgs', index_name=None, key_col_name=None, value_col_name=None)

Unpack a JSON column from a SafeGraph Patterns dataset.

Parameters:

Name Type Description Default
df pd.DataFrame

DataFrame containing the JSON column to be unpacked.

required
json_column str

JSON column to be unpacked. Defaults to 'visitor_home_cbgs'.

'visitor_home_cbgs'
index_name str

Index name for new ExtendedDataFrame. Defaults to None.

None
key_col_name str

Key name for new ExtendedDataFrame. Defaults to None.

None
value_col_name str

Value name for new ExtendedDataFrame. Defaults to None.

None

Returns:

Type Description
pd.DataFrame

DataFrame of an unpacked JSON column.

Source code in hagerstrand/dataprocess.py
def unpack_json(
    df,
    json_column="visitor_home_cbgs",
    index_name=None,
    key_col_name=None,
    value_col_name=None,
):
    """Unpack a JSON column from a SafeGraph Patterns dataset.

    Args:
        df (pd.DataFrame): DataFrame containing the JSON column to be unpacked.
        json_column (str, optional): JSON column to be unpacked. Defaults to 'visitor_home_cbgs'.
        index_name (str, optional): Index name for new ExtendedDataFrame. Defaults to None.
        key_col_name (str, optional): Key name for new ExtendedDataFrame. Defaults to None.
        value_col_name (str, optional): Value name for new ExtendedDataFrame. Defaults to None.

    Returns:
        pd.DataFrame: DataFrame of an unpacked JSON column.
    """
    import numpy as np

    # these checks are inefficent for multithreading, but it's not a big deal
    if key_col_name is None:
        key_col_name = json_column + "_key"
    if value_col_name is None:
        value_col_name = json_column + "_value"
    if df.index.unique().shape[0] < df.shape[0]:
        raise ("ERROR -- non-unique index found")
    df = df.copy()
    df[json_column + "_dict"] = load_json_nan(df, json_column)
    all_sgpid_cbg_data = []  # each cbg data point will be one element in this list
    if index_name is None:
        for index, row in df.iterrows():
            if row[json_column] == "" or pd.isnull(row[json_column]):
                next
            else:
                this_sgpid_cbg_data = [
                    {"orig_index": index, key_col_name: key, value_col_name: value}
                    for key, value in row[json_column + "_dict"].items()
                ]
                all_sgpid_cbg_data = all_sgpid_cbg_data + this_sgpid_cbg_data
    else:
        for index, row in df.iterrows():
            if row[json_column] == "" or pd.isnull(row[json_column]):
                next
            else:
                temp = row[index_name]
                this_sgpid_cbg_data = [
                    {
                        "orig_index": index,
                        index_name: temp,
                        key_col_name: key,
                        value_col_name: value,
                    }
                    for key, value in row[json_column + "_dict"].items()
                ]
                all_sgpid_cbg_data = all_sgpid_cbg_data + this_sgpid_cbg_data

    all_sgpid_cbg_data = pd.DataFrame(all_sgpid_cbg_data)
    all_sgpid_cbg_data.set_index("orig_index", inplace=True)
    return all_sgpid_cbg_data

unpack_json_and_merge(df, json_column='visitor_home_cbgs', key_col_name='visitor_home_cbg', value_col_name='cbg_visitor_name', keep_index=False)

Unpack a JSON column from a SafeGraph Patterns dataset.

Parameters:

Name Type Description Default
df pd.DataFrame

DataFrame containing the JSON column to be unpacked

required
json_column str

JSON column to be unpacked. Defaults to 'visitor_home_cbgs'.

'visitor_home_cbgs'
key_col_name str

Key name for new ExtendedDataFrame. Defaults to 'visitor_home_cbg'.

'visitor_home_cbg'
value_col_name str

Value name for new ExtendedDataFrame. Defaults to 'cbg_visitor_name'.

'cbg_visitor_name'
keep_index bool

Keep or do not keep the original index. Defaults to False.

False

Returns:

Type Description
pd.DataFrame

DataFrame containing the original DataFrame and the unpacked JSON column as additional columns.

Source code in hagerstrand/dataprocess.py
def unpack_json_and_merge(
    df,
    json_column="visitor_home_cbgs",
    key_col_name="visitor_home_cbg",
    value_col_name="cbg_visitor_name",
    keep_index=False,
):
    """Unpack a JSON column from a SafeGraph Patterns dataset.

    Args:
        df (pd.DataFrame): DataFrame containing the JSON column to be unpacked
        json_column (str, optional): JSON column to be unpacked. Defaults to 'visitor_home_cbgs'.
        key_col_name (str, optional): Key name for new ExtendedDataFrame. Defaults to 'visitor_home_cbg'.
        value_col_name (str, optional): Value name for new ExtendedDataFrame. Defaults to 'cbg_visitor_name'.
        keep_index (bool, optional): Keep or do not keep the original index. Defaults to False.

    Returns:
        pd.DataFrame: DataFrame containing the original DataFrame and the unpacked JSON column as additional columns.
    """
    if keep_index:
        df["index_original"] = df.index
    df.reset_index(drop=True, inplace=True)  # Every row must have a unique index
    df_exp = unpack_json(
        df,
        json_column=json_column,
        key_col_name=key_col_name,
        value_col_name=value_col_name,
    )
    df = df.merge(df_exp, left_index=True, right_index=True).reset_index(drop=True)
    return df

Last update: 2021-05-03