""" Data cleaning functions
"""
from typing import Optional, Any
import difflib
import numpy as np
import pandas as pd
import textdistance
[docs]def associate_entities(
game_names: Optional[list[Optional[str]]] = None,
entity_names: Optional[list[str]] = None,
metric="lcss",
) -> dict:
"""A function to return a dict of associated entities. Accepts
Args:
game_names (list, optional): A list of names generated by the demofile.
Defaults to []
entity_names (list, optional): A list of names: Defaults to []
metric (string, optional): A string indicating distance metric,
one of lcss, hamming, levenshtein, jaro, difflib.
Defaults to 'lcss'
Returns:
A dictionary where the keys are entries in game_names, values are the matched entity names.
"""
if game_names is None:
game_names = []
if entity_names is None:
entity_names = []
if metric.lower() == "lcss":
dist_metric = textdistance.lcsseq.distance
elif metric.lower() == "hamming":
dist_metric = textdistance.hamming.distance
elif metric.lower() == "levenshtein":
dist_metric = textdistance.levenshtein.distance
elif metric.lower() == "jaro":
dist_metric = textdistance.jaro.distance
elif metric.lower() == "difflib":
entities: dict[Optional[str], Any] = {}
for gn in game_names:
if gn is not None and gn is not np.nan:
closest_name = difflib.get_close_matches(
gn, entity_names, n=1, cutoff=0.0
)
if len(closest_name) > 0:
entities[gn] = closest_name[0]
else:
entities[gn] = None
entities[None] = None
return entities
else:
raise ValueError(
"Metric can only be lcss, hamming, levenshtein, jaro or difflib"
)
entities = {}
for gn in game_names:
if gn is not None and gn is not np.nan and gn != "":
name_distances = []
names = []
if len(entity_names) > 0:
for p in entity_names:
name_distances.append(dist_metric(gn.lower(), p.lower()))
names.append(p)
entities[gn] = names[np.argmin(name_distances)]
popped_name = entity_names.pop(np.argmin(name_distances))
if gn == "":
entities[gn] = None
entities[None] = None
return entities
[docs]def replace_entities(
df: pd.DataFrame, col_name: str, entity_dict: dict
) -> pd.DataFrame:
"""A function to replace values in a Pandas df column given an entity dict, as created in associate_entities()
Args:
df (DataFrame) : A Pandas DataFrame
col_name (string) : A column in the Pandas DataFrame
entity_dict (dict) : A dictionary as created in the associate_entities() function
Returns:
A dataframe with replaced names.
"""
if col_name not in df.columns:
raise ValueError("Column does not exist!")
df[col_name].replace(entity_dict, inplace=True)
return df