Source code for awpy.parser.cleaning

""" Data cleaning functions
"""

from typing import Optional, Any
import difflib
import numpy as np
import pandas as pd
import textdistance


[docs]def associate_entities( game_names: Optional[list[Optional[str]]] = None, entity_names: Optional[list[str]] = None, metric="lcss", ) -> dict: """A function to return a dict of associated entities. Accepts Args: game_names (list, optional): A list of names generated by the demofile. Defaults to [] entity_names (list, optional): A list of names: Defaults to [] metric (string, optional): A string indicating distance metric, one of lcss, hamming, levenshtein, jaro, difflib. Defaults to 'lcss' Returns: A dictionary where the keys are entries in game_names, values are the matched entity names. """ if game_names is None: game_names = [] if entity_names is None: entity_names = [] if metric.lower() == "lcss": dist_metric = textdistance.lcsseq.distance elif metric.lower() == "hamming": dist_metric = textdistance.hamming.distance elif metric.lower() == "levenshtein": dist_metric = textdistance.levenshtein.distance elif metric.lower() == "jaro": dist_metric = textdistance.jaro.distance elif metric.lower() == "difflib": entities: dict[Optional[str], Any] = {} for gn in game_names: if gn is not None and gn is not np.nan: closest_name = difflib.get_close_matches( gn, entity_names, n=1, cutoff=0.0 ) if len(closest_name) > 0: entities[gn] = closest_name[0] else: entities[gn] = None entities[None] = None return entities else: raise ValueError( "Metric can only be lcss, hamming, levenshtein, jaro or difflib" ) entities = {} for gn in game_names: if gn is not None and gn is not np.nan and gn != "": name_distances = [] names = [] if len(entity_names) > 0: for p in entity_names: name_distances.append(dist_metric(gn.lower(), p.lower())) names.append(p) entities[gn] = names[np.argmin(name_distances)] popped_name = entity_names.pop(np.argmin(name_distances)) if gn == "": entities[gn] = None entities[None] = None return entities
[docs]def replace_entities( df: pd.DataFrame, col_name: str, entity_dict: dict ) -> pd.DataFrame: """A function to replace values in a Pandas df column given an entity dict, as created in associate_entities() Args: df (DataFrame) : A Pandas DataFrame col_name (string) : A column in the Pandas DataFrame entity_dict (dict) : A dictionary as created in the associate_entities() function Returns: A dataframe with replaced names. """ if col_name not in df.columns: raise ValueError("Column does not exist!") df[col_name].replace(entity_dict, inplace=True) return df