Source code for nfp.preprocessing.tokenizer

[docs]class Tokenizer(object): """A class to turn arbitrary inputs into integer classes.""" def __init__(self): # the default class for an unseen entry during test-time self._data = {"unk": 1} self.num_classes = 1 self.train = True self.unknown = []
[docs] def __call__(self, item): """Check to see if the Tokenizer has seen `item` before, and if so, return the integer class associated with it. Otherwise, if we're training, create a new integer class, otherwise return the 'unknown' class. """ try: return self._data[item] except KeyError: if self.train: self._add_token(item) return self(item) else: # Record the unknown item, then return the unknown label self.unknown += [item] return self._data["unk"]
def _add_token(self, item): self.num_classes += 1 self._data[item] = self.num_classes