# cython: embedsignature=True # coding: utf8 from __future__ import unicode_literals import ujson from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from .strings cimport hash_string cimport cython from . import util from .tokens.doc cimport Doc cdef class Tokenizer: """ Segment text, and create Doc objects with the discovered segment boundaries. """ @classmethod def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, infix_finditer=None, token_match=None): """ Load a Tokenizer, reading unsupplied components from the path. Arguments: path (Path): The path to load from. vocab (Vocab): A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. token_match: A boolean function matching strings that becomes tokens. prefix_search: Signature of re.compile(string).search suffix_search: Signature of re.compile(string).search infix_finditer: Signature of re.compile(string).finditer Returns Tokenizer """ path = util.ensure_path(path) if rules is None: with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_: rules = ujson.load(file_) if prefix_search in (None, True): with (path / 'tokenizer' / 'prefix.txt').open() as file_: entries = file_.read().split('\n') prefix_search = util.compile_prefix_regex(entries).search if suffix_search in (None, True): with (path / 'tokenizer' / 'suffix.txt').open() as file_: entries = file_.read().split('\n') suffix_search = util.compile_suffix_regex(entries).search if infix_finditer in (None, True): with (path / 'tokenizer' / 'infix.txt').open() as file_: entries = file_.read().split('\n') infix_finditer = util.compile_infix_regex(entries).finditer return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match) def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): """ Create a Tokenizer, to create Doc objects given unicode text. Arguments: vocab (Vocab): A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. prefix_search: A function matching the signature of re.compile(string).search to match prefixes. suffix_search: A function matching the signature of re.compile(string).search to match suffixes. infix_finditer: A function matching the signature of re.compile(string).finditer to find infixes. token_match: A boolean function matching strings that becomes tokens. """ self.mem = Pool() self._cache = PreshMap() self._specials = PreshMap() self.token_match = token_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer self.vocab = vocab self._rules = {} for chunk, substrings in sorted(rules.items()): self.add_special_case(chunk, substrings) def __reduce__(self): args = (self.vocab, self._rules, self._prefix_re, self._suffix_re, self._infix_re, self.token_match) return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): return Doc(self.vocab, words=strings) #raise NotImplementedError( # "Method deprecated in 1.0.\n" # "Old: tokenizer.tokens_from_list(strings)\n" # "New: Doc(tokenizer.vocab, words=strings)") @cython.boundscheck(False) def __call__(self, unicode string): """ Tokenize a string. Arguments: string (unicode): The string to tokenize. Returns: Doc A container for linguistic annotations. """ if len(string) >= (2 ** 30): raise ValueError( "String is too long: %d characters. Max is 2**30." % len(string) ) cdef int length = len(string) cdef Doc tokens = Doc(self.vocab) if length == 0: return tokens cdef int i = 0 cdef int start = 0 cdef bint cache_hit cdef bint in_ws = string[0].isspace() cdef unicode span # The task here is much like string.split, but not quite # We find spans of whitespace and non-space characters, and ignore # spans that are exactly ' '. So, our sequences will all be separated # by either ' ' or nothing. for uc in string: if uc.isspace() != in_ws: if start < i: # When we want to make this fast, get the data buffer once # with PyUnicode_AS_DATA, and then maintain a start_byte # and end_byte, so we can call hash64 directly. That way # we don't have to create the slice when we hit the cache. span = string[start:i] key = hash_string(span) cache_hit = self._try_cache(key, tokens) if not cache_hit: self._tokenize(tokens, span, key) if uc == ' ': tokens.c[tokens.length - 1].spacy = True start = i + 1 else: start = i in_ws = not in_ws i += 1 if start < i: span = string[start:] key = hash_string(span) cache_hit = self._try_cache(key, tokens) if not cache_hit: self._tokenize(tokens, span, key) tokens.c[tokens.length - 1].spacy = string[-1] == ' ' and not in_ws return tokens def pipe(self, texts, batch_size=1000, n_threads=2): """ Tokenize a stream of texts. Arguments: texts: A sequence of unicode texts. batch_size (int): The number of texts to accumulate in an internal buffer. n_threads (int): The number of threads to use, if the implementation supports multi-threading. The default tokenizer is single-threaded. Yields: Doc A sequence of Doc objects, in order. """ for text in texts: yield self(text) cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: # See 'flush_cache' below for hand-wringing about # how to handle this. cached = <_Cached*>self._specials.get(key) if cached == NULL: return False else: self._cache.set(key, cached) cdef int i if cached.is_lex: for i in range(cached.length): tokens.push_back(cached.data.lexemes[i], False) else: for i in range(cached.length): tokens.push_back(&cached.data.tokens[i], False) return True cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key) except -1: cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length special_case = self._specials.get(orig_key) if special_case is not NULL: for i in range(special_case.length): tokens.push_back(&special_case.data.tokens[i], False) self._cache.set(orig_key, special_case) else: span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) self._attach_tokens(tokens, span, &prefixes, &suffixes) self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes): cdef size_t i cdef unicode prefix cdef unicode suffix cdef unicode minus_pre cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: if self.token_match and self.token_match(string): break last_size = len(string) pre_len = self.find_prefix(string) if pre_len != 0: prefix = string[:pre_len] minus_pre = string[pre_len:] # Check whether we've hit a special-case if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL: string = minus_pre prefixes.push_back(self.vocab.get(mem, prefix)) break if self.token_match and self.token_match(string): break suf_len = self.find_suffix(string) if suf_len != 0: suffix = string[-suf_len:] minus_suf = string[:-suf_len] # Check whether we've hit a special-case if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL): string = minus_suf suffixes.push_back(self.vocab.get(mem, suffix)) break if pre_len and suf_len and (pre_len + suf_len) <= len(string): string = string[pre_len:-suf_len] prefixes.push_back(self.vocab.get(mem, prefix)) suffixes.push_back(self.vocab.get(mem, suffix)) elif pre_len: string = minus_pre prefixes.push_back(self.vocab.get(mem, prefix)) elif suf_len: string = minus_suf suffixes.push_back(self.vocab.get(mem, suffix)) if string and (self._specials.get(hash_string(string)) != NULL): break return string cdef int _attach_tokens(self, Doc tokens, unicode string, vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes) except -1: cdef bint cache_hit cdef int split, end cdef const LexemeC* const* lexemes cdef const LexemeC* lexeme cdef unicode span cdef int i if prefixes.size(): for i in range(prefixes.size()): tokens.push_back(prefixes[0][i], False) if string: cache_hit = self._try_cache(hash_string(string), tokens) if cache_hit: pass elif self.token_match and self.token_match(string): # We're always saying 'no' to spaces here -- the caller will # fix up the outermost one, with reference to the original. # See Issue #859 tokens.push_back(self.vocab.get(tokens.mem, string), False) else: matches = self.find_infix(string) if not matches: tokens.push_back(self.vocab.get(tokens.mem, string), False) else: # let's say we have dyn-o-mite-dave # the regex finds the start and end positions of the hyphens start = 0 for match in matches: infix_start = match.start() infix_end = match.end() if infix_start == start: continue span = string[start:infix_start] tokens.push_back(self.vocab.get(tokens.mem, span), False) if infix_start != infix_end: # If infix_start != infix_end, it means the infix # token is non-empty. Empty infix tokens are useful # for tokenization in some languages (see # https://github.com/explosion/spaCy/issues/768) infix_span = string[infix_start:infix_end] tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) start = infix_end span = string[start:] tokens.push_back(self.vocab.get(tokens.mem, span), False) cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): lexeme = deref(it) preinc(it) tokens.push_back(lexeme, False) cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1: cdef int i for i in range(n): if tokens[i].lex.id == 0: return 0 cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = n cached.is_lex = True lexemes = self.mem.alloc(n, sizeof(LexemeC**)) for i in range(n): lexemes[i] = tokens[i].lex cached.data.lexemes = lexemes self._cache.set(key, cached) def find_infix(self, unicode string): """ Find internal split points of the string, such as hyphens. string (unicode): The string to segment. Returns List[re.MatchObject] A list of objects that have .start() and .end() methods, denoting the placement of internal segment separators, e.g. hyphens. """ if self.infix_finditer is None: return 0 return list(self.infix_finditer(string)) def find_prefix(self, unicode string): """ Find the length of a prefix that should be segmented from the string, or None if no prefix rules match. Arguments: string (unicode): The string to segment. Returns (int or None): The length of the prefix if present, otherwise None. """ if self.prefix_search is None: return 0 match = self.prefix_search(string) return (match.end() - match.start()) if match is not None else 0 def find_suffix(self, unicode string): """ Find the length of a suffix that should be segmented from the string, or None if no suffix rules match. Arguments: string (unicode): The string to segment. Returns (int or None): The length of the suffix if present, otherwise None. """ if self.suffix_search is None: return 0 match = self.suffix_search(string) return (match.end() - match.start()) if match is not None else 0 def _load_special_tokenization(self, special_cases): """ Add special-case tokenization rules. """ for chunk, substrings in sorted(special_cases.items()): self.add_special_case(chunk, substrings) def add_special_case(self, unicode string, substrings): """ Add a special-case tokenization rule. Arguments: string (unicode): The string to specially tokenize. token_attrs: A sequence of dicts, where each dict describes a token and its attributes. The ORTH fields of the attributes must exactly match the string when they are concatenated. Returns None """ substrings = list(substrings) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings) cached.is_lex = False cached.data.tokens = self.vocab.make_fused_token(substrings) key = hash_string(string) self._specials.set(key, cached) self._rules[string] = substrings # After changing the tokenization rules, the previous tokenization # may be stale. self.flush_cache() def flush_cache(self): '''Flush the tokenizer's cache. May not free memory immediately. This is called automatically after `add_special_case`, but if you write to the prefix or suffix functions, you'll have to call this yourself. You may also need to flush the tokenizer cache after changing the lex_attr_getter functions. ''' cdef hash_t key for key in self._cache.keys(): special_case = self._specials.get(key) # Don't free data shared with special-case rules if special_case is not NULL: continue cached = <_Cached*>self._cache.get(key) if cached is not NULL: self.mem.free(cached) self._cache = PreshMap(1000) # We could here readd the data from specials --- but if we loop over # a bunch of special-cases, we'll get a quadratic behaviour. The extra # lookup isn't so bad? Tough to tell.