# cython: profile=True # cython: embedsignature=True """Common classes and utilities across languages. Provides the main implementation for the spacy tokenizer. Specific languages subclass the Language class, over-writing the tokenization rules as necessary. Special-case tokenization rules are read from data//tokenization . """ from __future__ import unicode_literals import json import random from os import path import re from .util import read_lang_data from spacy.tokens import Tokens from spacy.lexeme cimport LexemeC, lexeme_init, lexeme_pack, lexeme_unpack from murmurhash.mrmr cimport hash64 from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool from cython.operator cimport preincrement as preinc from cython.operator cimport dereference as deref from preshed.maps cimport PreshMap from spacy import orth from spacy import util cdef enum Flags: Flag_IsAlpha Flag_IsAscii Flag_IsDigit Flag_IsLower Flag_IsPunct Flag_IsSpace Flag_IsTitle Flag_IsUpper Flag_CanAdj Flag_CanAdp Flag_CanAdv Flag_CanConj Flag_CanDet Flag_CanNoun Flag_CanNum Flag_CanPdt Flag_CanPos Flag_CanPron Flag_CanPrt Flag_CanPunct Flag_CanVerb Flag_OftLower Flag_OftTitle Flag_OftUpper Flag_N cdef enum Views: View_CanonForm View_WordShape View_NonSparse View_Asciied View_N # Assign the flag and view functions by enum value. # This is verbose, but it ensures we don't get nasty order sensitivities. STRING_VIEW_FUNCS = [None] * View_N STRING_VIEW_FUNCS[View_CanonForm] = orth.canon_case STRING_VIEW_FUNCS[View_WordShape] = orth.word_shape STRING_VIEW_FUNCS[View_NonSparse] = orth.non_sparse STRING_VIEW_FUNCS[View_Asciied] = orth.asciied FLAG_FUNCS = [None] * Flag_N FLAG_FUNCS[Flag_IsAlpha] = orth.is_alpha FLAG_FUNCS[Flag_IsAscii] = orth.is_ascii FLAG_FUNCS[Flag_IsDigit] = orth.is_digit FLAG_FUNCS[Flag_IsLower] = orth.is_lower FLAG_FUNCS[Flag_IsPunct] = orth.is_punct FLAG_FUNCS[Flag_IsSpace] = orth.is_space FLAG_FUNCS[Flag_IsTitle] = orth.is_title FLAG_FUNCS[Flag_IsUpper] = orth.is_upper FLAG_FUNCS[Flag_CanAdj] = orth.can_tag('ADJ') FLAG_FUNCS[Flag_CanAdp] = orth.can_tag('ADP') FLAG_FUNCS[Flag_CanAdv] = orth.can_tag('ADV') FLAG_FUNCS[Flag_CanConj] = orth.can_tag('CONJ') FLAG_FUNCS[Flag_CanDet] = orth.can_tag('DET') FLAG_FUNCS[Flag_CanNoun] = orth.can_tag('NOUN') FLAG_FUNCS[Flag_CanNum] = orth.can_tag('NUM') FLAG_FUNCS[Flag_CanPdt] = orth.can_tag('PDT') FLAG_FUNCS[Flag_CanPos] = orth.can_tag('POS') FLAG_FUNCS[Flag_CanPron] = orth.can_tag('PRON') FLAG_FUNCS[Flag_CanPrt] = orth.can_tag('PRT') FLAG_FUNCS[Flag_CanPunct] = orth.can_tag('PUNCT') FLAG_FUNCS[Flag_CanVerb] = orth.can_tag('VERB') FLAG_FUNCS[Flag_OftLower] = orth.oft_case('lower', 0.7) FLAG_FUNCS[Flag_OftTitle] = orth.oft_case('title', 0.7) FLAG_FUNCS[Flag_OftUpper] = orth.oft_case('upper', 0.7) cdef class Language: """Base class for language-specific tokenizers. Most subclasses will override the _split or _split_one methods, which take a string of non-whitespace characters and output a list of strings. This function is called by _tokenize, which sits behind a cache and turns the list of strings into Lexeme objects via the Lexicon. Most languages will not need to override _tokenize or tokenize. The language is supplied a list of boolean functions, used to compute flag features. These are passed to the language's Lexicon object. The language's name is used to look up default data-files, found in data//tokenization table, which handles special cases like contractions; * The appropriate :py:meth:`find_split` function, which is used to split off punctuation etc. Args: string (unicode): The string to be tokenized. Returns: tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. """ cdef size_t length = len(string) cdef Tokens tokens = Tokens(length) if length == 0: return tokens cdef size_t start = 0 cdef size_t i = 0 cdef Py_UNICODE* chars = string cdef Py_UNICODE c cdef String span for i in range(length): c = chars[i] if Py_UNICODE_ISSPACE(c) == 1: if start < i: string_from_slice(&span, chars, start, i) try: self._tokenize(tokens.v, &span) except MemoryError: print chars[start:i] raise start = i + 1 i += 1 if start < i: string_from_slice(&span, chars, start, i) self._tokenize(tokens.v, &span) return tokens cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1: cdef size_t i lexemes = self.cache.get(string.key) if lexemes != NULL: i = 0 while lexemes[i] != NULL: tokens_v.push_back(lexemes[i]) i += 1 return 0 cdef uint64_t orig_key = string.key cdef size_t orig_size = tokens_v.size() cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes cdef String prefix cdef String suffix cdef String minus_pre cdef String minus_suf cdef size_t last_size = 0 while string.n != 0 and string.n != last_size: last_size = string.n pre_len = self._find_prefix(string.chars, string.n) if pre_len != 0: string_from_slice(&prefix, string.chars, 0, pre_len) string_from_slice(&minus_pre, string.chars, pre_len, string.n) # Check whether we've hit a special-case if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL: string = &minus_pre prefixes.push_back(self.lexicon.get(&prefix)) break suf_len = self._find_suffix(string.chars, string.n) if suf_len != 0: string_from_slice(&suffix, string.chars, string.n - suf_len, string.n) string_from_slice(&minus_suf, string.chars, 0, string.n - suf_len) # Check whether we've hit a special-case if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL: string = &minus_suf suffixes.push_back(self.lexicon.get(&suffix)) break if pre_len and suf_len and (pre_len + suf_len) <= string.n: string_from_slice(string, string.chars, pre_len, string.n - suf_len) prefixes.push_back(self.lexicon.get(&prefix)) suffixes.push_back(self.lexicon.get(&suffix)) elif pre_len: string = &minus_pre prefixes.push_back(self.lexicon.get(&prefix)) elif suf_len: string = &minus_suf suffixes.push_back(self.lexicon.get(&suffix)) if self.specials.get(string.key): break self._attach_tokens(tokens_v, string, &prefixes, &suffixes) self._save_cached(tokens_v, orig_key, orig_size) cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1: lexemes = self.cache.get(string.key) cdef size_t i = 0 if lexemes != NULL: while lexemes[i] != NULL: tokens.push_back(lexemes[i]) i += 1 string.n = 0 string.key = 0 string.chars = NULL cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1: cdef size_t i cdef LexemeC** lexemes cdef LexemeC* lexeme for lexeme in deref(prefixes): tokens.push_back(lexeme) if string.n != 0: lexemes = self.specials.get(string.key) if lexemes != NULL: i = 0 while lexemes[i] != NULL: tokens.push_back(lexemes[i]) i += 1 else: tokens.push_back(self.lexicon.get(string)) cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): tokens.push_back(deref(it)) preinc(it) cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1: assert tokens.size() > n lexemes = self._mem.alloc((tokens.size() - n) + 1, sizeof(LexemeC**)) cdef size_t i, j for i, j in enumerate(range(n, tokens.size())): lexemes[i] = tokens.at(j) lexemes[i + 1] = NULL self.cache.set(key, lexemes) cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: cdef unicode string = chars[:length] match = self.prefix_re.search(string) if match is None: return 0 else: return match.end() - match.start() cdef int _find_suffix(self, Py_UNICODE* chars, size_t length): cdef unicode string = chars[:length] match = self.suffix_re.search(string) if match is None: return 0 else: return match.end() - match.start() def _load_special_tokenization(self, token_rules): '''Load special-case tokenization rules. Loads special-case tokenization rules into the Language.cache cache, read from data//tokenization . The special cases are loaded before any language data is tokenized, giving these priority. For instance, the English tokenization rules map "ain't" to ["are", "not"]. Args: token_rules (list): A list of (chunk, tokens) pairs, where chunk is a string and tokens is a list of strings. ''' cdef LexemeC** lexemes cdef uint64_t hashed cdef String string for uni_string, substrings in token_rules: lexemes = self._mem.alloc(len(substrings) + 1, sizeof(LexemeC*)) for i, substring in enumerate(substrings): string_from_unicode(&string, substring) lexemes[i] = self.lexicon.get(&string) lexemes[i + 1] = NULL string_from_unicode(&string, uni_string) self.specials.set(string.key, lexemes) self.cache.set(string.key, lexemes) cdef class Lexicon: def __cinit__(self, lexemes, string_features, flag_features): self._mem = Pool() self._flag_features = flag_features self._string_features = string_features self._dict = PreshMap(2 ** 20) self.size = 0 cdef String string cdef dict lexeme_dict cdef LexemeC* lexeme for lexeme_dict in lexemes: string_from_unicode(&string, lexeme_dict['string']) lexeme = self._mem.alloc(1, sizeof(LexemeC)) lexeme.views = self._mem.alloc(len(string_features), sizeof(char*)) lexeme_unpack(lexeme, lexeme_dict) self._dict.set(string.key, lexeme) self.size += 1 cdef LexemeC* get(self, String* string) except NULL: cdef LexemeC* lexeme lexeme = self._dict.get(string.key) if lexeme != NULL: return lexeme cdef unicode uni_string = string.chars[:string.n] views = [string_view(uni_string, 0.0, 0, {}, {}) for string_view in self._string_features] flags = set() for i, flag_feature in enumerate(self._flag_features): if flag_feature(uni_string, 0.0, {}, {}): flags.add(i) lexeme = lexeme_init(self._mem, self.size, uni_string, 0, 0, views, flags) self._dict.set(string.key, lexeme) self.size += 1 return lexeme cpdef Lexeme lookup(self, unicode uni_string): """Retrieve (or create, if not found) a Lexeme for a string, and return it. Args string (unicode): The string to be looked up. Must be unicode, not bytes. Returns: lexeme (Lexeme): A reference to a lexical type. """ cdef String string string_from_unicode(&string, uni_string) cdef LexemeC* lexeme = self.get(&string) return Lexeme(lexeme) cdef void string_from_unicode(String* s, unicode uni): cdef Py_UNICODE* c_uni = uni string_from_slice(s, c_uni, 0, len(uni)) cdef inline void string_from_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil: s.chars = &chars[start] s.n = end - start s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil: string_from_slice(prefix, s.chars, 0, n) s.chars += n s.n -= n s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil: string_from_slice(suffix, s.chars, s.n - n, s.n) s.n -= n s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)