diff --git a/spacy/en.pxd b/spacy/en.pxd index 6887dbc08..cee754d9c 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -10,6 +10,7 @@ cpdef enum en_person_t: FIRST SECOND THIRD + NON_THIRD cpdef enum en_number_t: @@ -17,14 +18,22 @@ cpdef enum en_number_t: SINGULAR PLURAL MASS - CARDINAL - ORDINAL cpdef enum en_gender_t: NO_GENDER MASCULINE FEMININE + NEUTER + + +cpdef enum en_case_t: + NO_CASE + NOMINATIVE + GENITIVE + ACCUSATIVE + REFLEXIVE + DEMONYM cpdef enum en_tenspect_t: @@ -37,23 +46,12 @@ cpdef enum en_tenspect_t: MODAL -cpdef enum en_case_t: - NO_CASE - NOMINATIVE - ACCUSATIVE - GENITIVE - DEMONYM - - cpdef enum misc_t: NO_MISC COMPARATIVE SUPERLATIVE RELATIVE NAME - URL - EMAIL - EMOTICON # Flags diff --git a/spacy/en.pyx b/spacy/en.pyx index fa59ef933..0136818f2 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -38,6 +38,8 @@ import orth from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB from .tagger cimport X, PUNCT, EOL +from .tokens cimport Morphology + POS_TAGS = { 'NULL': (NO_TAG, {}), @@ -152,7 +154,8 @@ cdef class English(Language): for i in range(tokens.length): fill_pos_context(context, i, t) t[i].pos = self.pos_tagger.predict(context) - #self.morphalyser.set_token(&t[i]) + _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph) + t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex) def train_pos(self, Tokens tokens, golds): cdef int i @@ -162,11 +165,27 @@ cdef class English(Language): for i in range(tokens.length): fill_pos_context(context, i, t) t[i].pos = self.pos_tagger.predict(context, [golds[i]]) - t[i].morph = self.pos_tagger.tags[t[i].pos].morph - #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex) + _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph) + t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex) c += t[i].pos == golds[i] return c +cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1: + if tok_morph.number == 0: + tok_morph.number = pos_morph.number + if tok_morph.tenspect == 0: + tok_morph.tenspect = pos_morph.tenspect + if tok_morph.mood == 0: + tok_morph.mood = pos_morph.mood + if tok_morph.gender == 0: + tok_morph.gender = pos_morph.gender + if tok_morph.person == 0: + tok_morph.person = pos_morph.person + if tok_morph.case == 0: + tok_morph.case = pos_morph.case + if tok_morph.misc == 0: + tok_morph.misc = pos_morph.misc + EN = English('en') diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 124281a6b..0307e12fe 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -9,7 +9,7 @@ from .typedefs cimport hash_t from .tokens cimport Tokens, TokenC from .lexeme cimport Lexeme from .tagger cimport Tagger -from .tagger cimport PosTag +from .tagger cimport univ_tag_t from .utf8string cimport StringStore, UniStr @@ -38,11 +38,12 @@ cdef class Language: cdef object _suffix_re cdef object _infix_re - cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1 + cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1 cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokenize(self, unicode text) + cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1 cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except NULL diff --git a/spacy/lang.pyx b/spacy/lang.pyx index fdeb7df66..cdae8644a 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -28,6 +28,7 @@ from .util import read_lang_data from .tokens import Tokens from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS +from .tokens cimport Morphology cdef class Language: @@ -53,27 +54,27 @@ cdef class Language: if path.exists(path.join(util.DATA_DIR, self.name, 'pos')): self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) - cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1: + cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: if self.lemmatizer is None: return lex.sic - if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ: + if pos != NOUN and pos != VERB and pos != ADJ: return lex.sic - cdef int lemma = self._lemmas.get(pos.pos, lex.sic) + cdef int lemma = self._lemmas.get(pos, lex.sic) if lemma != 0: return lemma cdef bytes py_string = self.lexicon.strings[lex.sic] cdef set lemma_strings cdef bytes lemma_string - if pos.pos == NOUN: + if pos == NOUN: lemma_strings = self.lemmatizer.noun(py_string) - elif pos.pos == VERB: + elif pos == VERB: lemma_strings = self.lemmatizer.verb(py_string) else: - assert pos.pos == ADJ + assert pos == ADJ lemma_strings = self.lemmatizer.adj(py_string) lemma_string = sorted(lemma_strings)[0] lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i - self._lemmas.set(pos.pos, lex.sic, lemma) + self._lemmas.set(pos, lex.sic, lemma) return lemma cpdef Tokens tokens_from_list(self, list strings): @@ -111,6 +112,7 @@ cdef class Language: return tokens cdef int i = 0 cdef int start = 0 + cdef bint cache_hit cdef Py_UNICODE* chars = string cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) cdef UniStr span @@ -118,10 +120,8 @@ cdef class Language: if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if start < i: slice_unicode(&span, chars, start, i) - lexemes = self._cache.get(span.key) - if lexemes != NULL: - tokens.extend(start, lexemes, 0) - else: + cache_hit = self._try_cache(start, span.key, tokens) + if not cache_hit: self._tokenize(tokens, &span, start, i) in_ws = not in_ws start = i @@ -130,13 +130,32 @@ cdef class Language: i += 1 if start < i: slice_unicode(&span, chars, start, i) - lexemes = self._cache.get(span.key) - if lexemes != NULL: - tokens.extend(start, lexemes, 0) - else: + cache_hit = self._try_cache(start, span.key, tokens) + if not cache_hit: self._tokenize(tokens, &span, start, i) return tokens + cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: + cdef int i + specials = self._specials.get(key) + if specials != NULL: + i = 0 + while specials[i].lex != NULL: + tokens.push_back(idx, specials[i].lex) + tokens.data[tokens.length - 1].pos = specials[i].pos + tokens.data[tokens.length - 1].morph = specials[i].morph + tokens.data[tokens.length - 1].lemma = specials[i].lemma + tokens.data[tokens.length - 1].sense = specials[i].sense + i += 1 + return True + else: + cached = self._cache.get(key) + if cached != NULL: + tokens.extend(i, cached, 0) + return True + else: + return False + cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: cdef vector[Lexeme*] prefixes cdef vector[Lexeme*] suffixes @@ -190,10 +209,10 @@ cdef class Language: break return string - cdef int _attach_tokens(self, Tokens tokens, - int idx, UniStr* string, + cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, vector[const Lexeme*] *prefixes, vector[const Lexeme*] *suffixes) except -1: + cdef bint cache_hit cdef int split cdef const Lexeme* const* lexemes cdef Lexeme* lexeme @@ -201,10 +220,9 @@ cdef class Language: if prefixes.size(): idx = tokens.extend(idx, prefixes.data(), prefixes.size()) if string.n != 0: - - lexemes = self._cache.get(string.key) - if lexemes != NULL: - idx = tokens.extend(idx, lexemes, 0) + cache_hit = self._try_cache(idx, string.key, tokens) + if cache_hit: + idx = tokens.data[tokens.length - 1].idx + 1 else: split = self._find_infix(string.chars, string.n) if split == 0 or split == -1: @@ -247,30 +265,42 @@ cdef class Language: match = self._suffix_re.search(string) return (match.end() - match.start()) if match is not None else 0 - def _load_special_tokenization(self, token_rules): - '''Load special-case tokenization rules. - - Loads special-case tokenization rules into the Language._cache cache, - read from data//tokenization . The special cases are loaded before - any language data is tokenized, giving these priority. For instance, - the English tokenization rules map "ain't" to ["are", "not"]. - - Args: - token_rules (list): A list of (chunk, tokens) pairs, where chunk is - a string and tokens is a list of strings. + def _load_special_tokenization(self, object rules): + '''Add a special-case tokenization rule. ''' + cdef int i + cdef unicode chunk + cdef list substrings + cdef unicode form + cdef unicode lemma + cdef dict props cdef Lexeme** lexemes cdef hash_t hashed cdef UniStr string - for uni_string, substrings in token_rules: - lexemes = self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*)) - for i, substring in enumerate(substrings): - slice_unicode(&string, substring, 0, len(substring)) - lexemes[i] = self.lexicon.get(self.lexicon.mem, &string) - lexemes[i + 1] = NULL - slice_unicode(&string, uni_string, 0, len(uni_string)) - self._specials.set(string.key, lexemes) - self._cache.set(string.key, lexemes) + for chunk, substrings in sorted(rules.items()): + tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) + for i, props in enumerate(substrings): + form = props['F'] + lemma = props.get("L", None) + slice_unicode(&string, form, 0, len(form)) + tokens[i].lex = self.lexicon.get(self.lexicon.mem, &string) + if lemma: + tokens[i].lemma = self.lexicon.strings[lemma] + set_morph_from_dict(&tokens[i].morph, props) + # Null-terminated array + tokens[i+1].lex = NULL + slice_unicode(&string, chunk, 0, len(chunk)) + self._specials.set(string.key, tokens) + + +cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: + morph.number = props.get('number', 0) + morph.tenspect = props.get('tenspect', 0) + morph.mood = props.get('mood', 0) + morph.gender = props.get('gender', 0) + morph.person = props.get('person', 0) + morph.case = props.get('case', 0) + morph.misc = props.get('misc', 0) cdef class Lexicon: diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 6f4691716..f3d6011ec 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -21,7 +21,6 @@ cdef struct Morphology: uint8_t misc - cdef struct TokenC: const Lexeme* lex Morphology morph diff --git a/spacy/util.py b/spacy/util.py index 5062ca6db..ff03760a5 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -13,7 +13,8 @@ def utf8open(loc, mode='r'): def read_lang_data(name): data_dir = path.join(DATA_DIR, name) - tokenization = read_tokenization(name) + with open(path.join(data_dir, 'specials.json')) as file_: + tokenization = ujson.load(file_) prefix = read_prefix(data_dir) suffix = read_suffix(data_dir) infix = read_infix(data_dir) @@ -26,12 +27,17 @@ def read_prefix(data_dir): expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) return expression + def read_suffix(data_dir): - with utf8open(path.join(data_dir, 'suffix')) as file_: + with utf8open(path.join(data_dir, 'suffix')) as file_: entries = file_.read().split('\n') expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()]) + # TODO: Fix this hack! + expression += r'|(?<=[a-z0-9])\.$' + expression += r'|(?<=[0-9])km$' return expression + def read_infix(data_dir): with utf8open(path.join(data_dir, 'infix')) as file_: entries = file_.read().split('\n') diff --git a/tests/test_contractions.py b/tests/test_contractions.py index 8334a74a9..1e697afd2 100644 --- a/tests/test_contractions.py +++ b/tests/test_contractions.py @@ -20,15 +20,18 @@ def test_apostrophe(): def test_LL(): tokens = EN.tokenize("we'll") assert len(tokens) == 2 - assert tokens[1].string == "will" + assert tokens[1].string == "'ll" + assert tokens[1].lemma == "will" assert tokens[0].string == "we" def test_aint(): tokens = EN.tokenize("ain't") assert len(tokens) == 2 - assert tokens[0].string == "are" - assert tokens[1].string == "not" + assert tokens[0].string == "ai" + assert tokens[0].lemma == "be" + assert tokens[1].string == "n't" + assert tokens[1].lemma == "not" def test_capitalized(): @@ -38,7 +41,8 @@ def test_capitalized(): assert len(tokens) == 2 tokens = EN.tokenize("Ain't") assert len(tokens) == 2 - assert tokens[0].string == "Are" + assert tokens[0].string == "Ai" + assert tokens[0].lemma == "be" def test_punct(): diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index fb5f78ed7..21d115b9b 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -34,7 +34,7 @@ def test_digits(): def test_contraction(): tokens = EN.tokenize("don't giggle") assert len(tokens) == 3 - assert tokens[1].sic == EN.lexicon["not"]['sic'] + assert tokens[1].sic == EN.lexicon["n't"]['sic'] tokens = EN.tokenize("i said don't!") assert len(tokens) == 5 assert tokens[4].sic == EN.lexicon['!']['sic'] @@ -71,30 +71,39 @@ def test_cnts1(): tokens = EN.tokenize(text) assert len(tokens) == 8 + def test_cnts2(): text = u"""U.N. regulations are not a part of their concern.""" tokens = EN.tokenize(text) assert len(tokens) == 10 + def test_cnts3(): text = u"“Isn't it?”" tokens = EN.tokenize(text) - assert len(tokens) == 6 + words = [t.string for t in tokens] + assert len(words) == 6 + def test_cnts4(): text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """ tokens = EN.tokenize(text) - assert len(tokens) == 15 + words = [t.string for t in tokens] + assert len(words) == 15 + def test_cnts5(): text = """'Me too!', Mr. P. Delaware cried. """ tokens = EN.tokenize(text) assert len(tokens) == 11 + def test_cnts6(): text = u'They ran about 10km.' tokens = EN.tokenize(text) - assert len(tokens) == 6 + words = [t.string for t in tokens] + assert len(words) == 6 + #def test_cnts7(): # text = 'But then the 6,000-year ice age came...'