mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas
This commit is contained in:
		
							parent
							
								
									cda9ea9a4a
								
							
						
					
					
						commit
						302e09018b
					
				
							
								
								
									
										24
									
								
								spacy/en.pxd
									
									
									
									
									
								
							
							
						
						
									
										24
									
								
								spacy/en.pxd
									
									
									
									
									
								
							|  | @ -10,6 +10,7 @@ cpdef enum en_person_t: | |||
|     FIRST | ||||
|     SECOND | ||||
|     THIRD | ||||
|     NON_THIRD | ||||
| 
 | ||||
| 
 | ||||
| cpdef enum en_number_t: | ||||
|  | @ -17,14 +18,22 @@ cpdef enum en_number_t: | |||
|     SINGULAR | ||||
|     PLURAL | ||||
|     MASS | ||||
|     CARDINAL | ||||
|     ORDINAL | ||||
| 
 | ||||
| 
 | ||||
| cpdef enum en_gender_t: | ||||
|     NO_GENDER | ||||
|     MASCULINE | ||||
|     FEMININE | ||||
|     NEUTER | ||||
| 
 | ||||
| 
 | ||||
| cpdef enum en_case_t: | ||||
|     NO_CASE | ||||
|     NOMINATIVE | ||||
|     GENITIVE | ||||
|     ACCUSATIVE | ||||
|     REFLEXIVE | ||||
|     DEMONYM | ||||
| 
 | ||||
| 
 | ||||
| cpdef enum en_tenspect_t: | ||||
|  | @ -37,23 +46,12 @@ cpdef enum en_tenspect_t: | |||
|     MODAL | ||||
| 
 | ||||
| 
 | ||||
| cpdef enum en_case_t: | ||||
|     NO_CASE | ||||
|     NOMINATIVE | ||||
|     ACCUSATIVE | ||||
|     GENITIVE | ||||
|     DEMONYM | ||||
| 
 | ||||
| 
 | ||||
| cpdef enum misc_t: | ||||
|     NO_MISC | ||||
|     COMPARATIVE | ||||
|     SUPERLATIVE | ||||
|     RELATIVE | ||||
|     NAME | ||||
|     URL | ||||
|     EMAIL | ||||
|     EMOTICON | ||||
| 
 | ||||
|      | ||||
| # Flags | ||||
|  |  | |||
							
								
								
									
										25
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										25
									
								
								spacy/en.pyx
									
									
									
									
									
								
							|  | @ -38,6 +38,8 @@ import orth | |||
| from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB | ||||
| from .tagger cimport X, PUNCT, EOL | ||||
| 
 | ||||
| from .tokens cimport Morphology | ||||
| 
 | ||||
| 
 | ||||
| POS_TAGS = { | ||||
|     'NULL': (NO_TAG, {}), | ||||
|  | @ -152,7 +154,8 @@ cdef class English(Language): | |||
|         for i in range(tokens.length): | ||||
|             fill_pos_context(context, i, t) | ||||
|             t[i].pos = self.pos_tagger.predict(context) | ||||
|             #self.morphalyser.set_token(&t[i]) | ||||
|             _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph) | ||||
|             t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex) | ||||
| 
 | ||||
|     def train_pos(self, Tokens tokens, golds): | ||||
|         cdef int i | ||||
|  | @ -162,11 +165,27 @@ cdef class English(Language): | |||
|         for i in range(tokens.length): | ||||
|             fill_pos_context(context, i, t) | ||||
|             t[i].pos = self.pos_tagger.predict(context, [golds[i]]) | ||||
|             t[i].morph = self.pos_tagger.tags[t[i].pos].morph | ||||
|             #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex) | ||||
|             _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph) | ||||
|             t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex) | ||||
|             c += t[i].pos == golds[i] | ||||
|         return c | ||||
| 
 | ||||
| 
 | ||||
| cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1: | ||||
|     if tok_morph.number == 0: | ||||
|         tok_morph.number = pos_morph.number | ||||
|     if tok_morph.tenspect == 0: | ||||
|         tok_morph.tenspect = pos_morph.tenspect | ||||
|     if tok_morph.mood == 0: | ||||
|         tok_morph.mood = pos_morph.mood | ||||
|     if tok_morph.gender == 0: | ||||
|         tok_morph.gender = pos_morph.gender | ||||
|     if tok_morph.person == 0: | ||||
|         tok_morph.person = pos_morph.person | ||||
|     if tok_morph.case == 0: | ||||
|         tok_morph.case = pos_morph.case | ||||
|     if tok_morph.misc == 0: | ||||
|         tok_morph.misc = pos_morph.misc | ||||
| 
 | ||||
| 
 | ||||
| EN = English('en') | ||||
|  |  | |||
|  | @ -9,7 +9,7 @@ from .typedefs cimport hash_t | |||
| from .tokens cimport Tokens, TokenC | ||||
| from .lexeme cimport Lexeme | ||||
| from .tagger cimport Tagger | ||||
| from .tagger cimport PosTag | ||||
| from .tagger cimport univ_tag_t | ||||
| from .utf8string cimport StringStore, UniStr | ||||
| 
 | ||||
| 
 | ||||
|  | @ -38,11 +38,12 @@ cdef class Language: | |||
|     cdef object _suffix_re | ||||
|     cdef object _infix_re | ||||
| 
 | ||||
|     cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1 | ||||
|     cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1 | ||||
| 
 | ||||
|     cpdef Tokens tokens_from_list(self, list strings) | ||||
|     cpdef Tokens tokenize(self, unicode text) | ||||
| 
 | ||||
|     cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1 | ||||
|     cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 | ||||
|     cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, | ||||
|                              vector[Lexeme*] *suffixes) except NULL | ||||
|  |  | |||
							
								
								
									
										112
									
								
								spacy/lang.pyx
									
									
									
									
									
								
							
							
						
						
									
										112
									
								
								spacy/lang.pyx
									
									
									
									
									
								
							|  | @ -28,6 +28,7 @@ from .util import read_lang_data | |||
| from .tokens import Tokens | ||||
| 
 | ||||
| from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS | ||||
| from .tokens cimport Morphology | ||||
| 
 | ||||
| 
 | ||||
| cdef class Language: | ||||
|  | @ -53,27 +54,27 @@ cdef class Language: | |||
|         if path.exists(path.join(util.DATA_DIR, self.name, 'pos')): | ||||
|             self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) | ||||
| 
 | ||||
|     cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1: | ||||
|     cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: | ||||
|         if self.lemmatizer is None: | ||||
|             return lex.sic | ||||
|         if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ: | ||||
|         if pos != NOUN and pos != VERB and pos != ADJ: | ||||
|             return lex.sic | ||||
|         cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic) | ||||
|         cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic) | ||||
|         if lemma != 0: | ||||
|             return lemma | ||||
|         cdef bytes py_string = self.lexicon.strings[lex.sic] | ||||
|         cdef set lemma_strings | ||||
|         cdef bytes lemma_string | ||||
|         if pos.pos == NOUN: | ||||
|         if pos == NOUN: | ||||
|             lemma_strings = self.lemmatizer.noun(py_string) | ||||
|         elif pos.pos == VERB: | ||||
|         elif pos == VERB: | ||||
|             lemma_strings = self.lemmatizer.verb(py_string) | ||||
|         else: | ||||
|             assert pos.pos == ADJ | ||||
|             assert pos == ADJ | ||||
|             lemma_strings = self.lemmatizer.adj(py_string) | ||||
|         lemma_string = sorted(lemma_strings)[0] | ||||
|         lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i | ||||
|         self._lemmas.set(pos.pos, lex.sic, <void*>lemma) | ||||
|         self._lemmas.set(pos, lex.sic, <void*>lemma) | ||||
|         return lemma | ||||
| 
 | ||||
|     cpdef Tokens tokens_from_list(self, list strings): | ||||
|  | @ -111,6 +112,7 @@ cdef class Language: | |||
|             return tokens | ||||
|         cdef int i = 0 | ||||
|         cdef int start = 0 | ||||
|         cdef bint cache_hit | ||||
|         cdef Py_UNICODE* chars = string | ||||
|         cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) | ||||
|         cdef UniStr span | ||||
|  | @ -118,10 +120,8 @@ cdef class Language: | |||
|             if Py_UNICODE_ISSPACE(chars[i]) != in_ws: | ||||
|                 if start < i: | ||||
|                     slice_unicode(&span, chars, start, i) | ||||
|                     lexemes = <const Lexeme* const*>self._cache.get(span.key) | ||||
|                     if lexemes != NULL: | ||||
|                         tokens.extend(start, lexemes, 0) | ||||
|                     else:  | ||||
|                     cache_hit = self._try_cache(start, span.key, tokens) | ||||
|                     if not cache_hit: | ||||
|                         self._tokenize(tokens, &span, start, i) | ||||
|                 in_ws = not in_ws | ||||
|                 start = i | ||||
|  | @ -130,13 +130,32 @@ cdef class Language: | |||
|         i += 1 | ||||
|         if start < i: | ||||
|             slice_unicode(&span, chars, start, i) | ||||
|             lexemes = <const Lexeme* const*>self._cache.get(span.key) | ||||
|             if lexemes != NULL: | ||||
|                 tokens.extend(start, lexemes, 0) | ||||
|             else:  | ||||
|             cache_hit = self._try_cache(start, span.key, tokens) | ||||
|             if not cache_hit: | ||||
|                 self._tokenize(tokens, &span, start, i) | ||||
|         return tokens | ||||
| 
 | ||||
|     cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: | ||||
|         cdef int i | ||||
|         specials = <TokenC*>self._specials.get(key) | ||||
|         if specials != NULL: | ||||
|             i = 0 | ||||
|             while specials[i].lex != NULL: | ||||
|                 tokens.push_back(idx, specials[i].lex) | ||||
|                 tokens.data[tokens.length - 1].pos = specials[i].pos | ||||
|                 tokens.data[tokens.length - 1].morph = specials[i].morph | ||||
|                 tokens.data[tokens.length - 1].lemma = specials[i].lemma | ||||
|                 tokens.data[tokens.length - 1].sense = specials[i].sense | ||||
|                 i += 1 | ||||
|             return True | ||||
|         else: | ||||
|             cached = <const Lexeme* const*>self._cache.get(key) | ||||
|             if cached != NULL: | ||||
|                 tokens.extend(i, cached, 0) | ||||
|                 return True | ||||
|             else: | ||||
|                 return False | ||||
| 
 | ||||
|     cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: | ||||
|         cdef vector[Lexeme*] prefixes | ||||
|         cdef vector[Lexeme*] suffixes | ||||
|  | @ -190,10 +209,10 @@ cdef class Language: | |||
|                 break | ||||
|         return string | ||||
| 
 | ||||
|     cdef int _attach_tokens(self, Tokens tokens, | ||||
|                             int idx, UniStr* string, | ||||
|     cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, | ||||
|                             vector[const Lexeme*] *prefixes, | ||||
|                             vector[const Lexeme*] *suffixes) except -1: | ||||
|         cdef bint cache_hit | ||||
|         cdef int split | ||||
|         cdef const Lexeme* const* lexemes | ||||
|         cdef Lexeme* lexeme | ||||
|  | @ -201,10 +220,9 @@ cdef class Language: | |||
|         if prefixes.size(): | ||||
|             idx = tokens.extend(idx, prefixes.data(), prefixes.size()) | ||||
|         if string.n != 0: | ||||
| 
 | ||||
|             lexemes = <const Lexeme* const*>self._cache.get(string.key) | ||||
|             if lexemes != NULL: | ||||
|                 idx = tokens.extend(idx, lexemes, 0) | ||||
|             cache_hit = self._try_cache(idx, string.key, tokens) | ||||
|             if cache_hit: | ||||
|                 idx = tokens.data[tokens.length - 1].idx + 1 | ||||
|             else: | ||||
|                 split = self._find_infix(string.chars, string.n) | ||||
|                 if split == 0 or split == -1: | ||||
|  | @ -247,30 +265,42 @@ cdef class Language: | |||
|         match = self._suffix_re.search(string) | ||||
|         return (match.end() - match.start()) if match is not None else 0 | ||||
| 
 | ||||
|     def _load_special_tokenization(self, token_rules): | ||||
|         '''Load special-case tokenization rules. | ||||
| 
 | ||||
|         Loads special-case tokenization rules into the Language._cache cache, | ||||
|         read from data/<lang>/tokenization . The special cases are loaded before | ||||
|         any language data is tokenized, giving these priority.  For instance, | ||||
|         the English tokenization rules map "ain't" to ["are", "not"]. | ||||
| 
 | ||||
|         Args: | ||||
|             token_rules (list): A list of (chunk, tokens) pairs, where chunk is | ||||
|                 a string and tokens is a list of strings. | ||||
|     def _load_special_tokenization(self, object rules): | ||||
|         '''Add a special-case tokenization rule. | ||||
|         ''' | ||||
|         cdef int i | ||||
|         cdef unicode chunk | ||||
|         cdef list substrings | ||||
|         cdef unicode form | ||||
|         cdef unicode lemma | ||||
|         cdef dict props | ||||
|         cdef Lexeme** lexemes | ||||
|         cdef hash_t hashed | ||||
|         cdef UniStr string | ||||
|         for uni_string, substrings in token_rules: | ||||
|             lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*)) | ||||
|             for i, substring in enumerate(substrings): | ||||
|                 slice_unicode(&string, substring, 0, len(substring)) | ||||
|                 lexemes[i] = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string) | ||||
|             lexemes[i + 1] = NULL | ||||
|             slice_unicode(&string, uni_string, 0, len(uni_string)) | ||||
|             self._specials.set(string.key, lexemes) | ||||
|             self._cache.set(string.key, lexemes) | ||||
|         for chunk, substrings in sorted(rules.items()): | ||||
|             tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) | ||||
|             for i, props in enumerate(substrings): | ||||
|                 form = props['F'] | ||||
|                 lemma = props.get("L", None) | ||||
|                 slice_unicode(&string, form, 0, len(form)) | ||||
|                 tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string) | ||||
|                 if lemma: | ||||
|                     tokens[i].lemma = self.lexicon.strings[lemma] | ||||
|                 set_morph_from_dict(&tokens[i].morph, props) | ||||
|             # Null-terminated array | ||||
|             tokens[i+1].lex = NULL | ||||
|             slice_unicode(&string, chunk, 0, len(chunk)) | ||||
|             self._specials.set(string.key, tokens) | ||||
| 
 | ||||
| 
 | ||||
| cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: | ||||
|     morph.number = props.get('number', 0) | ||||
|     morph.tenspect = props.get('tenspect', 0) | ||||
|     morph.mood = props.get('mood', 0) | ||||
|     morph.gender = props.get('gender', 0) | ||||
|     morph.person = props.get('person', 0) | ||||
|     morph.case = props.get('case', 0) | ||||
|     morph.misc = props.get('misc', 0) | ||||
| 
 | ||||
| 
 | ||||
| cdef class Lexicon: | ||||
|  |  | |||
|  | @ -21,7 +21,6 @@ cdef struct Morphology: | |||
|     uint8_t misc | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| cdef struct TokenC: | ||||
|     const Lexeme* lex | ||||
|     Morphology morph | ||||
|  |  | |||
|  | @ -13,7 +13,8 @@ def utf8open(loc, mode='r'): | |||
| 
 | ||||
| def read_lang_data(name): | ||||
|     data_dir = path.join(DATA_DIR, name) | ||||
|     tokenization = read_tokenization(name) | ||||
|     with open(path.join(data_dir, 'specials.json')) as file_: | ||||
|         tokenization = ujson.load(file_) | ||||
|     prefix = read_prefix(data_dir) | ||||
|     suffix = read_suffix(data_dir) | ||||
|     infix = read_infix(data_dir) | ||||
|  | @ -26,12 +27,17 @@ def read_prefix(data_dir): | |||
|         expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) | ||||
|     return expression | ||||
| 
 | ||||
| 
 | ||||
| def read_suffix(data_dir): | ||||
|     with utf8open(path.join(data_dir, 'suffix')) as file_: | ||||
|         entries = file_.read().split('\n') | ||||
|         expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()]) | ||||
|     # TODO: Fix this hack! | ||||
|     expression += r'|(?<=[a-z0-9])\.$' | ||||
|     expression += r'|(?<=[0-9])km$' | ||||
|     return expression | ||||
| 
 | ||||
| 
 | ||||
| def read_infix(data_dir): | ||||
|     with utf8open(path.join(data_dir, 'infix')) as file_: | ||||
|         entries = file_.read().split('\n') | ||||
|  |  | |||
|  | @ -20,15 +20,18 @@ def test_apostrophe(): | |||
| def test_LL(): | ||||
|     tokens = EN.tokenize("we'll") | ||||
|     assert len(tokens) == 2 | ||||
|     assert tokens[1].string == "will" | ||||
|     assert tokens[1].string == "'ll" | ||||
|     assert tokens[1].lemma == "will" | ||||
|     assert tokens[0].string == "we" | ||||
| 
 | ||||
| 
 | ||||
| def test_aint(): | ||||
|     tokens = EN.tokenize("ain't") | ||||
|     assert len(tokens) == 2 | ||||
|     assert tokens[0].string == "are" | ||||
|     assert tokens[1].string == "not" | ||||
|     assert tokens[0].string == "ai" | ||||
|     assert tokens[0].lemma == "be" | ||||
|     assert tokens[1].string == "n't" | ||||
|     assert tokens[1].lemma == "not" | ||||
| 
 | ||||
| 
 | ||||
| def test_capitalized(): | ||||
|  | @ -38,7 +41,8 @@ def test_capitalized(): | |||
|     assert len(tokens) == 2 | ||||
|     tokens = EN.tokenize("Ain't") | ||||
|     assert len(tokens) == 2 | ||||
|     assert tokens[0].string == "Are" | ||||
|     assert tokens[0].string == "Ai" | ||||
|     assert tokens[0].lemma == "be" | ||||
| 
 | ||||
| 
 | ||||
| def test_punct(): | ||||
|  |  | |||
|  | @ -34,7 +34,7 @@ def test_digits(): | |||
| def test_contraction(): | ||||
|     tokens = EN.tokenize("don't giggle") | ||||
|     assert len(tokens) == 3 | ||||
|     assert tokens[1].sic == EN.lexicon["not"]['sic'] | ||||
|     assert tokens[1].sic == EN.lexicon["n't"]['sic'] | ||||
|     tokens = EN.tokenize("i said don't!") | ||||
|     assert len(tokens) == 5 | ||||
|     assert tokens[4].sic == EN.lexicon['!']['sic'] | ||||
|  | @ -71,30 +71,39 @@ def test_cnts1(): | |||
|     tokens = EN.tokenize(text) | ||||
|     assert len(tokens) == 8 | ||||
| 
 | ||||
| 
 | ||||
| def test_cnts2(): | ||||
|     text = u"""U.N. regulations are not a part of their concern.""" | ||||
|     tokens = EN.tokenize(text) | ||||
|     assert len(tokens) == 10 | ||||
| 
 | ||||
| 
 | ||||
| def test_cnts3(): | ||||
|     text = u"“Isn't it?”" | ||||
|     tokens = EN.tokenize(text) | ||||
|     assert len(tokens) == 6 | ||||
|     words = [t.string for t in tokens] | ||||
|     assert len(words) == 6 | ||||
| 
 | ||||
| 
 | ||||
| def test_cnts4(): | ||||
|     text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """ | ||||
|     tokens = EN.tokenize(text) | ||||
|     assert len(tokens) == 15 | ||||
|     words = [t.string for t in tokens] | ||||
|     assert len(words) == 15 | ||||
| 
 | ||||
| 
 | ||||
| def test_cnts5(): | ||||
|     text = """'Me too!', Mr. P. Delaware cried. """ | ||||
|     tokens = EN.tokenize(text) | ||||
|     assert len(tokens) == 11 | ||||
| 
 | ||||
| 
 | ||||
| def test_cnts6(): | ||||
|     text = u'They ran about 10km.' | ||||
|     tokens = EN.tokenize(text) | ||||
|     assert len(tokens) == 6 | ||||
|     words = [t.string for t in tokens] | ||||
|     assert len(words) == 6 | ||||
| 
 | ||||
| 
 | ||||
| #def test_cnts7(): | ||||
| #    text = 'But then the 6,000-year ice age came...' | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user