mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas
This commit is contained in:
		
							parent
							
								
									cda9ea9a4a
								
							
						
					
					
						commit
						302e09018b
					
				
							
								
								
									
										24
									
								
								spacy/en.pxd
									
									
									
									
									
								
							
							
						
						
									
										24
									
								
								spacy/en.pxd
									
									
									
									
									
								
							|  | @ -10,6 +10,7 @@ cpdef enum en_person_t: | ||||||
|     FIRST |     FIRST | ||||||
|     SECOND |     SECOND | ||||||
|     THIRD |     THIRD | ||||||
|  |     NON_THIRD | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cpdef enum en_number_t: | cpdef enum en_number_t: | ||||||
|  | @ -17,14 +18,22 @@ cpdef enum en_number_t: | ||||||
|     SINGULAR |     SINGULAR | ||||||
|     PLURAL |     PLURAL | ||||||
|     MASS |     MASS | ||||||
|     CARDINAL |  | ||||||
|     ORDINAL |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cpdef enum en_gender_t: | cpdef enum en_gender_t: | ||||||
|     NO_GENDER |     NO_GENDER | ||||||
|     MASCULINE |     MASCULINE | ||||||
|     FEMININE |     FEMININE | ||||||
|  |     NEUTER | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cpdef enum en_case_t: | ||||||
|  |     NO_CASE | ||||||
|  |     NOMINATIVE | ||||||
|  |     GENITIVE | ||||||
|  |     ACCUSATIVE | ||||||
|  |     REFLEXIVE | ||||||
|  |     DEMONYM | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cpdef enum en_tenspect_t: | cpdef enum en_tenspect_t: | ||||||
|  | @ -37,23 +46,12 @@ cpdef enum en_tenspect_t: | ||||||
|     MODAL |     MODAL | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cpdef enum en_case_t: |  | ||||||
|     NO_CASE |  | ||||||
|     NOMINATIVE |  | ||||||
|     ACCUSATIVE |  | ||||||
|     GENITIVE |  | ||||||
|     DEMONYM |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cpdef enum misc_t: | cpdef enum misc_t: | ||||||
|     NO_MISC |     NO_MISC | ||||||
|     COMPARATIVE |     COMPARATIVE | ||||||
|     SUPERLATIVE |     SUPERLATIVE | ||||||
|     RELATIVE |     RELATIVE | ||||||
|     NAME |     NAME | ||||||
|     URL |  | ||||||
|     EMAIL |  | ||||||
|     EMOTICON |  | ||||||
| 
 | 
 | ||||||
|      |      | ||||||
| # Flags | # Flags | ||||||
|  |  | ||||||
							
								
								
									
										25
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										25
									
								
								spacy/en.pyx
									
									
									
									
									
								
							|  | @ -38,6 +38,8 @@ import orth | ||||||
| from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB | from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB | ||||||
| from .tagger cimport X, PUNCT, EOL | from .tagger cimport X, PUNCT, EOL | ||||||
| 
 | 
 | ||||||
|  | from .tokens cimport Morphology | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| POS_TAGS = { | POS_TAGS = { | ||||||
|     'NULL': (NO_TAG, {}), |     'NULL': (NO_TAG, {}), | ||||||
|  | @ -152,7 +154,8 @@ cdef class English(Language): | ||||||
|         for i in range(tokens.length): |         for i in range(tokens.length): | ||||||
|             fill_pos_context(context, i, t) |             fill_pos_context(context, i, t) | ||||||
|             t[i].pos = self.pos_tagger.predict(context) |             t[i].pos = self.pos_tagger.predict(context) | ||||||
|             #self.morphalyser.set_token(&t[i]) |             _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph) | ||||||
|  |             t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex) | ||||||
| 
 | 
 | ||||||
|     def train_pos(self, Tokens tokens, golds): |     def train_pos(self, Tokens tokens, golds): | ||||||
|         cdef int i |         cdef int i | ||||||
|  | @ -162,11 +165,27 @@ cdef class English(Language): | ||||||
|         for i in range(tokens.length): |         for i in range(tokens.length): | ||||||
|             fill_pos_context(context, i, t) |             fill_pos_context(context, i, t) | ||||||
|             t[i].pos = self.pos_tagger.predict(context, [golds[i]]) |             t[i].pos = self.pos_tagger.predict(context, [golds[i]]) | ||||||
|             t[i].morph = self.pos_tagger.tags[t[i].pos].morph |             _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph) | ||||||
|             #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex) |             t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex) | ||||||
|             c += t[i].pos == golds[i] |             c += t[i].pos == golds[i] | ||||||
|         return c |         return c | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1: | ||||||
|  |     if tok_morph.number == 0: | ||||||
|  |         tok_morph.number = pos_morph.number | ||||||
|  |     if tok_morph.tenspect == 0: | ||||||
|  |         tok_morph.tenspect = pos_morph.tenspect | ||||||
|  |     if tok_morph.mood == 0: | ||||||
|  |         tok_morph.mood = pos_morph.mood | ||||||
|  |     if tok_morph.gender == 0: | ||||||
|  |         tok_morph.gender = pos_morph.gender | ||||||
|  |     if tok_morph.person == 0: | ||||||
|  |         tok_morph.person = pos_morph.person | ||||||
|  |     if tok_morph.case == 0: | ||||||
|  |         tok_morph.case = pos_morph.case | ||||||
|  |     if tok_morph.misc == 0: | ||||||
|  |         tok_morph.misc = pos_morph.misc | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| EN = English('en') | EN = English('en') | ||||||
|  |  | ||||||
|  | @ -9,7 +9,7 @@ from .typedefs cimport hash_t | ||||||
| from .tokens cimport Tokens, TokenC | from .tokens cimport Tokens, TokenC | ||||||
| from .lexeme cimport Lexeme | from .lexeme cimport Lexeme | ||||||
| from .tagger cimport Tagger | from .tagger cimport Tagger | ||||||
| from .tagger cimport PosTag | from .tagger cimport univ_tag_t | ||||||
| from .utf8string cimport StringStore, UniStr | from .utf8string cimport StringStore, UniStr | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -38,11 +38,12 @@ cdef class Language: | ||||||
|     cdef object _suffix_re |     cdef object _suffix_re | ||||||
|     cdef object _infix_re |     cdef object _infix_re | ||||||
| 
 | 
 | ||||||
|     cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1 |     cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1 | ||||||
| 
 | 
 | ||||||
|     cpdef Tokens tokens_from_list(self, list strings) |     cpdef Tokens tokens_from_list(self, list strings) | ||||||
|     cpdef Tokens tokenize(self, unicode text) |     cpdef Tokens tokenize(self, unicode text) | ||||||
| 
 | 
 | ||||||
|  |     cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1 | ||||||
|     cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 |     cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 | ||||||
|     cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, |     cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, | ||||||
|                              vector[Lexeme*] *suffixes) except NULL |                              vector[Lexeme*] *suffixes) except NULL | ||||||
|  |  | ||||||
							
								
								
									
										112
									
								
								spacy/lang.pyx
									
									
									
									
									
								
							
							
						
						
									
										112
									
								
								spacy/lang.pyx
									
									
									
									
									
								
							|  | @ -28,6 +28,7 @@ from .util import read_lang_data | ||||||
| from .tokens import Tokens | from .tokens import Tokens | ||||||
| 
 | 
 | ||||||
| from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS | from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS | ||||||
|  | from .tokens cimport Morphology | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Language: | cdef class Language: | ||||||
|  | @ -53,27 +54,27 @@ cdef class Language: | ||||||
|         if path.exists(path.join(util.DATA_DIR, self.name, 'pos')): |         if path.exists(path.join(util.DATA_DIR, self.name, 'pos')): | ||||||
|             self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) |             self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) | ||||||
| 
 | 
 | ||||||
|     cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1: |     cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: | ||||||
|         if self.lemmatizer is None: |         if self.lemmatizer is None: | ||||||
|             return lex.sic |             return lex.sic | ||||||
|         if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ: |         if pos != NOUN and pos != VERB and pos != ADJ: | ||||||
|             return lex.sic |             return lex.sic | ||||||
|         cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic) |         cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic) | ||||||
|         if lemma != 0: |         if lemma != 0: | ||||||
|             return lemma |             return lemma | ||||||
|         cdef bytes py_string = self.lexicon.strings[lex.sic] |         cdef bytes py_string = self.lexicon.strings[lex.sic] | ||||||
|         cdef set lemma_strings |         cdef set lemma_strings | ||||||
|         cdef bytes lemma_string |         cdef bytes lemma_string | ||||||
|         if pos.pos == NOUN: |         if pos == NOUN: | ||||||
|             lemma_strings = self.lemmatizer.noun(py_string) |             lemma_strings = self.lemmatizer.noun(py_string) | ||||||
|         elif pos.pos == VERB: |         elif pos == VERB: | ||||||
|             lemma_strings = self.lemmatizer.verb(py_string) |             lemma_strings = self.lemmatizer.verb(py_string) | ||||||
|         else: |         else: | ||||||
|             assert pos.pos == ADJ |             assert pos == ADJ | ||||||
|             lemma_strings = self.lemmatizer.adj(py_string) |             lemma_strings = self.lemmatizer.adj(py_string) | ||||||
|         lemma_string = sorted(lemma_strings)[0] |         lemma_string = sorted(lemma_strings)[0] | ||||||
|         lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i |         lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i | ||||||
|         self._lemmas.set(pos.pos, lex.sic, <void*>lemma) |         self._lemmas.set(pos, lex.sic, <void*>lemma) | ||||||
|         return lemma |         return lemma | ||||||
| 
 | 
 | ||||||
|     cpdef Tokens tokens_from_list(self, list strings): |     cpdef Tokens tokens_from_list(self, list strings): | ||||||
|  | @ -111,6 +112,7 @@ cdef class Language: | ||||||
|             return tokens |             return tokens | ||||||
|         cdef int i = 0 |         cdef int i = 0 | ||||||
|         cdef int start = 0 |         cdef int start = 0 | ||||||
|  |         cdef bint cache_hit | ||||||
|         cdef Py_UNICODE* chars = string |         cdef Py_UNICODE* chars = string | ||||||
|         cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) |         cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) | ||||||
|         cdef UniStr span |         cdef UniStr span | ||||||
|  | @ -118,10 +120,8 @@ cdef class Language: | ||||||
|             if Py_UNICODE_ISSPACE(chars[i]) != in_ws: |             if Py_UNICODE_ISSPACE(chars[i]) != in_ws: | ||||||
|                 if start < i: |                 if start < i: | ||||||
|                     slice_unicode(&span, chars, start, i) |                     slice_unicode(&span, chars, start, i) | ||||||
|                     lexemes = <const Lexeme* const*>self._cache.get(span.key) |                     cache_hit = self._try_cache(start, span.key, tokens) | ||||||
|                     if lexemes != NULL: |                     if not cache_hit: | ||||||
|                         tokens.extend(start, lexemes, 0) |  | ||||||
|                     else:  |  | ||||||
|                         self._tokenize(tokens, &span, start, i) |                         self._tokenize(tokens, &span, start, i) | ||||||
|                 in_ws = not in_ws |                 in_ws = not in_ws | ||||||
|                 start = i |                 start = i | ||||||
|  | @ -130,13 +130,32 @@ cdef class Language: | ||||||
|         i += 1 |         i += 1 | ||||||
|         if start < i: |         if start < i: | ||||||
|             slice_unicode(&span, chars, start, i) |             slice_unicode(&span, chars, start, i) | ||||||
|             lexemes = <const Lexeme* const*>self._cache.get(span.key) |             cache_hit = self._try_cache(start, span.key, tokens) | ||||||
|             if lexemes != NULL: |             if not cache_hit: | ||||||
|                 tokens.extend(start, lexemes, 0) |  | ||||||
|             else:  |  | ||||||
|                 self._tokenize(tokens, &span, start, i) |                 self._tokenize(tokens, &span, start, i) | ||||||
|         return tokens |         return tokens | ||||||
| 
 | 
 | ||||||
|  |     cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: | ||||||
|  |         cdef int i | ||||||
|  |         specials = <TokenC*>self._specials.get(key) | ||||||
|  |         if specials != NULL: | ||||||
|  |             i = 0 | ||||||
|  |             while specials[i].lex != NULL: | ||||||
|  |                 tokens.push_back(idx, specials[i].lex) | ||||||
|  |                 tokens.data[tokens.length - 1].pos = specials[i].pos | ||||||
|  |                 tokens.data[tokens.length - 1].morph = specials[i].morph | ||||||
|  |                 tokens.data[tokens.length - 1].lemma = specials[i].lemma | ||||||
|  |                 tokens.data[tokens.length - 1].sense = specials[i].sense | ||||||
|  |                 i += 1 | ||||||
|  |             return True | ||||||
|  |         else: | ||||||
|  |             cached = <const Lexeme* const*>self._cache.get(key) | ||||||
|  |             if cached != NULL: | ||||||
|  |                 tokens.extend(i, cached, 0) | ||||||
|  |                 return True | ||||||
|  |             else: | ||||||
|  |                 return False | ||||||
|  | 
 | ||||||
|     cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: |     cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: | ||||||
|         cdef vector[Lexeme*] prefixes |         cdef vector[Lexeme*] prefixes | ||||||
|         cdef vector[Lexeme*] suffixes |         cdef vector[Lexeme*] suffixes | ||||||
|  | @ -190,10 +209,10 @@ cdef class Language: | ||||||
|                 break |                 break | ||||||
|         return string |         return string | ||||||
| 
 | 
 | ||||||
|     cdef int _attach_tokens(self, Tokens tokens, |     cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, | ||||||
|                             int idx, UniStr* string, |  | ||||||
|                             vector[const Lexeme*] *prefixes, |                             vector[const Lexeme*] *prefixes, | ||||||
|                             vector[const Lexeme*] *suffixes) except -1: |                             vector[const Lexeme*] *suffixes) except -1: | ||||||
|  |         cdef bint cache_hit | ||||||
|         cdef int split |         cdef int split | ||||||
|         cdef const Lexeme* const* lexemes |         cdef const Lexeme* const* lexemes | ||||||
|         cdef Lexeme* lexeme |         cdef Lexeme* lexeme | ||||||
|  | @ -201,10 +220,9 @@ cdef class Language: | ||||||
|         if prefixes.size(): |         if prefixes.size(): | ||||||
|             idx = tokens.extend(idx, prefixes.data(), prefixes.size()) |             idx = tokens.extend(idx, prefixes.data(), prefixes.size()) | ||||||
|         if string.n != 0: |         if string.n != 0: | ||||||
| 
 |             cache_hit = self._try_cache(idx, string.key, tokens) | ||||||
|             lexemes = <const Lexeme* const*>self._cache.get(string.key) |             if cache_hit: | ||||||
|             if lexemes != NULL: |                 idx = tokens.data[tokens.length - 1].idx + 1 | ||||||
|                 idx = tokens.extend(idx, lexemes, 0) |  | ||||||
|             else: |             else: | ||||||
|                 split = self._find_infix(string.chars, string.n) |                 split = self._find_infix(string.chars, string.n) | ||||||
|                 if split == 0 or split == -1: |                 if split == 0 or split == -1: | ||||||
|  | @ -247,30 +265,42 @@ cdef class Language: | ||||||
|         match = self._suffix_re.search(string) |         match = self._suffix_re.search(string) | ||||||
|         return (match.end() - match.start()) if match is not None else 0 |         return (match.end() - match.start()) if match is not None else 0 | ||||||
| 
 | 
 | ||||||
|     def _load_special_tokenization(self, token_rules): |     def _load_special_tokenization(self, object rules): | ||||||
|         '''Load special-case tokenization rules. |         '''Add a special-case tokenization rule. | ||||||
| 
 |  | ||||||
|         Loads special-case tokenization rules into the Language._cache cache, |  | ||||||
|         read from data/<lang>/tokenization . The special cases are loaded before |  | ||||||
|         any language data is tokenized, giving these priority.  For instance, |  | ||||||
|         the English tokenization rules map "ain't" to ["are", "not"]. |  | ||||||
| 
 |  | ||||||
|         Args: |  | ||||||
|             token_rules (list): A list of (chunk, tokens) pairs, where chunk is |  | ||||||
|                 a string and tokens is a list of strings. |  | ||||||
|         ''' |         ''' | ||||||
|  |         cdef int i | ||||||
|  |         cdef unicode chunk | ||||||
|  |         cdef list substrings | ||||||
|  |         cdef unicode form | ||||||
|  |         cdef unicode lemma | ||||||
|  |         cdef dict props | ||||||
|         cdef Lexeme** lexemes |         cdef Lexeme** lexemes | ||||||
|         cdef hash_t hashed |         cdef hash_t hashed | ||||||
|         cdef UniStr string |         cdef UniStr string | ||||||
|         for uni_string, substrings in token_rules: |         for chunk, substrings in sorted(rules.items()): | ||||||
|             lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*)) |             tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) | ||||||
|             for i, substring in enumerate(substrings): |             for i, props in enumerate(substrings): | ||||||
|                 slice_unicode(&string, substring, 0, len(substring)) |                 form = props['F'] | ||||||
|                 lexemes[i] = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string) |                 lemma = props.get("L", None) | ||||||
|             lexemes[i + 1] = NULL |                 slice_unicode(&string, form, 0, len(form)) | ||||||
|             slice_unicode(&string, uni_string, 0, len(uni_string)) |                 tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string) | ||||||
|             self._specials.set(string.key, lexemes) |                 if lemma: | ||||||
|             self._cache.set(string.key, lexemes) |                     tokens[i].lemma = self.lexicon.strings[lemma] | ||||||
|  |                 set_morph_from_dict(&tokens[i].morph, props) | ||||||
|  |             # Null-terminated array | ||||||
|  |             tokens[i+1].lex = NULL | ||||||
|  |             slice_unicode(&string, chunk, 0, len(chunk)) | ||||||
|  |             self._specials.set(string.key, tokens) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: | ||||||
|  |     morph.number = props.get('number', 0) | ||||||
|  |     morph.tenspect = props.get('tenspect', 0) | ||||||
|  |     morph.mood = props.get('mood', 0) | ||||||
|  |     morph.gender = props.get('gender', 0) | ||||||
|  |     morph.person = props.get('person', 0) | ||||||
|  |     morph.case = props.get('case', 0) | ||||||
|  |     morph.misc = props.get('misc', 0) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Lexicon: | cdef class Lexicon: | ||||||
|  |  | ||||||
|  | @ -21,7 +21,6 @@ cdef struct Morphology: | ||||||
|     uint8_t misc |     uint8_t misc | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| cdef struct TokenC: | cdef struct TokenC: | ||||||
|     const Lexeme* lex |     const Lexeme* lex | ||||||
|     Morphology morph |     Morphology morph | ||||||
|  |  | ||||||
|  | @ -13,7 +13,8 @@ def utf8open(loc, mode='r'): | ||||||
| 
 | 
 | ||||||
| def read_lang_data(name): | def read_lang_data(name): | ||||||
|     data_dir = path.join(DATA_DIR, name) |     data_dir = path.join(DATA_DIR, name) | ||||||
|     tokenization = read_tokenization(name) |     with open(path.join(data_dir, 'specials.json')) as file_: | ||||||
|  |         tokenization = ujson.load(file_) | ||||||
|     prefix = read_prefix(data_dir) |     prefix = read_prefix(data_dir) | ||||||
|     suffix = read_suffix(data_dir) |     suffix = read_suffix(data_dir) | ||||||
|     infix = read_infix(data_dir) |     infix = read_infix(data_dir) | ||||||
|  | @ -26,12 +27,17 @@ def read_prefix(data_dir): | ||||||
|         expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) |         expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) | ||||||
|     return expression |     return expression | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def read_suffix(data_dir): | def read_suffix(data_dir): | ||||||
|     with  utf8open(path.join(data_dir, 'suffix')) as file_: |     with utf8open(path.join(data_dir, 'suffix')) as file_: | ||||||
|         entries = file_.read().split('\n') |         entries = file_.read().split('\n') | ||||||
|         expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()]) |         expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()]) | ||||||
|  |     # TODO: Fix this hack! | ||||||
|  |     expression += r'|(?<=[a-z0-9])\.$' | ||||||
|  |     expression += r'|(?<=[0-9])km$' | ||||||
|     return expression |     return expression | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def read_infix(data_dir): | def read_infix(data_dir): | ||||||
|     with utf8open(path.join(data_dir, 'infix')) as file_: |     with utf8open(path.join(data_dir, 'infix')) as file_: | ||||||
|         entries = file_.read().split('\n') |         entries = file_.read().split('\n') | ||||||
|  |  | ||||||
|  | @ -20,15 +20,18 @@ def test_apostrophe(): | ||||||
| def test_LL(): | def test_LL(): | ||||||
|     tokens = EN.tokenize("we'll") |     tokens = EN.tokenize("we'll") | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     assert tokens[1].string == "will" |     assert tokens[1].string == "'ll" | ||||||
|  |     assert tokens[1].lemma == "will" | ||||||
|     assert tokens[0].string == "we" |     assert tokens[0].string == "we" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_aint(): | def test_aint(): | ||||||
|     tokens = EN.tokenize("ain't") |     tokens = EN.tokenize("ain't") | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     assert tokens[0].string == "are" |     assert tokens[0].string == "ai" | ||||||
|     assert tokens[1].string == "not" |     assert tokens[0].lemma == "be" | ||||||
|  |     assert tokens[1].string == "n't" | ||||||
|  |     assert tokens[1].lemma == "not" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_capitalized(): | def test_capitalized(): | ||||||
|  | @ -38,7 +41,8 @@ def test_capitalized(): | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     tokens = EN.tokenize("Ain't") |     tokens = EN.tokenize("Ain't") | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     assert tokens[0].string == "Are" |     assert tokens[0].string == "Ai" | ||||||
|  |     assert tokens[0].lemma == "be" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_punct(): | def test_punct(): | ||||||
|  |  | ||||||
|  | @ -34,7 +34,7 @@ def test_digits(): | ||||||
| def test_contraction(): | def test_contraction(): | ||||||
|     tokens = EN.tokenize("don't giggle") |     tokens = EN.tokenize("don't giggle") | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|     assert tokens[1].sic == EN.lexicon["not"]['sic'] |     assert tokens[1].sic == EN.lexicon["n't"]['sic'] | ||||||
|     tokens = EN.tokenize("i said don't!") |     tokens = EN.tokenize("i said don't!") | ||||||
|     assert len(tokens) == 5 |     assert len(tokens) == 5 | ||||||
|     assert tokens[4].sic == EN.lexicon['!']['sic'] |     assert tokens[4].sic == EN.lexicon['!']['sic'] | ||||||
|  | @ -71,30 +71,39 @@ def test_cnts1(): | ||||||
|     tokens = EN.tokenize(text) |     tokens = EN.tokenize(text) | ||||||
|     assert len(tokens) == 8 |     assert len(tokens) == 8 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def test_cnts2(): | def test_cnts2(): | ||||||
|     text = u"""U.N. regulations are not a part of their concern.""" |     text = u"""U.N. regulations are not a part of their concern.""" | ||||||
|     tokens = EN.tokenize(text) |     tokens = EN.tokenize(text) | ||||||
|     assert len(tokens) == 10 |     assert len(tokens) == 10 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def test_cnts3(): | def test_cnts3(): | ||||||
|     text = u"“Isn't it?”" |     text = u"“Isn't it?”" | ||||||
|     tokens = EN.tokenize(text) |     tokens = EN.tokenize(text) | ||||||
|     assert len(tokens) == 6 |     words = [t.string for t in tokens] | ||||||
|  |     assert len(words) == 6 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def test_cnts4(): | def test_cnts4(): | ||||||
|     text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """ |     text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """ | ||||||
|     tokens = EN.tokenize(text) |     tokens = EN.tokenize(text) | ||||||
|     assert len(tokens) == 15 |     words = [t.string for t in tokens] | ||||||
|  |     assert len(words) == 15 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def test_cnts5(): | def test_cnts5(): | ||||||
|     text = """'Me too!', Mr. P. Delaware cried. """ |     text = """'Me too!', Mr. P. Delaware cried. """ | ||||||
|     tokens = EN.tokenize(text) |     tokens = EN.tokenize(text) | ||||||
|     assert len(tokens) == 11 |     assert len(tokens) == 11 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def test_cnts6(): | def test_cnts6(): | ||||||
|     text = u'They ran about 10km.' |     text = u'They ran about 10km.' | ||||||
|     tokens = EN.tokenize(text) |     tokens = EN.tokenize(text) | ||||||
|     assert len(tokens) == 6 |     words = [t.string for t in tokens] | ||||||
|  |     assert len(words) == 6 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| #def test_cnts7(): | #def test_cnts7(): | ||||||
| #    text = 'But then the 6,000-year ice age came...' | #    text = 'But then the 6,000-year ice age came...' | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user