mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	Merge remote-tracking branch 'origin/organize-language-data' into organize-language-data
This commit is contained in:
		
						commit
						b11d8cd3db
					
				|  | @ -21,3 +21,4 @@ class English(Language): | ||||||
|         tokenizer_exceptions = TOKENIZER_EXCEPTIONS |         tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||||
|         tag_map = TAG_MAP |         tag_map = TAG_MAP | ||||||
|         stop_words = STOP_WORDS |         stop_words = STOP_WORDS | ||||||
|  |         lemma_rules = LEMMA_RULES | ||||||
|  |  | ||||||
|  | @ -39,7 +39,7 @@ class BaseDefaults(object): | ||||||
|         if nlp is None or nlp.path is None: |         if nlp is None or nlp.path is None: | ||||||
|             return Lemmatizer({}, {}, {}) |             return Lemmatizer({}, {}, {}) | ||||||
|         else: |         else: | ||||||
|             return Lemmatizer.load(nlp.path) |             return Lemmatizer.load(nlp.path, rules=self.lemma_rules) | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def create_vocab(cls, nlp=None): |     def create_vocab(cls, nlp=None): | ||||||
|  | @ -160,6 +160,8 @@ class BaseDefaults(object): | ||||||
| 
 | 
 | ||||||
|     stop_words = set() |     stop_words = set() | ||||||
| 
 | 
 | ||||||
|  |     lemma_rules = {} | ||||||
|  | 
 | ||||||
|     lex_attr_getters = { |     lex_attr_getters = { | ||||||
|         attrs.LOWER: lambda string: string.lower(), |         attrs.LOWER: lambda string: string.lower(), | ||||||
|         attrs.NORM: lambda string: string, |         attrs.NORM: lambda string: string, | ||||||
|  |  | ||||||
|  | @ -9,7 +9,7 @@ from .symbols import POS, NOUN, VERB, ADJ, PUNCT | ||||||
| 
 | 
 | ||||||
| class Lemmatizer(object): | class Lemmatizer(object): | ||||||
|     @classmethod |     @classmethod | ||||||
|     def load(cls, path): |     def load(cls, path, rules=None): | ||||||
|         index = {} |         index = {} | ||||||
|         exc = {} |         exc = {} | ||||||
|         for pos in ['adj', 'noun', 'verb']: |         for pos in ['adj', 'noun', 'verb']: | ||||||
|  | @ -25,8 +25,11 @@ class Lemmatizer(object): | ||||||
|                     exc[pos] = read_exc(file_) |                     exc[pos] = read_exc(file_) | ||||||
|             else: |             else: | ||||||
|                 exc[pos] = {} |                 exc[pos] = {} | ||||||
|         with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_: |         if rules is None and (path / 'vocab' / 'lemma_rules.json').exists(): | ||||||
|             rules = json.load(file_) |             with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_: | ||||||
|  |                 rules = json.load(file_) | ||||||
|  |         elif rules is None: | ||||||
|  |             rules = {} | ||||||
|         return cls(index, exc, rules) |         return cls(index, exc, rules) | ||||||
| 
 | 
 | ||||||
|     def __init__(self, index, exceptions, rules): |     def __init__(self, index, exceptions, rules): | ||||||
|  |  | ||||||
|  | @ -140,6 +140,7 @@ cdef class Morphology: | ||||||
|         lemma = self.strings[lemma_string] |         lemma = self.strings[lemma_string] | ||||||
|         return lemma |         return lemma | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| IDS = { | IDS = { | ||||||
|     "Animacy_anim": Animacy_anim, |     "Animacy_anim": Animacy_anim, | ||||||
|     "Animacy_inam": Animacy_inam, |     "Animacy_inam": Animacy_inam, | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user