mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Move tokenizer data for German into spacy.de.language_data
This commit is contained in:
		
							parent
							
								
									95aaea0d3f
								
							
						
					
					
						commit
						7db956133e
					
				|  | @ -3,22 +3,22 @@ from __future__ import unicode_literals, print_function | ||||||
| from os import path | from os import path | ||||||
| 
 | 
 | ||||||
| from ..language import Language | from ..language import Language | ||||||
| from ..vocab import Vocab | from . import language_data | ||||||
| from ..attrs import LANG |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class German(Language): | class German(Language): | ||||||
|     lang = 'de' |     lang = 'de' | ||||||
|      |      | ||||||
|     class Defaults(Language.Defaults): |     class Defaults(Language.Defaults): | ||||||
|         def Vocab(self, vectors=None, lex_attr_getters=None): |         tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) | ||||||
|             if lex_attr_getters is None: |          | ||||||
|                 lex_attr_getters = dict(self.lex_attr_getters) |         prefixes = tuple(language_data.TOKENIZER_PREFIXES) | ||||||
|             if vectors is None: |          | ||||||
|                 vectors = self.Vectors() |         suffixes = tuple(language_data.TOKENIZER_SUFFIXES) | ||||||
|             # set a dummy lemmatizer for now that simply returns the same string |          | ||||||
|             # until the morphology is done for German |         infixes = tuple(language_data.TOKENIZER_INFIXES) | ||||||
|             return Vocab.load(self.path, lex_attr_getters=lex_attr_getters, vectors=vectors, | 
 | ||||||
|                               lemmatizer=False) |         tag_map = dict(language_data.TAG_MAP) | ||||||
|  | 
 | ||||||
|  |         stop_words = set(language_data.STOP_WORDS) | ||||||
| 
 | 
 | ||||||
|         stop_words = set() |  | ||||||
|  |  | ||||||
|  | @ -14,35 +14,6 @@ class English(Language): | ||||||
|     lang = 'en' |     lang = 'en' | ||||||
| 
 | 
 | ||||||
|     class Defaults(Language.Defaults): |     class Defaults(Language.Defaults): | ||||||
|         def Vocab(self, lex_attr_getters=True, tag_map=True, |  | ||||||
|                   lemmatizer=True, serializer_freqs=True, vectors=True): |  | ||||||
|             if lex_attr_getters is True: |  | ||||||
|                 lex_attr_getters = self.lex_attr_getters |  | ||||||
|             if tag_map is True: |  | ||||||
|                 tag_map = self.tag_map |  | ||||||
|             if lemmatizer is True: |  | ||||||
|                 lemmatizer = self.Lemmatizer() |  | ||||||
|             return Vocab.load(self.path, lex_attr_getters=lex_attr_getters, |  | ||||||
|                               tag_map=tag_map, lemmatizer=lemmatizer, |  | ||||||
|                               serializer_freqs=serializer_freqs) |  | ||||||
| 
 |  | ||||||
|         def Tokenizer(self, vocab, rules=None, prefix_search=None, suffix_search=None, |  | ||||||
|                 infix_finditer=None): |  | ||||||
|             if rules is None: |  | ||||||
|                 rules = self.tokenizer_exceptions |  | ||||||
|             if prefix_search is None: |  | ||||||
|                 prefix_search  = util.compile_prefix_regex(self.prefixes).search |  | ||||||
|             if suffix_search is None: |  | ||||||
|                 suffix_search  = util.compile_suffix_regex(self.suffixes).search |  | ||||||
|             if infix_finditer is None: |  | ||||||
|                 infix_finditer = util.compile_infix_regex(self.infixes).finditer |  | ||||||
|             return Tokenizer(vocab, rules=rules, |  | ||||||
|                     prefix_search=prefix_search, suffix_search=suffix_search, |  | ||||||
|                     infix_finditer=infix_finditer) |  | ||||||
| 
 |  | ||||||
|         def Lemmatizer(self): |  | ||||||
|             return Lemmatizer.load(self.path) |  | ||||||
|              |  | ||||||
|         lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |         lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
| 
 | 
 | ||||||
|         tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) |         tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) | ||||||
|  |  | ||||||
|  | @ -26,6 +26,7 @@ from . import orth | ||||||
| from .syntax.ner import BiluoPushDown | from .syntax.ner import BiluoPushDown | ||||||
| from .syntax.arc_eager import ArcEager | from .syntax.arc_eager import ArcEager | ||||||
| from . import util | from . import util | ||||||
|  | from .lemmatizer import Lemmatizer | ||||||
| 
 | 
 | ||||||
| from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP | from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP | ||||||
| 
 | 
 | ||||||
|  | @ -42,18 +43,39 @@ class BaseDefaults(object): | ||||||
|         self.lex_attr_getters[LANG] = lambda string: lang |         self.lex_attr_getters[LANG] = lambda string: lang | ||||||
|         self.lex_attr_getters[IS_STOP] = lambda string: string in self.stop_words |         self.lex_attr_getters[IS_STOP] = lambda string: string in self.stop_words | ||||||
| 
 | 
 | ||||||
|  |     def Lemmatizer(self): | ||||||
|  |         return Lemmatizer.load(self.path) | ||||||
|  | 
 | ||||||
|     def Vectors(self): |     def Vectors(self): | ||||||
|         return True |         return True | ||||||
| 
 | 
 | ||||||
|     def Vocab(self, vectors=None, lex_attr_getters=None): |     def Vocab(self, lex_attr_getters=True, tag_map=True, | ||||||
|         if lex_attr_getters is None: |               lemmatizer=True, serializer_freqs=True, vectors=True): | ||||||
|             lex_attr_getters = dict(self.lex_attr_getters) |         if lex_attr_getters is True: | ||||||
|         if vectors is None: |             lex_attr_getters = self.lex_attr_getters | ||||||
|  |         if tag_map is True: | ||||||
|  |             tag_map = self.tag_map | ||||||
|  |         if lemmatizer is True: | ||||||
|  |             lemmatizer = self.Lemmatizer() | ||||||
|  |         if vectors is True: | ||||||
|             vectors = self.Vectors() |             vectors = self.Vectors() | ||||||
|         return Vocab.load(self.path, lex_attr_getters=self.lex_attr_getters, vectors=vectors) |         return Vocab.load(self.path, lex_attr_getters=lex_attr_getters, | ||||||
|  |                           tag_map=tag_map, lemmatizer=lemmatizer, | ||||||
|  |                           serializer_freqs=serializer_freqs) | ||||||
| 
 | 
 | ||||||
|     def Tokenizer(self, vocab): |     def Tokenizer(self, vocab, rules=None, prefix_search=None, suffix_search=None, | ||||||
|         return Tokenizer.load(self.path, vocab)  |                   infix_finditer=None): | ||||||
|  |         if rules is None: | ||||||
|  |             rules = self.tokenizer_exceptions | ||||||
|  |         if prefix_search is None: | ||||||
|  |             prefix_search  = util.compile_prefix_regex(self.prefixes).search | ||||||
|  |         if suffix_search is None: | ||||||
|  |             suffix_search  = util.compile_suffix_regex(self.suffixes).search | ||||||
|  |         if infix_finditer is None: | ||||||
|  |             infix_finditer = util.compile_infix_regex(self.infixes).finditer | ||||||
|  |         return Tokenizer(vocab, rules=rules, | ||||||
|  |                 prefix_search=prefix_search, suffix_search=suffix_search, | ||||||
|  |                 infix_finditer=infix_finditer) | ||||||
| 
 | 
 | ||||||
|     def Tagger(self, vocab): |     def Tagger(self, vocab): | ||||||
|         return Tagger.load(self.path / 'pos', vocab) |         return Tagger.load(self.path / 'pos', vocab) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user