mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						468ca6c760
					
				|  | @ -3,7 +3,7 @@ pathlib | ||||||
| numpy>=1.7 | numpy>=1.7 | ||||||
| cymem>=1.30,<1.32 | cymem>=1.30,<1.32 | ||||||
| preshed>=1.0.0,<2.0.0 | preshed>=1.0.0,<2.0.0 | ||||||
| thinc>=6.7.1,<6.8.0 | thinc>=6.7.2,<6.8.0 | ||||||
| murmurhash>=0.28,<0.29 | murmurhash>=0.28,<0.29 | ||||||
| plac<1.0.0,>=0.9.6 | plac<1.0.0,>=0.9.6 | ||||||
| six | six | ||||||
|  |  | ||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -191,7 +191,7 @@ def setup_package(): | ||||||
|                 'murmurhash>=0.28,<0.29', |                 'murmurhash>=0.28,<0.29', | ||||||
|                 'cymem>=1.30,<1.32', |                 'cymem>=1.30,<1.32', | ||||||
|                 'preshed>=1.0.0,<2.0.0', |                 'preshed>=1.0.0,<2.0.0', | ||||||
|                 'thinc>=6.7.1,<6.8.0', |                 'thinc>=6.7.2,<6.8.0', | ||||||
|                 'plac<1.0.0,>=0.9.6', |                 'plac<1.0.0,>=0.9.6', | ||||||
|                 'pip>=9.0.0,<10.0.0', |                 'pip>=9.0.0,<10.0.0', | ||||||
|                 'six', |                 'six', | ||||||
|  |  | ||||||
|  | @ -28,15 +28,17 @@ from .. import displacy | ||||||
|     n_iter=("number of iterations", "option", "n", int), |     n_iter=("number of iterations", "option", "n", int), | ||||||
|     n_sents=("number of sentences", "option", "ns", int), |     n_sents=("number of sentences", "option", "ns", int), | ||||||
|     use_gpu=("Use GPU", "flag", "G", bool), |     use_gpu=("Use GPU", "flag", "G", bool), | ||||||
|  |     resume=("Whether to resume training", "flag", "R", bool), | ||||||
|     no_tagger=("Don't train tagger", "flag", "T", bool), |     no_tagger=("Don't train tagger", "flag", "T", bool), | ||||||
|     no_parser=("Don't train parser", "flag", "P", bool), |     no_parser=("Don't train parser", "flag", "P", bool), | ||||||
|     no_entities=("Don't train NER", "flag", "N", bool) |     no_entities=("Don't train NER", "flag", "N", bool) | ||||||
| ) | ) | ||||||
| def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | ||||||
|           use_gpu=False, no_tagger=False, no_parser=False, no_entities=False): |           use_gpu=False, resume=False, no_tagger=False, no_parser=False, no_entities=False): | ||||||
|     """ |     """ | ||||||
|     Train a model. Expects data in spaCy's JSON format. |     Train a model. Expects data in spaCy's JSON format. | ||||||
|     """ |     """ | ||||||
|  |     util.set_env_log(True) | ||||||
|     n_sents = n_sents or None |     n_sents = n_sents or None | ||||||
|     output_path = util.ensure_path(output_dir) |     output_path = util.ensure_path(output_dir) | ||||||
|     train_path = util.ensure_path(train_data) |     train_path = util.ensure_path(train_data) | ||||||
|  | @ -66,6 +68,10 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | ||||||
|                                    util.env_opt('batch_to', 64), |                                    util.env_opt('batch_to', 64), | ||||||
|                                    util.env_opt('batch_compound', 1.001)) |                                    util.env_opt('batch_compound', 1.001)) | ||||||
| 
 | 
 | ||||||
|  |     if resume: | ||||||
|  |         prints(output_path / 'model19.pickle', title="Resuming training") | ||||||
|  |         nlp = dill.load((output_path / 'model19.pickle').open('rb')) | ||||||
|  |     else: | ||||||
|         nlp = lang_class(pipeline=pipeline) |         nlp = lang_class(pipeline=pipeline) | ||||||
|     corpus = GoldCorpus(train_path, dev_path, limit=n_sents) |     corpus = GoldCorpus(train_path, dev_path, limit=n_sents) | ||||||
|     n_train_docs = corpus.count_train() |     n_train_docs = corpus.count_train() | ||||||
|  | @ -75,6 +81,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | ||||||
|     print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") |     print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") | ||||||
|     try: |     try: | ||||||
|         for i in range(n_iter): |         for i in range(n_iter): | ||||||
|  |             if resume: | ||||||
|  |                 i += 20 | ||||||
|             with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: |             with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: | ||||||
|                 train_docs = corpus.train_docs(nlp, projectivize=True, |                 train_docs = corpus.train_docs(nlp, projectivize=True, | ||||||
|                                                gold_preproc=False, max_length=0) |                                                gold_preproc=False, max_length=0) | ||||||
|  | @ -86,14 +94,18 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | ||||||
|                     pbar.update(len(docs)) |                     pbar.update(len(docs)) | ||||||
| 
 | 
 | ||||||
|             with nlp.use_params(optimizer.averages): |             with nlp.use_params(optimizer.averages): | ||||||
|  |                 util.set_env_log(False) | ||||||
|  |                 epoch_model_path = output_path / ('model%d' % i) | ||||||
|  |                 nlp.to_disk(epoch_model_path) | ||||||
|                 with (output_path / ('model%d.pickle' % i)).open('wb') as file_: |                 with (output_path / ('model%d.pickle' % i)).open('wb') as file_: | ||||||
|                     dill.dump(nlp, file_, -1) |                     dill.dump(nlp, file_, -1) | ||||||
|                 with (output_path / ('model%d.bin' % i)).open('wb') as file_: |  | ||||||
|                     file_.write(nlp.to_bytes()) |  | ||||||
|                 with (output_path / ('model%d.bin' % i)).open('rb') as file_: |  | ||||||
|                 nlp_loaded = lang_class(pipeline=pipeline) |                 nlp_loaded = lang_class(pipeline=pipeline) | ||||||
|                     nlp_loaded.from_bytes(file_.read()) |                 nlp_loaded = nlp_loaded.from_disk(epoch_model_path) | ||||||
|                 scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False)) |                 scorer = nlp_loaded.evaluate( | ||||||
|  |                             corpus.dev_docs( | ||||||
|  |                                 nlp_loaded, | ||||||
|  |                                 gold_preproc=False)) | ||||||
|  |                 util.set_env_log(True) | ||||||
|             print_progress(i, losses, scorer.scores) |             print_progress(i, losses, scorer.scores) | ||||||
|     finally: |     finally: | ||||||
|         print("Saving model...") |         print("Saving model...") | ||||||
|  |  | ||||||
|  | @ -56,7 +56,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False, | ||||||
|     render(docs, style=style, page=page, minify=minify, options=options, manual=manual) |     render(docs, style=style, page=page, minify=minify, options=options, manual=manual) | ||||||
|     httpd = simple_server.make_server('0.0.0.0', port, app) |     httpd = simple_server.make_server('0.0.0.0', port, app) | ||||||
|     prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port) |     prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port) | ||||||
|  |     try: | ||||||
|         httpd.serve_forever() |         httpd.serve_forever() | ||||||
|  |     except KeyboardInterrupt: | ||||||
|  |         prints("Shutting down server on port %d." % port) | ||||||
|  |     finally: | ||||||
|  |         httpd.server_close() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def app(environ, start_response): | def app(environ, start_response): | ||||||
|  | @ -65,12 +70,13 @@ def app(environ, start_response): | ||||||
|     return [res] |     return [res] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def parse_deps(doc, options={}): | def parse_deps(orig_doc, options={}): | ||||||
|     """Generate dependency parse in {'words': [], 'arcs': []} format. |     """Generate dependency parse in {'words': [], 'arcs': []} format. | ||||||
| 
 | 
 | ||||||
|     doc (Doc): Document do parse. |     doc (Doc): Document do parse. | ||||||
|     RETURNS (dict): Generated dependency parse keyed by words and arcs. |     RETURNS (dict): Generated dependency parse keyed by words and arcs. | ||||||
|     """ |     """ | ||||||
|  |     doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes()) | ||||||
|     if options.get('collapse_punct', True): |     if options.get('collapse_punct', True): | ||||||
|         spans = [] |         spans = [] | ||||||
|         for word in doc[:-1]: |         for word in doc[:-1]: | ||||||
|  |  | ||||||
|  | @ -18,12 +18,11 @@ class DependencyRenderer(object): | ||||||
|                         offset_x, color, bg, font) |                         offset_x, color, bg, font) | ||||||
|         """ |         """ | ||||||
|         self.compact = options.get('compact', False) |         self.compact = options.get('compact', False) | ||||||
|         distance, arrow_width = (85, 8) if self.compact else (175, 10) |  | ||||||
|         self.word_spacing = options.get('word_spacing', 45) |         self.word_spacing = options.get('word_spacing', 45) | ||||||
|         self.arrow_spacing = options.get('arrow_spacing', 20) |         self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20) | ||||||
|         self.arrow_width = options.get('arrow_width', arrow_width) |         self.arrow_width = options.get('arrow_width', 6 if self.compact else 10) | ||||||
|         self.arrow_stroke = options.get('arrow_stroke', 2) |         self.arrow_stroke = options.get('arrow_stroke', 2) | ||||||
|         self.distance = options.get('distance', distance) |         self.distance = options.get('distance', 150 if self.compact else 175) | ||||||
|         self.offset_x = options.get('offset_x', 50) |         self.offset_x = options.get('offset_x', 50) | ||||||
|         self.color = options.get('color', '#000000') |         self.color = options.get('color', '#000000') | ||||||
|         self.bg = options.get('bg', '#ffffff') |         self.bg = options.get('bg', '#ffffff') | ||||||
|  | @ -99,6 +98,8 @@ class DependencyRenderer(object): | ||||||
|         x_end = (self.offset_x+(end-start)*self.distance+start*self.distance |         x_end = (self.offset_x+(end-start)*self.distance+start*self.distance | ||||||
|                  -self.arrow_spacing*(self.highest_level-level)/4) |                  -self.arrow_spacing*(self.highest_level-level)/4) | ||||||
|         y_curve = self.offset_y-level*self.distance/2 |         y_curve = self.offset_y-level*self.distance/2 | ||||||
|  |         if self.compact: | ||||||
|  |             y_curve = self.offset_y-level*self.distance/6 | ||||||
|         if y_curve == 0 and len(self.levels) > 5: |         if y_curve == 0 and len(self.levels) > 5: | ||||||
|             y_curve = -self.distance |             y_curve = -self.distance | ||||||
|         arrowhead = self.get_arrowhead(direction, x_start, y, x_end) |         arrowhead = self.get_arrowhead(direction, x_start, y, x_end) | ||||||
|  |  | ||||||
|  | @ -21,7 +21,7 @@ TPL_DEP_WORDS = """ | ||||||
| TPL_DEP_ARCS = """ | TPL_DEP_ARCS = """ | ||||||
| <g class="displacy-arrow"> | <g class="displacy-arrow"> | ||||||
|     <path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/> |     <path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/> | ||||||
|     <text dy="1.25em" style="font-size: 0.8em"> |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"> | ||||||
|         <textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath> |         <textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath> | ||||||
|     </text> |     </text> | ||||||
|     <path class="displacy-arrowhead" d="{head}" fill="currentColor"/> |     <path class="displacy-arrowhead" d="{head}" fill="currentColor"/> | ||||||
|  |  | ||||||
|  | @ -212,7 +212,7 @@ class GoldCorpus(object): | ||||||
| 
 | 
 | ||||||
|     def dev_docs(self, nlp, gold_preproc=False): |     def dev_docs(self, nlp, gold_preproc=False): | ||||||
|         gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) |         gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) | ||||||
|         gold_docs = nlp.preprocess_gold(gold_docs) |         #gold_docs = nlp.preprocess_gold(gold_docs) | ||||||
|         yield from gold_docs |         yield from gold_docs | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|  | @ -227,7 +227,7 @@ class GoldCorpus(object): | ||||||
|                                   gold_preproc) |                                   gold_preproc) | ||||||
|             golds = cls._make_golds(docs, paragraph_tuples) |             golds = cls._make_golds(docs, paragraph_tuples) | ||||||
|             for doc, gold in zip(docs, golds): |             for doc, gold in zip(docs, golds): | ||||||
|                 if not max_length or len(doc) < max_length: |                 if (not max_length) or len(doc) < max_length: | ||||||
|                     yield doc, gold |                     yield doc, gold | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|  | @ -235,17 +235,17 @@ class GoldCorpus(object): | ||||||
|         if raw_text is not None: |         if raw_text is not None: | ||||||
|             return [nlp.make_doc(raw_text)] |             return [nlp.make_doc(raw_text)] | ||||||
|         else: |         else: | ||||||
|             return [Doc(nlp.vocab, words=sent_tuples[0][1]) |             return [Doc(nlp.vocab, words=sent_tuples[1]) | ||||||
|                 for sent_tuples in paragraph_tuples] |                 for (sent_tuples, brackets) in paragraph_tuples] | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def _make_golds(cls, docs, paragraph_tuples): |     def _make_golds(cls, docs, paragraph_tuples): | ||||||
|  |         assert len(docs) == len(paragraph_tuples) | ||||||
|         if len(docs) == 1: |         if len(docs) == 1: | ||||||
|             return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0]) |             return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])] | ||||||
|                     for sent_tuples in paragraph_tuples] |  | ||||||
|         else: |         else: | ||||||
|             return [GoldParse.from_annot_tuples(doc, sent_tuples[0]) |             return [GoldParse.from_annot_tuples(doc, sent_tuples) | ||||||
|                     for doc, sent_tuples in zip(docs, paragraph_tuples)] |                     for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)] | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def walk_corpus(path): |     def walk_corpus(path): | ||||||
|  |  | ||||||
|  | @ -2,21 +2,25 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
|  | from .norm_exceptions import NORM_EXCEPTIONS | ||||||
| from .tag_map import TAG_MAP | from .tag_map import TAG_MAP | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .lemmatizer import LOOKUP | from .lemmatizer import LOOKUP | ||||||
| from .syntax_iterators import SYNTAX_ITERATORS | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
| 
 | 
 | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
|  | from ..norm_exceptions import BASE_NORMS | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...lemmatizerlookup import Lemmatizer | from ...lemmatizerlookup import Lemmatizer | ||||||
| from ...attrs import LANG | from ...attrs import LANG, NORM | ||||||
| from ...util import update_exc | from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class GermanDefaults(Language.Defaults): | class GermanDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'de' |     lex_attr_getters[LANG] = lambda text: 'de' | ||||||
|  |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], | ||||||
|  |                                          BASE_NORMS, NORM_EXCEPTIONS) | ||||||
| 
 | 
 | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     tag_map = dict(TAG_MAP) |     tag_map = dict(TAG_MAP) | ||||||
|  |  | ||||||
							
								
								
									
										17
									
								
								spacy/lang/de/norm_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								spacy/lang/de/norm_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,17 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | # Here we only want to include the absolute most common words. Otherwise, | ||||||
|  | # this list would get impossibly long for German – especially considering the | ||||||
|  | # old vs. new spelling rules, and all possible cases. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _exc = { | ||||||
|  |     "daß": "dass" | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | NORM_EXCEPTIONS = {} | ||||||
|  | 
 | ||||||
|  | for string, norm in _exc.items(): | ||||||
|  |     NORM_EXCEPTIONS[string.title()] = norm | ||||||
|  | @ -8,7 +8,7 @@ from ...deprecated import PRON_LEMMA | ||||||
| _exc = { | _exc = { | ||||||
|     "auf'm": [ |     "auf'm": [ | ||||||
|         {ORTH: "auf", LEMMA: "auf"}, |         {ORTH: "auf", LEMMA: "auf"}, | ||||||
|         {ORTH: "'m", LEMMA: "der", NORM: "dem" }], |         {ORTH: "'m", LEMMA: "der", NORM: "dem"}], | ||||||
| 
 | 
 | ||||||
|     "du's": [ |     "du's": [ | ||||||
|         {ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"}, |         {ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"}, | ||||||
|  | @ -53,97 +53,97 @@ _exc = { | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| for exc_data in [ | for exc_data in [ | ||||||
|     {ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"}, |     {ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, | ||||||
|     {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"}, |     {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, | ||||||
|     {ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"}, |     {ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, | ||||||
|     {ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"}, |     {ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, | ||||||
|     {ORTH: "'n", LEMMA: "ein", NORM: "ein"}, |     {ORTH: "'n", LEMMA: "ein", NORM: "ein"}, | ||||||
|     {ORTH: "'ne", LEMMA: "eine", NORM: "eine"}, |     {ORTH: "'ne", LEMMA: "eine", NORM: "eine"}, | ||||||
|     {ORTH: "'nen", LEMMA: "ein", NORM: "einen"}, |     {ORTH: "'nen", LEMMA: "ein", NORM: "einen"}, | ||||||
|     {ORTH: "'nem", LEMMA: "ein", NORM: "einem"}, |     {ORTH: "'nem", LEMMA: "ein", NORM: "einem"}, | ||||||
|     {ORTH: "Abb.", LEMMA: "Abbildung"}, |     {ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"}, | ||||||
|     {ORTH: "Abk.", LEMMA: "Abkürzung"}, |     {ORTH: "Abk.", LEMMA: "Abkürzung", NORM: "Abkürzung"}, | ||||||
|     {ORTH: "Abt.", LEMMA: "Abteilung"}, |     {ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"}, | ||||||
|     {ORTH: "Apr.", LEMMA: "April"}, |     {ORTH: "Apr.", LEMMA: "April", NORM: "April"}, | ||||||
|     {ORTH: "Aug.", LEMMA: "August"}, |     {ORTH: "Aug.", LEMMA: "August", NORM: "August"}, | ||||||
|     {ORTH: "Bd.", LEMMA: "Band"}, |     {ORTH: "Bd.", LEMMA: "Band", NORM: "Band"}, | ||||||
|     {ORTH: "Betr.", LEMMA: "Betreff"}, |     {ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"}, | ||||||
|     {ORTH: "Bf.", LEMMA: "Bahnhof"}, |     {ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"}, | ||||||
|     {ORTH: "Bhf.", LEMMA: "Bahnhof"}, |     {ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"}, | ||||||
|     {ORTH: "Bsp.", LEMMA: "Beispiel"}, |     {ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"}, | ||||||
|     {ORTH: "Dez.", LEMMA: "Dezember"}, |     {ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"}, | ||||||
|     {ORTH: "Di.", LEMMA: "Dienstag"}, |     {ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"}, | ||||||
|     {ORTH: "Do.", LEMMA: "Donnerstag"}, |     {ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"}, | ||||||
|     {ORTH: "Fa.", LEMMA: "Firma"}, |     {ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"}, | ||||||
|     {ORTH: "Fam.", LEMMA: "Familie"}, |     {ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"}, | ||||||
|     {ORTH: "Feb.", LEMMA: "Februar"}, |     {ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"}, | ||||||
|     {ORTH: "Fr.", LEMMA: "Frau"}, |     {ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"}, | ||||||
|     {ORTH: "Frl.", LEMMA: "Fräulein"}, |     {ORTH: "Frl.", LEMMA: "Fräulein", NORM: "Fräulein"}, | ||||||
|     {ORTH: "Hbf.", LEMMA: "Hauptbahnhof"}, |     {ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"}, | ||||||
|     {ORTH: "Hr.", LEMMA: "Herr"}, |     {ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"}, | ||||||
|     {ORTH: "Hrn.", LEMMA: "Herr"}, |     {ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"}, | ||||||
|     {ORTH: "Jan.", LEMMA: "Januar"}, |     {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}, | ||||||
|     {ORTH: "Jh.", LEMMA: "Jahrhundert"}, |     {ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"}, | ||||||
|     {ORTH: "Jhd.", LEMMA: "Jahrhundert"}, |     {ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"}, | ||||||
|     {ORTH: "Jul.", LEMMA: "Juli"}, |     {ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"}, | ||||||
|     {ORTH: "Jun.", LEMMA: "Juni"}, |     {ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"}, | ||||||
|     {ORTH: "Mi.", LEMMA: "Mittwoch"}, |     {ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"}, | ||||||
|     {ORTH: "Mio.", LEMMA: "Million"}, |     {ORTH: "Mio.", LEMMA: "Million", NORM: "Million"}, | ||||||
|     {ORTH: "Mo.", LEMMA: "Montag"}, |     {ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"}, | ||||||
|     {ORTH: "Mrd.", LEMMA: "Milliarde"}, |     {ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"}, | ||||||
|     {ORTH: "Mrz.", LEMMA: "März"}, |     {ORTH: "Mrz.", LEMMA: "März", NORM: "März"}, | ||||||
|     {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"}, |     {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"}, | ||||||
|     {ORTH: "Mär.", LEMMA: "März"}, |     {ORTH: "Mär.", LEMMA: "März", NORM: "März"}, | ||||||
|     {ORTH: "Nov.", LEMMA: "November"}, |     {ORTH: "Nov.", LEMMA: "November", NORM: "November"}, | ||||||
|     {ORTH: "Nr.", LEMMA: "Nummer"}, |     {ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"}, | ||||||
|     {ORTH: "Okt.", LEMMA: "Oktober"}, |     {ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"}, | ||||||
|     {ORTH: "Orig.", LEMMA: "Original"}, |     {ORTH: "Orig.", LEMMA: "Original", NORM: "Original"}, | ||||||
|     {ORTH: "Pkt.", LEMMA: "Punkt"}, |     {ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"}, | ||||||
|     {ORTH: "Prof.", LEMMA: "Professor"}, |     {ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"}, | ||||||
|     {ORTH: "Red.", LEMMA: "Redaktion"}, |     {ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"}, | ||||||
|     {ORTH: "Sa.", LEMMA: "Samstag"}, |     {ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"}, | ||||||
|     {ORTH: "Sep.", LEMMA: "September"}, |     {ORTH: "Sep.", LEMMA: "September", NORM: "September"}, | ||||||
|     {ORTH: "Sept.", LEMMA: "September"}, |     {ORTH: "Sept.", LEMMA: "September", NORM: "September"}, | ||||||
|     {ORTH: "So.", LEMMA: "Sonntag"}, |     {ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"}, | ||||||
|     {ORTH: "Std.", LEMMA: "Stunde"}, |     {ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"}, | ||||||
|     {ORTH: "Str.", LEMMA: "Straße"}, |     {ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"}, | ||||||
|     {ORTH: "Tel.", LEMMA: "Telefon"}, |     {ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"}, | ||||||
|     {ORTH: "Tsd.", LEMMA: "Tausend"}, |     {ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"}, | ||||||
|     {ORTH: "Univ.", LEMMA: "Universität"}, |     {ORTH: "Univ.", LEMMA: "Universität", NORM: "Universität"}, | ||||||
|     {ORTH: "abzgl.", LEMMA: "abzüglich"}, |     {ORTH: "abzgl.", LEMMA: "abzüglich", NORM: "abzüglich"}, | ||||||
|     {ORTH: "allg.", LEMMA: "allgemein"}, |     {ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"}, | ||||||
|     {ORTH: "bspw.", LEMMA: "beispielsweise"}, |     {ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"}, | ||||||
|     {ORTH: "bzgl.", LEMMA: "bezüglich"}, |     {ORTH: "bzgl.", LEMMA: "bezüglich", NORM: "bezüglich"}, | ||||||
|     {ORTH: "bzw.", LEMMA: "beziehungsweise"}, |     {ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"}, | ||||||
|     {ORTH: "d.h.", LEMMA: "das heißt"}, |     {ORTH: "d.h.", LEMMA: "das heißt"}, | ||||||
|     {ORTH: "dgl.", LEMMA: "dergleichen"}, |     {ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"}, | ||||||
|     {ORTH: "ebd.", LEMMA: "ebenda"}, |     {ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"}, | ||||||
|     {ORTH: "eigtl.", LEMMA: "eigentlich"}, |     {ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"}, | ||||||
|     {ORTH: "engl.", LEMMA: "englisch"}, |     {ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"}, | ||||||
|     {ORTH: "evtl.", LEMMA: "eventuell"}, |     {ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"}, | ||||||
|     {ORTH: "frz.", LEMMA: "französisch"}, |     {ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"}, | ||||||
|     {ORTH: "gegr.", LEMMA: "gegründet"}, |     {ORTH: "gegr.", LEMMA: "gegründet", NORM: "gegründet"}, | ||||||
|     {ORTH: "ggf.", LEMMA: "gegebenenfalls"}, |     {ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"}, | ||||||
|     {ORTH: "ggfs.", LEMMA: "gegebenenfalls"}, |     {ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"}, | ||||||
|     {ORTH: "ggü.", LEMMA: "gegenüber"}, |     {ORTH: "ggü.", LEMMA: "gegenüber", NORM: "gegenüber"}, | ||||||
|     {ORTH: "i.O.", LEMMA: "in Ordnung"}, |     {ORTH: "i.O.", LEMMA: "in Ordnung"}, | ||||||
|     {ORTH: "i.d.R.", LEMMA: "in der Regel"}, |     {ORTH: "i.d.R.", LEMMA: "in der Regel"}, | ||||||
|     {ORTH: "incl.", LEMMA: "inklusive"}, |     {ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"}, | ||||||
|     {ORTH: "inkl.", LEMMA: "inklusive"}, |     {ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"}, | ||||||
|     {ORTH: "insb.", LEMMA: "insbesondere"}, |     {ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"}, | ||||||
|     {ORTH: "kath.", LEMMA: "katholisch"}, |     {ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"}, | ||||||
|     {ORTH: "lt.", LEMMA: "laut"}, |     {ORTH: "lt.", LEMMA: "laut", NORM: "laut"}, | ||||||
|     {ORTH: "max.", LEMMA: "maximal"}, |     {ORTH: "max.", LEMMA: "maximal", NORM: "maximal"}, | ||||||
|     {ORTH: "min.", LEMMA: "minimal"}, |     {ORTH: "min.", LEMMA: "minimal", NORM: "minimal"}, | ||||||
|     {ORTH: "mind.", LEMMA: "mindestens"}, |     {ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"}, | ||||||
|     {ORTH: "mtl.", LEMMA: "monatlich"}, |     {ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"}, | ||||||
|     {ORTH: "n.Chr.", LEMMA: "nach Christus"}, |     {ORTH: "n.Chr.", LEMMA: "nach Christus"}, | ||||||
|     {ORTH: "orig.", LEMMA: "original"}, |     {ORTH: "orig.", LEMMA: "original", NORM: "original"}, | ||||||
|     {ORTH: "röm.", LEMMA: "römisch"}, |     {ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"}, | ||||||
|     {ORTH: "s.o.", LEMMA: "siehe oben"}, |     {ORTH: "s.o.", LEMMA: "siehe oben"}, | ||||||
|     {ORTH: "sog.", LEMMA: "so genannt"}, |     {ORTH: "sog.", LEMMA: "so genannt"}, | ||||||
|     {ORTH: "stellv.", LEMMA: "stellvertretend"}, |     {ORTH: "stellv.", LEMMA: "stellvertretend"}, | ||||||
|     {ORTH: "tägl.", LEMMA: "täglich"}, |     {ORTH: "tägl.", LEMMA: "täglich", NORM: "täglich"}, | ||||||
|     {ORTH: "u.U.", LEMMA: "unter Umständen"}, |     {ORTH: "u.U.", LEMMA: "unter Umständen"}, | ||||||
|     {ORTH: "u.s.w.", LEMMA: "und so weiter"}, |     {ORTH: "u.s.w.", LEMMA: "und so weiter"}, | ||||||
|     {ORTH: "u.v.m.", LEMMA: "und vieles mehr"}, |     {ORTH: "u.v.m.", LEMMA: "und vieles mehr"}, | ||||||
|  | @ -153,9 +153,9 @@ for exc_data in [ | ||||||
|     {ORTH: "v.Chr.", LEMMA: "vor Christus"}, |     {ORTH: "v.Chr.", LEMMA: "vor Christus"}, | ||||||
|     {ORTH: "v.a.", LEMMA: "vor allem"}, |     {ORTH: "v.a.", LEMMA: "vor allem"}, | ||||||
|     {ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"}, |     {ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"}, | ||||||
|     {ORTH: "vgl.", LEMMA: "vergleiche"}, |     {ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"}, | ||||||
|     {ORTH: "vllt.", LEMMA: "vielleicht"}, |     {ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"}, | ||||||
|     {ORTH: "vlt.", LEMMA: "vielleicht"}, |     {ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"}, | ||||||
|     {ORTH: "z.B.", LEMMA: "zum Beispiel"}, |     {ORTH: "z.B.", LEMMA: "zum Beispiel"}, | ||||||
|     {ORTH: "z.Bsp.", LEMMA: "zum Beispiel"}, |     {ORTH: "z.Bsp.", LEMMA: "zum Beispiel"}, | ||||||
|     {ORTH: "z.T.", LEMMA: "zum Teil"}, |     {ORTH: "z.T.", LEMMA: "zum Teil"}, | ||||||
|  | @ -163,7 +163,7 @@ for exc_data in [ | ||||||
|     {ORTH: "z.Zt.", LEMMA: "zur Zeit"}, |     {ORTH: "z.Zt.", LEMMA: "zur Zeit"}, | ||||||
|     {ORTH: "z.b.", LEMMA: "zum Beispiel"}, |     {ORTH: "z.b.", LEMMA: "zum Beispiel"}, | ||||||
|     {ORTH: "zzgl.", LEMMA: "zuzüglich"}, |     {ORTH: "zzgl.", LEMMA: "zuzüglich"}, | ||||||
|     {ORTH: "österr.", LEMMA: "österreichisch"}]: |     {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]: | ||||||
|     _exc[exc_data[ORTH]] = [dict(exc_data)] |     _exc[exc_data[ORTH]] = [dict(exc_data)] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -2,6 +2,7 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
|  | from .norm_exceptions import NORM_EXCEPTIONS | ||||||
| from .tag_map import TAG_MAP | from .tag_map import TAG_MAP | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .lex_attrs import LEX_ATTRS | from .lex_attrs import LEX_ATTRS | ||||||
|  | @ -10,14 +11,17 @@ from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC | ||||||
| from .syntax_iterators import SYNTAX_ITERATORS | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
| 
 | 
 | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
|  | from ..norm_exceptions import BASE_NORMS | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...attrs import LANG | from ...attrs import LANG, NORM | ||||||
| from ...util import update_exc | from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class EnglishDefaults(Language.Defaults): | class EnglishDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'en' |     lex_attr_getters[LANG] = lambda text: 'en' | ||||||
|  |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], | ||||||
|  |                                          BASE_NORMS, NORM_EXCEPTIONS) | ||||||
|     lex_attr_getters.update(LEX_ATTRS) |     lex_attr_getters.update(LEX_ATTRS) | ||||||
| 
 | 
 | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|  |  | ||||||
							
								
								
									
										1760
									
								
								spacy/lang/en/norm_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1760
									
								
								spacy/lang/en/norm_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -15,20 +15,20 @@ _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell", | ||||||
| for pron in ["i"]: | for pron in ["i"]: | ||||||
|     for orth in [pron, pron.title()]: |     for orth in [pron, pron.title()]: | ||||||
|         _exc[orth + "'m"] = [ |         _exc[orth + "'m"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}] |             {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "m"] = [ |         _exc[orth + "m"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }] |             {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "'ma"] = [ |         _exc[orth + "'ma"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "'m", LEMMA: "be", NORM: "am"}, |             {ORTH: "'m", LEMMA: "be", NORM: "am"}, | ||||||
|             {ORTH: "a", LEMMA: "going to", NORM: "gonna"}] |             {ORTH: "a", LEMMA: "going to", NORM: "gonna"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "ma"] = [ |         _exc[orth + "ma"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "m", LEMMA: "be", NORM: "am"}, |             {ORTH: "m", LEMMA: "be", NORM: "am"}, | ||||||
|             {ORTH: "a", LEMMA: "going to", NORM: "gonna"}] |             {ORTH: "a", LEMMA: "going to", NORM: "gonna"}] | ||||||
| 
 | 
 | ||||||
|  | @ -36,72 +36,72 @@ for pron in ["i"]: | ||||||
| for pron in ["i", "you", "he", "she", "it", "we", "they"]: | for pron in ["i", "you", "he", "she", "it", "we", "they"]: | ||||||
|     for orth in [pron, pron.title()]: |     for orth in [pron, pron.title()]: | ||||||
|         _exc[orth + "'ll"] = [ |         _exc[orth + "'ll"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "'ll", LEMMA: "will", TAG: "MD"}] |             {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "ll"] = [ |         _exc[orth + "ll"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "ll", LEMMA: "will", TAG: "MD"}] |             {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "'ll've"] = [ |         _exc[orth + "'ll've"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "'ll", LEMMA: "will", TAG: "MD"}, |             {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, | ||||||
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "llve"] = [ |         _exc[orth + "llve"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "ll", LEMMA: "will", TAG: "MD"}, |             {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, | ||||||
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "'d"] = [ |         _exc[orth + "'d"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "'d", LEMMA: "would", TAG: "MD"}] |             {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "d"] = [ |         _exc[orth + "d"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "d", LEMMA: "would", TAG: "MD"}] |             {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "'d've"] = [ |         _exc[orth + "'d've"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "'d", LEMMA: "would", TAG: "MD"}, |             {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}, | ||||||
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "dve"] = [ |         _exc[orth + "dve"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "d", LEMMA: "would", TAG: "MD"}, |             {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}, | ||||||
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| for pron in ["i", "you", "we", "they"]: | for pron in ["i", "you", "we", "they"]: | ||||||
|     for orth in [pron, pron.title()]: |     for orth in [pron, pron.title()]: | ||||||
|         _exc[orth + "'ve"] = [ |         _exc[orth + "'ve"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "ve"] = [ |         _exc[orth + "ve"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| for pron in ["you", "we", "they"]: | for pron in ["you", "we", "they"]: | ||||||
|     for orth in [pron, pron.title()]: |     for orth in [pron, pron.title()]: | ||||||
|         _exc[orth + "'re"] = [ |         _exc[orth + "'re"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "'re", LEMMA: "be", NORM: "are"}] |             {ORTH: "'re", LEMMA: "be", NORM: "are"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "re"] = [ |         _exc[orth + "re"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}] |             {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| for pron in ["he", "she", "it"]: | for pron in ["he", "she", "it"]: | ||||||
|     for orth in [pron, pron.title()]: |     for orth in [pron, pron.title()]: | ||||||
|         _exc[orth + "'s"] = [ |         _exc[orth + "'s"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "'s"}] |             {ORTH: "'s", NORM: "'s"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "s"] = [ |         _exc[orth + "s"] = [ | ||||||
|             {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, |             {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, | ||||||
|             {ORTH: "s"}] |             {ORTH: "s"}] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -110,111 +110,111 @@ for pron in ["he", "she", "it"]: | ||||||
| for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: | for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: | ||||||
|     for orth in [word, word.title()]: |     for orth in [word, word.title()]: | ||||||
|         _exc[orth + "'s"] = [ |         _exc[orth + "'s"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word, NORM: word}, | ||||||
|             {ORTH: "'s"}] |             {ORTH: "'s", NORM: "'s"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "s"] = [ |         _exc[orth + "s"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word, NORM: word}, | ||||||
|             {ORTH: "s"}] |             {ORTH: "s"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "'ll"] = [ |         _exc[orth + "'ll"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word, NORM: word}, | ||||||
|             {ORTH: "'ll", LEMMA: "will", TAG: "MD"}] |             {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "ll"] = [ |         _exc[orth + "ll"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word, NORM: word}, | ||||||
|             {ORTH: "ll", LEMMA: "will", TAG: "MD"}] |             {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "'ll've"] = [ |         _exc[orth + "'ll've"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word, NORM: word}, | ||||||
|             {ORTH: "'ll", LEMMA: "will", TAG: "MD"}, |             {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, | ||||||
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "llve"] = [ |         _exc[orth + "llve"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word, NORM: word}, | ||||||
|             {ORTH: "ll", LEMMA: "will", TAG: "MD"}, |             {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, | ||||||
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "'re"] = [ |         _exc[orth + "'re"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word, NORM: word}, | ||||||
|             {ORTH: "'re", LEMMA: "be", NORM: "are"}] |             {ORTH: "'re", LEMMA: "be", NORM: "are"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "re"] = [ |         _exc[orth + "re"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word, NORM: word}, | ||||||
|             {ORTH: "re", LEMMA: "be", NORM: "are"}] |             {ORTH: "re", LEMMA: "be", NORM: "are"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "'ve"] = [ |         _exc[orth + "'ve"] = [ | ||||||
|             {ORTH: orth}, |             {ORTH: orth, LEMMA: word, NORM: word}, | ||||||
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "ve"] = [ |         _exc[orth + "ve"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word}, | ||||||
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "'d"] = [ |         _exc[orth + "'d"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word, NORM: word}, | ||||||
|             {ORTH: "'d"}] |             {ORTH: "'d", NORM: "'d"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "d"] = [ |         _exc[orth + "d"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word, NORM: word}, | ||||||
|             {ORTH: "d"}] |             {ORTH: "d"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "'d've"] = [ |         _exc[orth + "'d've"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word, NORM: word}, | ||||||
|             {ORTH: "'d", LEMMA: "would", TAG: "MD"}, |             {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}, | ||||||
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
|         _exc[orth + "dve"] = [ |         _exc[orth + "dve"] = [ | ||||||
|             {ORTH: orth, LEMMA: word}, |             {ORTH: orth, LEMMA: word, NORM: word}, | ||||||
|             {ORTH: "d", LEMMA: "would", TAG: "MD"}, |             {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}, | ||||||
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Verbs | # Verbs | ||||||
| 
 | 
 | ||||||
| for verb_data in [ | for verb_data in [ | ||||||
|     {ORTH: "ca", LEMMA: "can", TAG: "MD"}, |     {ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"}, | ||||||
|     {ORTH: "could", TAG: "MD"}, |     {ORTH: "could", NORM: "could", TAG: "MD"}, | ||||||
|     {ORTH: "do", LEMMA: "do"}, |     {ORTH: "do", LEMMA: "do", NORM: "do"}, | ||||||
|     {ORTH: "does", LEMMA: "do"}, |     {ORTH: "does", LEMMA: "do", NORM: "does"}, | ||||||
|     {ORTH: "did", LEMMA: "do", TAG: "VBD"}, |     {ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"}, | ||||||
|     {ORTH: "had", LEMMA: "have", TAG: "VBD"}, |     {ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"}, | ||||||
|     {ORTH: "may", TAG: "MD"}, |     {ORTH: "may", NORM: "may", TAG: "MD"}, | ||||||
|     {ORTH: "might", TAG: "MD"}, |     {ORTH: "might", NORM: "might", TAG: "MD"}, | ||||||
|     {ORTH: "must", TAG: "MD"}, |     {ORTH: "must", NORM: "must", TAG: "MD"}, | ||||||
|     {ORTH: "need"}, |     {ORTH: "need", NORM: "need"}, | ||||||
|     {ORTH: "ought"}, |     {ORTH: "ought", NORM: "ought", TAG: "MD"}, | ||||||
|     {ORTH: "sha", LEMMA: "shall", TAG: "MD"}, |     {ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"}, | ||||||
|     {ORTH: "should", TAG: "MD"}, |     {ORTH: "should", NORM: "should", TAG: "MD"}, | ||||||
|     {ORTH: "wo", LEMMA: "will", TAG: "MD"}, |     {ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"}, | ||||||
|     {ORTH: "would", TAG: "MD"}]: |     {ORTH: "would", NORM: "would", TAG: "MD"}]: | ||||||
|     verb_data_tc = dict(verb_data) |     verb_data_tc = dict(verb_data) | ||||||
|     verb_data_tc[ORTH] = verb_data_tc[ORTH].title() |     verb_data_tc[ORTH] = verb_data_tc[ORTH].title() | ||||||
|     for data in [verb_data, verb_data_tc]: |     for data in [verb_data, verb_data_tc]: | ||||||
|         _exc[data[ORTH] + "n't"] = [ |         _exc[data[ORTH] + "n't"] = [ | ||||||
|             dict(data), |             dict(data), | ||||||
|             {ORTH: "n't", LEMMA: "not", TAG: "RB"}] |             {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}] | ||||||
| 
 | 
 | ||||||
|         _exc[data[ORTH] + "nt"] = [ |         _exc[data[ORTH] + "nt"] = [ | ||||||
|             dict(data), |             dict(data), | ||||||
|             {ORTH: "nt", LEMMA: "not", TAG: "RB"}] |             {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}] | ||||||
| 
 | 
 | ||||||
|         _exc[data[ORTH] + "n't've"] = [ |         _exc[data[ORTH] + "n't've"] = [ | ||||||
|             dict(data), |             dict(data), | ||||||
|             {ORTH: "n't", LEMMA: "not", TAG: "RB"}, |             {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}, | ||||||
|             {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
|         _exc[data[ORTH] + "ntve"] = [ |         _exc[data[ORTH] + "ntve"] = [ | ||||||
|             dict(data), |             dict(data), | ||||||
|             {ORTH: "nt", LEMMA: "not", TAG: "RB"}, |             {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}, | ||||||
|             {ORTH: "ve", LEMMA: "have", TAG: "VB"}] |             {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| for verb_data in [ | for verb_data in [ | ||||||
|     {ORTH: "could", TAG: "MD"}, |     {ORTH: "could", NORM: "could", TAG: "MD"}, | ||||||
|     {ORTH: "might"}, |     {ORTH: "might", NORM: "might", TAG: "MD"}, | ||||||
|     {ORTH: "must"}, |     {ORTH: "must", NORM: "must", TAG: "MD"}, | ||||||
|     {ORTH: "should"}]: |     {ORTH: "should", NORM: "should", TAG: "MD"}]: | ||||||
|     verb_data_tc = dict(verb_data) |     verb_data_tc = dict(verb_data) | ||||||
|     verb_data_tc[ORTH] = verb_data_tc[ORTH].title() |     verb_data_tc[ORTH] = verb_data_tc[ORTH].title() | ||||||
|     for data in [verb_data, verb_data_tc]: |     for data in [verb_data, verb_data_tc]: | ||||||
|  | @ -228,21 +228,21 @@ for verb_data in [ | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| for verb_data in [ | for verb_data in [ | ||||||
|     {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, |     {ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2}, | ||||||
|     {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, |     {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2}, | ||||||
|     {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, |     {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"}, | ||||||
|     {ORTH: "was", LEMMA: "be"}, |     {ORTH: "was", LEMMA: "be", NORM: "was"}, | ||||||
|     {ORTH: "were", LEMMA: "be"}]: |     {ORTH: "were", LEMMA: "be", NORM: "were"}]: | ||||||
|     verb_data_tc = dict(verb_data) |     verb_data_tc = dict(verb_data) | ||||||
|     verb_data_tc[ORTH] = verb_data_tc[ORTH].title() |     verb_data_tc[ORTH] = verb_data_tc[ORTH].title() | ||||||
|     for data in [verb_data, verb_data_tc]: |     for data in [verb_data, verb_data_tc]: | ||||||
|         _exc[data[ORTH] + "n't"] = [ |         _exc[data[ORTH] + "n't"] = [ | ||||||
|             dict(data), |             dict(data), | ||||||
|             {ORTH: "n't", LEMMA: "not", TAG: "RB"}] |             {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}] | ||||||
| 
 | 
 | ||||||
|         _exc[data[ORTH] + "nt"] = [ |         _exc[data[ORTH] + "nt"] = [ | ||||||
|             dict(data), |             dict(data), | ||||||
|             {ORTH: "nt", LEMMA: "not", TAG: "RB"}] |             {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Other contractions with trailing apostrophe | # Other contractions with trailing apostrophe | ||||||
|  | @ -250,10 +250,10 @@ for verb_data in [ | ||||||
| for exc_data in [ | for exc_data in [ | ||||||
|     {ORTH: "doin", LEMMA: "do", NORM: "doing"}, |     {ORTH: "doin", LEMMA: "do", NORM: "doing"}, | ||||||
|     {ORTH: "goin", LEMMA: "go", NORM: "going"}, |     {ORTH: "goin", LEMMA: "go", NORM: "going"}, | ||||||
|     {ORTH: "nothin", LEMMA: "nothing"}, |     {ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"}, | ||||||
|     {ORTH: "nuthin", LEMMA: "nothing"}, |     {ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"}, | ||||||
|     {ORTH: "ol", LEMMA: "old"}, |     {ORTH: "ol", LEMMA: "old", NORM: "old"}, | ||||||
|     {ORTH: "somethin", LEMMA: "something"}]: |     {ORTH: "somethin", LEMMA: "something", NORM: "something"}]: | ||||||
|     exc_data_tc = dict(exc_data) |     exc_data_tc = dict(exc_data) | ||||||
|     exc_data_tc[ORTH] = exc_data_tc[ORTH].title() |     exc_data_tc[ORTH] = exc_data_tc[ORTH].title() | ||||||
|     for data in [exc_data, exc_data_tc]: |     for data in [exc_data, exc_data_tc]: | ||||||
|  | @ -266,10 +266,10 @@ for exc_data in [ | ||||||
| # Other contractions with leading apostrophe | # Other contractions with leading apostrophe | ||||||
| 
 | 
 | ||||||
| for exc_data in [ | for exc_data in [ | ||||||
|     {ORTH: "cause", LEMMA: "because"}, |     {ORTH: "cause", LEMMA: "because", NORM: "because"}, | ||||||
|     {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, |     {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, | ||||||
|     {ORTH: "ll", LEMMA: "will"}, |     {ORTH: "ll", LEMMA: "will", NORM: "will"}, | ||||||
|     {ORTH: "nuff", LEMMA: "enough"}]: |     {ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]: | ||||||
|     exc_data_apos = dict(exc_data) |     exc_data_apos = dict(exc_data) | ||||||
|     exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] |     exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] | ||||||
|     for data in [exc_data, exc_data_apos]: |     for data in [exc_data, exc_data_apos]: | ||||||
|  | @ -282,11 +282,11 @@ for h in range(1, 12 + 1): | ||||||
|     for period in ["a.m.", "am"]: |     for period in ["a.m.", "am"]: | ||||||
|         _exc["%d%s" % (h, period)] = [ |         _exc["%d%s" % (h, period)] = [ | ||||||
|             {ORTH: "%d" % h}, |             {ORTH: "%d" % h}, | ||||||
|             {ORTH: period, LEMMA: "a.m."}] |             {ORTH: period, LEMMA: "a.m.", NORM: "a.m."}] | ||||||
|     for period in ["p.m.", "pm"]: |     for period in ["p.m.", "pm"]: | ||||||
|         _exc["%d%s" % (h, period)] = [ |         _exc["%d%s" % (h, period)] = [ | ||||||
|             {ORTH: "%d" % h}, |             {ORTH: "%d" % h}, | ||||||
|             {ORTH: period, LEMMA: "p.m."}] |             {ORTH: period, LEMMA: "p.m.", NORM: "p.m."}] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Rest | # Rest | ||||||
|  | @ -306,56 +306,56 @@ _other_exc = { | ||||||
|         {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], |         {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], | ||||||
| 
 | 
 | ||||||
|     "How'd'y": [ |     "How'd'y": [ | ||||||
|         {ORTH: "How", LEMMA: "how"}, |         {ORTH: "How", LEMMA: "how", NORM: "how"}, | ||||||
|         {ORTH: "'d", LEMMA: "do"}, |         {ORTH: "'d", LEMMA: "do"}, | ||||||
|         {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], |         {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], | ||||||
| 
 | 
 | ||||||
|     "not've": [ |     "not've": [ | ||||||
|         {ORTH: "not", LEMMA: "not", TAG: "RB"}, |         {ORTH: "not", LEMMA: "not", TAG: "RB"}, | ||||||
|         {ORTH: "'ve", LEMMA: "have", TAG: "VB"}], |         {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}], | ||||||
| 
 | 
 | ||||||
|     "notve": [ |     "notve": [ | ||||||
|         {ORTH: "not", LEMMA: "not", TAG: "RB"}, |         {ORTH: "not", LEMMA: "not", TAG: "RB"}, | ||||||
|         {ORTH: "ve", LEMMA: "have", TAG: "VB"}], |         {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}], | ||||||
| 
 | 
 | ||||||
|     "Not've": [ |     "Not've": [ | ||||||
|         {ORTH: "Not", LEMMA: "not", TAG: "RB"}, |         {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"}, | ||||||
|         {ORTH: "'ve", LEMMA: "have", TAG: "VB"}], |         {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}], | ||||||
| 
 | 
 | ||||||
|     "Notve": [ |     "Notve": [ | ||||||
|         {ORTH: "Not", LEMMA: "not", TAG: "RB"}, |         {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"}, | ||||||
|         {ORTH: "ve", LEMMA: "have", TAG: "VB"}], |         {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}], | ||||||
| 
 | 
 | ||||||
|     "cannot": [ |     "cannot": [ | ||||||
|         {ORTH: "can", LEMMA: "can", TAG: "MD"}, |         {ORTH: "can", LEMMA: "can", TAG: "MD"}, | ||||||
|         {ORTH: "not", LEMMA: "not", TAG: "RB"}], |         {ORTH: "not", LEMMA: "not", TAG: "RB"}], | ||||||
| 
 | 
 | ||||||
|     "Cannot": [ |     "Cannot": [ | ||||||
|         {ORTH: "Can", LEMMA: "can", TAG: "MD"}, |         {ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"}, | ||||||
|         {ORTH: "not", LEMMA: "not", TAG: "RB"}], |         {ORTH: "not", LEMMA: "not", TAG: "RB"}], | ||||||
| 
 | 
 | ||||||
|     "gonna": [ |     "gonna": [ | ||||||
|         {ORTH: "gon", LEMMA: "go", NORM: "going"}, |         {ORTH: "gon", LEMMA: "go", NORM: "going"}, | ||||||
|         {ORTH: "na", LEMMA: "to"}], |         {ORTH: "na", LEMMA: "to", NORM: "to"}], | ||||||
| 
 | 
 | ||||||
|     "Gonna": [ |     "Gonna": [ | ||||||
|         {ORTH: "Gon", LEMMA: "go", NORM: "going"}, |         {ORTH: "Gon", LEMMA: "go", NORM: "going"}, | ||||||
|         {ORTH: "na", LEMMA: "to"}], |         {ORTH: "na", LEMMA: "to", NORM: "to"}], | ||||||
| 
 | 
 | ||||||
|     "gotta": [ |     "gotta": [ | ||||||
|         {ORTH: "got"}, |         {ORTH: "got"}, | ||||||
|         {ORTH: "ta", LEMMA: "to"}], |         {ORTH: "ta", LEMMA: "to", NORM: "to"}], | ||||||
| 
 | 
 | ||||||
|     "Gotta": [ |     "Gotta": [ | ||||||
|         {ORTH: "Got"}, |         {ORTH: "Got", NORM: "got"}, | ||||||
|         {ORTH: "ta", LEMMA: "to"}], |         {ORTH: "ta", LEMMA: "to", NORM: "to"}], | ||||||
| 
 | 
 | ||||||
|     "let's": [ |     "let's": [ | ||||||
|         {ORTH: "let"}, |         {ORTH: "let"}, | ||||||
|         {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}], |         {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}], | ||||||
| 
 | 
 | ||||||
|     "Let's": [ |     "Let's": [ | ||||||
|         {ORTH: "Let", LEMMA: "let"}, |         {ORTH: "Let", LEMMA: "let", NORM: "let"}, | ||||||
|         {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}] |         {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}] | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -363,72 +363,80 @@ _exc.update(_other_exc) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| for exc_data in [ | for exc_data in [ | ||||||
|     {ORTH: "'S", LEMMA: "'s"}, |     {ORTH: "'S", LEMMA: "'s", NORM: "'s"}, | ||||||
|     {ORTH: "'s", LEMMA: "'s"}, |     {ORTH: "'s", LEMMA: "'s", NORM: "'s"}, | ||||||
|     {ORTH: "\u2018S", LEMMA: "'s"}, |     {ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"}, | ||||||
|     {ORTH: "\u2018s", LEMMA: "'s"}, |     {ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"}, | ||||||
|     {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"}, |     {ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"}, | ||||||
|  |     {ORTH: "w/o", LEMMA: "without", NORM: "without"}, | ||||||
|     {ORTH: "'re", LEMMA: "be", NORM: "are"}, |     {ORTH: "'re", LEMMA: "be", NORM: "are"}, | ||||||
|     {ORTH: "'Cause", LEMMA: "because"}, |     {ORTH: "'Cause", LEMMA: "because", NORM: "because"}, | ||||||
|     {ORTH: "'cause", LEMMA: "because"}, |     {ORTH: "'cause", LEMMA: "because", NORM: "because"}, | ||||||
|     {ORTH: "ma'am", LEMMA: "madam"}, |     {ORTH: "'cos", LEMMA: "because", NORM: "because"}, | ||||||
|     {ORTH: "Ma'am", LEMMA: "madam"}, |     {ORTH: "'Cos", LEMMA: "because", NORM: "because"}, | ||||||
|     {ORTH: "o'clock", LEMMA: "o'clock"}, |     {ORTH: "'coz", LEMMA: "because", NORM: "because"}, | ||||||
|     {ORTH: "O'clock", LEMMA: "o'clock"}, |     {ORTH: "'Coz", LEMMA: "because", NORM: "because"}, | ||||||
|  |     {ORTH: "'cuz", LEMMA: "because", NORM: "because"}, | ||||||
|  |     {ORTH: "'Cuz", LEMMA: "because", NORM: "because"}, | ||||||
|  |     {ORTH: "'bout", LEMMA: "about", NORM: "about"}, | ||||||
|  |     {ORTH: "ma'am", LEMMA: "madam", NORM: "madam"}, | ||||||
|  |     {ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"}, | ||||||
|  |     {ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"}, | ||||||
|  |     {ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"}, | ||||||
| 
 | 
 | ||||||
|     {ORTH: "Mt.", LEMMA: "Mount"}, |     {ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"}, | ||||||
|     {ORTH: "Ak.", LEMMA: "Alaska"}, |     {ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"}, | ||||||
|     {ORTH: "Ala.", LEMMA: "Alabama"}, |     {ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"}, | ||||||
|     {ORTH: "Apr.", LEMMA: "April"}, |     {ORTH: "Apr.", LEMMA: "April", NORM: "April"}, | ||||||
|     {ORTH: "Ariz.", LEMMA: "Arizona"}, |     {ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"}, | ||||||
|     {ORTH: "Ark.", LEMMA: "Arkansas"}, |     {ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"}, | ||||||
|     {ORTH: "Aug.", LEMMA: "August"}, |     {ORTH: "Aug.", LEMMA: "August", NORM: "August"}, | ||||||
|     {ORTH: "Calif.", LEMMA: "California"}, |     {ORTH: "Calif.", LEMMA: "California", NORM: "California"}, | ||||||
|     {ORTH: "Colo.", LEMMA: "Colorado"}, |     {ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"}, | ||||||
|     {ORTH: "Conn.", LEMMA: "Connecticut"}, |     {ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"}, | ||||||
|     {ORTH: "Dec.", LEMMA: "December"}, |     {ORTH: "Dec.", LEMMA: "December", NORM: "December"}, | ||||||
|     {ORTH: "Del.", LEMMA: "Delaware"}, |     {ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"}, | ||||||
|     {ORTH: "Feb.", LEMMA: "February"}, |     {ORTH: "Feb.", LEMMA: "February", NORM: "February"}, | ||||||
|     {ORTH: "Fla.", LEMMA: "Florida"}, |     {ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"}, | ||||||
|     {ORTH: "Ga.", LEMMA: "Georgia"}, |     {ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"}, | ||||||
|     {ORTH: "Ia.", LEMMA: "Iowa"}, |     {ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"}, | ||||||
|     {ORTH: "Id.", LEMMA: "Idaho"}, |     {ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"}, | ||||||
|     {ORTH: "Ill.", LEMMA: "Illinois"}, |     {ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"}, | ||||||
|     {ORTH: "Ind.", LEMMA: "Indiana"}, |     {ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"}, | ||||||
|     {ORTH: "Jan.", LEMMA: "January"}, |     {ORTH: "Jan.", LEMMA: "January", NORM: "January"}, | ||||||
|     {ORTH: "Jul.", LEMMA: "July"}, |     {ORTH: "Jul.", LEMMA: "July", NORM: "July"}, | ||||||
|     {ORTH: "Jun.", LEMMA: "June"}, |     {ORTH: "Jun.", LEMMA: "June", NORM: "June"}, | ||||||
|     {ORTH: "Kan.", LEMMA: "Kansas"}, |     {ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"}, | ||||||
|     {ORTH: "Kans.", LEMMA: "Kansas"}, |     {ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"}, | ||||||
|     {ORTH: "Ky.", LEMMA: "Kentucky"}, |     {ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"}, | ||||||
|     {ORTH: "La.", LEMMA: "Louisiana"}, |     {ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"}, | ||||||
|     {ORTH: "Mar.", LEMMA: "March"}, |     {ORTH: "Mar.", LEMMA: "March", NORM: "March"}, | ||||||
|     {ORTH: "Mass.", LEMMA: "Massachusetts"}, |     {ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"}, | ||||||
|     {ORTH: "May.", LEMMA: "May"}, |     {ORTH: "May.", LEMMA: "May", NORM: "May"}, | ||||||
|     {ORTH: "Mich.", LEMMA: "Michigan"}, |     {ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"}, | ||||||
|     {ORTH: "Minn.", LEMMA: "Minnesota"}, |     {ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"}, | ||||||
|     {ORTH: "Miss.", LEMMA: "Mississippi"}, |     {ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"}, | ||||||
|     {ORTH: "N.C.", LEMMA: "North Carolina"}, |     {ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"}, | ||||||
|     {ORTH: "N.D.", LEMMA: "North Dakota"}, |     {ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"}, | ||||||
|     {ORTH: "N.H.", LEMMA: "New Hampshire"}, |     {ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"}, | ||||||
|     {ORTH: "N.J.", LEMMA: "New Jersey"}, |     {ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"}, | ||||||
|     {ORTH: "N.M.", LEMMA: "New Mexico"}, |     {ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"}, | ||||||
|     {ORTH: "N.Y.", LEMMA: "New York"}, |     {ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"}, | ||||||
|     {ORTH: "Neb.", LEMMA: "Nebraska"}, |     {ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"}, | ||||||
|     {ORTH: "Nebr.", LEMMA: "Nebraska"}, |     {ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"}, | ||||||
|     {ORTH: "Nev.", LEMMA: "Nevada"}, |     {ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"}, | ||||||
|     {ORTH: "Nov.", LEMMA: "November"}, |     {ORTH: "Nov.", LEMMA: "November", NORM: "November"}, | ||||||
|     {ORTH: "Oct.", LEMMA: "October"}, |     {ORTH: "Oct.", LEMMA: "October", NORM: "October"}, | ||||||
|     {ORTH: "Okla.", LEMMA: "Oklahoma"}, |     {ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"}, | ||||||
|     {ORTH: "Ore.", LEMMA: "Oregon"}, |     {ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"}, | ||||||
|     {ORTH: "Pa.", LEMMA: "Pennsylvania"}, |     {ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"}, | ||||||
|     {ORTH: "S.C.", LEMMA: "South Carolina"}, |     {ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"}, | ||||||
|     {ORTH: "Sep.", LEMMA: "September"}, |     {ORTH: "Sep.", LEMMA: "September", NORM: "September"}, | ||||||
|     {ORTH: "Sept.", LEMMA: "September"}, |     {ORTH: "Sept.", LEMMA: "September", NORM: "September"}, | ||||||
|     {ORTH: "Tenn.", LEMMA: "Tennessee"}, |     {ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"}, | ||||||
|     {ORTH: "Va.", LEMMA: "Virginia"}, |     {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"}, | ||||||
|     {ORTH: "Wash.", LEMMA: "Washington"}, |     {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"}, | ||||||
|     {ORTH: "Wis.", LEMMA: "Wisconsin"}]: |     {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]: | ||||||
|     _exc[exc_data[ORTH]] = [dict(exc_data)] |     _exc[exc_data[ORTH]] = [dict(exc_data)] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										46
									
								
								spacy/lang/norm_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								spacy/lang/norm_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,46 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # These exceptions are used to add NORM values based on a token's ORTH value. | ||||||
|  | # Individual languages can also add their own exceptions and overwrite them - | ||||||
|  | # for example, British vs. American spelling in English. | ||||||
|  | 
 | ||||||
|  | # Norms are only set if no alternative is provided in the tokenizer exceptions. | ||||||
|  | # Note that this does not change any other token attributes. Its main purpose | ||||||
|  | # is to normalise the word representations so that equivalent tokens receive | ||||||
|  | # similar representations. For example: $ and € are very different, but they're | ||||||
|  | # both currency symbols. By normalising currency symbols to $, all symbols are | ||||||
|  | # seen as similar, no matter how common they are in the training data. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | BASE_NORMS = { | ||||||
|  |     "'s": "'s", | ||||||
|  |     "'S": "'s", | ||||||
|  |     "’s": "'s", | ||||||
|  |     "’S": "'s", | ||||||
|  |     "’": "'", | ||||||
|  |     "‘": "'", | ||||||
|  |     "´": "'", | ||||||
|  |     "`": "'", | ||||||
|  |     "”": '"', | ||||||
|  |     "“": '"', | ||||||
|  |     "''": '"', | ||||||
|  |     "``": '"', | ||||||
|  |     "´´": '"', | ||||||
|  |     "„": '"', | ||||||
|  |     "»": '"', | ||||||
|  |     "«": '"', | ||||||
|  |     "…": "...", | ||||||
|  |     "—": "-", | ||||||
|  |     "–": "-", | ||||||
|  |     "--": "-", | ||||||
|  |     "---": "-", | ||||||
|  |     "€": "$", | ||||||
|  |     "£": "$", | ||||||
|  |     "¥": "$", | ||||||
|  |     "฿": "$", | ||||||
|  |     "US$": "$", | ||||||
|  |     "C$": "$", | ||||||
|  |     "A$": "$" | ||||||
|  | } | ||||||
|  | @ -301,7 +301,7 @@ class Language(object): | ||||||
|     def evaluate(self, docs_golds): |     def evaluate(self, docs_golds): | ||||||
|         docs, golds = zip(*docs_golds) |         docs, golds = zip(*docs_golds) | ||||||
|         scorer = Scorer() |         scorer = Scorer() | ||||||
|         for doc, gold in zip(self.pipe(docs), golds): |         for doc, gold in zip(self.pipe(docs, batch_size=32), golds): | ||||||
|             scorer.score(doc, gold) |             scorer.score(doc, gold) | ||||||
|             doc.tensor = None |             doc.tensor = None | ||||||
|         return scorer |         return scorer | ||||||
|  |  | ||||||
|  | @ -38,7 +38,7 @@ cdef class Morphology: | ||||||
|         self.strings = string_store |         self.strings = string_store | ||||||
|         self.tag_map = {} |         self.tag_map = {} | ||||||
|         self.lemmatizer = lemmatizer |         self.lemmatizer = lemmatizer | ||||||
|         self.n_tags = len(tag_map) + 1 |         self.n_tags = len(tag_map) | ||||||
|         self.tag_names = tuple(sorted(tag_map.keys())) |         self.tag_names = tuple(sorted(tag_map.keys())) | ||||||
|         self.reverse_index = {} |         self.reverse_index = {} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -8,20 +8,33 @@ import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"]) | @pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"]) | ||||||
| def test_tokenizer_splits_contractions(de_tokenizer, text): | def test_de_tokenizer_splits_contractions(de_tokenizer, text): | ||||||
|     tokens = de_tokenizer(text) |     tokens = de_tokenizer(text) | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."]) | @pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."]) | ||||||
| def test_tokenizer_handles_abbr(de_tokenizer, text): | def test_de_tokenizer_handles_abbr(de_tokenizer, text): | ||||||
|     tokens = de_tokenizer(text) |     tokens = de_tokenizer(text) | ||||||
|     assert len(tokens) == 1 |     assert len(tokens) == 1 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_tokenizer_handles_exc_in_text(de_tokenizer): | def test_de_tokenizer_handles_exc_in_text(de_tokenizer): | ||||||
|     text = "Ich bin z.Zt. im Urlaub." |     text = "Ich bin z.Zt. im Urlaub." | ||||||
|     tokens = de_tokenizer(text) |     tokens = de_tokenizer(text) | ||||||
|     assert len(tokens) == 6 |     assert len(tokens) == 6 | ||||||
|     assert tokens[2].text == "z.Zt." |     assert tokens[2].text == "z.Zt." | ||||||
|     assert tokens[2].lemma_ == "zur Zeit" |     assert tokens[2].lemma_ == "zur Zeit" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])]) | ||||||
|  | def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms): | ||||||
|  |     tokens = de_tokenizer(text) | ||||||
|  |     assert [token.norm_ for token in tokens] == norms | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.xfail | ||||||
|  | @pytest.mark.parametrize('text,norm', [("daß", "dass")]) | ||||||
|  | def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm): | ||||||
|  |     tokens = de_tokenizer(text) | ||||||
|  |     assert tokens[0].norm_ == norm | ||||||
|  |  | ||||||
|  | @ -102,3 +102,16 @@ def test_en_tokenizer_handles_times(en_tokenizer, text): | ||||||
|     tokens = en_tokenizer(text) |     tokens = en_tokenizer(text) | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     assert tokens[1].lemma_ in ["a.m.", "p.m."] |     assert tokens[1].lemma_ in ["a.m.", "p.m."] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('text,norms', [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])]) | ||||||
|  | def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms): | ||||||
|  |     tokens = en_tokenizer(text) | ||||||
|  |     assert [token.norm_ for token in tokens] == norms | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.xfail | ||||||
|  | @pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")]) | ||||||
|  | def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm): | ||||||
|  |     tokens = en_tokenizer(text) | ||||||
|  |     assert tokens[0].norm_ == norm | ||||||
|  |  | ||||||
							
								
								
									
										33
									
								
								spacy/tests/serialize/test_serialize_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								spacy/tests/serialize/test_serialize_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,33 @@ | ||||||
|  | # coding: utf-8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from ...util import get_lang_class | ||||||
|  | from ..util import make_tempdir, assert_packed_msg_equal | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def load_tokenizer(b): | ||||||
|  |     tok = get_lang_class('en').Defaults.create_tokenizer() | ||||||
|  |     tok.from_bytes(b) | ||||||
|  |     return tok | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('text', ["I💜you", "they’re", "“hello”"]) | ||||||
|  | def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text): | ||||||
|  |     tokenizer = en_tokenizer | ||||||
|  |     new_tokenizer = load_tokenizer(tokenizer.to_bytes()) | ||||||
|  |     assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes()) | ||||||
|  |     # assert new_tokenizer.to_bytes() == tokenizer.to_bytes() | ||||||
|  |     doc1 = tokenizer(text) | ||||||
|  |     doc2 = new_tokenizer(text) | ||||||
|  |     assert [token.text for token in doc1] == [token.text for token in doc2] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_serialize_tokenizer_roundtrip_disk(en_tokenizer): | ||||||
|  |     tokenizer = en_tokenizer | ||||||
|  |     with make_tempdir() as d: | ||||||
|  |         file_path = d / 'tokenizer' | ||||||
|  |         tokenizer.to_disk(file_path) | ||||||
|  |         tokenizer_d = en_tokenizer.from_disk(file_path) | ||||||
|  |         assert tokenizer.to_bytes() == tokenizer_d.to_bytes() | ||||||
|  | @ -10,6 +10,7 @@ import numpy | ||||||
| import tempfile | import tempfile | ||||||
| import shutil | import shutil | ||||||
| import contextlib | import contextlib | ||||||
|  | import msgpack | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -105,3 +106,13 @@ def assert_docs_equal(doc1, doc2): | ||||||
|     assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ] |     assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ] | ||||||
|     assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ] |     assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ] | ||||||
|     assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ] |     assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def assert_packed_msg_equal(b1, b2): | ||||||
|  |     """Assert that two packed msgpack messages are equal.""" | ||||||
|  |     msg1 = msgpack.loads(b1, encoding='utf8') | ||||||
|  |     msg2 = msgpack.loads(b2, encoding='utf8') | ||||||
|  |     assert sorted(msg1.keys()) == sorted(msg2.keys()) | ||||||
|  |     for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())): | ||||||
|  |         assert k1 == k2 | ||||||
|  |         assert v1 == v2 | ||||||
|  |  | ||||||
|  | @ -2,6 +2,7 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
|  | from collections import OrderedDict | ||||||
| from cython.operator cimport dereference as deref | from cython.operator cimport dereference as deref | ||||||
| from cython.operator cimport preincrement as preinc | from cython.operator cimport preincrement as preinc | ||||||
| from cymem.cymem cimport Pool | from cymem.cymem cimport Pool | ||||||
|  | @ -355,14 +356,14 @@ cdef class Tokenizer: | ||||||
|         **exclude: Named attributes to prevent from being serialized. |         **exclude: Named attributes to prevent from being serialized. | ||||||
|         RETURNS (bytes): The serialized form of the `Tokenizer` object. |         RETURNS (bytes): The serialized form of the `Tokenizer` object. | ||||||
|         """ |         """ | ||||||
|         serializers = { |         serializers = OrderedDict(( | ||||||
|             'vocab': lambda: self.vocab.to_bytes(), |             ('vocab', lambda: self.vocab.to_bytes()), | ||||||
|             'prefix_search': lambda: self.prefix_search.__self__.pattern, |             ('prefix_search', lambda: self.prefix_search.__self__.pattern), | ||||||
|             'suffix_search': lambda: self.suffix_search.__self__.pattern, |             ('suffix_search', lambda: self.suffix_search.__self__.pattern), | ||||||
|             'infix_finditer': lambda: self.infix_finditer.__self__.pattern, |             ('infix_finditer', lambda: self.infix_finditer.__self__.pattern), | ||||||
|             'token_match': lambda: self.token_match.__self__.pattern, |             ('token_match', lambda: self.token_match.__self__.pattern), | ||||||
|             'exceptions': lambda: self._rules |             ('exceptions', lambda: OrderedDict(sorted(self._rules.items()))) | ||||||
|         } |         )) | ||||||
|         return util.to_bytes(serializers, exclude) |         return util.to_bytes(serializers, exclude) | ||||||
| 
 | 
 | ||||||
|     def from_bytes(self, bytes_data, **exclude): |     def from_bytes(self, bytes_data, **exclude): | ||||||
|  | @ -372,15 +373,15 @@ cdef class Tokenizer: | ||||||
|         **exclude: Named attributes to prevent from being loaded. |         **exclude: Named attributes to prevent from being loaded. | ||||||
|         RETURNS (Tokenizer): The `Tokenizer` object. |         RETURNS (Tokenizer): The `Tokenizer` object. | ||||||
|         """ |         """ | ||||||
|         data = {} |         data = OrderedDict() | ||||||
|         deserializers = { |         deserializers = OrderedDict(( | ||||||
|             'vocab': lambda b: self.vocab.from_bytes(b), |             ('vocab', lambda b: self.vocab.from_bytes(b)), | ||||||
|             'prefix_search': lambda b: data.setdefault('prefix', b), |             ('prefix_search', lambda b: data.setdefault('prefix', b)), | ||||||
|             'suffix_search': lambda b: data.setdefault('suffix_search', b), |             ('suffix_search', lambda b: data.setdefault('suffix_search', b)), | ||||||
|             'infix_finditer': lambda b: data.setdefault('infix_finditer', b), |             ('infix_finditer', lambda b: data.setdefault('infix_finditer', b)), | ||||||
|             'token_match': lambda b: data.setdefault('token_match', b), |             ('token_match', lambda b: data.setdefault('token_match', b)), | ||||||
|             'exceptions': lambda b: data.setdefault('rules', b) |             ('exceptions', lambda b: data.setdefault('rules', b)) | ||||||
|         } |         )) | ||||||
|         msg = util.from_bytes(bytes_data, deserializers, exclude) |         msg = util.from_bytes(bytes_data, deserializers, exclude) | ||||||
|         if 'prefix_search' in data: |         if 'prefix_search' in data: | ||||||
|             self.prefix_search = re.compile(data['prefix_search']).search |             self.prefix_search = re.compile(data['prefix_search']).search | ||||||
|  | @ -392,3 +393,4 @@ cdef class Tokenizer: | ||||||
|             self.token_match = re.compile(data['token_match']).search |             self.token_match = re.compile(data['token_match']).search | ||||||
|         for string, substrings in data.get('rules', {}).items(): |         for string, substrings in data.get('rules', {}).items(): | ||||||
|             self.add_special_case(string, substrings) |             self.add_special_case(string, substrings) | ||||||
|  |         return self | ||||||
|  |  | ||||||
|  | @ -437,7 +437,8 @@ cdef class Doc: | ||||||
|         """ |         """ | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             if 'sents' in self.user_hooks: |             if 'sents' in self.user_hooks: | ||||||
|                 return self.user_hooks['sents'](self) |                 yield from self.user_hooks['sents'](self) | ||||||
|  |                 return | ||||||
| 
 | 
 | ||||||
|             if not self.is_parsed: |             if not self.is_parsed: | ||||||
|                 raise ValueError( |                 raise ValueError( | ||||||
|  |  | ||||||
|  | @ -299,6 +299,22 @@ def compile_infix_regex(entries): | ||||||
|     return re.compile(expression) |     return re.compile(expression) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def add_lookups(default_func, *lookups): | ||||||
|  |     """Extend an attribute function with special cases. If a word is in the | ||||||
|  |     lookups, the value is returned. Otherwise the previous function is used. | ||||||
|  | 
 | ||||||
|  |     default_func (callable): The default function to execute. | ||||||
|  |     *lookups (dict): Lookup dictionary mapping string to attribute value. | ||||||
|  |     RETURNS (callable): Lexical attribute getter. | ||||||
|  |     """ | ||||||
|  |     def get_attr(string): | ||||||
|  |         for lookup in lookups: | ||||||
|  |             if string in lookup: | ||||||
|  |                 return lookup[string] | ||||||
|  |         return default_func(string) | ||||||
|  |     return get_attr | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def update_exc(base_exceptions, *addition_dicts): | def update_exc(base_exceptions, *addition_dicts): | ||||||
|     """Update and validate tokenizer exceptions. Will overwrite exceptions. |     """Update and validate tokenizer exceptions. Will overwrite exceptions. | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -231,11 +231,13 @@ cdef class Vocab: | ||||||
|             props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True) |             props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True) | ||||||
|             token = &tokens[i] |             token = &tokens[i] | ||||||
|             # Set the special tokens up to have arbitrary attributes |             # Set the special tokens up to have arbitrary attributes | ||||||
|             token.lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH]) |             lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH]) | ||||||
|  |             token.lex = lex | ||||||
|             if attrs.TAG in props: |             if attrs.TAG in props: | ||||||
|                 self.morphology.assign_tag(token, props[attrs.TAG]) |                 self.morphology.assign_tag(token, props[attrs.TAG]) | ||||||
|             for attr_id, value in props.items(): |             for attr_id, value in props.items(): | ||||||
|                 Token.set_struct_attr(token, attr_id, value) |                 Token.set_struct_attr(token, attr_id, value) | ||||||
|  |                 Lexeme.set_struct_attr(lex, attr_id, value) | ||||||
|         return tokens |         return tokens | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|  |  | ||||||
|  | @ -205,7 +205,7 @@ p | ||||||
|         +cell #[code arrow_spacing] |         +cell #[code arrow_spacing] | ||||||
|         +cell int |         +cell int | ||||||
|         +cell Spacing between arrows in px to avoid overlaps. |         +cell Spacing between arrows in px to avoid overlaps. | ||||||
|         +cell #[code 20] |         +cell #[code 20] / #[code 12] (compact) | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code word_spacing] |         +cell #[code word_spacing] | ||||||
|  |  | ||||||
|  | @ -64,7 +64,7 @@ p | ||||||
|     doc = nlp(u'Give it back! He pleaded.') |     doc = nlp(u'Give it back! He pleaded.') | ||||||
|     assert doc[0].text == 'Give' |     assert doc[0].text == 'Give' | ||||||
|     assert doc[-1].text == '.' |     assert doc[-1].text == '.' | ||||||
|     span = doc[1:1] |     span = doc[1:3] | ||||||
|     assert span.text == 'it back' |     assert span.text == 'it back' | ||||||
| 
 | 
 | ||||||
| +table(["Name", "Type", "Description"]) | +table(["Name", "Type", "Description"]) | ||||||
|  |  | ||||||
|  | @ -141,7 +141,7 @@ p | ||||||
|                 else: |                 else: | ||||||
|                     tokens.append(substring) |                     tokens.append(substring) | ||||||
|                     substring = '' |                     substring = '' | ||||||
|             tokens.extend(suffixes) |             tokens.extend(reversed(suffixes)) | ||||||
|             return tokens |             return tokens | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|  |  | ||||||
|  | @ -59,9 +59,11 @@ p | ||||||
|     |  to customise the layout, for example: |     |  to customise the layout, for example: | ||||||
| 
 | 
 | ||||||
| +aside("Important note") | +aside("Important note") | ||||||
|     |  There's currently a known issue with the #[code compact] mode for long |     |  There's currently a known issue with the #[code compact] mode for | ||||||
|     |  sentences with arrow spacing. If the spacing is larger than the arc |     |  sentences with short arrows and long dependency labels, that causes labels | ||||||
|     |  itself, it'll cause the arc and its label to flip. |     |  longer than the arrow to wrap. So if you come across this problem, | ||||||
|  |     |  especially when using custom labels, you'll have to increase the | ||||||
|  |     |  #[code distance] setting in the #[code options] to allow longer arcs. | ||||||
| 
 | 
 | ||||||
| +table(["Name", "Type", "Description", "Default"]) | +table(["Name", "Type", "Description", "Default"]) | ||||||
|     +row |     +row | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user