mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Tidy up and auto-format
This commit is contained in:
		
							parent
							
								
									a741de7cf6
								
							
						
					
					
						commit
						cb4145adc7
					
				|  | @ -4249,20 +4249,20 @@ TAG_MAP = { | ||||||
|         "Voice": "Act", |         "Voice": "Act", | ||||||
|         "Case": "Nom|Gen|Dat|Acc|Voc", |         "Case": "Nom|Gen|Dat|Acc|Voc", | ||||||
|     }, |     }, | ||||||
|     'ADJ': {POS: ADJ}, |     "ADJ": {POS: ADJ}, | ||||||
|     'ADP': {POS: ADP}, |     "ADP": {POS: ADP}, | ||||||
|     'ADV': {POS: ADV}, |     "ADV": {POS: ADV}, | ||||||
|     'AtDf': {POS: DET}, |     "AtDf": {POS: DET}, | ||||||
|     'AUX': {POS: AUX}, |     "AUX": {POS: AUX}, | ||||||
|     'CCONJ': {POS: CCONJ}, |     "CCONJ": {POS: CCONJ}, | ||||||
|     'DET': {POS: DET}, |     "DET": {POS: DET}, | ||||||
|     'NOUN': {POS: NOUN}, |     "NOUN": {POS: NOUN}, | ||||||
|     'NUM': {POS: NUM}, |     "NUM": {POS: NUM}, | ||||||
|     'PART': {POS: PART}, |     "PART": {POS: PART}, | ||||||
|     'PRON': {POS: PRON}, |     "PRON": {POS: PRON}, | ||||||
|     'PROPN': {POS: PROPN}, |     "PROPN": {POS: PROPN}, | ||||||
|     'SCONJ': {POS: SCONJ}, |     "SCONJ": {POS: SCONJ}, | ||||||
|     'SYM': {POS: SYM}, |     "SYM": {POS: SYM}, | ||||||
|     'VERB': {POS: VERB}, |     "VERB": {POS: VERB}, | ||||||
|     'X': {POS: X}, |     "X": {POS: X}, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -16,7 +16,8 @@ from ...util import DummyTokenizer | ||||||
| # the flow by creating a dummy with the same interface. | # the flow by creating a dummy with the same interface. | ||||||
| DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"]) | DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"]) | ||||||
| DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"]) | DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"]) | ||||||
| DummySpace = DummyNode(' ', ' ', DummyNodeFeatures(' ')) | DummySpace = DummyNode(" ", " ", DummyNodeFeatures(" ")) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def try_fugashi_import(): | def try_fugashi_import(): | ||||||
|     """Fugashi is required for Japanese support, so check for it. |     """Fugashi is required for Japanese support, so check for it. | ||||||
|  | @ -27,8 +28,7 @@ def try_fugashi_import(): | ||||||
|         return fugashi |         return fugashi | ||||||
|     except ImportError: |     except ImportError: | ||||||
|         raise ImportError( |         raise ImportError( | ||||||
|             "Japanese support requires Fugashi: " |             "Japanese support requires Fugashi: " "https://github.com/polm/fugashi" | ||||||
|             "https://github.com/polm/fugashi" |  | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -55,13 +55,14 @@ def resolve_pos(token): | ||||||
|         return token.pos + ",ADJ" |         return token.pos + ",ADJ" | ||||||
|     return token.pos |     return token.pos | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def get_words_and_spaces(tokenizer, text): | def get_words_and_spaces(tokenizer, text): | ||||||
|     """Get the individual tokens that make up the sentence and handle white space. |     """Get the individual tokens that make up the sentence and handle white space. | ||||||
| 
 | 
 | ||||||
|     Japanese doesn't usually use white space, and MeCab's handling of it for |     Japanese doesn't usually use white space, and MeCab's handling of it for | ||||||
|     multiple spaces in a row is somewhat awkward. |     multiple spaces in a row is somewhat awkward. | ||||||
|     """ |     """ | ||||||
|      | 
 | ||||||
|     tokens = tokenizer.parseToNodeList(text) |     tokens = tokenizer.parseToNodeList(text) | ||||||
| 
 | 
 | ||||||
|     words = [] |     words = [] | ||||||
|  | @ -76,6 +77,7 @@ def get_words_and_spaces(tokenizer, text): | ||||||
|         spaces.append(bool(token.white_space)) |         spaces.append(bool(token.white_space)) | ||||||
|     return words, spaces |     return words, spaces | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| class JapaneseTokenizer(DummyTokenizer): | class JapaneseTokenizer(DummyTokenizer): | ||||||
|     def __init__(self, cls, nlp=None): |     def __init__(self, cls, nlp=None): | ||||||
|         self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) |         self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) | ||||||
|  |  | ||||||
|  | @ -1,8 +1,7 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS | from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||||
| from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER |  | ||||||
| 
 | 
 | ||||||
| ELISION = " ' ’ ".strip().replace(" ", "") | ELISION = " ' ’ ".strip().replace(" ", "") | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -20,7 +20,7 @@ for exc_data in [ | ||||||
|     {ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"}, |     {ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"}, | ||||||
|     {ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"}, |     {ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"}, | ||||||
|     {ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"}, |     {ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"}, | ||||||
|     {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"} |     {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}, | ||||||
| ]: | ]: | ||||||
|     _exc[exc_data[ORTH]] = [exc_data] |     _exc[exc_data[ORTH]] = [exc_data] | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -467,38 +467,110 @@ TAG_MAP = { | ||||||
|     "VERB__VerbForm=Part": {"morph": "VerbForm=Part", POS: VERB}, |     "VERB__VerbForm=Part": {"morph": "VerbForm=Part", POS: VERB}, | ||||||
|     "VERB___": {"morph": "_", POS: VERB}, |     "VERB___": {"morph": "_", POS: VERB}, | ||||||
|     "X___": {"morph": "_", POS: X}, |     "X___": {"morph": "_", POS: X}, | ||||||
|     'CCONJ___': {"morph": "_", POS: CCONJ}, |     "CCONJ___": {"morph": "_", POS: CCONJ}, | ||||||
|     "ADJ__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADJ}, |     "ADJ__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADJ}, | ||||||
|     "ADJ__Abbr=Yes|Degree=Pos": {"morph": "Abbr=Yes|Degree=Pos", POS: ADJ}, |     "ADJ__Abbr=Yes|Degree=Pos": {"morph": "Abbr=Yes|Degree=Pos", POS: ADJ}, | ||||||
|     "ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ}, |     "ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": { | ||||||
|     "ADJ__Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ}, |         "morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part", | ||||||
|     "ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part", POS: ADJ}, |         POS: ADJ, | ||||||
|     "ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part", POS: ADJ}, |     }, | ||||||
|     "ADJ__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: ADJ}, |     "ADJ__Definite=Def|Number=Sing|VerbForm=Part": { | ||||||
|  |         "morph": "Definite=Def|Number=Sing|VerbForm=Part", | ||||||
|  |         POS: ADJ, | ||||||
|  |     }, | ||||||
|  |     "ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": { | ||||||
|  |         "morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part", | ||||||
|  |         POS: ADJ, | ||||||
|  |     }, | ||||||
|  |     "ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": { | ||||||
|  |         "morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part", | ||||||
|  |         POS: ADJ, | ||||||
|  |     }, | ||||||
|  |     "ADJ__Definite=Ind|Number=Sing|VerbForm=Part": { | ||||||
|  |         "morph": "Definite=Ind|Number=Sing|VerbForm=Part", | ||||||
|  |         POS: ADJ, | ||||||
|  |     }, | ||||||
|     "ADJ__Number=Sing|VerbForm=Part": {"morph": "Number=Sing|VerbForm=Part", POS: ADJ}, |     "ADJ__Number=Sing|VerbForm=Part": {"morph": "Number=Sing|VerbForm=Part", POS: ADJ}, | ||||||
|     "ADJ__VerbForm=Part": {"morph": "VerbForm=Part", POS: ADJ}, |     "ADJ__VerbForm=Part": {"morph": "VerbForm=Part", POS: ADJ}, | ||||||
|     "ADP__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADP}, |     "ADP__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADP}, | ||||||
|     "ADV__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADV}, |     "ADV__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADV}, | ||||||
|     "DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": {"morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art", POS: DET}, |     "DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": { | ||||||
|     "DET__Case=Gen|Number=Plur|PronType=Tot": {"morph": "Case=Gen|Number=Plur|PronType=Tot", POS: DET}, |         "morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|  |     "DET__Case=Gen|Number=Plur|PronType=Tot": { | ||||||
|  |         "morph": "Case=Gen|Number=Plur|PronType=Tot", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|     "DET__Definite=Def|PronType=Prs": {"morph": "Definite=Def|PronType=Prs", POS: DET}, |     "DET__Definite=Def|PronType=Prs": {"morph": "Definite=Def|PronType=Prs", POS: DET}, | ||||||
|     "DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs", POS: DET}, |     "DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": { | ||||||
|     "DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs", POS: DET}, |         "morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs", | ||||||
|     "DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs", POS: DET}, |         POS: DET, | ||||||
|     "DET__Gender=Fem|Number=Sing|PronType=Art": {"morph": "Gender=Fem|Number=Sing|PronType=Art", POS: DET}, |     }, | ||||||
|     "DET__Gender=Fem|Number=Sing|PronType=Ind": {"morph": "Gender=Fem|Number=Sing|PronType=Ind", POS: DET}, |     "DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": { | ||||||
|     "DET__Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|PronType=Prs", POS: DET}, |         "morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs", | ||||||
|     "DET__Gender=Fem|Number=Sing|PronType=Tot": {"morph": "Gender=Fem|Number=Sing|PronType=Tot", POS: DET}, |         POS: DET, | ||||||
|     "DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET}, |     }, | ||||||
|     "DET__Gender=Masc|Number=Sing|PronType=Art": {"morph": "Gender=Masc|Number=Sing|PronType=Art", POS: DET}, |     "DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": { | ||||||
|     "DET__Gender=Masc|Number=Sing|PronType=Ind": {"morph": "Gender=Masc|Number=Sing|PronType=Ind", POS: DET}, |         "morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs", | ||||||
|     "DET__Gender=Masc|Number=Sing|PronType=Tot": {"morph": "Gender=Masc|Number=Sing|PronType=Tot", POS: DET}, |         POS: DET, | ||||||
|     "DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET}, |     }, | ||||||
|     "DET__Gender=Neut|Number=Sing|PronType=Art": {"morph": "Gender=Neut|Number=Sing|PronType=Art", POS: DET}, |     "DET__Gender=Fem|Number=Sing|PronType=Art": { | ||||||
|     "DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind", POS: DET}, |         "morph": "Gender=Fem|Number=Sing|PronType=Art", | ||||||
|     "DET__Gender=Neut|Number=Sing|PronType=Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Ind", POS: DET}, |         POS: DET, | ||||||
|     "DET__Gender=Neut|Number=Sing|PronType=Tot": {"morph": "Gender=Neut|Number=Sing|PronType=Tot", POS: DET}, |     }, | ||||||
|     "DET__Number=Plur|Polarity=Neg|PronType=Neg": {"morph": "Number=Plur|Polarity=Neg|PronType=Neg", POS: DET}, |     "DET__Gender=Fem|Number=Sing|PronType=Ind": { | ||||||
|  |         "morph": "Gender=Fem|Number=Sing|PronType=Ind", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|  |     "DET__Gender=Fem|Number=Sing|PronType=Prs": { | ||||||
|  |         "morph": "Gender=Fem|Number=Sing|PronType=Prs", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|  |     "DET__Gender=Fem|Number=Sing|PronType=Tot": { | ||||||
|  |         "morph": "Gender=Fem|Number=Sing|PronType=Tot", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|  |     "DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": { | ||||||
|  |         "morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|  |     "DET__Gender=Masc|Number=Sing|PronType=Art": { | ||||||
|  |         "morph": "Gender=Masc|Number=Sing|PronType=Art", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|  |     "DET__Gender=Masc|Number=Sing|PronType=Ind": { | ||||||
|  |         "morph": "Gender=Masc|Number=Sing|PronType=Ind", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|  |     "DET__Gender=Masc|Number=Sing|PronType=Tot": { | ||||||
|  |         "morph": "Gender=Masc|Number=Sing|PronType=Tot", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|  |     "DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": { | ||||||
|  |         "morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|  |     "DET__Gender=Neut|Number=Sing|PronType=Art": { | ||||||
|  |         "morph": "Gender=Neut|Number=Sing|PronType=Art", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|  |     "DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": { | ||||||
|  |         "morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|  |     "DET__Gender=Neut|Number=Sing|PronType=Ind": { | ||||||
|  |         "morph": "Gender=Neut|Number=Sing|PronType=Ind", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|  |     "DET__Gender=Neut|Number=Sing|PronType=Tot": { | ||||||
|  |         "morph": "Gender=Neut|Number=Sing|PronType=Tot", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|  |     "DET__Number=Plur|Polarity=Neg|PronType=Neg": { | ||||||
|  |         "morph": "Number=Plur|Polarity=Neg|PronType=Neg", | ||||||
|  |         POS: DET, | ||||||
|  |     }, | ||||||
|     "DET__Number=Plur|PronType=Art": {"morph": "Number=Plur|PronType=Art", POS: DET}, |     "DET__Number=Plur|PronType=Art": {"morph": "Number=Plur|PronType=Art", POS: DET}, | ||||||
|     "DET__Number=Plur|PronType=Ind": {"morph": "Number=Plur|PronType=Ind", POS: DET}, |     "DET__Number=Plur|PronType=Ind": {"morph": "Number=Plur|PronType=Ind", POS: DET}, | ||||||
|     "DET__Number=Plur|PronType=Prs": {"morph": "Number=Plur|PronType=Prs", POS: DET}, |     "DET__Number=Plur|PronType=Prs": {"morph": "Number=Plur|PronType=Prs", POS: DET}, | ||||||
|  | @ -507,57 +579,183 @@ TAG_MAP = { | ||||||
|     "DET__PronType=Prs": {"morph": "PronType=Prs", POS: DET}, |     "DET__PronType=Prs": {"morph": "PronType=Prs", POS: DET}, | ||||||
|     "NOUN__Abbr=Yes": {"morph": "Abbr=Yes", POS: NOUN}, |     "NOUN__Abbr=Yes": {"morph": "Abbr=Yes", POS: NOUN}, | ||||||
|     "NOUN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: NOUN}, |     "NOUN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: NOUN}, | ||||||
|     "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing", POS: NOUN}, |     "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": { | ||||||
|     "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing", POS: NOUN}, |         "morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing", | ||||||
|     "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing", POS: NOUN}, |         POS: NOUN, | ||||||
|  |     }, | ||||||
|  |     "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": { | ||||||
|  |         "morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing", | ||||||
|  |         POS: NOUN, | ||||||
|  |     }, | ||||||
|  |     "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": { | ||||||
|  |         "morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing", | ||||||
|  |         POS: NOUN, | ||||||
|  |     }, | ||||||
|     "NOUN__Abbr=Yes|Gender=Masc": {"morph": "Abbr=Yes|Gender=Masc", POS: NOUN}, |     "NOUN__Abbr=Yes|Gender=Masc": {"morph": "Abbr=Yes|Gender=Masc", POS: NOUN}, | ||||||
|     "NUM__Case=Gen|Number=Plur|NumType=Card": {"morph": "Case=Gen|Number=Plur|NumType=Card", POS: NUM}, |     "NUM__Case=Gen|Number=Plur|NumType=Card": { | ||||||
|     "NUM__Definite=Def|Number=Sing|NumType=Card": {"morph": "Definite=Def|Number=Sing|NumType=Card", POS: NUM}, |         "morph": "Case=Gen|Number=Plur|NumType=Card", | ||||||
|  |         POS: NUM, | ||||||
|  |     }, | ||||||
|  |     "NUM__Definite=Def|Number=Sing|NumType=Card": { | ||||||
|  |         "morph": "Definite=Def|Number=Sing|NumType=Card", | ||||||
|  |         POS: NUM, | ||||||
|  |     }, | ||||||
|     "NUM__Definite=Def|NumType=Card": {"morph": "Definite=Def|NumType=Card", POS: NUM}, |     "NUM__Definite=Def|NumType=Card": {"morph": "Definite=Def|NumType=Card", POS: NUM}, | ||||||
|     "NUM__Gender=Fem|Number=Sing|NumType=Card": {"morph": "Gender=Fem|Number=Sing|NumType=Card", POS: NUM}, |     "NUM__Gender=Fem|Number=Sing|NumType=Card": { | ||||||
|     "NUM__Gender=Masc|Number=Sing|NumType=Card": {"morph": "Gender=Masc|Number=Sing|NumType=Card", POS: NUM}, |         "morph": "Gender=Fem|Number=Sing|NumType=Card", | ||||||
|     "NUM__Gender=Neut|Number=Sing|NumType=Card": {"morph": "Gender=Neut|Number=Sing|NumType=Card", POS: NUM}, |         POS: NUM, | ||||||
|  |     }, | ||||||
|  |     "NUM__Gender=Masc|Number=Sing|NumType=Card": { | ||||||
|  |         "morph": "Gender=Masc|Number=Sing|NumType=Card", | ||||||
|  |         POS: NUM, | ||||||
|  |     }, | ||||||
|  |     "NUM__Gender=Neut|Number=Sing|NumType=Card": { | ||||||
|  |         "morph": "Gender=Neut|Number=Sing|NumType=Card", | ||||||
|  |         POS: NUM, | ||||||
|  |     }, | ||||||
|     "NUM__Number=Plur|NumType=Card": {"morph": "Number=Plur|NumType=Card", POS: NUM}, |     "NUM__Number=Plur|NumType=Card": {"morph": "Number=Plur|NumType=Card", POS: NUM}, | ||||||
|     "NUM__Number=Sing|NumType=Card": {"morph": "Number=Sing|NumType=Card", POS: NUM}, |     "NUM__Number=Sing|NumType=Card": {"morph": "Number=Sing|NumType=Card", POS: NUM}, | ||||||
|     "NUM__NumType=Card": {"morph": "NumType=Card", POS: NUM}, |     "NUM__NumType=Card": {"morph": "NumType=Card", POS: NUM}, | ||||||
|     "PART__Polarity=Neg": {"morph": "Polarity=Neg", POS: PART}, |     "PART__Polarity=Neg": {"morph": "Polarity=Neg", POS: PART}, | ||||||
|     "PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON}, |     "PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { | ||||||
|     "PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON}, |         "morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs", | ||||||
|     "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs", POS: PRON}, |         POS: PRON, | ||||||
|     "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs", POS: PRON}, |     }, | ||||||
|     "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs", POS: PRON}, |     "PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { | ||||||
|     "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs", POS: PRON}, |         "morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs", | ||||||
|     "PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs", POS: PRON}, |         POS: PRON, | ||||||
|     "PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs", POS: PRON}, |     }, | ||||||
|     "PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON}, |     "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": { | ||||||
|     "PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON}, |         "morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs", | ||||||
|     "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs", POS: PRON}, |         POS: PRON, | ||||||
|     "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs", POS: PRON}, |     }, | ||||||
|     "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs", POS: PRON}, |     "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": { | ||||||
|     "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs", POS: PRON}, |         "morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs", | ||||||
|     "PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs", POS: PRON}, |         POS: PRON, | ||||||
|     "PRON__Animacy=Hum|Number=Plur|PronType=Rcp": {"morph": "Animacy=Hum|Number=Plur|PronType=Rcp", POS: PRON}, |     }, | ||||||
|     "PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs", POS: PRON}, |     "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": { | ||||||
|     "PRON__Animacy=Hum|Poss=Yes|PronType=Int": {"morph": "Animacy=Hum|Poss=Yes|PronType=Int", POS: PRON}, |         "morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": { | ||||||
|  |         "morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": { | ||||||
|  |         "morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": { | ||||||
|  |         "morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { | ||||||
|  |         "morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { | ||||||
|  |         "morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": { | ||||||
|  |         "morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": { | ||||||
|  |         "morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": { | ||||||
|  |         "morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": { | ||||||
|  |         "morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": { | ||||||
|  |         "morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Animacy=Hum|Number=Plur|PronType=Rcp": { | ||||||
|  |         "morph": "Animacy=Hum|Number=Plur|PronType=Rcp", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": { | ||||||
|  |         "morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Animacy=Hum|Poss=Yes|PronType=Int": { | ||||||
|  |         "morph": "Animacy=Hum|Poss=Yes|PronType=Int", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|     "PRON__Animacy=Hum|PronType=Int": {"morph": "Animacy=Hum|PronType=Int", POS: PRON}, |     "PRON__Animacy=Hum|PronType=Int": {"morph": "Animacy=Hum|PronType=Int", POS: PRON}, | ||||||
|     "PRON__Case=Acc|PronType=Prs|Reflex=Yes": {"morph": "Case=Acc|PronType=Prs|Reflex=Yes", POS: PRON}, |     "PRON__Case=Acc|PronType=Prs|Reflex=Yes": { | ||||||
|     "PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": { "morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON}, |         "morph": "Case=Acc|PronType=Prs|Reflex=Yes", | ||||||
|     "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON}, |         POS: PRON, | ||||||
|     "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot", POS: PRON}, |     }, | ||||||
|     "PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, |     "PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": { | ||||||
|     "PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, |         "morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs", | ||||||
|     "PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON}, |         POS: PRON, | ||||||
|     "PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, |     }, | ||||||
|     "PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": {"morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON}, |     "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": { | ||||||
|     "PRON__Number=Plur|Person=3|PronType=Ind,Prs": {"morph": "Number=Plur|Person=3|PronType=Ind,Prs", POS: PRON}, |         "morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs", | ||||||
|     "PRON__Number=Plur|Person=3|PronType=Prs,Tot": {"morph": "Number=Plur|Person=3|PronType=Prs,Tot", POS: PRON}, |         POS: PRON, | ||||||
|     "PRON__Number=Plur|Poss=Yes|PronType=Prs": {"morph": "Number=Plur|Poss=Yes|PronType=Prs", POS: PRON}, |     }, | ||||||
|     "PRON__Number=Plur|Poss=Yes|PronType=Rcp": {"morph": "Number=Plur|Poss=Yes|PronType=Rcp", POS: PRON}, |     "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": { | ||||||
|     "PRON__Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Number=Sing|Polarity=Neg|PronType=Neg", POS: PRON}, |         "morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": { | ||||||
|  |         "morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": { | ||||||
|  |         "morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": { | ||||||
|  |         "morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": { | ||||||
|  |         "morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": { | ||||||
|  |         "morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Number=Plur|Person=3|PronType=Ind,Prs": { | ||||||
|  |         "morph": "Number=Plur|Person=3|PronType=Ind,Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Number=Plur|Person=3|PronType=Prs,Tot": { | ||||||
|  |         "morph": "Number=Plur|Person=3|PronType=Prs,Tot", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Number=Plur|Poss=Yes|PronType=Prs": { | ||||||
|  |         "morph": "Number=Plur|Poss=Yes|PronType=Prs", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Number=Plur|Poss=Yes|PronType=Rcp": { | ||||||
|  |         "morph": "Number=Plur|Poss=Yes|PronType=Rcp", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|  |     "PRON__Number=Sing|Polarity=Neg|PronType=Neg": { | ||||||
|  |         "morph": "Number=Sing|Polarity=Neg|PronType=Neg", | ||||||
|  |         POS: PRON, | ||||||
|  |     }, | ||||||
|     "PRON__PronType=Prs": {"morph": "PronType=Prs", POS: PRON}, |     "PRON__PronType=Prs": {"morph": "PronType=Prs", POS: PRON}, | ||||||
|     "PRON__PronType=Rel": {"morph": "PronType=Rel", POS: PRON}, |     "PRON__PronType=Rel": {"morph": "PronType=Rel", POS: PRON}, | ||||||
|     "PROPN__Abbr=Yes": {"morph": "Abbr=Yes", POS: PROPN}, |     "PROPN__Abbr=Yes": {"morph": "Abbr=Yes", POS: PROPN}, | ||||||
|     "PROPN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: PROPN}, |     "PROPN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: PROPN}, | ||||||
|     "VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": {"morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin", POS: VERB}, |     "VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": { | ||||||
|     "VERB__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: VERB}, |         "morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin", | ||||||
|  |         POS: VERB, | ||||||
|  |     }, | ||||||
|  |     "VERB__Definite=Ind|Number=Sing|VerbForm=Part": { | ||||||
|  |         "morph": "Definite=Ind|Number=Sing|VerbForm=Part", | ||||||
|  |         POS: VERB, | ||||||
|  |     }, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -295,10 +295,9 @@ class EntityRuler(object): | ||||||
|             deserializers_patterns = { |             deserializers_patterns = { | ||||||
|                 "patterns": lambda p: self.add_patterns( |                 "patterns": lambda p: self.add_patterns( | ||||||
|                     srsly.read_jsonl(p.with_suffix(".jsonl")) |                     srsly.read_jsonl(p.with_suffix(".jsonl")) | ||||||
|                 )} |                 ) | ||||||
|             deserializers_cfg = { |  | ||||||
|                 "cfg": lambda p: cfg.update(srsly.read_json(p)) |  | ||||||
|             } |             } | ||||||
|  |             deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))} | ||||||
|             from_disk(path, deserializers_cfg, {}) |             from_disk(path, deserializers_cfg, {}) | ||||||
|             self.overwrite = cfg.get("overwrite", False) |             self.overwrite = cfg.get("overwrite", False) | ||||||
|             self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") |             self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") | ||||||
|  |  | ||||||
|  | @ -219,14 +219,13 @@ def uk_tokenizer(): | ||||||
| def ur_tokenizer(): | def ur_tokenizer(): | ||||||
|     return get_lang_class("ur").Defaults.create_tokenizer() |     return get_lang_class("ur").Defaults.create_tokenizer() | ||||||
| 
 | 
 | ||||||
|    | 
 | ||||||
| @pytest.fixture(scope="session") | @pytest.fixture(scope="session") | ||||||
| def yo_tokenizer(): | def yo_tokenizer(): | ||||||
|     return get_lang_class("yo").Defaults.create_tokenizer() |     return get_lang_class("yo").Defaults.create_tokenizer() | ||||||
| 
 | 
 | ||||||
|    | 
 | ||||||
| @pytest.fixture(scope="session") | @pytest.fixture(scope="session") | ||||||
| def zh_tokenizer(): | def zh_tokenizer(): | ||||||
|     pytest.importorskip("jieba") |     pytest.importorskip("jieba") | ||||||
|     return get_lang_class("zh").Defaults.create_tokenizer() |     return get_lang_class("zh").Defaults.create_tokenizer() | ||||||
| 
 |  | ||||||
|  |  | ||||||
|  | @ -15,7 +15,7 @@ ABBREVIATION_TESTS = [ | ||||||
| HYPHENATED_TESTS = [ | HYPHENATED_TESTS = [ | ||||||
|     ( |     ( | ||||||
|         "1700-luvulle sijoittuva taide-elokuva", |         "1700-luvulle sijoittuva taide-elokuva", | ||||||
|         ["1700-luvulle", "sijoittuva", "taide-elokuva"] |         ["1700-luvulle", "sijoittuva", "taide-elokuva"], | ||||||
|     ) |     ) | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -3,16 +3,19 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| @pytest.mark.parametrize("text", ["z.B.", "Jan."]) | @pytest.mark.parametrize("text", ["z.B.", "Jan."]) | ||||||
| def test_lb_tokenizer_handles_abbr(lb_tokenizer, text): | def test_lb_tokenizer_handles_abbr(lb_tokenizer, text): | ||||||
|     tokens = lb_tokenizer(text) |     tokens = lb_tokenizer(text) | ||||||
|     assert len(tokens) == 1 |     assert len(tokens) == 1 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| @pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"]) | @pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"]) | ||||||
| def test_lb_tokenizer_splits_contractions(lb_tokenizer, text): | def test_lb_tokenizer_splits_contractions(lb_tokenizer, text): | ||||||
|     tokens = lb_tokenizer(text) |     tokens = lb_tokenizer(text) | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer): | def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer): | ||||||
|     text = "Mee 't ass net evident, d'Liewen." |     text = "Mee 't ass net evident, d'Liewen." | ||||||
|     tokens = lb_tokenizer(text) |     tokens = lb_tokenizer(text) | ||||||
|  | @ -20,6 +23,7 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer): | ||||||
|     assert tokens[1].text == "'t" |     assert tokens[1].text == "'t" | ||||||
|     assert tokens[1].lemma_ == "et" |     assert tokens[1].lemma_ == "et" | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| @pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")]) | @pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")]) | ||||||
| def test_lb_norm_exceptions(lb_tokenizer, text, norm): | def test_lb_norm_exceptions(lb_tokenizer, text, norm): | ||||||
|     tokens = lb_tokenizer(text) |     tokens = lb_tokenizer(text) | ||||||
|  |  | ||||||
|  | @ -16,7 +16,7 @@ def test_lb_tokenizer_handles_long_text(lb_tokenizer): | ||||||
|     [ |     [ | ||||||
|         ("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13), |         ("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13), | ||||||
|         ("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15), |         ("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15), | ||||||
|         ("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14) |         ("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length): | def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length): | ||||||
|  |  | ||||||
|  | @ -87,4 +87,4 @@ def test_lex_attrs_like_url(text, match): | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_lex_attrs_word_shape(text, shape): | def test_lex_attrs_word_shape(text, shape): | ||||||
|     assert word_shape(text) == shape |     assert word_shape(text) == shape | ||||||
|  |  | ||||||
|  | @ -151,17 +151,17 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_parser_set_sent_starts(en_vocab): | def test_parser_set_sent_starts(en_vocab): | ||||||
|  |     # fmt: off | ||||||
|     words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n'] |     words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n'] | ||||||
|     heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1] |     heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1] | ||||||
|     deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', ''] |     deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', ''] | ||||||
|     doc = get_doc( |     # fmt: on | ||||||
|         en_vocab, words=words, deps=deps, heads=heads |     doc = get_doc(en_vocab, words=words, deps=deps, heads=heads) | ||||||
|     ) |  | ||||||
|     for i in range(len(words)): |     for i in range(len(words)): | ||||||
|         if i == 0 or i == 3: |         if i == 0 or i == 3: | ||||||
|             assert doc[i].is_sent_start == True |             assert doc[i].is_sent_start is True | ||||||
|         else: |         else: | ||||||
|             assert doc[i].is_sent_start == None |             assert doc[i].is_sent_start is None | ||||||
|     for sent in doc.sents: |     for sent in doc.sents: | ||||||
|         for token in sent: |         for token in sent: | ||||||
|             assert token.head in sent |             assert token.head in sent | ||||||
|  |  | ||||||
|  | @ -3,7 +3,6 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
| from spacy.language import Language | from spacy.language import Language | ||||||
| from spacy.pipeline import Tagger |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_label_types(): | def test_label_types(): | ||||||
|  |  | ||||||
|  | @ -15,7 +15,9 @@ def test_issue4674(): | ||||||
| 
 | 
 | ||||||
|     vector1 = [0.9, 1.1, 1.01] |     vector1 = [0.9, 1.1, 1.01] | ||||||
|     vector2 = [1.8, 2.25, 2.01] |     vector2 = [1.8, 2.25, 2.01] | ||||||
|     kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]) |     kb.set_entities( | ||||||
|  |         entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2] | ||||||
|  |     ) | ||||||
| 
 | 
 | ||||||
|     assert kb.get_size_entities() == 1 |     assert kb.get_size_entities() == 1 | ||||||
| 
 | 
 | ||||||
|  | @ -31,4 +33,3 @@ def test_issue4674(): | ||||||
|         kb2.load_bulk(str(file_path)) |         kb2.load_bulk(str(file_path)) | ||||||
| 
 | 
 | ||||||
|     assert kb2.get_size_entities() == 1 |     assert kb2.get_size_entities() == 1 | ||||||
| 
 |  | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user