mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Tidy up and fix small bugs and typos
This commit is contained in:
		
							parent
							
								
									9e652afa4b
								
							
						
					
					
						commit
						25602c794c
					
				|  | @ -8,15 +8,14 @@ import time | ||||||
| from collections import Counter | from collections import Counter | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from thinc.v2v import Affine, Maxout | from thinc.v2v import Affine, Maxout | ||||||
| from thinc.api import wrap, layerize |  | ||||||
| from thinc.misc import LayerNorm as LN | from thinc.misc import LayerNorm as LN | ||||||
| from thinc.neural.util import prefer_gpu, get_array_module | from thinc.neural.util import prefer_gpu | ||||||
| from wasabi import Printer | from wasabi import Printer | ||||||
| import srsly | import srsly | ||||||
| 
 | 
 | ||||||
| from ..tokens import Doc | from ..tokens import Doc | ||||||
| from ..attrs import ID, HEAD | from ..attrs import ID, HEAD | ||||||
| from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer | from .._ml import Tok2Vec, flatten, chain, create_default_optimizer | ||||||
| from .._ml import masked_language_model | from .._ml import masked_language_model | ||||||
| from .. import util | from .. import util | ||||||
| 
 | 
 | ||||||
|  | @ -136,7 +135,7 @@ def pretrain( | ||||||
|             random.shuffle(texts) |             random.shuffle(texts) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def make_update(model, docs, optimizer, drop=0.0, objective='L2'): | def make_update(model, docs, optimizer, drop=0.0, objective="L2"): | ||||||
|     """Perform an update over a single batch of documents. |     """Perform an update over a single batch of documents. | ||||||
| 
 | 
 | ||||||
|     docs (iterable): A batch of `Doc` objects. |     docs (iterable): A batch of `Doc` objects. | ||||||
|  | @ -171,7 +170,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500): | ||||||
|     return docs |     return docs | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_vectors_loss(ops, docs, prediction, objective='L2'): | def get_vectors_loss(ops, docs, prediction, objective="L2"): | ||||||
|     """Compute a mean-squared error loss between the documents' vectors and |     """Compute a mean-squared error loss between the documents' vectors and | ||||||
|     the prediction. |     the prediction. | ||||||
| 
 | 
 | ||||||
|  | @ -185,9 +184,9 @@ def get_vectors_loss(ops, docs, prediction, objective='L2'): | ||||||
|     # and look them up all at once. This prevents data copying. |     # and look them up all at once. This prevents data copying. | ||||||
|     ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) |     ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) | ||||||
|     target = docs[0].vocab.vectors.data[ids] |     target = docs[0].vocab.vectors.data[ids] | ||||||
|     if objective == 'L2': |     if objective == "L2": | ||||||
|         d_scores = prediction - target |         d_scores = prediction - target | ||||||
|         loss = (d_scores**2).sum() |         loss = (d_scores ** 2).sum() | ||||||
|     else: |     else: | ||||||
|         raise NotImplementedError(objective) |         raise NotImplementedError(objective) | ||||||
|     return loss, d_scores |     return loss, d_scores | ||||||
|  | @ -201,8 +200,7 @@ def create_pretraining_model(nlp, tok2vec): | ||||||
|     """ |     """ | ||||||
|     output_size = nlp.vocab.vectors.data.shape[1] |     output_size = nlp.vocab.vectors.data.shape[1] | ||||||
|     output_layer = chain( |     output_layer = chain( | ||||||
|         LN(Maxout(300, pieces=3)), |         LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0) | ||||||
|         Affine(output_size, drop_factor=0.0), |  | ||||||
|     ) |     ) | ||||||
|     # This is annoying, but the parser etc have the flatten step after |     # This is annoying, but the parser etc have the flatten step after | ||||||
|     # the tok2vec. To load the weights in cleanly, we need to match |     # the tok2vec. To load the weights in cleanly, we need to match | ||||||
|  |  | ||||||
|  | @ -13,13 +13,7 @@ RENDER_WRAPPER = None | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def render( | def render( | ||||||
|     docs, |     docs, style="dep", page=False, minify=False, jupyter=False, options={}, manual=False | ||||||
|     style="dep", |  | ||||||
|     page=False, |  | ||||||
|     minify=False, |  | ||||||
|     jupyter=False, |  | ||||||
|     options={}, |  | ||||||
|     manual=False, |  | ||||||
| ): | ): | ||||||
|     """Render displaCy visualisation. |     """Render displaCy visualisation. | ||||||
| 
 | 
 | ||||||
|  | @ -80,7 +74,7 @@ def serve( | ||||||
|     """ |     """ | ||||||
|     from wsgiref import simple_server |     from wsgiref import simple_server | ||||||
| 
 | 
 | ||||||
|     if IS_JUPYTER: |     if is_in_jupyter(): | ||||||
|         user_warning(Warnings.W011) |         user_warning(Warnings.W011) | ||||||
| 
 | 
 | ||||||
|     render(docs, style=style, page=page, minify=minify, options=options, manual=manual) |     render(docs, style=style, page=page, minify=minify, options=options, manual=manual) | ||||||
|  |  | ||||||
|  | @ -1,8 +1,9 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS | from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES | ||||||
| from ..char_classes import CONCAT_QUOTES, CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER | from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| # removing ° from the special icons to keep e.g. 99° as one token | # removing ° from the special icons to keep e.g. 99° as one token | ||||||
| _concat_icons = CONCAT_ICONS.replace("\u00B0", "") | _concat_icons = CONCAT_ICONS.replace("\u00B0", "") | ||||||
|  | @ -29,7 +30,9 @@ _suffixes = ( | ||||||
|         r"(?<=°[FfCcKk])\.", |         r"(?<=°[FfCcKk])\.", | ||||||
|         r"(?<=[0-9])(?:[{c}])".format(c=_currency), |         r"(?<=[0-9])(?:[{c}])".format(c=_currency), | ||||||
|         r"(?<=[0-9])(?:{u})".format(u=UNITS), |         r"(?<=[0-9])(?:{u})".format(u=UNITS), | ||||||
|         r"(?<=[{al}{e}{q}(?:{c})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency), |         r"(?<=[{al}{e}{q}(?:{c})])\.".format( | ||||||
|  |             al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency | ||||||
|  |         ), | ||||||
|         r"(?<=[{al})])-e".format(al=ALPHA_LOWER), |         r"(?<=[{al})])-e".format(al=ALPHA_LOWER), | ||||||
|     ] |     ] | ||||||
| ) | ) | ||||||
|  | @ -40,7 +43,7 @@ _infixes = ( | ||||||
|     + [ |     + [ | ||||||
|         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), |         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), | ||||||
|         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), |         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), | ||||||
|         r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA), |         r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), | ||||||
|         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), |         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), | ||||||
|         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), |         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | ||||||
|         r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes), |         r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes), | ||||||
|  |  | ||||||
|  | @ -5,24 +5,24 @@ import re | ||||||
| from collections import namedtuple | from collections import namedtuple | ||||||
| 
 | 
 | ||||||
| from .tag_map import TAG_MAP | from .tag_map import TAG_MAP | ||||||
| 
 |  | ||||||
| from ...attrs import LANG | from ...attrs import LANG | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...tokens import Doc, Token | from ...tokens import Doc, Token | ||||||
| from ...util import DummyTokenizer | from ...util import DummyTokenizer | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"]) | ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"]) | ||||||
| 
 | 
 | ||||||
|  | # TODO: Is this the right place for this? | ||||||
|  | Token.set_extension("mecab_tag", default=None) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def try_mecab_import(): | def try_mecab_import(): | ||||||
|     """Mecab is required for Japanese support, so check for it. |     """Mecab is required for Japanese support, so check for it. | ||||||
| 
 |  | ||||||
|     It it's not available blow up and explain how to fix it.""" |     It it's not available blow up and explain how to fix it.""" | ||||||
|     try: |     try: | ||||||
|         import MeCab |         import MeCab | ||||||
| 
 | 
 | ||||||
|         # XXX Is this the right place for this? |  | ||||||
|         Token.set_extension("mecab_tag", default=None) |  | ||||||
|         return MeCab |         return MeCab | ||||||
|     except ImportError: |     except ImportError: | ||||||
|         raise ImportError( |         raise ImportError( | ||||||
|  | @ -33,14 +33,13 @@ def try_mecab_import(): | ||||||
| 
 | 
 | ||||||
| def resolve_pos(token): | def resolve_pos(token): | ||||||
|     """If necessary, add a field to the POS tag for UD mapping. |     """If necessary, add a field to the POS tag for UD mapping. | ||||||
| 
 |  | ||||||
|     Under Universal Dependencies, sometimes the same Unidic POS tag can |     Under Universal Dependencies, sometimes the same Unidic POS tag can | ||||||
|     be mapped differently depending on the literal token or its context |     be mapped differently depending on the literal token or its context | ||||||
|     in the sentence. This function adds information to the POS tag to |     in the sentence. This function adds information to the POS tag to | ||||||
|     resolve ambiguous mappings. |     resolve ambiguous mappings. | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|     # NOTE: This is a first take. The rules here are crude approximations. |     # TODO: This is a first take. The rules here are crude approximations. | ||||||
|     # For many of these, full dependencies are needed to properly resolve |     # For many of these, full dependencies are needed to properly resolve | ||||||
|     # PoS mappings. |     # PoS mappings. | ||||||
| 
 | 
 | ||||||
|  | @ -56,7 +55,7 @@ def resolve_pos(token): | ||||||
| 
 | 
 | ||||||
| def detailed_tokens(tokenizer, text): | def detailed_tokens(tokenizer, text): | ||||||
|     """Format Mecab output into a nice data structure, based on Janome.""" |     """Format Mecab output into a nice data structure, based on Janome.""" | ||||||
|     tokenizer.parse(text) | 
 | ||||||
|     node = tokenizer.parseToNode(text) |     node = tokenizer.parseToNode(text) | ||||||
|     node = node.next  # first node is beginning of sentence and empty, skip it |     node = node.next  # first node is beginning of sentence and empty, skip it | ||||||
|     words = [] |     words = [] | ||||||
|  | @ -98,62 +97,15 @@ class JapaneseTokenizer(DummyTokenizer): | ||||||
|         return doc |         return doc | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class JapaneseCharacterSegmenter(object): |  | ||||||
|     def __init__(self, vocab): |  | ||||||
|         self.vocab = vocab |  | ||||||
|         self._presegmenter = self._make_presegmenter(self.vocab) |  | ||||||
| 
 |  | ||||||
|     def _make_presegmenter(self, vocab): |  | ||||||
|         rules = Japanese.Defaults.tokenizer_exceptions |  | ||||||
|         token_match = Japanese.Defaults.token_match |  | ||||||
|         prefix_search = ( |  | ||||||
|             util.compile_prefix_regex(Japanese.Defaults.prefixes).search |  | ||||||
|             if Japanese.Defaults.prefixes |  | ||||||
|             else None |  | ||||||
|         ) |  | ||||||
|         suffix_search = ( |  | ||||||
|             util.compile_suffix_regex(Japanese.Defaults.suffixes).search |  | ||||||
|             if Japanese.Defaults.suffixes |  | ||||||
|             else None |  | ||||||
|         ) |  | ||||||
|         infix_finditer = ( |  | ||||||
|             util.compile_infix_regex(Japanese.Defaults.infixes).finditer |  | ||||||
|             if Japanese.Defaults.infixes |  | ||||||
|             else None |  | ||||||
|         ) |  | ||||||
|         return Tokenizer( |  | ||||||
|             vocab, |  | ||||||
|             rules=rules, |  | ||||||
|             prefix_search=prefix_search, |  | ||||||
|             suffix_search=suffix_search, |  | ||||||
|             infix_finditer=infix_finditer, |  | ||||||
|             token_match=token_match, |  | ||||||
|         ) |  | ||||||
| 
 |  | ||||||
|     def __call__(self, text): |  | ||||||
|         words = [] |  | ||||||
|         spaces = [] |  | ||||||
|         doc = self._presegmenter(text) |  | ||||||
|         for token in doc: |  | ||||||
|             words.extend(list(token.text)) |  | ||||||
|             spaces.extend([False] * len(token.text)) |  | ||||||
|             spaces[-1] = bool(token.whitespace_) |  | ||||||
|         return Doc(self.vocab, words=words, spaces=spaces) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class JapaneseDefaults(Language.Defaults): | class JapaneseDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda _text: "ja" |     lex_attr_getters[LANG] = lambda _text: "ja" | ||||||
| 
 | 
 | ||||||
|     tag_map = TAG_MAP |     tag_map = TAG_MAP | ||||||
|     use_janome = True |  | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def create_tokenizer(cls, nlp=None): |     def create_tokenizer(cls, nlp=None): | ||||||
|         if cls.use_janome: |         return JapaneseTokenizer(cls, nlp) | ||||||
|             return JapaneseTokenizer(cls, nlp) |  | ||||||
|         else: |  | ||||||
|             return JapaneseCharacterSegmenter(nlp.vocab) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Japanese(Language): | class Japanese(Language): | ||||||
|  |  | ||||||
|  | @ -2,10 +2,10 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
|  | from .punctuation import TOKENIZER_INFIXES | ||||||
| from .tag_map import TAG_MAP | from .tag_map import TAG_MAP | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .lex_attrs import LEX_ATTRS | from .lex_attrs import LEX_ATTRS | ||||||
| from .punctuation import TOKENIZER_INFIXES |  | ||||||
| 
 | 
 | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
|  | @ -22,9 +22,9 @@ class PolishDefaults(Language.Defaults): | ||||||
|         Language.Defaults.lex_attr_getters[NORM], BASE_NORMS |         Language.Defaults.lex_attr_getters[NORM], BASE_NORMS | ||||||
|     ) |     ) | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     infixes = tuple(TOKENIZER_INFIXES) |  | ||||||
|     stop_words = STOP_WORDS |     stop_words = STOP_WORDS | ||||||
|     tag_map = TAG_MAP |     tag_map = TAG_MAP | ||||||
|  |     infixes = TOKENIZER_INFIXES | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Polish(Language): | class Polish(Language): | ||||||
|  |  | ||||||
|  | @ -1,14 +1,22 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS | 
 | ||||||
| from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS | ||||||
| _quotes = QUOTES.replace("'", '') | from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||||
| _infixes = (LIST_ELLIPSES + LIST_ICONS + | 
 | ||||||
|             [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), | _quotes = CONCAT_QUOTES.replace("'", "") | ||||||
|              r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA), | 
 | ||||||
|              r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), | _infixes = ( | ||||||
|              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), |     LIST_ELLIPSES | ||||||
|              r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes), |     + [CONCAT_ICONS] | ||||||
|              r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)]) |     + [ | ||||||
|  |         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), | ||||||
|  |         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), | ||||||
|  |         r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), | ||||||
|  |         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), | ||||||
|  |         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | ||||||
|  |         r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES), | ||||||
|  |     ] | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| TOKENIZER_INFIXES = _infixes | TOKENIZER_INFIXES = _infixes | ||||||
|  |  | ||||||
|  | @ -2,6 +2,7 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS | from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS | ||||||
|  | from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| _exc = {} | _exc = {} | ||||||
|  |  | ||||||
|  | @ -6,7 +6,9 @@ from .tag_map import TAG_MAP | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .morph_rules import MORPH_RULES | from .morph_rules import MORPH_RULES | ||||||
| from .lemmatizer import LEMMA_RULES, LOOKUP | from .lemmatizer import LEMMA_RULES, LOOKUP | ||||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | 
 | ||||||
|  | # Punctuation stolen from Danish | ||||||
|  | from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | ||||||
| 
 | 
 | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
|  | @ -31,6 +33,7 @@ class SwedishDefaults(Language.Defaults): | ||||||
|     lemma_lookup = LOOKUP |     lemma_lookup = LOOKUP | ||||||
|     morph_rules = MORPH_RULES |     morph_rules = MORPH_RULES | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| class Swedish(Language): | class Swedish(Language): | ||||||
|     lang = "sv" |     lang = "sv" | ||||||
|     Defaults = SwedishDefaults |     Defaults = SwedishDefaults | ||||||
|  |  | ||||||
|  | @ -1,25 +0,0 @@ | ||||||
| # coding: utf8 |  | ||||||
| """Punctuation stolen from Danish""" |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS |  | ||||||
| from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER |  | ||||||
| from ..punctuation import TOKENIZER_SUFFIXES |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| _quotes = QUOTES.replace("'", '') |  | ||||||
| 
 |  | ||||||
| _infixes = (LIST_ELLIPSES + LIST_ICONS + |  | ||||||
|             [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), |  | ||||||
|              r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA), |  | ||||||
|              r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), |  | ||||||
|              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), |  | ||||||
|              r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes), |  | ||||||
|              r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)]) |  | ||||||
| 
 |  | ||||||
| _suffixes = [suffix for suffix in TOKENIZER_SUFFIXES if suffix not in ["'s", "'S", "’s", "’S", r"\'"]] |  | ||||||
| _suffixes += [r"(?<=[^sSxXzZ])\'"] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| TOKENIZER_INFIXES = _infixes |  | ||||||
| TOKENIZER_SUFFIXES = _suffixes |  | ||||||
|  | @ -1,169 +1,191 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| 
 |  | ||||||
| """ |  | ||||||
| Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html |  | ||||||
| for https://github.com/UniversalDependencies/UD_Swedish-Talbanken |  | ||||||
| """ |  | ||||||
| 
 |  | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X, VERB | from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV | ||||||
| from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX | from ...symbols import ADP, X, VERB, NOUN, PROPN, PART, INTJ, PRON | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html | ||||||
|  | # for https://github.com/UniversalDependencies/UD_Swedish-Talbanken | ||||||
| 
 | 
 | ||||||
| TAG_MAP = { | TAG_MAP = { | ||||||
|     'AB': { POS: ADV }, # inte, också, så, bara, nu |     "AB": {POS: ADV},  # inte, också, så, bara, nu | ||||||
|     'AB|AN': { POS: ADV }, # t.ex., ca, t_ex, bl.a., s_k |     "AB|AN": {POS: ADV},  # t.ex., ca, t_ex, bl.a., s_k | ||||||
|     'AB|KOM': { POS: ADV }, # mer, tidigare, mindre, vidare, mera |     "AB|KOM": {POS: ADV},  # mer, tidigare, mindre, vidare, mera | ||||||
|     'AB|POS': { POS: ADV }, # mycket, helt, ofta, länge, långt |     "AB|POS": {POS: ADV},  # mycket, helt, ofta, länge, långt | ||||||
|     'AB|SMS': { POS: ADV }, # över-, in- |     "AB|SMS": {POS: ADV},  # över-, in- | ||||||
|     'AB|SUV': { POS: ADV }, # minst, mest, högst, främst, helst |     "AB|SUV": {POS: ADV},  # minst, mest, högst, främst, helst | ||||||
|     'DT|MAS|SIN|DEF': { POS: DET }, |     "DT|MAS|SIN|DEF": {POS: DET}, | ||||||
|     'DT|MAS|SIN|IND': { POS: DET }, |     "DT|MAS|SIN|IND": {POS: DET}, | ||||||
|     'DT|NEU|SIN|DEF': { POS: DET }, # det, detta |     "DT|NEU|SIN|DEF": {POS: DET},  # det, detta | ||||||
|     'DT|NEU|SIN|IND': { POS: DET }, # ett, något, inget, vart, vartannat |     "DT|NEU|SIN|IND": {POS: DET},  # ett, något, inget, vart, vartannat | ||||||
|     'DT|NEU|SIN|IND/DEF': { POS: DET }, # allt |     "DT|NEU|SIN|IND/DEF": {POS: DET},  # allt | ||||||
|     'DT|UTR/NEU|PLU|DEF': { POS: DET }, # de, dessa, bägge, dom |     "DT|UTR/NEU|PLU|DEF": {POS: DET},  # de, dessa, bägge, dom | ||||||
|     'DT|UTR/NEU|PLU|IND': { POS: DET }, # några, inga |     "DT|UTR/NEU|PLU|IND": {POS: DET},  # några, inga | ||||||
|     'DT|UTR/NEU|PLU|IND/DEF': { POS: DET }, # alla |     "DT|UTR/NEU|PLU|IND/DEF": {POS: DET},  # alla | ||||||
|     'DT|UTR/NEU|SIN/PLU|IND': { POS: DET }, # samma |     "DT|UTR/NEU|SIN/PLU|IND": {POS: DET},  # samma | ||||||
|     'DT|UTR/NEU|SIN|DEF': { POS: DET }, # vardera |     "DT|UTR/NEU|SIN|DEF": {POS: DET},  # vardera | ||||||
|     'DT|UTR/NEU|SIN|IND': { POS: DET }, # varje, varenda |     "DT|UTR/NEU|SIN|IND": {POS: DET},  # varje, varenda | ||||||
|     'DT|UTR|SIN|DEF': { POS: DET }, # den, denna |     "DT|UTR|SIN|DEF": {POS: DET},  # den, denna | ||||||
|     'DT|UTR|SIN|IND': { POS: DET }, # en, någon, ingen, var, varannan |     "DT|UTR|SIN|IND": {POS: DET},  # en, någon, ingen, var, varannan | ||||||
|     'DT|UTR|SIN|IND/DEF': { POS: DET }, # all |     "DT|UTR|SIN|IND/DEF": {POS: DET},  # all | ||||||
|     'HA': { POS: ADV }, # när, där, hur, som, då |     "HA": {POS: ADV},  # när, där, hur, som, då | ||||||
|     'HD|NEU|SIN|IND': { POS: DET }, # vilket |     "HD|NEU|SIN|IND": {POS: DET},  # vilket | ||||||
|     'HD|UTR/NEU|PLU|IND': { POS: DET }, # vilka |     "HD|UTR/NEU|PLU|IND": {POS: DET},  # vilka | ||||||
|     'HD|UTR|SIN|IND': { POS: DET }, # vilken |     "HD|UTR|SIN|IND": {POS: DET},  # vilken | ||||||
|     'HP|-|-|-': { POS: PRON }, # som |     "HP|-|-|-": {POS: PRON},  # som | ||||||
|     'HP|NEU|SIN|IND': { POS: PRON }, # vad, vilket |     "HP|NEU|SIN|IND": {POS: PRON},  # vad, vilket | ||||||
|     'HP|NEU|SIN|IND|SMS': { POS: PRON }, |     "HP|NEU|SIN|IND|SMS": {POS: PRON}, | ||||||
|     'HP|UTR/NEU|PLU|IND': { POS: PRON }, # vilka |     "HP|UTR/NEU|PLU|IND": {POS: PRON},  # vilka | ||||||
|     'HP|UTR|SIN|IND': { POS: PRON }, # vilken, vem |     "HP|UTR|SIN|IND": {POS: PRON},  # vilken, vem | ||||||
|     'HS|DEF': { POS: DET }, # vars, vilkas, Vems |     "HS|DEF": {POS: DET},  # vars, vilkas, Vems | ||||||
|     'IE': { POS: PART }, # att |     "IE": {POS: PART},  # att | ||||||
|     'IN': { POS: INTJ }, # Jo, ja, nej, fan, visst |     "IN": {POS: INTJ},  # Jo, ja, nej, fan, visst | ||||||
|     'JJ|AN': { POS: ADJ }, # ev, S:t, Kungl, Kungl., Teol |     "JJ|AN": {POS: ADJ},  # ev, S:t, Kungl, Kungl., Teol | ||||||
|     'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: ADJ }, # äldres |     "JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN": {POS: ADJ},  # äldres | ||||||
|     'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # större, högre, mindre, bättre, äldre |     "JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM": { | ||||||
|     'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS': { POS: ADJ }, |         POS: ADJ | ||||||
|     'JJ|POS|MAS|SIN|DEF|GEN': { POS: ADJ }, # enskildes, sjukes, andres |     },  # större, högre, mindre, bättre, äldre | ||||||
|     'JJ|POS|MAS|SIN|DEF|NOM': { POS: ADJ }, # enskilde, sjuke, andre, unge, ene |     "JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS": {POS: ADJ}, | ||||||
|     'JJ|POS|NEU|SIN|IND/DEF|NOM': { POS: ADJ }, # eget |     "JJ|POS|MAS|SIN|DEF|GEN": {POS: ADJ},  # enskildes, sjukes, andres | ||||||
|     'JJ|POS|NEU|SIN|IND|GEN': { POS: ADJ }, |     "JJ|POS|MAS|SIN|DEF|NOM": {POS: ADJ},  # enskilde, sjuke, andre, unge, ene | ||||||
|     'JJ|POS|NEU|SIN|IND|NOM': { POS: ADJ }, # annat, svårt, möjligt, nytt, sådant |     "JJ|POS|NEU|SIN|IND/DEF|NOM": {POS: ADJ},  # eget | ||||||
|     'JJ|POS|UTR/NEU|PLU|IND/DEF|GEN': { POS: ADJ }, # ogiftas, ungas, frånskildas, efterkommandes, färgblindas |     "JJ|POS|NEU|SIN|IND|GEN": {POS: ADJ}, | ||||||
|     'JJ|POS|UTR/NEU|PLU|IND/DEF|NOM': { POS: ADJ }, # olika, andra, många, stora, vissa |     "JJ|POS|NEU|SIN|IND|NOM": {POS: ADJ},  # annat, svårt, möjligt, nytt, sådant | ||||||
|     'JJ|POS|UTR/NEU|PLU|IND|NOM': { POS: ADJ }, # flera, sådana, fler, få, samtliga |     "JJ|POS|UTR/NEU|PLU|IND/DEF|GEN": { | ||||||
|     'JJ|POS|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ }, |         POS: ADJ | ||||||
|     'JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # bra, ena, enda, nästa, ringa |     },  # ogiftas, ungas, frånskildas, efterkommandes, färgblindas | ||||||
|     'JJ|POS|UTR/NEU|SIN|DEF|GEN': { POS: ADJ }, |     "JJ|POS|UTR/NEU|PLU|IND/DEF|NOM": {POS: ADJ},  # olika, andra, många, stora, vissa | ||||||
|     'JJ|POS|UTR/NEU|SIN|DEF|NOM': { POS: ADJ }, # hela, nya, andra, svenska, ekonomiska |     "JJ|POS|UTR/NEU|PLU|IND|NOM": {POS: ADJ},  # flera, sådana, fler, få, samtliga | ||||||
|     'JJ|POS|UTR|-|-|SMS': { POS: ADJ }, # fri-, låg-, sexual- |     "JJ|POS|UTR/NEU|SIN/PLU|IND|NOM": {POS: ADJ}, | ||||||
|     'JJ|POS|UTR|SIN|IND/DEF|NOM': { POS: ADJ }, # egen |     "JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM": {POS: ADJ},  # bra, ena, enda, nästa, ringa | ||||||
|     'JJ|POS|UTR|SIN|IND|GEN': { POS: ADJ }, # enskilds |     "JJ|POS|UTR/NEU|SIN|DEF|GEN": {POS: ADJ}, | ||||||
|     'JJ|POS|UTR|SIN|IND|NOM': { POS: ADJ }, # stor, annan, själv, sådan, viss |     "JJ|POS|UTR/NEU|SIN|DEF|NOM": {POS: ADJ},  # hela, nya, andra, svenska, ekonomiska | ||||||
|     'JJ|SUV|MAS|SIN|DEF|GEN': { POS: ADJ }, |     "JJ|POS|UTR|-|-|SMS": {POS: ADJ},  # fri-, låg-, sexual- | ||||||
|     'JJ|SUV|MAS|SIN|DEF|NOM': { POS: ADJ }, # störste, främste, äldste, minste |     "JJ|POS|UTR|SIN|IND/DEF|NOM": {POS: ADJ},  # egen | ||||||
|     'JJ|SUV|UTR/NEU|PLU|DEF|NOM': { POS: ADJ }, # flesta |     "JJ|POS|UTR|SIN|IND|GEN": {POS: ADJ},  # enskilds | ||||||
|     'JJ|SUV|UTR/NEU|PLU|IND|NOM': { POS: ADJ }, |     "JJ|POS|UTR|SIN|IND|NOM": {POS: ADJ},  # stor, annan, själv, sådan, viss | ||||||
|     'JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM': { POS: ADJ }, # bästa, största, närmaste, viktigaste, högsta |     "JJ|SUV|MAS|SIN|DEF|GEN": {POS: ADJ}, | ||||||
|     'JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ }, # störst, bäst, tidigast, högst, fattigast |     "JJ|SUV|MAS|SIN|DEF|NOM": {POS: ADJ},  # störste, främste, äldste, minste | ||||||
|     'KN': { POS: CCONJ }, # och, eller, som, än, men |     "JJ|SUV|UTR/NEU|PLU|DEF|NOM": {POS: ADJ},  # flesta | ||||||
|     'KN|AN': { POS: CCONJ }, |     "JJ|SUV|UTR/NEU|PLU|IND|NOM": {POS: ADJ}, | ||||||
|     'MAD': { POS: PUNCT }, # ., ?, :, !, ... |     "JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM": { | ||||||
|     'MID': { POS: PUNCT }, # ,, -, :, *, ; |         POS: ADJ | ||||||
|     'NN|-|-|-|-': { POS: NOUN }, # godo, fjol, fullo, somras, måtto |     },  # bästa, största, närmaste, viktigaste, högsta | ||||||
|     'NN|AN': { POS: NOUN }, # kr, %, s., dr, kap. |     "JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM": { | ||||||
|     'NN|NEU|-|-|-': { POS: NOUN }, |         POS: ADJ | ||||||
|     'NN|NEU|-|-|SMS': { POS: NOUN }, # yrkes-, barn-, hem-, fack-, vatten- |     },  # störst, bäst, tidigast, högst, fattigast | ||||||
|     'NN|NEU|PLU|DEF|GEN': { POS: NOUN }, # barnens, årens, u-ländernas, företagens, århundradenas |     "KN": {POS: CCONJ},  # och, eller, som, än, men | ||||||
|     'NN|NEU|PLU|DEF|NOM': { POS: NOUN }, # barnen, u-länderna, åren, länderna, könen |     "KN|AN": {POS: CCONJ}, | ||||||
|     'NN|NEU|PLU|IND|GEN': { POS: NOUN }, # slags, års, barns, länders, tusentals |     "MAD": {POS: PUNCT},  # ., ?, :, !, ... | ||||||
|     'NN|NEU|PLU|IND|NOM': { POS: NOUN }, # barn, år, fall, länder, problem |     "MID": {POS: PUNCT},  # ,, -, :, *, ; | ||||||
|     'NN|NEU|SIN|DEF|GEN': { POS: NOUN }, # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets |     "NN|-|-|-|-": {POS: NOUN},  # godo, fjol, fullo, somras, måtto | ||||||
|     'NN|NEU|SIN|DEF|NOM': { POS: NOUN }, # äktenskapet, samhället, barnet, stället, hemmet |     "NN|AN": {POS: NOUN},  # kr, %, s., dr, kap. | ||||||
|     'NN|NEU|SIN|IND|GEN': { POS: NOUN }, # års, slags, lands, havs, företags |     "NN|NEU|-|-|-": {POS: NOUN}, | ||||||
|     'NN|NEU|SIN|IND|NOM': { POS: NOUN }, # år, arbete, barn, sätt, äktenskap |     "NN|NEU|-|-|SMS": {POS: NOUN},  # yrkes-, barn-, hem-, fack-, vatten- | ||||||
|     'NN|SMS': { POS: NOUN }, # PCB-, Syd- |     "NN|NEU|PLU|DEF|GEN": { | ||||||
|     'NN|UTR|-|-|-': { POS: NOUN }, # dags, rätta |         POS: NOUN | ||||||
|     'NN|UTR|-|-|SMS': { POS: NOUN }, # far-, kibbutz-, röntgen-, barna-, hälso- |     },  # barnens, årens, u-ländernas, företagens, århundradenas | ||||||
|     'NN|UTR|PLU|DEF|GEN': { POS: NOUN }, # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas |     "NN|NEU|PLU|DEF|NOM": {POS: NOUN},  # barnen, u-länderna, åren, länderna, könen | ||||||
|     'NN|UTR|PLU|DEF|NOM': { POS: NOUN }, # kvinnorna, föräldrarna, makarna, männen, hyrorna |     "NN|NEU|PLU|IND|GEN": {POS: NOUN},  # slags, års, barns, länders, tusentals | ||||||
|     'NN|UTR|PLU|IND|GEN': { POS: NOUN }, # människors, kvinnors, dagars, tiders, månaders |     "NN|NEU|PLU|IND|NOM": {POS: NOUN},  # barn, år, fall, länder, problem | ||||||
|     'NN|UTR|PLU|IND|NOM': { POS: NOUN }, # procent, människor, kvinnor, miljoner, kronor |     "NN|NEU|SIN|DEF|GEN": { | ||||||
|     'NN|UTR|SIN|DEF|GEN': { POS: NOUN }, # kvinnans, världens, familjens, dagens, jordens |         POS: NOUN | ||||||
|     'NN|UTR|SIN|DEF|NOM': { POS: NOUN }, # familjen, kvinnan, mannen, världen, skolan |     },  # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets | ||||||
|     'NN|UTR|SIN|IND|GEN': { POS: NOUN }, # sorts, medelålders, makes, kvinnas, veckas |     "NN|NEU|SIN|DEF|NOM": { | ||||||
|     'NN|UTR|SIN|IND|NOM': { POS: NOUN }, # del, tid, dag, fråga, man |         POS: NOUN | ||||||
|     'PAD': { POS: PUNCT }, # , ), ( |     },  # äktenskapet, samhället, barnet, stället, hemmet | ||||||
|     'PC|AN': { POS: VERB }, |     "NN|NEU|SIN|IND|GEN": {POS: NOUN},  # års, slags, lands, havs, företags | ||||||
|     'PC|PRF|MAS|SIN|DEF|GEN': { POS: VERB }, # avlidnes |     "NN|NEU|SIN|IND|NOM": {POS: NOUN},  # år, arbete, barn, sätt, äktenskap | ||||||
|     'PC|PRF|MAS|SIN|DEF|NOM': { POS: VERB }, |     "NN|SMS": {POS: NOUN},  # PCB-, Syd- | ||||||
|     'PC|PRF|NEU|SIN|IND|NOM': { POS: VERB }, # taget, sett, särskilt, förbjudet, ökat |     "NN|UTR|-|-|-": {POS: NOUN},  # dags, rätta | ||||||
|     'PC|PRF|UTR/NEU|PLU|IND/DEF|GEN': { POS: VERB }, # försäkrades, anställdas |     "NN|UTR|-|-|SMS": {POS: NOUN},  # far-, kibbutz-, röntgen-, barna-, hälso- | ||||||
|     'PC|PRF|UTR/NEU|PLU|IND/DEF|NOM': { POS: VERB }, # särskilda, gifta, ökade, handikappade, skilda |     "NN|UTR|PLU|DEF|GEN": { | ||||||
|     'PC|PRF|UTR/NEU|SIN|DEF|GEN': { POS: VERB }, |         POS: NOUN | ||||||
|     'PC|PRF|UTR/NEU|SIN|DEF|NOM': { POS: VERB }, # ökade, gifta, nämnda, nedärvda, dolda |     },  # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas | ||||||
|     'PC|PRF|UTR|SIN|IND|GEN': { POS: VERB }, |     "NN|UTR|PLU|DEF|NOM": { | ||||||
|     'PC|PRF|UTR|SIN|IND|NOM': { POS: VERB }, # särskild, ökad, beredd, gift, oförändrad |         POS: NOUN | ||||||
|     'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: VERB }, # studerandes, sammanboendes, dubbelarbetandes |     },  # kvinnorna, föräldrarna, makarna, männen, hyrorna | ||||||
|     'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: VERB }, # följande, beroende, nuvarande, motsvarande, liknande |     "NN|UTR|PLU|IND|GEN": {POS: NOUN},  # människors, kvinnors, dagars, tiders, månaders | ||||||
|     'PL': { POS: PART }, # ut, upp, in, till, med |     "NN|UTR|PLU|IND|NOM": {POS: NOUN},  # procent, människor, kvinnor, miljoner, kronor | ||||||
|     'PL|SMS': { POS: PART }, |     "NN|UTR|SIN|DEF|GEN": {POS: NOUN},  # kvinnans, världens, familjens, dagens, jordens | ||||||
|     'PM': { POS: PROPN }, # F, N, Liechtenstein, Danmark, DK |     "NN|UTR|SIN|DEF|NOM": {POS: NOUN},  # familjen, kvinnan, mannen, världen, skolan | ||||||
|     'PM|GEN': { POS: PROPN }, # Sveriges, EEC:s, Guds, Stockholms, Kristi |     "NN|UTR|SIN|IND|GEN": {POS: NOUN},  # sorts, medelålders, makes, kvinnas, veckas | ||||||
|     'PM|NOM': { POS: PROPN }, # Sverige, EEC, Stockholm, USA, ATP |     "NN|UTR|SIN|IND|NOM": {POS: NOUN},  # del, tid, dag, fråga, man | ||||||
|     'PM|SMS': { POS: PROPN }, # Göteborgs-, Nord-, Väst- |     "PAD": {POS: PUNCT},  # , ), ( | ||||||
|     'PN|MAS|SIN|DEF|SUB/OBJ': { POS: PRON }, # denne |     "PC|AN": {POS: VERB}, | ||||||
|     'PN|NEU|SIN|DEF|SUB/OBJ': { POS: PRON }, # det, detta, detsamma |     "PC|PRF|MAS|SIN|DEF|GEN": {POS: VERB},  # avlidnes | ||||||
|     'PN|NEU|SIN|IND|SUB/OBJ': { POS: PRON }, # något, allt, mycket, annat, ingenting |     "PC|PRF|MAS|SIN|DEF|NOM": {POS: VERB}, | ||||||
|     'PN|UTR/NEU|PLU|DEF|OBJ': { POS: PRON }, # dem, varandra, varann |     "PC|PRF|NEU|SIN|IND|NOM": {POS: VERB},  # taget, sett, särskilt, förbjudet, ökat | ||||||
|     'PN|UTR/NEU|PLU|DEF|SUB': { POS: PRON }, # de, bägge |     "PC|PRF|UTR/NEU|PLU|IND/DEF|GEN": {POS: VERB},  # försäkrades, anställdas | ||||||
|     'PN|UTR/NEU|PLU|DEF|SUB/OBJ': { POS: PRON }, # dessa, dom, båda, den, bådadera |     "PC|PRF|UTR/NEU|PLU|IND/DEF|NOM": { | ||||||
|     'PN|UTR/NEU|PLU|IND|SUB/OBJ': { POS: PRON }, # andra, alla, många, sådana, några |         POS: VERB | ||||||
|     'PN|UTR/NEU|SIN/PLU|DEF|OBJ': { POS: PRON }, # sig, sej |     },  # särskilda, gifta, ökade, handikappade, skilda | ||||||
|     'PN|UTR|PLU|DEF|OBJ': { POS: PRON }, # oss, er, eder |     "PC|PRF|UTR/NEU|SIN|DEF|GEN": {POS: VERB}, | ||||||
|     'PN|UTR|PLU|DEF|SUB': { POS: PRON }, # vi |     "PC|PRF|UTR/NEU|SIN|DEF|NOM": {POS: VERB},  # ökade, gifta, nämnda, nedärvda, dolda | ||||||
|     'PN|UTR|SIN|DEF|OBJ': { POS: PRON }, # dig, mig, henne, honom, Er |     "PC|PRF|UTR|SIN|IND|GEN": {POS: VERB}, | ||||||
|     'PN|UTR|SIN|DEF|SUB': { POS: PRON }, # du, han, hon, jag, ni |     "PC|PRF|UTR|SIN|IND|NOM": {POS: VERB},  # särskild, ökad, beredd, gift, oförändrad | ||||||
|     'PN|UTR|SIN|DEF|SUB/OBJ': { POS: PRON }, # den, denna, densamma |     "PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN": { | ||||||
|     'PN|UTR|SIN|IND|SUB': { POS: PRON }, # man |         POS: VERB | ||||||
|     'PN|UTR|SIN|IND|SUB/OBJ': { POS: PRON }, # en, var, någon, ingen, Varannan |     },  # studerandes, sammanboendes, dubbelarbetandes | ||||||
|     'PP': { POS: ADP }, # i, av, på, för, till |     "PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM": { | ||||||
|     'PP|AN': { POS: ADP }, # f |         POS: VERB | ||||||
|     'PS|AN': { POS: DET }, |     },  # följande, beroende, nuvarande, motsvarande, liknande | ||||||
|     'PS|NEU|SIN|DEF': { POS: DET }, # sitt, vårt, ditt, mitt, ert |     "PL": {POS: PART},  # ut, upp, in, till, med | ||||||
|     'PS|UTR/NEU|PLU|DEF': { POS: DET }, # sina, våra, dina, mina |     "PL|SMS": {POS: PART}, | ||||||
|     'PS|UTR/NEU|SIN/PLU|DEF': { POS: DET }, # deras, dess, hans, hennes, varandras |     "PM": {POS: PROPN},  # F, N, Liechtenstein, Danmark, DK | ||||||
|     'PS|UTR|SIN|DEF': { POS: DET }, # sin, vår, din, min, er |     "PM|GEN": {POS: PROPN},  # Sveriges, EEC:s, Guds, Stockholms, Kristi | ||||||
|     'RG': { POS: NUM }, # 2, 17, 20, 1, 18 |     "PM|NOM": {POS: PROPN},  # Sverige, EEC, Stockholm, USA, ATP | ||||||
|     'RG|GEN': { POS: NUM }, |     "PM|SMS": {POS: PROPN},  # Göteborgs-, Nord-, Väst- | ||||||
|     'RG|MAS|SIN|DEF|NOM': { POS: NUM }, |     "PN|MAS|SIN|DEF|SUB/OBJ": {POS: PRON},  # denne | ||||||
|     'RG|NEU|SIN|IND|NOM': { POS: NUM }, # ett |     "PN|NEU|SIN|DEF|SUB/OBJ": {POS: PRON},  # det, detta, detsamma | ||||||
|     'RG|NOM': { POS: NUM }, # två, tre, 1, 20, 2 |     "PN|NEU|SIN|IND|SUB/OBJ": {POS: PRON},  # något, allt, mycket, annat, ingenting | ||||||
|     'RG|SMS': { POS: NUM }, # ett-, 1950-, två-, tre-, 1700- |     "PN|UTR/NEU|PLU|DEF|OBJ": {POS: PRON},  # dem, varandra, varann | ||||||
|     'RG|UTR/NEU|SIN|DEF|NOM': { POS: NUM }, |     "PN|UTR/NEU|PLU|DEF|SUB": {POS: PRON},  # de, bägge | ||||||
|     'RG|UTR|SIN|IND|NOM': { POS: NUM }, # en |     "PN|UTR/NEU|PLU|DEF|SUB/OBJ": {POS: PRON},  # dessa, dom, båda, den, bådadera | ||||||
|     'RO|MAS|SIN|IND/DEF|GEN': { POS: ADJ }, |     "PN|UTR/NEU|PLU|IND|SUB/OBJ": {POS: PRON},  # andra, alla, många, sådana, några | ||||||
|     'RO|MAS|SIN|IND/DEF|NOM': { POS: ADJ }, # förste |     "PN|UTR/NEU|SIN/PLU|DEF|OBJ": {POS: PRON},  # sig, sej | ||||||
|     'RO|GEN': { POS: ADJ }, |     "PN|UTR|PLU|DEF|OBJ": {POS: PRON},  # oss, er, eder | ||||||
|     'RO|NOM': { POS: ADJ }, # första, andra, tredje, fjärde, femte |     "PN|UTR|PLU|DEF|SUB": {POS: PRON},  # vi | ||||||
|     'SN': { POS: SCONJ }, # att, om, innan, eftersom, medan |     "PN|UTR|SIN|DEF|OBJ": {POS: PRON},  # dig, mig, henne, honom, Er | ||||||
|     'UO': { POS: X }, # companionship, vice, versa, family, capita |     "PN|UTR|SIN|DEF|SUB": {POS: PRON},  # du, han, hon, jag, ni | ||||||
|     'VB|AN': { POS: VERB }, # jfr |     "PN|UTR|SIN|DEF|SUB/OBJ": {POS: PRON},  # den, denna, densamma | ||||||
|     'VB|IMP|AKT': { POS: VERB }, # se, Diskutera, låt, Läs, Gå |     "PN|UTR|SIN|IND|SUB": {POS: PRON},  # man | ||||||
|     'VB|IMP|SFO': { POS: VERB }, # tas |     "PN|UTR|SIN|IND|SUB/OBJ": {POS: PRON},  # en, var, någon, ingen, Varannan | ||||||
|     'VB|INF|AKT': { POS: VERB }, # vara, få, ha, bli, kunna |     "PP": {POS: ADP},  # i, av, på, för, till | ||||||
|     'VB|INF|SFO': { POS: VERB }, # användas, finnas, göras, tas, ses |     "PP|AN": {POS: ADP},  # f | ||||||
|     'VB|KON|PRS|AKT': { POS: VERB }, # vare, Gånge |     "PS|AN": {POS: DET}, | ||||||
|     'VB|KON|PRT|AKT': { POS: VERB }, # vore, finge |     "PS|NEU|SIN|DEF": {POS: DET},  # sitt, vårt, ditt, mitt, ert | ||||||
|     'VB|KON|PRT|SFO': { POS: VERB }, |     "PS|UTR/NEU|PLU|DEF": {POS: DET},  # sina, våra, dina, mina | ||||||
|     'VB|PRS|AKT': { POS: VERB }, # är, har, kan, får, måste |     "PS|UTR/NEU|SIN/PLU|DEF": {POS: DET},  # deras, dess, hans, hennes, varandras | ||||||
|     'VB|PRS|SFO': { POS: VERB }, # finns, kallas, behövs, beräknas, används |     "PS|UTR|SIN|DEF": {POS: DET},  # sin, vår, din, min, er | ||||||
|     'VB|PRT|AKT': { POS: VERB }, # skulle, var, hade, kunde, fick |     "RG": {POS: NUM},  # 2, 17, 20, 1, 18 | ||||||
|     'VB|PRT|SFO': { POS: VERB }, # fanns, gjordes, höjdes, användes, infördes |     "RG|GEN": {POS: NUM}, | ||||||
|     'VB|SMS': { POS: VERB }, # läs- |     "RG|MAS|SIN|DEF|NOM": {POS: NUM}, | ||||||
|     'VB|SUP|AKT': { POS: VERB }, # varit, fått, blivit, haft, kommit |     "RG|NEU|SIN|IND|NOM": {POS: NUM},  # ett | ||||||
|     'VB|SUP|SFO': { POS: VERB } # nämnts, gjorts, förändrats, sagts, framhållits |     "RG|NOM": {POS: NUM},  # två, tre, 1, 20, 2 | ||||||
|  |     "RG|SMS": {POS: NUM},  # ett-, 1950-, två-, tre-, 1700- | ||||||
|  |     "RG|UTR/NEU|SIN|DEF|NOM": {POS: NUM}, | ||||||
|  |     "RG|UTR|SIN|IND|NOM": {POS: NUM},  # en | ||||||
|  |     "RO|MAS|SIN|IND/DEF|GEN": {POS: ADJ}, | ||||||
|  |     "RO|MAS|SIN|IND/DEF|NOM": {POS: ADJ},  # förste | ||||||
|  |     "RO|GEN": {POS: ADJ}, | ||||||
|  |     "RO|NOM": {POS: ADJ},  # första, andra, tredje, fjärde, femte | ||||||
|  |     "SN": {POS: SCONJ},  # att, om, innan, eftersom, medan | ||||||
|  |     "UO": {POS: X},  # companionship, vice, versa, family, capita | ||||||
|  |     "VB|AN": {POS: VERB},  # jfr | ||||||
|  |     "VB|IMP|AKT": {POS: VERB},  # se, Diskutera, låt, Läs, Gå | ||||||
|  |     "VB|IMP|SFO": {POS: VERB},  # tas | ||||||
|  |     "VB|INF|AKT": {POS: VERB},  # vara, få, ha, bli, kunna | ||||||
|  |     "VB|INF|SFO": {POS: VERB},  # användas, finnas, göras, tas, ses | ||||||
|  |     "VB|KON|PRS|AKT": {POS: VERB},  # vare, Gånge | ||||||
|  |     "VB|KON|PRT|AKT": {POS: VERB},  # vore, finge | ||||||
|  |     "VB|KON|PRT|SFO": {POS: VERB}, | ||||||
|  |     "VB|PRS|AKT": {POS: VERB},  # är, har, kan, får, måste | ||||||
|  |     "VB|PRS|SFO": {POS: VERB},  # finns, kallas, behövs, beräknas, används | ||||||
|  |     "VB|PRT|AKT": {POS: VERB},  # skulle, var, hade, kunde, fick | ||||||
|  |     "VB|PRT|SFO": {POS: VERB},  # fanns, gjordes, höjdes, användes, infördes | ||||||
|  |     "VB|SMS": {POS: VERB},  # läs- | ||||||
|  |     "VB|SUP|AKT": {POS: VERB},  # varit, fått, blivit, haft, kommit | ||||||
|  |     "VB|SUP|SFO": {POS: VERB},  # nämnts, gjorts, förändrats, sagts, framhållits | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -144,7 +144,7 @@ ABBREVIATIONS = [ | ||||||
| 
 | 
 | ||||||
| # Add abbreviation for trailing punctuation too. If the abbreviation already has a trailing punctuation - skip it. | # Add abbreviation for trailing punctuation too. If the abbreviation already has a trailing punctuation - skip it. | ||||||
| for abbr in ABBREVIATIONS: | for abbr in ABBREVIATIONS: | ||||||
|     if abbr.endswith(".") == False: |     if not abbr.endswith("."): | ||||||
|         ABBREVIATIONS.append(abbr + ".") |         ABBREVIATIONS.append(abbr + ".") | ||||||
| 
 | 
 | ||||||
| for orth in ABBREVIATIONS: | for orth in ABBREVIATIONS: | ||||||
|  |  | ||||||
|  | @ -4,16 +4,15 @@ from __future__ import unicode_literals | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .lex_attrs import LEX_ATTRS | from .lex_attrs import LEX_ATTRS | ||||||
| 
 | 
 | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS |  | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...attrs import LANG | from ...attrs import LANG | ||||||
| from ...util import update_exc |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class TamilDefaults(Language.Defaults): | class TamilDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: "ta" |     lex_attr_getters[LANG] = lambda text: "ta" | ||||||
|     lex_attr_getters.update(LEX_ATTRS) |     lex_attr_getters.update(LEX_ATTRS) | ||||||
|  |     stop_words = STOP_WORDS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Tamil(Language): | class Tamil(Language): | ||||||
|  |  | ||||||
|  | @ -4,70 +4,33 @@ from __future__ import unicode_literals | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .lex_attrs import LEX_ATTRS | from .lex_attrs import LEX_ATTRS | ||||||
| 
 |  | ||||||
| # uncomment if files are available |  | ||||||
| # from .norm_exceptions import NORM_EXCEPTIONS |  | ||||||
| from .tag_map import TAG_MAP |  | ||||||
| # from .morph_rules import MORPH_RULES |  | ||||||
| 
 |  | ||||||
| # uncomment if lookup-based lemmatizer is available |  | ||||||
| from .lemmatizer import LOOKUP | from .lemmatizer import LOOKUP | ||||||
| # from ...lemmatizerlookup import Lemmatizer |  | ||||||
| 
 |  | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS |  | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...attrs import LANG, NORM | from ...attrs import LANG, NORM | ||||||
| from ...util import update_exc, add_lookups | from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def _return_tl(_): | def _return_tl(_): | ||||||
|     return 'tl' |     return "tl" | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Create a Language subclass |  | ||||||
| # Documentation: https://spacy.io/docs/usage/adding-languages |  | ||||||
| 
 |  | ||||||
| # This file should be placed in spacy/lang/xx (ISO code of language). |  | ||||||
| # Before submitting a pull request, make sure the remove all comments from the |  | ||||||
| # language data files, and run at least the basic tokenizer tests. Simply add the |  | ||||||
| # language ID to the list of languages in spacy/tests/conftest.py to include it |  | ||||||
| # in the basic tokenizer sanity tests. You can optionally add a fixture for the |  | ||||||
| # language's tokenizer and add more specific tests. For more info, see the |  | ||||||
| # tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class TagalogDefaults(Language.Defaults): | class TagalogDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = _return_tl # ISO code |     lex_attr_getters[LANG] = _return_tl | ||||||
|     # add more norm exception dictionaries here |     lex_attr_getters[NORM] = add_lookups( | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |         Language.Defaults.lex_attr_getters[NORM], BASE_NORMS | ||||||
| 
 |     ) | ||||||
|     # overwrite functions for lexical attributes |  | ||||||
|     lex_attr_getters.update(LEX_ATTRS) |     lex_attr_getters.update(LEX_ATTRS) | ||||||
| 
 |  | ||||||
|     # add custom tokenizer exceptions to base exceptions |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
| 
 |  | ||||||
|     # add stop words |  | ||||||
|     stop_words = STOP_WORDS |     stop_words = STOP_WORDS | ||||||
| 
 |     lemma_lookup = LOOKUP | ||||||
|     # if available: add tag map |  | ||||||
|     # tag_map = dict(TAG_MAP) |  | ||||||
| 
 |  | ||||||
|     # if available: add morph rules |  | ||||||
|     # morph_rules = dict(MORPH_RULES) |  | ||||||
| 
 |  | ||||||
|     # if available: add lookup lemmatizer |  | ||||||
|     # @classmethod |  | ||||||
|     # def create_lemmatizer(cls, nlp=None): |  | ||||||
|     #     return Lemmatizer(LOOKUP) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Tagalog(Language): | class Tagalog(Language): | ||||||
|     lang = 'tl' # ISO code |     lang = "tl" | ||||||
|     Defaults = TagalogDefaults # set Defaults to custom language defaults |     Defaults = TagalogDefaults | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # set default export – this allows the language class to be lazy-loaded | __all__ = ["Tagalog"] | ||||||
| __all__ = ['Tagalog'] |  | ||||||
|  |  | ||||||
|  | @ -2,11 +2,6 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Adding a lemmatizer lookup table |  | ||||||
| # Documentation: https://spacy.io/docs/usage/adding-languages#lemmatizer |  | ||||||
| # Entries should be added in the following format: |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| LOOKUP = { | LOOKUP = { | ||||||
|     "kaugnayan": "ugnay", |     "kaugnayan": "ugnay", | ||||||
|     "sangkatauhan": "tao", |     "sangkatauhan": "tao", | ||||||
|  | @ -14,5 +9,5 @@ LOOKUP = { | ||||||
|     "pandaigdigan": "daigdig", |     "pandaigdigan": "daigdig", | ||||||
|     "kasaysayan": "saysay", |     "kasaysayan": "saysay", | ||||||
|     "kabayanihan": "bayani", |     "kabayanihan": "bayani", | ||||||
|     "karuwagan": "duwag" |     "karuwagan": "duwag", | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -1,33 +1,55 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| # import the symbols for the attrs you want to overwrite |  | ||||||
| from ...attrs import LIKE_NUM | from ...attrs import LIKE_NUM | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Overwriting functions for lexical attributes | _num_words = [ | ||||||
| # Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs |     "sero", | ||||||
| # Most of these functions, like is_lower or like_url should be language- |     "isa", | ||||||
| # independent. Others, like like_num (which includes both digits and number |     "dalawa", | ||||||
| # words), requires customisation. |     "tatlo", | ||||||
| 
 |     "apat", | ||||||
| 
 |     "lima", | ||||||
| # Example: check if token resembles a number |     "anim", | ||||||
| 
 |     "pito", | ||||||
| _num_words = ['sero', 'isa', 'dalawa', 'tatlo', 'apat', 'lima', 'anim', 'pito', |     "walo", | ||||||
|               'walo', 'siyam', 'sampu', 'labing-isa', 'labindalawa', 'labintatlo', 'labing-apat', |     "siyam", | ||||||
|               'labinlima', 'labing-anim', 'labimpito', 'labing-walo', 'labinsiyam', 'dalawampu', |     "sampu", | ||||||
|               'tatlumpu', 'apatnapu', 'limampu', 'animnapu', 'pitumpu', 'walumpu', 'siyamnapu', |     "labing-isa", | ||||||
|               'daan', 'libo', 'milyon', 'bilyon', 'trilyon', 'quadrilyon', |     "labindalawa", | ||||||
|               'gajilyon', 'bazilyon'] |     "labintatlo", | ||||||
|  |     "labing-apat", | ||||||
|  |     "labinlima", | ||||||
|  |     "labing-anim", | ||||||
|  |     "labimpito", | ||||||
|  |     "labing-walo", | ||||||
|  |     "labinsiyam", | ||||||
|  |     "dalawampu", | ||||||
|  |     "tatlumpu", | ||||||
|  |     "apatnapu", | ||||||
|  |     "limampu", | ||||||
|  |     "animnapu", | ||||||
|  |     "pitumpu", | ||||||
|  |     "walumpu", | ||||||
|  |     "siyamnapu", | ||||||
|  |     "daan", | ||||||
|  |     "libo", | ||||||
|  |     "milyon", | ||||||
|  |     "bilyon", | ||||||
|  |     "trilyon", | ||||||
|  |     "quadrilyon", | ||||||
|  |     "gajilyon", | ||||||
|  |     "bazilyon", | ||||||
|  | ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def like_num(text): | def like_num(text): | ||||||
|     text = text.replace(',', '').replace('.', '') |     text = text.replace(",", "").replace(".", "") | ||||||
|     if text.isdigit(): |     if text.isdigit(): | ||||||
|         return True |         return True | ||||||
|     if text.count('/') == 1: |     if text.count("/") == 1: | ||||||
|         num, denom = text.split('/') |         num, denom = text.split("/") | ||||||
|         if num.isdigit() and denom.isdigit(): |         if num.isdigit() and denom.isdigit(): | ||||||
|             return True |             return True | ||||||
|     if text in _num_words: |     if text in _num_words: | ||||||
|  | @ -35,9 +57,4 @@ def like_num(text): | ||||||
|     return False |     return False | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Create dictionary of functions to overwrite. The default lex_attr_getters are | LEX_ATTRS = {LIKE_NUM: like_num} | ||||||
| # updated with this one, so only the functions defined here are overwritten. |  | ||||||
| 
 |  | ||||||
| LEX_ATTRS = { |  | ||||||
|     LIKE_NUM: like_num |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  | @ -1,162 +1,154 @@ | ||||||
| # encoding: utf8 | # encoding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| 
 | STOP_WORDS = set( | ||||||
| # Add stop words |     """ | ||||||
| # Documentation: https://spacy.io/docs/usage/adding-languages#stop-words | akin | ||||||
| # To improve readability, words should be ordered alphabetically and separated | aking | ||||||
| # by spaces and newlines. When adding stop words from an online source, always | ako | ||||||
| # include the link in a comment. Make sure to proofread and double-check the | alin | ||||||
| # words – lists available online are often known to contain mistakes. | am | ||||||
| 
 | amin | ||||||
| # data from https://github.com/stopwords-iso/stopwords-tl/blob/master/stopwords-tl.txt | aming | ||||||
| 
 | ang | ||||||
| STOP_WORDS = set(""" | ano | ||||||
|     akin | anumang | ||||||
|     aking | apat | ||||||
|     ako | at | ||||||
|     alin | atin | ||||||
|     am | ating | ||||||
|     amin | ay | ||||||
|     aming | bababa | ||||||
|     ang | bago | ||||||
|     ano | bakit | ||||||
|     anumang | bawat | ||||||
|     apat | bilang | ||||||
|     at | dahil | ||||||
|     atin | dalawa | ||||||
|     ating | dapat | ||||||
|     ay | din | ||||||
|     bababa | dito | ||||||
|     bago | doon | ||||||
|     bakit | gagawin | ||||||
|     bawat | gayunman | ||||||
|     bilang | ginagawa | ||||||
|     dahil | ginawa | ||||||
|     dalawa | ginawang | ||||||
|     dapat | gumawa | ||||||
|     din | gusto | ||||||
|     dito | habang | ||||||
|     doon | hanggang | ||||||
|     gagawin | hindi | ||||||
|     gayunman | huwag | ||||||
|     ginagawa | iba | ||||||
|     ginawa | ibaba | ||||||
|     ginawang | ibabaw | ||||||
|     gumawa | ibig | ||||||
|     gusto | ikaw | ||||||
|     habang | ilagay | ||||||
|     hanggang | ilalim | ||||||
|     hindi | ilan | ||||||
|     huwag | inyong | ||||||
|     iba | isa | ||||||
|     ibaba | isang | ||||||
|     ibabaw | itaas | ||||||
|     ibig | ito | ||||||
|     ikaw | iyo | ||||||
|     ilagay | iyon | ||||||
|     ilalim | iyong | ||||||
|     ilan | ka | ||||||
|     inyong | kahit | ||||||
|     isa | kailangan | ||||||
|     isang | kailanman | ||||||
|     itaas | kami | ||||||
|     ito | kanila | ||||||
|     iyo | kanilang | ||||||
|     iyon | kanino | ||||||
|     iyong | kanya | ||||||
|     ka | kanyang | ||||||
|     kahit | kapag | ||||||
|     kailangan | kapwa | ||||||
|     kailanman | karamihan | ||||||
|     kami | katiyakan | ||||||
|     kanila | katulad | ||||||
|     kanilang | kaya | ||||||
|     kanino | kaysa | ||||||
|     kanya | ko | ||||||
|     kanyang | kong | ||||||
|     kapag | kulang | ||||||
|     kapwa | kumuha | ||||||
|     karamihan | kung | ||||||
|     katiyakan | laban | ||||||
|     katulad | lahat | ||||||
|     kaya | lamang | ||||||
|     kaysa | likod | ||||||
|     ko | lima | ||||||
|     kong | maaari | ||||||
|     kulang | maaaring | ||||||
|     kumuha | maging | ||||||
|     kung | mahusay | ||||||
|     laban | makita | ||||||
|     lahat | marami | ||||||
|     lamang | marapat | ||||||
|     likod | masyado | ||||||
|     lima | may | ||||||
|     maaari | mayroon | ||||||
|     maaaring | mga | ||||||
|     maging | minsan | ||||||
|     mahusay | mismo | ||||||
|     makita | mula | ||||||
|     marami | muli | ||||||
|     marapat | na | ||||||
|     masyado | nabanggit | ||||||
|     may | naging | ||||||
|     mayroon | nagkaroon | ||||||
|     mga | nais | ||||||
|     minsan | nakita | ||||||
|     mismo | namin | ||||||
|     mula | napaka | ||||||
|     muli | narito | ||||||
|     na | nasaan | ||||||
|     nabanggit | ng | ||||||
|     naging | ngayon | ||||||
|     nagkaroon | ni | ||||||
|     nais | nila | ||||||
|     nakita | nilang | ||||||
|     namin | nito | ||||||
|     napaka | niya | ||||||
|     narito | niyang | ||||||
|     nasaan | noon | ||||||
|     ng | o | ||||||
|     ngayon | pa | ||||||
|     ni | paano | ||||||
|     nila | pababa | ||||||
|     nilang | paggawa | ||||||
|     nito | pagitan | ||||||
|     niya | pagkakaroon | ||||||
|     niyang | pagkatapos | ||||||
|     noon | palabas | ||||||
|     o | pamamagitan | ||||||
|     pa | panahon | ||||||
|     paano | pangalawa | ||||||
|     pababa | para | ||||||
|     paggawa | paraan | ||||||
|     pagitan | pareho | ||||||
|     pagkakaroon | pataas | ||||||
|     pagkatapos | pero | ||||||
|     palabas | pumunta | ||||||
|     pamamagitan | pumupunta | ||||||
|     panahon | sa | ||||||
|     pangalawa | saan | ||||||
|     para | sabi | ||||||
|     paraan | sabihin | ||||||
|     pareho | sarili | ||||||
|     pataas | sila | ||||||
|     pero | sino | ||||||
|     pumunta | siya | ||||||
|     pumupunta | tatlo | ||||||
|     sa | tayo | ||||||
|     saan | tulad | ||||||
|     sabi | tungkol | ||||||
|     sabihin | una | ||||||
|     sarili | walang | ||||||
|     sila | """.split() | ||||||
|     sino | ) | ||||||
|     siya |  | ||||||
|     tatlo |  | ||||||
|     tayo |  | ||||||
|     tulad |  | ||||||
|     tungkol |  | ||||||
|     una |  | ||||||
|     walang |  | ||||||
| """.split()) |  | ||||||
|  |  | ||||||
|  | @ -1,36 +0,0 @@ | ||||||
| # coding: utf8 |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ |  | ||||||
| from ...symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Add a tag map |  | ||||||
| # Documentation: https://spacy.io/docs/usage/adding-languages#tag-map |  | ||||||
| # Universal Dependencies: http://universaldependencies.org/u/pos/all.html |  | ||||||
| # The keys of the tag map should be strings in your tag set. The dictionary must |  | ||||||
| # have an entry POS whose value is one of the Universal Dependencies tags. |  | ||||||
| # Optionally, you can also include morphological features or other attributes. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| TAG_MAP = { |  | ||||||
|     "ADV":      {POS: ADV}, |  | ||||||
|     "NOUN":     {POS: NOUN}, |  | ||||||
|     "ADP":      {POS: ADP}, |  | ||||||
|     "PRON":     {POS: PRON}, |  | ||||||
|     "SCONJ":    {POS: SCONJ}, |  | ||||||
|     "PROPN":    {POS: PROPN}, |  | ||||||
|     "DET":      {POS: DET}, |  | ||||||
|     "SYM":      {POS: SYM}, |  | ||||||
|     "INTJ":     {POS: INTJ}, |  | ||||||
|     "PUNCT":    {POS: PUNCT}, |  | ||||||
|     "NUM":      {POS: NUM}, |  | ||||||
|     "AUX":      {POS: AUX}, |  | ||||||
|     "X":        {POS: X}, |  | ||||||
|     "CONJ":     {POS: CONJ}, |  | ||||||
|     "CCONJ":    {POS: CCONJ}, |  | ||||||
|     "ADJ":      {POS: ADJ}, |  | ||||||
|     "VERB":     {POS: VERB}, |  | ||||||
|     "PART":     {POS: PART}, |  | ||||||
|     "SP":     	{POS: SPACE} |  | ||||||
| } |  | ||||||
|  | @ -1,48 +1,20 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| # import symbols – if you need to use more, add them here | from ...symbols import ORTH, LEMMA | ||||||
| from ...symbols import ORTH, LEMMA, TAG, NORM, ADP, DET |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Add tokenizer exceptions |  | ||||||
| # Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions |  | ||||||
| # Feel free to use custom logic to generate repetitive exceptions more efficiently. |  | ||||||
| # If an exception is split into more than one token, the ORTH values combined always |  | ||||||
| # need to match the original string. |  | ||||||
| 
 |  | ||||||
| # Exceptions should be added in the following format: |  | ||||||
| 
 |  | ||||||
| _exc = { | _exc = { | ||||||
|     "tayo'y": [ |     "tayo'y": [{ORTH: "tayo", LEMMA: "tayo"}, {ORTH: "'y", LEMMA: "ay"}], | ||||||
|         {ORTH: "tayo", LEMMA: "tayo"}, |     "isa'y": [{ORTH: "isa", LEMMA: "isa"}, {ORTH: "'y", LEMMA: "ay"}], | ||||||
|         {ORTH: "'y", LEMMA: "ay"}], |     "baya'y": [{ORTH: "baya", LEMMA: "bayan"}, {ORTH: "'y", LEMMA: "ay"}], | ||||||
|     "isa'y": [ |     "sa'yo": [{ORTH: "sa", LEMMA: "sa"}, {ORTH: "'yo", LEMMA: "iyo"}], | ||||||
|         {ORTH: "isa", LEMMA: "isa"}, |     "ano'ng": [{ORTH: "ano", LEMMA: "ano"}, {ORTH: "'ng", LEMMA: "ang"}], | ||||||
|         {ORTH: "'y", LEMMA: "ay"}], |     "siya'y": [{ORTH: "siya", LEMMA: "siya"}, {ORTH: "'y", LEMMA: "ay"}], | ||||||
|     "baya'y": [ |     "nawa'y": [{ORTH: "nawa", LEMMA: "nawa"}, {ORTH: "'y", LEMMA: "ay"}], | ||||||
|         {ORTH: "baya", LEMMA: "bayan"}, |     "papa'no": [{ORTH: "papa'no", LEMMA: "papaano"}], | ||||||
|         {ORTH: "'y", LEMMA: "ay"}], |     "'di": [{ORTH: "'di", LEMMA: "hindi"}], | ||||||
|     "sa'yo": [ |  | ||||||
|         {ORTH: "sa", LEMMA: "sa"}, |  | ||||||
|         {ORTH: "'yo", LEMMA: "iyo"}], |  | ||||||
|     "ano'ng": [ |  | ||||||
|         {ORTH: "ano", LEMMA: "ano"}, |  | ||||||
|         {ORTH: "'ng", LEMMA: "ang"}], |  | ||||||
|     "siya'y": [ |  | ||||||
|         {ORTH: "siya", LEMMA: "siya"}, |  | ||||||
|         {ORTH: "'y", LEMMA: "ay"}], |  | ||||||
|     "nawa'y": [ |  | ||||||
|         {ORTH: "nawa", LEMMA: "nawa"}, |  | ||||||
|         {ORTH: "'y", LEMMA: "ay"}], |  | ||||||
|     "papa'no": [ |  | ||||||
|         {ORTH: "papa'no", LEMMA: "papaano"}], |  | ||||||
|     "'di": [ |  | ||||||
|         {ORTH: "'di", LEMMA: "hindi"}] |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # To keep things clean and readable, it's recommended to only declare the |  | ||||||
| # TOKENIZER_EXCEPTIONS at the bottom: |  | ||||||
| 
 |  | ||||||
| TOKENIZER_EXCEPTIONS = _exc | TOKENIZER_EXCEPTIONS = _exc | ||||||
|  |  | ||||||
|  | @ -3,7 +3,7 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import re | import re | ||||||
| 
 | 
 | ||||||
| from ..symbols import ORTH, POS, TAG, LEMMA, SPACE, PUNCT | from ..symbols import ORTH, POS, TAG, LEMMA, SPACE | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex | # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex | ||||||
|  |  | ||||||
|  | @ -5,71 +5,32 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .lex_attrs import LEX_ATTRS | from .lex_attrs import LEX_ATTRS | ||||||
| 
 | 
 | ||||||
| # uncomment if files are available |  | ||||||
| # from .norm_exceptions import NORM_EXCEPTIONS |  | ||||||
| # from .tag_map import TAG_MAP |  | ||||||
| # from .morph_rules import MORPH_RULES |  | ||||||
| 
 |  | ||||||
| # uncomment if lookup-based lemmatizer is available |  | ||||||
| # from .lemmatizer import LOOKUP |  | ||||||
| # from ...lemmatizerlookup import Lemmatizer |  | ||||||
| 
 |  | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
| from ...util import update_exc, add_lookups | from ...util import update_exc, add_lookups | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...attrs import LANG, LIKE_NUM, NORM | from ...attrs import LANG, NORM | ||||||
| # from .tag_map import TAG_MAP |  | ||||||
| from .lemmatizer import UkrainianLemmatizer | from .lemmatizer import UkrainianLemmatizer | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Create a Language subclass |  | ||||||
| # Documentation: https://spacy.io/docs/usage/adding-languages |  | ||||||
| 
 |  | ||||||
| # This file should be placed in spacy/lang/xx (ISO code of language). |  | ||||||
| # Before submitting a pull request, make sure the remove all comments from the |  | ||||||
| # language data files, and run at least the basic tokenizer tests. Simply add the |  | ||||||
| # language ID to the list of languages in spacy/tests/conftest.py to include it |  | ||||||
| # in the basic tokenizer sanity tests. You can optionally add a fixture for the |  | ||||||
| # language's tokenizer and add more specific tests. For more info, see the |  | ||||||
| # tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class UkrainianDefaults(Language.Defaults): | class UkrainianDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'uk' # ISO code |     lex_attr_getters[LANG] = lambda text: "uk" | ||||||
|     # add more norm exception dictionaries here |     lex_attr_getters[NORM] = add_lookups( | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |         Language.Defaults.lex_attr_getters[NORM], BASE_NORMS | ||||||
| 
 |     ) | ||||||
|     # overwrite functions for lexical attributes |  | ||||||
|     lex_attr_getters.update(LEX_ATTRS) |     lex_attr_getters.update(LEX_ATTRS) | ||||||
| 
 |  | ||||||
|     # add custom tokenizer exceptions to base exceptions |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
| 
 |  | ||||||
|     # add stop words |  | ||||||
|     stop_words = STOP_WORDS |     stop_words = STOP_WORDS | ||||||
| 
 | 
 | ||||||
|     # if available: add tag map |  | ||||||
|     # tag_map = dict(TAG_MAP) |  | ||||||
| 
 |  | ||||||
|     # if available: add morph rules |  | ||||||
|     # morph_rules = dict(MORPH_RULES) |  | ||||||
| 
 |  | ||||||
|     # if available: add lookup lemmatizer |  | ||||||
|     # @classmethod |  | ||||||
|     # def create_lemmatizer(cls, nlp=None): |  | ||||||
|     #     return Lemmatizer(LOOKUP) |  | ||||||
| 
 |  | ||||||
|     @classmethod |     @classmethod | ||||||
|     def create_lemmatizer(cls, nlp=None): |     def create_lemmatizer(cls, nlp=None): | ||||||
|         return UkrainianLemmatizer() |         return UkrainianLemmatizer() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Ukrainian(Language): | class Ukrainian(Language): | ||||||
|     lang = 'uk' # ISO code |     lang = "uk" | ||||||
|     Defaults = UkrainianDefaults # set Defaults to custom language defaults |     Defaults = UkrainianDefaults | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # set default export – this allows the language class to be lazy-loaded | __all__ = ["Ukrainian"] | ||||||
| __all__ = ['Ukrainian'] |  | ||||||
|  |  | ||||||
|  | @ -14,10 +14,10 @@ sentences = [ | ||||||
|     "Ніч на середу буде морозною.", |     "Ніч на середу буде морозною.", | ||||||
|     "Чим кращі книги ти читав, тим гірше спиш.",  # Serhiy Zhadan |     "Чим кращі книги ти читав, тим гірше спиш.",  # Serhiy Zhadan | ||||||
|     "Найстаріші ґудзики, відомі людству, археологи знайшли в долині ріки Інд.", |     "Найстаріші ґудзики, відомі людству, археологи знайшли в долині ріки Інд.", | ||||||
|     "Слов'янське слово «Україна» вперше згадується у Київському літописному зводі за Іпатіївським списком під 1187 роком.", # wikipedia |     "Слов'янське слово «Україна» вперше згадується у Київському літописному зводі за Іпатіївським списком під 1187 роком.",  # wikipedia | ||||||
|     "Де у Києві найсмачніша кава?", |     "Де у Києві найсмачніша кава?", | ||||||
|     "Від Нижнього озера довгими дерев’яними сходами, над якими синьо й біло горіли маленькі коробочки-ліхтарики, підіймалися до нього двоє стовусів: найкращий друг Вертутій і його дванадцятилітній онук Чублик.", # blyznets_viktor_semenovych/zemlia_svitliachkiv |     "Від Нижнього озера довгими дерев’яними сходами, над якими синьо й біло горіли маленькі коробочки-ліхтарики, підіймалися до нього двоє стовусів: найкращий друг Вертутій і його дванадцятилітній онук Чублик.",  # blyznets_viktor_semenovych/zemlia_svitliachkiv | ||||||
|     "Китайський космічний зонд \"Чан'е-4\" вперше в історії здійснив м'яку посадку на зворотному боці Місяця.", |     "Китайський космічний зонд \"Чан'е-4\" вперше в історії здійснив м'яку посадку на зворотному боці Місяця.", | ||||||
|     "Коли до губ твоїх лишається півподиху, коли до губ твоїх лишається півкроку – зіниці твої виткані із подиву, в очах у тебе синьо і широко.", # Hryhorij Czubaj |     "Коли до губ твоїх лишається півподиху, коли до губ твоїх лишається півкроку – зіниці твої виткані із подиву, в очах у тебе синьо і широко.",  # Hryhorij Czubaj | ||||||
|     "Дорогу сестру збираю у дорогу, а брати вирішили не брати машину." # homographs |     "Дорогу сестру збираю у дорогу, а брати вирішили не брати машину.",  # homographs | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | @ -1,12 +1,15 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
| from ..ru.lemmatizer import RussianLemmatizer | from ..ru.lemmatizer import RussianLemmatizer | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class UkrainianLemmatizer(RussianLemmatizer): | class UkrainianLemmatizer(RussianLemmatizer): | ||||||
| 
 |     def __init__(self, pymorphy2_lang="ru"): | ||||||
|     def __init__(self, pymorphy2_lang='ru'): |  | ||||||
|         try: |         try: | ||||||
|             super(UkrainianLemmatizer, self).__init__(pymorphy2_lang='uk') |             super(UkrainianLemmatizer, self).__init__(pymorphy2_lang="uk") | ||||||
|         except ImportError: |         except ImportError: | ||||||
|             raise ImportError( |             raise ImportError( | ||||||
|                 'The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: ' |                 "The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: " | ||||||
|                 'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"') |                 'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"' | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  | @ -1,32 +1,68 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| # import the symbols for the attrs you want to overwrite |  | ||||||
| from ...attrs import LIKE_NUM | from ...attrs import LIKE_NUM | ||||||
| 
 | 
 | ||||||
| 
 | _num_words = [ | ||||||
| # Overwriting functions for lexical attributes |     "більйон", | ||||||
| # Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs |     "вісім", | ||||||
| # Most of these functions, like is_lower or like_url should be language- |     "вісімдесят", | ||||||
| # independent. Others, like like_num (which includes both digits and number |     "вісімнадцять", | ||||||
| # words), requires customisation. |     "вісімсот", | ||||||
| 
 |     "восьмий", | ||||||
| 
 |     "два", | ||||||
| # Example: check if token resembles a number |     "двадцять", | ||||||
| _num_words = ["більйон", "вісім", "вісімдесят", "вісімнадцять", "вісімсот", "восьмий", "два", "двадцять", "дванадцять", |     "дванадцять", | ||||||
|               "двісті", "дев'яносто", "дев'ятнадцять", "дев'ятсот", "дев'ять", "десять", "децильйон", "квадрильйон", |     "двісті", | ||||||
|               "квінтильйон", "мільйон", "мільярд", "нонильйон", "один", "одинадцять", "октильйон", "п'ятий", |     "дев'яносто", | ||||||
|               "п'ятисотий", "п'ятнадцять", "п'ятсот", "п'ять", "секстильйон", "септильйон", "сім", "сімдесят", |     "дев'ятнадцять", | ||||||
|               "сімнадцять", "сімсот", "сорок", "сто", "тисяча", "три", "тридцять", "трильйон", "тринадцять", "триста", |     "дев'ятсот", | ||||||
|               "чотири", "чотириста", "чотирнадцять", "шістдесят", "шістнадцять", "шістсот", "шість"] |     "дев'ять", | ||||||
|  |     "десять", | ||||||
|  |     "децильйон", | ||||||
|  |     "квадрильйон", | ||||||
|  |     "квінтильйон", | ||||||
|  |     "мільйон", | ||||||
|  |     "мільярд", | ||||||
|  |     "нонильйон", | ||||||
|  |     "один", | ||||||
|  |     "одинадцять", | ||||||
|  |     "октильйон", | ||||||
|  |     "п'ятий", | ||||||
|  |     "п'ятисотий", | ||||||
|  |     "п'ятнадцять", | ||||||
|  |     "п'ятсот", | ||||||
|  |     "п'ять", | ||||||
|  |     "секстильйон", | ||||||
|  |     "септильйон", | ||||||
|  |     "сім", | ||||||
|  |     "сімдесят", | ||||||
|  |     "сімнадцять", | ||||||
|  |     "сімсот", | ||||||
|  |     "сорок", | ||||||
|  |     "сто", | ||||||
|  |     "тисяча", | ||||||
|  |     "три", | ||||||
|  |     "тридцять", | ||||||
|  |     "трильйон", | ||||||
|  |     "тринадцять", | ||||||
|  |     "триста", | ||||||
|  |     "чотири", | ||||||
|  |     "чотириста", | ||||||
|  |     "чотирнадцять", | ||||||
|  |     "шістдесят", | ||||||
|  |     "шістнадцять", | ||||||
|  |     "шістсот", | ||||||
|  |     "шість", | ||||||
|  | ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def like_num(text): | def like_num(text): | ||||||
|     text = text.replace(',', '').replace('.', '') |     text = text.replace(",", "").replace(".", "") | ||||||
|     if text.isdigit(): |     if text.isdigit(): | ||||||
|         return True |         return True | ||||||
|     if text.count('/') == 1: |     if text.count("/") == 1: | ||||||
|         num, denom = text.split('/') |         num, denom = text.split("/") | ||||||
|         if num.isdigit() and denom.isdigit(): |         if num.isdigit() and denom.isdigit(): | ||||||
|             return True |             return True | ||||||
|     if text in _num_words: |     if text in _num_words: | ||||||
|  | @ -34,9 +70,4 @@ def like_num(text): | ||||||
|     return False |     return False | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Create dictionary of functions to overwrite. The default lex_attr_getters are | LEX_ATTRS = {LIKE_NUM: like_num} | ||||||
| # updated with this one, so only the functions defined here are overwritten. |  | ||||||
| 
 |  | ||||||
| LEX_ATTRS = { |  | ||||||
|     LIKE_NUM: like_num |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  | @ -2,15 +2,8 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Add stop words | STOP_WORDS = set( | ||||||
| # Documentation: https://spacy.io/docs/usage/adding-languages#stop-words |     """а | ||||||
| # To improve readability, words should be ordered alphabetically and separated |  | ||||||
| # by spaces and newlines. When adding stop words from an online source, always |  | ||||||
| # include the link in a comment. Make sure to proofread and double-check the |  | ||||||
| # words – lists available online are often known to contain mistakes. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| STOP_WORDS = set("""а |  | ||||||
| або | або | ||||||
| адже | адже | ||||||
| але | але | ||||||
|  | @ -401,4 +394,5 @@ STOP_WORDS = set("""а | ||||||
| якій | якій | ||||||
| якого | якого | ||||||
| якщо | якщо | ||||||
| """.split()) | """.split() | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | @ -5,32 +5,24 @@ from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ | ||||||
| from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ | from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Add a tag map |  | ||||||
| # Documentation: https://spacy.io/docs/usage/adding-languages#tag-map |  | ||||||
| # Universal Dependencies: http://universaldependencies.org/u/pos/all.html |  | ||||||
| # The keys of the tag map should be strings in your tag set. The dictionary must |  | ||||||
| # have an entry POS whose value is one of the Universal Dependencies tags. |  | ||||||
| # Optionally, you can also include morphological features or other attributes. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| TAG_MAP = { | TAG_MAP = { | ||||||
|     "ADV":      {POS: ADV}, |     "ADV": {POS: ADV}, | ||||||
|     "NOUN":     {POS: NOUN}, |     "NOUN": {POS: NOUN}, | ||||||
|     "ADP":      {POS: ADP}, |     "ADP": {POS: ADP}, | ||||||
|     "PRON":     {POS: PRON}, |     "PRON": {POS: PRON}, | ||||||
|     "SCONJ":    {POS: SCONJ}, |     "SCONJ": {POS: SCONJ}, | ||||||
|     "PROPN":    {POS: PROPN}, |     "PROPN": {POS: PROPN}, | ||||||
|     "DET":      {POS: DET}, |     "DET": {POS: DET}, | ||||||
|     "SYM":      {POS: SYM}, |     "SYM": {POS: SYM}, | ||||||
|     "INTJ":     {POS: INTJ}, |     "INTJ": {POS: INTJ}, | ||||||
|     "PUNCT":    {POS: PUNCT}, |     "PUNCT": {POS: PUNCT}, | ||||||
|     "NUM":      {POS: NUM}, |     "NUM": {POS: NUM}, | ||||||
|     "AUX":      {POS: AUX}, |     "AUX": {POS: AUX}, | ||||||
|     "X":        {POS: X}, |     "X": {POS: X}, | ||||||
|     "CONJ":     {POS: CONJ}, |     "CONJ": {POS: CONJ}, | ||||||
|     "CCONJ":    {POS: CCONJ}, |     "CCONJ": {POS: CCONJ}, | ||||||
|     "ADJ":      {POS: ADJ}, |     "ADJ": {POS: ADJ}, | ||||||
|     "VERB":     {POS: VERB}, |     "VERB": {POS: VERB}, | ||||||
|     "PART":     {POS: PART}, |     "PART": {POS: PART}, | ||||||
|     "SP":     	{POS: SPACE} |     "SP": {POS: SPACE}, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -1,18 +1,9 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| # import symbols – if you need to use more, add them here |  | ||||||
| from ...symbols import ORTH, LEMMA, POS, NORM, NOUN | from ...symbols import ORTH, LEMMA, POS, NORM, NOUN | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Add tokenizer exceptions |  | ||||||
| # Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions |  | ||||||
| # Feel free to use custom logic to generate repetitive exceptions more efficiently. |  | ||||||
| # If an exception is split into more than one token, the ORTH values combined always |  | ||||||
| # need to match the original string. |  | ||||||
| 
 |  | ||||||
| # Exceptions should be added in the following format: |  | ||||||
| 
 |  | ||||||
| _exc = {} | _exc = {} | ||||||
| 
 | 
 | ||||||
| for exc_data in [ | for exc_data in [ | ||||||
|  | @ -28,11 +19,9 @@ for exc_data in [ | ||||||
|     {ORTH: "проф.", LEMMA: "професор", NORM: "професор", POS: NOUN}, |     {ORTH: "проф.", LEMMA: "професор", NORM: "професор", POS: NOUN}, | ||||||
|     {ORTH: "акад.", LEMMA: "академік", NORM: "академік", POS: NOUN}, |     {ORTH: "акад.", LEMMA: "академік", NORM: "академік", POS: NOUN}, | ||||||
|     {ORTH: "доц.", LEMMA: "доцент", NORM: "доцент", POS: NOUN}, |     {ORTH: "доц.", LEMMA: "доцент", NORM: "доцент", POS: NOUN}, | ||||||
|     {ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN}]: |     {ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN}, | ||||||
|  | ]: | ||||||
|     _exc[exc_data[ORTH]] = [exc_data] |     _exc[exc_data[ORTH]] = [exc_data] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # To keep things clean and readable, it's recommended to only declare the |  | ||||||
| # TOKENIZER_EXCEPTIONS at the bottom: |  | ||||||
| 
 |  | ||||||
| TOKENIZER_EXCEPTIONS = _exc | TOKENIZER_EXCEPTIONS = _exc | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from .matcher import Matcher | from .matcher import Matcher  # noqa: F401 | ||||||
| from .phrasematcher import PhraseMatcher | from .phrasematcher import PhraseMatcher  # noqa: F401 | ||||||
| from .dependencymatcher import DependencyTreeMatcher | from .dependencymatcher import DependencyTreeMatcher  # noqa: F401 | ||||||
|  |  | ||||||
|  | @ -119,8 +119,8 @@ def tr_tokenizer(): | ||||||
| 
 | 
 | ||||||
| @pytest.fixture(scope="session") | @pytest.fixture(scope="session") | ||||||
| def uk_tokenizer(): | def uk_tokenizer(): | ||||||
|     pymorphy = pytest.importorskip("pymorphy2") |     pytest.importorskip("pymorphy2") | ||||||
|     return util.get_lang_class("uk").Defaults.create_tokenizer() |     return get_lang_class("uk").Defaults.create_tokenizer() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture(scope="session") | @pytest.fixture(scope="session") | ||||||
|  | @ -130,7 +130,7 @@ def ca_tokenizer(): | ||||||
| 
 | 
 | ||||||
| @pytest.fixture(scope="session") | @pytest.fixture(scope="session") | ||||||
| def pl_tokenizer(): | def pl_tokenizer(): | ||||||
|     return util.get_lang_class("pl").Defaults.create_tokenizer() |     return get_lang_class("pl").Defaults.create_tokenizer() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture(scope="session") | @pytest.fixture(scope="session") | ||||||
|  |  | ||||||
							
								
								
									
										0
									
								
								spacy/tests/lang/pl/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/pl/__init__.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -3,57 +3,57 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| DOT_TESTS =  [ | DOT_TESTS = [ | ||||||
|     ('tel.', ['tel.']), |     ("tel.", ["tel."]), | ||||||
|     ('np.', ['np.']), |     ("np.", ["np."]), | ||||||
|     ('godz. 21:37', ['godz.', '21:37']), |     ("godz. 21:37", ["godz.", "21:37"]), | ||||||
|     ('inż.', ['inż.']), |     ("inż.", ["inż."]), | ||||||
|     ('gosp.-polit.', ['gosp.-polit.']), |     ("gosp.-polit.", ["gosp.-polit."]), | ||||||
|     ('ppoż', ['ppoż']), |     ("ppoż", ["ppoż"]), | ||||||
|     ('płn', ['płn']), |     ("płn", ["płn"]), | ||||||
|     ('ul.', ['ul.']), |     ("ul.", ["ul."]), | ||||||
|     ('jw.', ['jw.']), |     ("jw.", ["jw."]), | ||||||
|     ('itd.', ['itd.']), |     ("itd.", ["itd."]), | ||||||
|     ('cdn.', ['cdn.']), |     ("cdn.", ["cdn."]), | ||||||
|     ('itp.', ['itp.']), |     ("itp.", ["itp."]), | ||||||
|     ('10,- zł', ['10,-', 'zł']), |     ("10,- zł", ["10,-", "zł"]), | ||||||
|     ('0 zł 99 gr', ['0', 'zł', '99', 'gr']), |     ("0 zł 99 gr", ["0", "zł", "99", "gr"]), | ||||||
|     ('0,99 rub.', ['0,99', 'rub.']), |     ("0,99 rub.", ["0,99", "rub."]), | ||||||
|     ('dol.', ['dol.']), |     ("dol.", ["dol."]), | ||||||
|     ('1000 m n.p.m.', ['1000', 'm', 'n.p.m.']), |     ("1000 m n.p.m.", ["1000", "m", "n.p.m."]), | ||||||
|     ('m.in.', ['m.in.']), |     ("m.in.", ["m.in."]), | ||||||
|     ('p.n.e.', ['p.n.e.']), |     ("p.n.e.", ["p.n.e."]), | ||||||
|     ('Sz.P.', ['Sz.P.']), |     ("Sz.P.", ["Sz.P."]), | ||||||
|     ('p.o.', ['p.o.']), |     ("p.o.", ["p.o."]), | ||||||
|     ('k.o.', ['k.o.']), |     ("k.o.", ["k.o."]), | ||||||
|     ('m.st.', ['m.st.']), |     ("m.st.", ["m.st."]), | ||||||
|     ('dra.', ['dra', '.']), |     ("dra.", ["dra", "."]), | ||||||
|     ('pp.', ['pp.']), |     ("pp.", ["pp."]), | ||||||
|     ('oo.', ['oo.']) |     ("oo.", ["oo."]), | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| HYPHEN_TESTS = [ | HYPHEN_TESTS = [ | ||||||
|     ('5-fluoropentylo-3-pirydynyloindol', ['5-fluoropentylo-3-pirydynyloindol']), |     ("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]), | ||||||
|     ('NESS-040C5', ['NESS-040C5']), |     ("NESS-040C5", ["NESS-040C5"]), | ||||||
|     ('JTE-7-31', ['JTE-7-31']), |     ("JTE-7-31", ["JTE-7-31"]), | ||||||
|     ('BAY-59-3074', ['BAY-59-3074']), |     ("BAY-59-3074", ["BAY-59-3074"]), | ||||||
|     ('BAY-38-7271', ['BAY-38-7271']), |     ("BAY-38-7271", ["BAY-38-7271"]), | ||||||
|     ('STS-135', ['STS-135']), |     ("STS-135", ["STS-135"]), | ||||||
|     ('5F-PB-22', ['5F-PB-22']), |     ("5F-PB-22", ["5F-PB-22"]), | ||||||
|     ('cztero-', ['cztero-']), |     ("cztero-", ["cztero-"]), | ||||||
|     ('jedno-', ['jedno-']), |     ("jedno-", ["jedno-"]), | ||||||
|     ('dwu-', ['dwu-']), |     ("dwu-", ["dwu-"]), | ||||||
|     ('trzy-', ['trzy-']), |     ("trzy-", ["trzy-"]), | ||||||
|     ('b-adoratorzy', ['b-adoratorzy']), |     ("b-adoratorzy", ["b-adoratorzy"]), | ||||||
|     ('2-3-4 drzewa', ['2-3-4', 'drzewa']), |     ("2-3-4 drzewa", ["2-3-4", "drzewa"]), | ||||||
|     ('b-drzewa', ['b-drzewa']) |     ("b-drzewa", ["b-drzewa"]), | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| TESTCASES = DOT_TESTS + HYPHEN_TESTS | TESTCASES = DOT_TESTS + HYPHEN_TESTS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text,expected_tokens', TESTCASES) | @pytest.mark.parametrize("text,expected_tokens", TESTCASES) | ||||||
| def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens): | def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens): | ||||||
|     tokens = pl_tokenizer(text) |     tokens = pl_tokenizer(text) | ||||||
|     token_list = [token.text for token in tokens if not token.is_space] |     token_list = [token.text for token in tokens if not token.is_space] | ||||||
|  |  | ||||||
|  | @ -5,34 +5,42 @@ import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| SV_TOKEN_EXCEPTION_TESTS = [ | SV_TOKEN_EXCEPTION_TESTS = [ | ||||||
|     ('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']), |     ( | ||||||
|     ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']), |         "Smörsåsen används bl.a. till fisk", | ||||||
|     ('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."]) |         ["Smörsåsen", "används", "bl.a.", "till", "fisk"], | ||||||
|  |     ), | ||||||
|  |     ( | ||||||
|  |         "Jag kommer först kl. 13 p.g.a. diverse förseningar", | ||||||
|  |         ["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"], | ||||||
|  |     ), | ||||||
|  |     ( | ||||||
|  |         "Anders I. tycker om ord med i i.", | ||||||
|  |         ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."], | ||||||
|  |     ), | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text,expected_tokens', SV_TOKEN_EXCEPTION_TESTS) | @pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS) | ||||||
| def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens): | def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens): | ||||||
|     tokens = sv_tokenizer(text) |     tokens = sv_tokenizer(text) | ||||||
|     token_list = [token.text for token in tokens if not token.is_space] |     token_list = [token.text for token in tokens if not token.is_space] | ||||||
|     assert expected_tokens == token_list |     assert expected_tokens == token_list | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["driveru", "hajaru", "Serru", "Fixaru"]) | @pytest.mark.parametrize("text", ["driveru", "hajaru", "Serru", "Fixaru"]) | ||||||
| def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text): | def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text): | ||||||
|     tokens = sv_tokenizer(text) |     tokens = sv_tokenizer(text) | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     assert tokens[1].text == "u" |     assert tokens[1].text == "u" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', | @pytest.mark.parametrize("text", ["bl.a", "m.a.o.", "Jan.", "Dec.", "kr.", "osv."]) | ||||||
|                          ["bl.a", "m.a.o.", "Jan.", "Dec.", "kr.", "osv."]) |  | ||||||
| def test_sv_tokenizer_handles_abbr(sv_tokenizer, text): | def test_sv_tokenizer_handles_abbr(sv_tokenizer, text): | ||||||
|     tokens = sv_tokenizer(text) |     tokens = sv_tokenizer(text) | ||||||
|     assert len(tokens) == 1 |     assert len(tokens) == 1 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["Jul.", "jul.", "sön.", "Sön."]) | @pytest.mark.parametrize("text", ["Jul.", "jul.", "sön.", "Sön."]) | ||||||
| def test_sv_tokenizer_handles_ambiguous_abbr(sv_tokenizer, text): | def test_sv_tokenizer_handles_ambiguous_abbr(sv_tokenizer, text): | ||||||
|     tokens = sv_tokenizer(text) |     tokens = sv_tokenizer(text) | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|  |  | ||||||
|  | @ -4,12 +4,17 @@ from __future__ import unicode_literals | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('string,lemma', [('DNA-profilernas', 'DNA-profil'), | @pytest.mark.parametrize( | ||||||
|                                           ('Elfenbenskustens', 'Elfenbenskusten'), |     "string,lemma", | ||||||
|                                           ('abortmotståndarens', 'abortmotståndare'), |     [ | ||||||
|                                           ('kolesterols', 'kolesterol'), |         ("DNA-profilernas", "DNA-profil"), | ||||||
|                                           ('portionssnusernas', 'portionssnus'), |         ("Elfenbenskustens", "Elfenbenskusten"), | ||||||
|                                           ('åsyns', 'åsyn')]) |         ("abortmotståndarens", "abortmotståndare"), | ||||||
|  |         ("kolesterols", "kolesterol"), | ||||||
|  |         ("portionssnusernas", "portionssnus"), | ||||||
|  |         ("åsyns", "åsyn"), | ||||||
|  |     ], | ||||||
|  | ) | ||||||
| def test_lemmatizer_lookup_assigns(sv_tokenizer, string, lemma): | def test_lemmatizer_lookup_assigns(sv_tokenizer, string, lemma): | ||||||
|     tokens = sv_tokenizer(string) |     tokens = sv_tokenizer(string) | ||||||
|     assert tokens[0].lemma_ == lemma |     assert tokens[0].lemma_ == lemma | ||||||
|  |  | ||||||
|  | @ -1,28 +1,28 @@ | ||||||
| # coding: utf-8 | # coding: utf-8 | ||||||
| """Test that tokenizer prefixes, suffixes and infixes are handled correctly.""" |  | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["(under)"]) | 
 | ||||||
|  | @pytest.mark.parametrize("text", ["(under)"]) | ||||||
| def test_tokenizer_splits_no_special(sv_tokenizer, text): | def test_tokenizer_splits_no_special(sv_tokenizer, text): | ||||||
|     tokens = sv_tokenizer(text) |     tokens = sv_tokenizer(text) | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["gitta'r", "Björn's", "Lars'"]) | @pytest.mark.parametrize("text", ["gitta'r", "Björn's", "Lars'"]) | ||||||
| def test_tokenizer_handles_no_punct(sv_tokenizer, text): | def test_tokenizer_handles_no_punct(sv_tokenizer, text): | ||||||
|     tokens = sv_tokenizer(text) |     tokens = sv_tokenizer(text) | ||||||
|     assert len(tokens) == 1 |     assert len(tokens) == 1 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["svart.Gul", "Hej.Världen"]) | @pytest.mark.parametrize("text", ["svart.Gul", "Hej.Världen"]) | ||||||
| def test_tokenizer_splits_period_infix(sv_tokenizer, text): | def test_tokenizer_splits_period_infix(sv_tokenizer, text): | ||||||
|     tokens = sv_tokenizer(text) |     tokens = sv_tokenizer(text) | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["Hej,Världen", "en,två"]) | @pytest.mark.parametrize("text", ["Hej,Världen", "en,två"]) | ||||||
| def test_tokenizer_splits_comma_infix(sv_tokenizer, text): | def test_tokenizer_splits_comma_infix(sv_tokenizer, text): | ||||||
|     tokens = sv_tokenizer(text) |     tokens = sv_tokenizer(text) | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|  | @ -31,7 +31,7 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text): | ||||||
|     assert tokens[2].text == text.split(",")[1] |     assert tokens[2].text == text.split(",")[1] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["svart...Gul", "svart...gul"]) | @pytest.mark.parametrize("text", ["svart...Gul", "svart...gul"]) | ||||||
| def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text): | def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text): | ||||||
|     tokens = sv_tokenizer(text) |     tokens = sv_tokenizer(text) | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|  |  | ||||||
|  | @ -1,9 +1,6 @@ | ||||||
| # coding: utf-8 | # coding: utf-8 | ||||||
| """Test that longer and mixed texts are tokenized correctly.""" |  | ||||||
| 
 |  | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import pytest |  | ||||||
| 
 | 
 | ||||||
| def test_sv_tokenizer_handles_long_text(sv_tokenizer): | def test_sv_tokenizer_handles_long_text(sv_tokenizer): | ||||||
|     text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön, |     text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön, | ||||||
|  |  | ||||||
|  | @ -1,25 +1,24 @@ | ||||||
| # coding: utf-8 | # coding: utf-8 | ||||||
| """Test that open, closed and paired punctuation is split off correctly.""" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| PUNCT_OPEN = ['(', '[', '{', '*'] | PUNCT_OPEN = ["(", "[", "{", "*"] | ||||||
| PUNCT_CLOSE = [')', ']', '}', '*'] | PUNCT_CLOSE = [")", "]", "}", "*"] | ||||||
| PUNCT_PAIRED = [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')] | PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["(", "((", "<"]) | @pytest.mark.parametrize("text", ["(", "((", "<"]) | ||||||
| def test_uk_tokenizer_handles_only_punct(uk_tokenizer, text): | def test_uk_tokenizer_handles_only_punct(uk_tokenizer, text): | ||||||
|     tokens = uk_tokenizer(text) |     tokens = uk_tokenizer(text) | ||||||
|     assert len(tokens) == len(text) |     assert len(tokens) == len(text) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('punct', PUNCT_OPEN) | @pytest.mark.parametrize("punct", PUNCT_OPEN) | ||||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | @pytest.mark.parametrize( | ||||||
|  |     "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] | ||||||
|  | ) | ||||||
| def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text): | def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text): | ||||||
|     tokens = uk_tokenizer(punct + text) |     tokens = uk_tokenizer(punct + text) | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|  | @ -27,8 +26,10 @@ def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text): | ||||||
|     assert tokens[1].text == text |     assert tokens[1].text == text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('punct', PUNCT_CLOSE) | @pytest.mark.parametrize("punct", PUNCT_CLOSE) | ||||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | @pytest.mark.parametrize( | ||||||
|  |     "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] | ||||||
|  | ) | ||||||
| def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text): | def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text): | ||||||
|     tokens = uk_tokenizer(text + punct) |     tokens = uk_tokenizer(text + punct) | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|  | @ -36,9 +37,11 @@ def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text): | ||||||
|     assert tokens[1].text == punct |     assert tokens[1].text == punct | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('punct', PUNCT_OPEN) | @pytest.mark.parametrize("punct", PUNCT_OPEN) | ||||||
| @pytest.mark.parametrize('punct_add', ["`"]) | @pytest.mark.parametrize("punct_add", ["`"]) | ||||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | @pytest.mark.parametrize( | ||||||
|  |     "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] | ||||||
|  | ) | ||||||
| def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add, text): | def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add, text): | ||||||
|     tokens = uk_tokenizer(punct + punct_add + text) |     tokens = uk_tokenizer(punct + punct_add + text) | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|  | @ -47,9 +50,11 @@ def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add, | ||||||
|     assert tokens[2].text == text |     assert tokens[2].text == text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('punct', PUNCT_CLOSE) | @pytest.mark.parametrize("punct", PUNCT_CLOSE) | ||||||
| @pytest.mark.parametrize('punct_add', ["'"]) | @pytest.mark.parametrize("punct_add", ["'"]) | ||||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | @pytest.mark.parametrize( | ||||||
|  |     "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] | ||||||
|  | ) | ||||||
| def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add, text): | def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add, text): | ||||||
|     tokens = uk_tokenizer(text + punct + punct_add) |     tokens = uk_tokenizer(text + punct + punct_add) | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|  | @ -58,8 +63,10 @@ def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add | ||||||
|     assert tokens[2].text == punct_add |     assert tokens[2].text == punct_add | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('punct', PUNCT_OPEN) | @pytest.mark.parametrize("punct", PUNCT_OPEN) | ||||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | @pytest.mark.parametrize( | ||||||
|  |     "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] | ||||||
|  | ) | ||||||
| def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text): | def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text): | ||||||
|     tokens = uk_tokenizer(punct + punct + punct + text) |     tokens = uk_tokenizer(punct + punct + punct + text) | ||||||
|     assert len(tokens) == 4 |     assert len(tokens) == 4 | ||||||
|  | @ -67,8 +74,10 @@ def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text): | ||||||
|     assert tokens[3].text == text |     assert tokens[3].text == text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('punct', PUNCT_CLOSE) | @pytest.mark.parametrize("punct", PUNCT_CLOSE) | ||||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | @pytest.mark.parametrize( | ||||||
|  |     "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] | ||||||
|  | ) | ||||||
| def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text): | def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text): | ||||||
|     tokens = uk_tokenizer(text + punct + punct + punct) |     tokens = uk_tokenizer(text + punct + punct + punct) | ||||||
|     assert len(tokens) == 4 |     assert len(tokens) == 4 | ||||||
|  | @ -76,14 +85,14 @@ def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text): | ||||||
|     assert tokens[1].text == punct |     assert tokens[1].text == punct | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["'Тест"]) | @pytest.mark.parametrize("text", ["'Тест"]) | ||||||
| def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text): | def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text): | ||||||
|     tokens = uk_tokenizer(text) |     tokens = uk_tokenizer(text) | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     assert tokens[0].text == "'" |     assert tokens[0].text == "'" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["Тест''"]) | @pytest.mark.parametrize("text", ["Тест''"]) | ||||||
| def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text): | def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text): | ||||||
|     tokens = uk_tokenizer(text) |     tokens = uk_tokenizer(text) | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|  | @ -91,10 +100,13 @@ def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text): | ||||||
|     assert len(tokens_punct) == 1 |     assert len(tokens_punct) == 1 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) | @pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED) | ||||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | @pytest.mark.parametrize( | ||||||
| def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open, |     "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] | ||||||
|                                               punct_close, text): | ) | ||||||
|  | def test_uk_tokenizer_splits_open_close_punct( | ||||||
|  |     uk_tokenizer, punct_open, punct_close, text | ||||||
|  | ): | ||||||
|     tokens = uk_tokenizer(punct_open + text + punct_close) |     tokens = uk_tokenizer(punct_open + text + punct_close) | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|     assert tokens[0].text == punct_open |     assert tokens[0].text == punct_open | ||||||
|  | @ -102,11 +114,14 @@ def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open, | ||||||
|     assert tokens[2].text == punct_close |     assert tokens[2].text == punct_close | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) | @pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED) | ||||||
| @pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")]) | @pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")]) | ||||||
| @pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) | @pytest.mark.parametrize( | ||||||
| def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close, |     "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] | ||||||
|                                      punct_open2, punct_close2, text): | ) | ||||||
|  | def test_uk_tokenizer_two_diff_punct( | ||||||
|  |     uk_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text | ||||||
|  | ): | ||||||
|     tokens = uk_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2) |     tokens = uk_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2) | ||||||
|     assert len(tokens) == 5 |     assert len(tokens) == 5 | ||||||
|     assert tokens[0].text == punct_open2 |     assert tokens[0].text == punct_open2 | ||||||
|  | @ -116,7 +131,9 @@ def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close, | ||||||
|     assert tokens[4].text == punct_close2 |     assert tokens[4].text == punct_close2 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."]) | @pytest.mark.parametrize( | ||||||
|  |     "text", ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."] | ||||||
|  | ) | ||||||
| def test_uk_tokenizer_splits_trailing_dot(uk_tokenizer, text): | def test_uk_tokenizer_splits_trailing_dot(uk_tokenizer, text): | ||||||
|     tokens = uk_tokenizer(text) |     tokens = uk_tokenizer(text) | ||||||
|     assert tokens[1].text == "." |     assert tokens[1].text == "." | ||||||
|  |  | ||||||
|  | @ -1,18 +1,14 @@ | ||||||
| # coding: utf-8 | # coding: utf-8 | ||||||
| """Test that tokenizer exceptions are parsed correctly.""" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text,norms,lemmas', [("ім.", ["імені"], ["ім'я"]), | @pytest.mark.parametrize( | ||||||
|                                         ("проф.", ["професор"], ["професор"])]) |     "text,norms,lemmas", | ||||||
|  |     [("ім.", ["імені"], ["ім'я"]), ("проф.", ["професор"], ["професор"])], | ||||||
|  | ) | ||||||
| def test_uk_tokenizer_abbrev_exceptions(uk_tokenizer, text, norms, lemmas): | def test_uk_tokenizer_abbrev_exceptions(uk_tokenizer, text, norms, lemmas): | ||||||
|     tokens = uk_tokenizer(text) |     tokens = uk_tokenizer(text) | ||||||
|     assert len(tokens) == 1 |     assert len(tokens) == 1 | ||||||
|     assert [token.norm_ for token in tokens] == norms |     assert [token.norm_ for token in tokens] == norms | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|  |  | ||||||
|  | @ -1,16 +1,16 @@ | ||||||
| # coding: utf-8 | # coding: utf-8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
| import json | import json | ||||||
| from tempfile import NamedTemporaryFile | from tempfile import NamedTemporaryFile | ||||||
| import pytest |  | ||||||
| 
 | 
 | ||||||
| from ...cli.train import train | from ...cli.train import train | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_cli_trained_model_can_be_saved(tmpdir): | def test_cli_trained_model_can_be_saved(tmpdir): | ||||||
|     lang = 'nl' |     lang = "nl" | ||||||
|     output_dir = str(tmpdir) |     output_dir = str(tmpdir) | ||||||
|     train_file = NamedTemporaryFile('wb', dir=output_dir, delete=False) |     train_file = NamedTemporaryFile("wb", dir=output_dir, delete=False) | ||||||
|     train_corpus = [ |     train_corpus = [ | ||||||
|         { |         { | ||||||
|             "id": "identifier_0", |             "id": "identifier_0", | ||||||
|  | @ -26,7 +26,7 @@ def test_cli_trained_model_can_be_saved(tmpdir): | ||||||
|                                     "head": 1, |                                     "head": 1, | ||||||
|                                     "tag": "NOUN", |                                     "tag": "NOUN", | ||||||
|                                     "orth": "Jan", |                                     "orth": "Jan", | ||||||
|                                     "ner": "B-PER" |                                     "ner": "B-PER", | ||||||
|                                 }, |                                 }, | ||||||
|                                 { |                                 { | ||||||
|                                     "id": 1, |                                     "id": 1, | ||||||
|  | @ -34,7 +34,7 @@ def test_cli_trained_model_can_be_saved(tmpdir): | ||||||
|                                     "head": 0, |                                     "head": 0, | ||||||
|                                     "tag": "VERB", |                                     "tag": "VERB", | ||||||
|                                     "orth": "houdt", |                                     "orth": "houdt", | ||||||
|                                     "ner": "O" |                                     "ner": "O", | ||||||
|                                 }, |                                 }, | ||||||
|                                 { |                                 { | ||||||
|                                     "id": 2, |                                     "id": 2, | ||||||
|  | @ -42,7 +42,7 @@ def test_cli_trained_model_can_be_saved(tmpdir): | ||||||
|                                     "head": 1, |                                     "head": 1, | ||||||
|                                     "tag": "ADP", |                                     "tag": "ADP", | ||||||
|                                     "orth": "van", |                                     "orth": "van", | ||||||
|                                     "ner": "O" |                                     "ner": "O", | ||||||
|                                 }, |                                 }, | ||||||
|                                 { |                                 { | ||||||
|                                     "id": 3, |                                     "id": 3, | ||||||
|  | @ -50,7 +50,7 @@ def test_cli_trained_model_can_be_saved(tmpdir): | ||||||
|                                     "head": -2, |                                     "head": -2, | ||||||
|                                     "tag": "NOUN", |                                     "tag": "NOUN", | ||||||
|                                     "orth": "Marie", |                                     "orth": "Marie", | ||||||
|                                     "ner": "B-PER" |                                     "ner": "B-PER", | ||||||
|                                 }, |                                 }, | ||||||
|                                 { |                                 { | ||||||
|                                     "id": 4, |                                     "id": 4, | ||||||
|  | @ -58,7 +58,7 @@ def test_cli_trained_model_can_be_saved(tmpdir): | ||||||
|                                     "head": -3, |                                     "head": -3, | ||||||
|                                     "tag": "PUNCT", |                                     "tag": "PUNCT", | ||||||
|                                     "orth": ".", |                                     "orth": ".", | ||||||
|                                     "ner": "O" |                                     "ner": "O", | ||||||
|                                 }, |                                 }, | ||||||
|                                 { |                                 { | ||||||
|                                     "id": 5, |                                     "id": 5, | ||||||
|  | @ -66,18 +66,18 @@ def test_cli_trained_model_can_be_saved(tmpdir): | ||||||
|                                     "head": -1, |                                     "head": -1, | ||||||
|                                     "tag": "SPACE", |                                     "tag": "SPACE", | ||||||
|                                     "orth": "\n", |                                     "orth": "\n", | ||||||
|                                     "ner": "O" |                                     "ner": "O", | ||||||
|                                 } |                                 }, | ||||||
|                             ], |                             ], | ||||||
|                             "brackets": [] |                             "brackets": [], | ||||||
|                         } |                         } | ||||||
|                     ] |                     ], | ||||||
|                 } |                 } | ||||||
|             ] |             ], | ||||||
|         } |         } | ||||||
|     ] |     ] | ||||||
| 
 | 
 | ||||||
|     train_file.write(json.dumps(train_corpus).encode('utf-8')) |     train_file.write(json.dumps(train_corpus).encode("utf-8")) | ||||||
|     train_file.close() |     train_file.close() | ||||||
|     train_data = train_file.name |     train_data = train_file.name | ||||||
|     dev_data = train_data |     dev_data = train_data | ||||||
|  |  | ||||||
|  | @ -155,6 +155,14 @@ def test_issue1758(en_tokenizer): | ||||||
|     assert tokens[1].lemma_ == "have" |     assert tokens[1].lemma_ == "have" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_issue1773(en_tokenizer): | ||||||
|  |     """Test that spaces don't receive a POS but no TAG. This is the root cause | ||||||
|  |     of the serialization issue reported in #1773.""" | ||||||
|  |     doc = en_tokenizer("\n") | ||||||
|  |     if doc[0].pos_ == "SPACE": | ||||||
|  |         assert doc[0].tag_ != "" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_issue1799(): | def test_issue1799(): | ||||||
|     """Test sentence boundaries are deserialized correctly, even for |     """Test sentence boundaries are deserialized correctly, even for | ||||||
|     non-projective sentences.""" |     non-projective sentences.""" | ||||||
|  | @ -249,8 +257,8 @@ def test_issue1945(): | ||||||
| 
 | 
 | ||||||
| def test_issue1963(en_tokenizer): | def test_issue1963(en_tokenizer): | ||||||
|     """Test that doc.merge() resizes doc.tensor""" |     """Test that doc.merge() resizes doc.tensor""" | ||||||
|     doc = en_tokenizer('a b c d') |     doc = en_tokenizer("a b c d") | ||||||
|     doc.tensor = numpy.ones((len(doc), 128), dtype='f') |     doc.tensor = numpy.ones((len(doc), 128), dtype="f") | ||||||
|     with doc.retokenize() as retokenizer: |     with doc.retokenize() as retokenizer: | ||||||
|         retokenizer.merge(doc[0:2]) |         retokenizer.merge(doc[0:2]) | ||||||
|     assert len(doc) == 3 |     assert len(doc) == 3 | ||||||
|  |  | ||||||
|  | @ -1,9 +0,0 @@ | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_issue1773(en_tokenizer): |  | ||||||
|     """Test that spaces don't receive a POS but no TAG. This is the root cause |  | ||||||
|     of the serialization issue reported in #1773.""" |  | ||||||
|     doc = en_tokenizer('\n') |  | ||||||
|     if doc[0].pos_ == 'SPACE': |  | ||||||
|         assert doc[0].tag_ != "" |  | ||||||
|  | @ -6,8 +6,9 @@ from spacy.tokens import Doc | ||||||
| from spacy.displacy import render | from spacy.displacy import render | ||||||
| from spacy.gold import iob_to_biluo | from spacy.gold import iob_to_biluo | ||||||
| from spacy.lang.it import Italian | from spacy.lang.it import Italian | ||||||
|  | import numpy | ||||||
| 
 | 
 | ||||||
| from ..util import add_vecs_to_vocab | from ..util import add_vecs_to_vocab, get_doc | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.xfail | @pytest.mark.xfail | ||||||
|  | @ -69,6 +70,26 @@ def test_issue2385_biluo(tags): | ||||||
|     assert iob_to_biluo(tags) == list(tags) |     assert iob_to_biluo(tags) == list(tags) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_issue2396(en_vocab): | ||||||
|  |     words = ["She", "created", "a", "test", "for", "spacy"] | ||||||
|  |     heads = [1, 0, 1, -2, -1, -1] | ||||||
|  |     matrix = numpy.array( | ||||||
|  |         [ | ||||||
|  |             [0, 1, 1, 1, 1, 1], | ||||||
|  |             [1, 1, 1, 1, 1, 1], | ||||||
|  |             [1, 1, 2, 3, 3, 3], | ||||||
|  |             [1, 1, 3, 3, 3, 3], | ||||||
|  |             [1, 1, 3, 3, 4, 4], | ||||||
|  |             [1, 1, 3, 3, 4, 5], | ||||||
|  |         ], | ||||||
|  |         dtype=numpy.int32, | ||||||
|  |     ) | ||||||
|  |     doc = get_doc(en_vocab, words=words, heads=heads) | ||||||
|  |     span = doc[:] | ||||||
|  |     assert (doc.get_lca_matrix() == matrix).all() | ||||||
|  |     assert (span.get_lca_matrix() == matrix).all() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_issue2482(): | def test_issue2482(): | ||||||
|     """Test we can serialize and deserialize a blank NER or parser model.""" |     """Test we can serialize and deserialize a blank NER or parser model.""" | ||||||
|     nlp = Italian() |     nlp = Italian() | ||||||
|  |  | ||||||
|  | @ -1,35 +0,0 @@ | ||||||
| # coding: utf-8 |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| from ..util import get_doc |  | ||||||
| 
 |  | ||||||
| import pytest |  | ||||||
| import numpy |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @pytest.mark.parametrize( |  | ||||||
|     "sentence,heads,matrix", |  | ||||||
|     [ |  | ||||||
|         ( |  | ||||||
|             "She created a test for spacy", |  | ||||||
|             [1, 0, 1, -2, -1, -1], |  | ||||||
|             numpy.array( |  | ||||||
|                 [ |  | ||||||
|                     [0, 1, 1, 1, 1, 1], |  | ||||||
|                     [1, 1, 1, 1, 1, 1], |  | ||||||
|                     [1, 1, 2, 3, 3, 3], |  | ||||||
|                     [1, 1, 3, 3, 3, 3], |  | ||||||
|                     [1, 1, 3, 3, 4, 4], |  | ||||||
|                     [1, 1, 3, 3, 4, 5], |  | ||||||
|                 ], |  | ||||||
|                 dtype=numpy.int32, |  | ||||||
|             ), |  | ||||||
|         ) |  | ||||||
|     ], |  | ||||||
| ) |  | ||||||
| def test_issue2396(en_tokenizer, sentence, heads, matrix): |  | ||||||
|     tokens = en_tokenizer(sentence) |  | ||||||
|     doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) |  | ||||||
|     span = doc[:] |  | ||||||
|     assert (doc.get_lca_matrix() == matrix).all() |  | ||||||
|     assert (span.get_lca_matrix() == matrix).all() |  | ||||||
|  | @ -1,14 +1,10 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import pytest |  | ||||||
| from spacy.lang.en import English |  | ||||||
| 
 | 
 | ||||||
| def test_issue2754(): | def test_issue2754(en_tokenizer): | ||||||
|     """Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" |     """Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" | ||||||
|     nlp = English() |     a = en_tokenizer("a") | ||||||
|     a = nlp('a') |     assert a[0].norm_ == "a" | ||||||
|     assert a[0].norm_ == 'a' |     am = en_tokenizer("am") | ||||||
|     am = nlp('am') |     assert am[0].norm_ == "am" | ||||||
|     assert am[0].norm_ == 'am' |  | ||||||
| 
 |  | ||||||
|  |  | ||||||
|  | @ -9,4 +9,3 @@ def test_issue2835(en_tokenizer): | ||||||
|     """ |     """ | ||||||
|     doc = en_tokenizer(text) |     doc = en_tokenizer(text) | ||||||
|     assert doc |     assert doc | ||||||
| 
 |  | ||||||
|  |  | ||||||
|  | @ -2,26 +2,24 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import numpy | import numpy | ||||||
| from spacy.vectors import Vectors |  | ||||||
| from spacy.vocab import Vocab | from spacy.vocab import Vocab | ||||||
| from spacy.tokens import Doc |  | ||||||
| from spacy._ml import link_vectors_to_models | from spacy._ml import link_vectors_to_models | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_issue2871(): | def test_issue2871(): | ||||||
|     """Test that vectors recover the correct key for spaCy reserved words.""" |     """Test that vectors recover the correct key for spaCy reserved words.""" | ||||||
|     words = ['dog', 'cat', 'SUFFIX'] |     words = ["dog", "cat", "SUFFIX"] | ||||||
|     vocab = Vocab() |     vocab = Vocab() | ||||||
|     vocab.vectors.resize(shape=(3, 10)) |     vocab.vectors.resize(shape=(3, 10)) | ||||||
|     vector_data = numpy.zeros((3, 10), dtype='f') |     vector_data = numpy.zeros((3, 10), dtype="f") | ||||||
|     for word in words: |     for word in words: | ||||||
|         _ = vocab[word] |         _ = vocab[word]  # noqa: F841 | ||||||
|         vocab.set_vector(word, vector_data[0]) |         vocab.set_vector(word, vector_data[0]) | ||||||
|     vocab.vectors.name = 'dummy_vectors' |     vocab.vectors.name = "dummy_vectors" | ||||||
|     link_vectors_to_models(vocab) |     link_vectors_to_models(vocab) | ||||||
|     assert vocab['dog'].rank == 0 |     assert vocab["dog"].rank == 0 | ||||||
|     assert vocab['cat'].rank == 1 |     assert vocab["cat"].rank == 1 | ||||||
|     assert vocab['SUFFIX'].rank == 2 |     assert vocab["SUFFIX"].rank == 2 | ||||||
|     assert vocab.vectors.find(key='dog') == 0 |     assert vocab.vectors.find(key="dog") == 0 | ||||||
|     assert vocab.vectors.find(key='cat') == 1 |     assert vocab.vectors.find(key="cat") == 1 | ||||||
|     assert vocab.vectors.find(key='SUFFIX') == 2 |     assert vocab.vectors.find(key="SUFFIX") == 2 | ||||||
|  |  | ||||||
|  | @ -58,9 +58,10 @@ def test_issue3009(doc, matcher, pattern): | ||||||
|     matches = matcher(doc) |     matches = matcher(doc) | ||||||
|     assert matches |     assert matches | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def test_issue2464(matcher): | def test_issue2464(matcher): | ||||||
|     """Test problem with successive ?. This is the same bug, so putting it here.""" |     """Test problem with successive ?. This is the same bug, so putting it here.""" | ||||||
|     doc = Doc(matcher.vocab, words=['a', 'b']) |     doc = Doc(matcher.vocab, words=["a", "b"]) | ||||||
|     matcher.add('4', None, [{'OP': '?'}, {'OP': '?'}]) |     matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}]) | ||||||
|     matches = matcher(doc) |     matches = matcher(doc) | ||||||
|     assert len(matches) == 3 |     assert len(matches) == 3 | ||||||
|  |  | ||||||
|  | @ -1,8 +1,6 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import pytest |  | ||||||
| 
 |  | ||||||
| from ...attrs import ENT_IOB, ENT_TYPE | from ...attrs import ENT_IOB, ENT_TYPE | ||||||
| from ...tokens import Doc | from ...tokens import Doc | ||||||
| from ..util import get_doc | from ..util import get_doc | ||||||
|  | @ -30,4 +28,4 @@ def test_issue3012(en_vocab): | ||||||
|     # serializing then deserializing |     # serializing then deserializing | ||||||
|     doc_bytes = doc.to_bytes() |     doc_bytes = doc.to_bytes() | ||||||
|     doc2 = Doc(en_vocab).from_bytes(doc_bytes) |     doc2 = Doc(en_vocab).from_bytes(doc_bytes) | ||||||
|     assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected |     assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected | ||||||
|  |  | ||||||
|  | @ -1,10 +0,0 @@ | ||||||
| from __future__ import unicode_literals |  | ||||||
| import pytest |  | ||||||
| import spacy |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @pytest.mark.models('fr') |  | ||||||
| def test_issue1959(FR): |  | ||||||
|     texts = ['Je suis la mauvaise herbe', "Me, myself and moi"] |  | ||||||
|     for text in texts: |  | ||||||
|         FR(text) |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user