diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index a2ba1d243..2bf4c17c1 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -8,15 +8,14 @@ import time from collections import Counter from pathlib import Path from thinc.v2v import Affine, Maxout -from thinc.api import wrap, layerize from thinc.misc import LayerNorm as LN -from thinc.neural.util import prefer_gpu, get_array_module +from thinc.neural.util import prefer_gpu from wasabi import Printer import srsly from ..tokens import Doc from ..attrs import ID, HEAD -from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer +from .._ml import Tok2Vec, flatten, chain, create_default_optimizer from .._ml import masked_language_model from .. import util @@ -136,7 +135,7 @@ def pretrain( random.shuffle(texts) -def make_update(model, docs, optimizer, drop=0.0, objective='L2'): +def make_update(model, docs, optimizer, drop=0.0, objective="L2"): """Perform an update over a single batch of documents. docs (iterable): A batch of `Doc` objects. @@ -171,7 +170,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500): return docs -def get_vectors_loss(ops, docs, prediction, objective='L2'): +def get_vectors_loss(ops, docs, prediction, objective="L2"): """Compute a mean-squared error loss between the documents' vectors and the prediction. @@ -185,9 +184,9 @@ def get_vectors_loss(ops, docs, prediction, objective='L2'): # and look them up all at once. This prevents data copying. ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = docs[0].vocab.vectors.data[ids] - if objective == 'L2': + if objective == "L2": d_scores = prediction - target - loss = (d_scores**2).sum() + loss = (d_scores ** 2).sum() else: raise NotImplementedError(objective) return loss, d_scores @@ -201,8 +200,7 @@ def create_pretraining_model(nlp, tok2vec): """ output_size = nlp.vocab.vectors.data.shape[1] output_layer = chain( - LN(Maxout(300, pieces=3)), - Affine(output_size, drop_factor=0.0), + LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0) ) # This is annoying, but the parser etc have the flatten step after # the tok2vec. To load the weights in cleanly, we need to match diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 1289e5844..3a3cba708 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -13,13 +13,7 @@ RENDER_WRAPPER = None def render( - docs, - style="dep", - page=False, - minify=False, - jupyter=False, - options={}, - manual=False, + docs, style="dep", page=False, minify=False, jupyter=False, options={}, manual=False ): """Render displaCy visualisation. @@ -80,7 +74,7 @@ def serve( """ from wsgiref import simple_server - if IS_JUPYTER: + if is_in_jupyter(): user_warning(Warnings.W011) render(docs, style=style, page=page, minify=minify, options=options, manual=manual) diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index 0b03ce893..4198dcd88 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -1,8 +1,9 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS -from ..char_classes import CONCAT_QUOTES, CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES +from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER + # removing ° from the special icons to keep e.g. 99° as one token _concat_icons = CONCAT_ICONS.replace("\u00B0", "") @@ -29,7 +30,9 @@ _suffixes = ( r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:[{c}])".format(c=_currency), r"(?<=[0-9])(?:{u})".format(u=UNITS), - r"(?<=[{al}{e}{q}(?:{c})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency), + r"(?<=[{al}{e}{q}(?:{c})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency + ), r"(?<=[{al})])-e".format(al=ALPHA_LOWER), ] ) @@ -40,7 +43,7 @@ _infixes = ( + [ r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), - r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA), + r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes), diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index cf896ae12..e18d59a4c 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -5,24 +5,24 @@ import re from collections import namedtuple from .tag_map import TAG_MAP - from ...attrs import LANG from ...language import Language from ...tokens import Doc, Token from ...util import DummyTokenizer + ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"]) +# TODO: Is this the right place for this? +Token.set_extension("mecab_tag", default=None) + def try_mecab_import(): """Mecab is required for Japanese support, so check for it. - It it's not available blow up and explain how to fix it.""" try: import MeCab - # XXX Is this the right place for this? - Token.set_extension("mecab_tag", default=None) return MeCab except ImportError: raise ImportError( @@ -33,14 +33,13 @@ def try_mecab_import(): def resolve_pos(token): """If necessary, add a field to the POS tag for UD mapping. - Under Universal Dependencies, sometimes the same Unidic POS tag can be mapped differently depending on the literal token or its context in the sentence. This function adds information to the POS tag to resolve ambiguous mappings. """ - # NOTE: This is a first take. The rules here are crude approximations. + # TODO: This is a first take. The rules here are crude approximations. # For many of these, full dependencies are needed to properly resolve # PoS mappings. @@ -56,7 +55,7 @@ def resolve_pos(token): def detailed_tokens(tokenizer, text): """Format Mecab output into a nice data structure, based on Janome.""" - tokenizer.parse(text) + node = tokenizer.parseToNode(text) node = node.next # first node is beginning of sentence and empty, skip it words = [] @@ -98,62 +97,15 @@ class JapaneseTokenizer(DummyTokenizer): return doc -class JapaneseCharacterSegmenter(object): - def __init__(self, vocab): - self.vocab = vocab - self._presegmenter = self._make_presegmenter(self.vocab) - - def _make_presegmenter(self, vocab): - rules = Japanese.Defaults.tokenizer_exceptions - token_match = Japanese.Defaults.token_match - prefix_search = ( - util.compile_prefix_regex(Japanese.Defaults.prefixes).search - if Japanese.Defaults.prefixes - else None - ) - suffix_search = ( - util.compile_suffix_regex(Japanese.Defaults.suffixes).search - if Japanese.Defaults.suffixes - else None - ) - infix_finditer = ( - util.compile_infix_regex(Japanese.Defaults.infixes).finditer - if Japanese.Defaults.infixes - else None - ) - return Tokenizer( - vocab, - rules=rules, - prefix_search=prefix_search, - suffix_search=suffix_search, - infix_finditer=infix_finditer, - token_match=token_match, - ) - - def __call__(self, text): - words = [] - spaces = [] - doc = self._presegmenter(text) - for token in doc: - words.extend(list(token.text)) - spaces.extend([False] * len(token.text)) - spaces[-1] = bool(token.whitespace_) - return Doc(self.vocab, words=words, spaces=spaces) - - class JapaneseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda _text: "ja" tag_map = TAG_MAP - use_janome = True @classmethod def create_tokenizer(cls, nlp=None): - if cls.use_janome: - return JapaneseTokenizer(cls, nlp) - else: - return JapaneseCharacterSegmenter(nlp.vocab) + return JapaneseTokenizer(cls, nlp) class Japanese(Language): diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 9922db89e..702a19063 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -2,10 +2,10 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .punctuation import TOKENIZER_INFIXES from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from .punctuation import TOKENIZER_INFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -22,9 +22,9 @@ class PolishDefaults(Language.Defaults): Language.Defaults.lex_attr_getters[NORM], BASE_NORMS ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - infixes = tuple(TOKENIZER_INFIXES) stop_words = STOP_WORDS tag_map = TAG_MAP + infixes = TOKENIZER_INFIXES class Polish(Language): diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py index 8fdcaca41..4e69a3912 100644 --- a/spacy/lang/pl/punctuation.py +++ b/spacy/lang/pl/punctuation.py @@ -1,14 +1,22 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, LIST_ICONS -from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER -_quotes = QUOTES.replace("'", '') -_infixes = (LIST_ELLIPSES + LIST_ICONS + - [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), - r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes), - r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)]) + +from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS +from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER + +_quotes = CONCAT_QUOTES.replace("'", "") + +_infixes = ( + LIST_ELLIPSES + + [CONCAT_ICONS] + + [ + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES), + ] +) TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py index c16315804..9e4814b0f 100644 --- a/spacy/lang/pl/tokenizer_exceptions.py +++ b/spacy/lang/pl/tokenizer_exceptions.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS +from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ _exc = {} diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 67b78e558..675bd794d 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -6,7 +6,9 @@ from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .morph_rules import MORPH_RULES from .lemmatizer import LEMMA_RULES, LOOKUP -from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES + +# Punctuation stolen from Danish +from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -31,6 +33,7 @@ class SwedishDefaults(Language.Defaults): lemma_lookup = LOOKUP morph_rules = MORPH_RULES + class Swedish(Language): lang = "sv" Defaults = SwedishDefaults diff --git a/spacy/lang/sv/punctuation.py b/spacy/lang/sv/punctuation.py deleted file mode 100644 index 3cac2f9ac..000000000 --- a/spacy/lang/sv/punctuation.py +++ /dev/null @@ -1,25 +0,0 @@ -# coding: utf8 -"""Punctuation stolen from Danish""" -from __future__ import unicode_literals - -from ..char_classes import LIST_ELLIPSES, LIST_ICONS -from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER -from ..punctuation import TOKENIZER_SUFFIXES - - -_quotes = QUOTES.replace("'", '') - -_infixes = (LIST_ELLIPSES + LIST_ICONS + - [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), - r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes), - r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)]) - -_suffixes = [suffix for suffix in TOKENIZER_SUFFIXES if suffix not in ["'s", "'S", "’s", "’S", r"\'"]] -_suffixes += [r"(?<=[^sSxXzZ])\'"] - - -TOKENIZER_INFIXES = _infixes -TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/sv/tag_map.py b/spacy/lang/sv/tag_map.py index 9fe4d6872..7d4e29030 100644 --- a/spacy/lang/sv/tag_map.py +++ b/spacy/lang/sv/tag_map.py @@ -1,169 +1,191 @@ # coding: utf8 - -""" -Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html -for https://github.com/UniversalDependencies/UD_Swedish-Talbanken -""" - from __future__ import unicode_literals -from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX +from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV +from ...symbols import ADP, X, VERB, NOUN, PROPN, PART, INTJ, PRON + + +# Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html +# for https://github.com/UniversalDependencies/UD_Swedish-Talbanken TAG_MAP = { - 'AB': { POS: ADV }, # inte, också, så, bara, nu - 'AB|AN': { POS: ADV }, # t.ex., ca, t_ex, bl.a., s_k - 'AB|KOM': { POS: ADV }, # mer, tidigare, mindre, vidare, mera - 'AB|POS': { POS: ADV }, # mycket, helt, ofta, länge, långt - 'AB|SMS': { POS: ADV }, # över-, in- - 'AB|SUV': { POS: ADV }, # minst, mest, högst, främst, helst - 'DT|MAS|SIN|DEF': { POS: DET }, - 'DT|MAS|SIN|IND': { POS: DET }, - 'DT|NEU|SIN|DEF': { POS: DET }, # det, detta - 'DT|NEU|SIN|IND': { POS: DET }, # ett, något, inget, vart, vartannat - 'DT|NEU|SIN|IND/DEF': { POS: DET }, # allt - 'DT|UTR/NEU|PLU|DEF': { POS: DET }, # de, dessa, bägge, dom - 'DT|UTR/NEU|PLU|IND': { POS: DET }, # några, inga - 'DT|UTR/NEU|PLU|IND/DEF': { POS: DET }, # alla - 'DT|UTR/NEU|SIN/PLU|IND': { POS: DET }, # samma - 'DT|UTR/NEU|SIN|DEF': { POS: DET }, # vardera - 'DT|UTR/NEU|SIN|IND': { POS: DET }, # varje, varenda - 'DT|UTR|SIN|DEF': { POS: DET }, # den, denna - 'DT|UTR|SIN|IND': { POS: DET }, # en, någon, ingen, var, varannan - 'DT|UTR|SIN|IND/DEF': { POS: DET }, # all - 'HA': { POS: ADV }, # när, där, hur, som, då - 'HD|NEU|SIN|IND': { POS: DET }, # vilket - 'HD|UTR/NEU|PLU|IND': { POS: DET }, # vilka - 'HD|UTR|SIN|IND': { POS: DET }, # vilken - 'HP|-|-|-': { POS: PRON }, # som - 'HP|NEU|SIN|IND': { POS: PRON }, # vad, vilket - 'HP|NEU|SIN|IND|SMS': { POS: PRON }, - 'HP|UTR/NEU|PLU|IND': { POS: PRON }, # vilka - 'HP|UTR|SIN|IND': { POS: PRON }, # vilken, vem - 'HS|DEF': { POS: DET }, # vars, vilkas, Vems - 'IE': { POS: PART }, # att - 'IN': { POS: INTJ }, # Jo, ja, nej, fan, visst - 'JJ|AN': { POS: ADJ }, # ev, S:t, Kungl, Kungl., Teol - 'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: ADJ }, # äldres - 'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # större, högre, mindre, bättre, äldre - 'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS': { POS: ADJ }, - 'JJ|POS|MAS|SIN|DEF|GEN': { POS: ADJ }, # enskildes, sjukes, andres - 'JJ|POS|MAS|SIN|DEF|NOM': { POS: ADJ }, # enskilde, sjuke, andre, unge, ene - 'JJ|POS|NEU|SIN|IND/DEF|NOM': { POS: ADJ }, # eget - 'JJ|POS|NEU|SIN|IND|GEN': { POS: ADJ }, - 'JJ|POS|NEU|SIN|IND|NOM': { POS: ADJ }, # annat, svårt, möjligt, nytt, sådant - 'JJ|POS|UTR/NEU|PLU|IND/DEF|GEN': { POS: ADJ }, # ogiftas, ungas, frånskildas, efterkommandes, färgblindas - 'JJ|POS|UTR/NEU|PLU|IND/DEF|NOM': { POS: ADJ }, # olika, andra, många, stora, vissa - 'JJ|POS|UTR/NEU|PLU|IND|NOM': { POS: ADJ }, # flera, sådana, fler, få, samtliga - 'JJ|POS|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ }, - 'JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # bra, ena, enda, nästa, ringa - 'JJ|POS|UTR/NEU|SIN|DEF|GEN': { POS: ADJ }, - 'JJ|POS|UTR/NEU|SIN|DEF|NOM': { POS: ADJ }, # hela, nya, andra, svenska, ekonomiska - 'JJ|POS|UTR|-|-|SMS': { POS: ADJ }, # fri-, låg-, sexual- - 'JJ|POS|UTR|SIN|IND/DEF|NOM': { POS: ADJ }, # egen - 'JJ|POS|UTR|SIN|IND|GEN': { POS: ADJ }, # enskilds - 'JJ|POS|UTR|SIN|IND|NOM': { POS: ADJ }, # stor, annan, själv, sådan, viss - 'JJ|SUV|MAS|SIN|DEF|GEN': { POS: ADJ }, - 'JJ|SUV|MAS|SIN|DEF|NOM': { POS: ADJ }, # störste, främste, äldste, minste - 'JJ|SUV|UTR/NEU|PLU|DEF|NOM': { POS: ADJ }, # flesta - 'JJ|SUV|UTR/NEU|PLU|IND|NOM': { POS: ADJ }, - 'JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM': { POS: ADJ }, # bästa, största, närmaste, viktigaste, högsta - 'JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ }, # störst, bäst, tidigast, högst, fattigast - 'KN': { POS: CCONJ }, # och, eller, som, än, men - 'KN|AN': { POS: CCONJ }, - 'MAD': { POS: PUNCT }, # ., ?, :, !, ... - 'MID': { POS: PUNCT }, # ,, -, :, *, ; - 'NN|-|-|-|-': { POS: NOUN }, # godo, fjol, fullo, somras, måtto - 'NN|AN': { POS: NOUN }, # kr, %, s., dr, kap. - 'NN|NEU|-|-|-': { POS: NOUN }, - 'NN|NEU|-|-|SMS': { POS: NOUN }, # yrkes-, barn-, hem-, fack-, vatten- - 'NN|NEU|PLU|DEF|GEN': { POS: NOUN }, # barnens, årens, u-ländernas, företagens, århundradenas - 'NN|NEU|PLU|DEF|NOM': { POS: NOUN }, # barnen, u-länderna, åren, länderna, könen - 'NN|NEU|PLU|IND|GEN': { POS: NOUN }, # slags, års, barns, länders, tusentals - 'NN|NEU|PLU|IND|NOM': { POS: NOUN }, # barn, år, fall, länder, problem - 'NN|NEU|SIN|DEF|GEN': { POS: NOUN }, # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets - 'NN|NEU|SIN|DEF|NOM': { POS: NOUN }, # äktenskapet, samhället, barnet, stället, hemmet - 'NN|NEU|SIN|IND|GEN': { POS: NOUN }, # års, slags, lands, havs, företags - 'NN|NEU|SIN|IND|NOM': { POS: NOUN }, # år, arbete, barn, sätt, äktenskap - 'NN|SMS': { POS: NOUN }, # PCB-, Syd- - 'NN|UTR|-|-|-': { POS: NOUN }, # dags, rätta - 'NN|UTR|-|-|SMS': { POS: NOUN }, # far-, kibbutz-, röntgen-, barna-, hälso- - 'NN|UTR|PLU|DEF|GEN': { POS: NOUN }, # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas - 'NN|UTR|PLU|DEF|NOM': { POS: NOUN }, # kvinnorna, föräldrarna, makarna, männen, hyrorna - 'NN|UTR|PLU|IND|GEN': { POS: NOUN }, # människors, kvinnors, dagars, tiders, månaders - 'NN|UTR|PLU|IND|NOM': { POS: NOUN }, # procent, människor, kvinnor, miljoner, kronor - 'NN|UTR|SIN|DEF|GEN': { POS: NOUN }, # kvinnans, världens, familjens, dagens, jordens - 'NN|UTR|SIN|DEF|NOM': { POS: NOUN }, # familjen, kvinnan, mannen, världen, skolan - 'NN|UTR|SIN|IND|GEN': { POS: NOUN }, # sorts, medelålders, makes, kvinnas, veckas - 'NN|UTR|SIN|IND|NOM': { POS: NOUN }, # del, tid, dag, fråga, man - 'PAD': { POS: PUNCT }, # , ), ( - 'PC|AN': { POS: VERB }, - 'PC|PRF|MAS|SIN|DEF|GEN': { POS: VERB }, # avlidnes - 'PC|PRF|MAS|SIN|DEF|NOM': { POS: VERB }, - 'PC|PRF|NEU|SIN|IND|NOM': { POS: VERB }, # taget, sett, särskilt, förbjudet, ökat - 'PC|PRF|UTR/NEU|PLU|IND/DEF|GEN': { POS: VERB }, # försäkrades, anställdas - 'PC|PRF|UTR/NEU|PLU|IND/DEF|NOM': { POS: VERB }, # särskilda, gifta, ökade, handikappade, skilda - 'PC|PRF|UTR/NEU|SIN|DEF|GEN': { POS: VERB }, - 'PC|PRF|UTR/NEU|SIN|DEF|NOM': { POS: VERB }, # ökade, gifta, nämnda, nedärvda, dolda - 'PC|PRF|UTR|SIN|IND|GEN': { POS: VERB }, - 'PC|PRF|UTR|SIN|IND|NOM': { POS: VERB }, # särskild, ökad, beredd, gift, oförändrad - 'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: VERB }, # studerandes, sammanboendes, dubbelarbetandes - 'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: VERB }, # följande, beroende, nuvarande, motsvarande, liknande - 'PL': { POS: PART }, # ut, upp, in, till, med - 'PL|SMS': { POS: PART }, - 'PM': { POS: PROPN }, # F, N, Liechtenstein, Danmark, DK - 'PM|GEN': { POS: PROPN }, # Sveriges, EEC:s, Guds, Stockholms, Kristi - 'PM|NOM': { POS: PROPN }, # Sverige, EEC, Stockholm, USA, ATP - 'PM|SMS': { POS: PROPN }, # Göteborgs-, Nord-, Väst- - 'PN|MAS|SIN|DEF|SUB/OBJ': { POS: PRON }, # denne - 'PN|NEU|SIN|DEF|SUB/OBJ': { POS: PRON }, # det, detta, detsamma - 'PN|NEU|SIN|IND|SUB/OBJ': { POS: PRON }, # något, allt, mycket, annat, ingenting - 'PN|UTR/NEU|PLU|DEF|OBJ': { POS: PRON }, # dem, varandra, varann - 'PN|UTR/NEU|PLU|DEF|SUB': { POS: PRON }, # de, bägge - 'PN|UTR/NEU|PLU|DEF|SUB/OBJ': { POS: PRON }, # dessa, dom, båda, den, bådadera - 'PN|UTR/NEU|PLU|IND|SUB/OBJ': { POS: PRON }, # andra, alla, många, sådana, några - 'PN|UTR/NEU|SIN/PLU|DEF|OBJ': { POS: PRON }, # sig, sej - 'PN|UTR|PLU|DEF|OBJ': { POS: PRON }, # oss, er, eder - 'PN|UTR|PLU|DEF|SUB': { POS: PRON }, # vi - 'PN|UTR|SIN|DEF|OBJ': { POS: PRON }, # dig, mig, henne, honom, Er - 'PN|UTR|SIN|DEF|SUB': { POS: PRON }, # du, han, hon, jag, ni - 'PN|UTR|SIN|DEF|SUB/OBJ': { POS: PRON }, # den, denna, densamma - 'PN|UTR|SIN|IND|SUB': { POS: PRON }, # man - 'PN|UTR|SIN|IND|SUB/OBJ': { POS: PRON }, # en, var, någon, ingen, Varannan - 'PP': { POS: ADP }, # i, av, på, för, till - 'PP|AN': { POS: ADP }, # f - 'PS|AN': { POS: DET }, - 'PS|NEU|SIN|DEF': { POS: DET }, # sitt, vårt, ditt, mitt, ert - 'PS|UTR/NEU|PLU|DEF': { POS: DET }, # sina, våra, dina, mina - 'PS|UTR/NEU|SIN/PLU|DEF': { POS: DET }, # deras, dess, hans, hennes, varandras - 'PS|UTR|SIN|DEF': { POS: DET }, # sin, vår, din, min, er - 'RG': { POS: NUM }, # 2, 17, 20, 1, 18 - 'RG|GEN': { POS: NUM }, - 'RG|MAS|SIN|DEF|NOM': { POS: NUM }, - 'RG|NEU|SIN|IND|NOM': { POS: NUM }, # ett - 'RG|NOM': { POS: NUM }, # två, tre, 1, 20, 2 - 'RG|SMS': { POS: NUM }, # ett-, 1950-, två-, tre-, 1700- - 'RG|UTR/NEU|SIN|DEF|NOM': { POS: NUM }, - 'RG|UTR|SIN|IND|NOM': { POS: NUM }, # en - 'RO|MAS|SIN|IND/DEF|GEN': { POS: ADJ }, - 'RO|MAS|SIN|IND/DEF|NOM': { POS: ADJ }, # förste - 'RO|GEN': { POS: ADJ }, - 'RO|NOM': { POS: ADJ }, # första, andra, tredje, fjärde, femte - 'SN': { POS: SCONJ }, # att, om, innan, eftersom, medan - 'UO': { POS: X }, # companionship, vice, versa, family, capita - 'VB|AN': { POS: VERB }, # jfr - 'VB|IMP|AKT': { POS: VERB }, # se, Diskutera, låt, Läs, Gå - 'VB|IMP|SFO': { POS: VERB }, # tas - 'VB|INF|AKT': { POS: VERB }, # vara, få, ha, bli, kunna - 'VB|INF|SFO': { POS: VERB }, # användas, finnas, göras, tas, ses - 'VB|KON|PRS|AKT': { POS: VERB }, # vare, Gånge - 'VB|KON|PRT|AKT': { POS: VERB }, # vore, finge - 'VB|KON|PRT|SFO': { POS: VERB }, - 'VB|PRS|AKT': { POS: VERB }, # är, har, kan, får, måste - 'VB|PRS|SFO': { POS: VERB }, # finns, kallas, behövs, beräknas, används - 'VB|PRT|AKT': { POS: VERB }, # skulle, var, hade, kunde, fick - 'VB|PRT|SFO': { POS: VERB }, # fanns, gjordes, höjdes, användes, infördes - 'VB|SMS': { POS: VERB }, # läs- - 'VB|SUP|AKT': { POS: VERB }, # varit, fått, blivit, haft, kommit - 'VB|SUP|SFO': { POS: VERB } # nämnts, gjorts, förändrats, sagts, framhållits + "AB": {POS: ADV}, # inte, också, så, bara, nu + "AB|AN": {POS: ADV}, # t.ex., ca, t_ex, bl.a., s_k + "AB|KOM": {POS: ADV}, # mer, tidigare, mindre, vidare, mera + "AB|POS": {POS: ADV}, # mycket, helt, ofta, länge, långt + "AB|SMS": {POS: ADV}, # över-, in- + "AB|SUV": {POS: ADV}, # minst, mest, högst, främst, helst + "DT|MAS|SIN|DEF": {POS: DET}, + "DT|MAS|SIN|IND": {POS: DET}, + "DT|NEU|SIN|DEF": {POS: DET}, # det, detta + "DT|NEU|SIN|IND": {POS: DET}, # ett, något, inget, vart, vartannat + "DT|NEU|SIN|IND/DEF": {POS: DET}, # allt + "DT|UTR/NEU|PLU|DEF": {POS: DET}, # de, dessa, bägge, dom + "DT|UTR/NEU|PLU|IND": {POS: DET}, # några, inga + "DT|UTR/NEU|PLU|IND/DEF": {POS: DET}, # alla + "DT|UTR/NEU|SIN/PLU|IND": {POS: DET}, # samma + "DT|UTR/NEU|SIN|DEF": {POS: DET}, # vardera + "DT|UTR/NEU|SIN|IND": {POS: DET}, # varje, varenda + "DT|UTR|SIN|DEF": {POS: DET}, # den, denna + "DT|UTR|SIN|IND": {POS: DET}, # en, någon, ingen, var, varannan + "DT|UTR|SIN|IND/DEF": {POS: DET}, # all + "HA": {POS: ADV}, # när, där, hur, som, då + "HD|NEU|SIN|IND": {POS: DET}, # vilket + "HD|UTR/NEU|PLU|IND": {POS: DET}, # vilka + "HD|UTR|SIN|IND": {POS: DET}, # vilken + "HP|-|-|-": {POS: PRON}, # som + "HP|NEU|SIN|IND": {POS: PRON}, # vad, vilket + "HP|NEU|SIN|IND|SMS": {POS: PRON}, + "HP|UTR/NEU|PLU|IND": {POS: PRON}, # vilka + "HP|UTR|SIN|IND": {POS: PRON}, # vilken, vem + "HS|DEF": {POS: DET}, # vars, vilkas, Vems + "IE": {POS: PART}, # att + "IN": {POS: INTJ}, # Jo, ja, nej, fan, visst + "JJ|AN": {POS: ADJ}, # ev, S:t, Kungl, Kungl., Teol + "JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN": {POS: ADJ}, # äldres + "JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM": { + POS: ADJ + }, # större, högre, mindre, bättre, äldre + "JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS": {POS: ADJ}, + "JJ|POS|MAS|SIN|DEF|GEN": {POS: ADJ}, # enskildes, sjukes, andres + "JJ|POS|MAS|SIN|DEF|NOM": {POS: ADJ}, # enskilde, sjuke, andre, unge, ene + "JJ|POS|NEU|SIN|IND/DEF|NOM": {POS: ADJ}, # eget + "JJ|POS|NEU|SIN|IND|GEN": {POS: ADJ}, + "JJ|POS|NEU|SIN|IND|NOM": {POS: ADJ}, # annat, svårt, möjligt, nytt, sådant + "JJ|POS|UTR/NEU|PLU|IND/DEF|GEN": { + POS: ADJ + }, # ogiftas, ungas, frånskildas, efterkommandes, färgblindas + "JJ|POS|UTR/NEU|PLU|IND/DEF|NOM": {POS: ADJ}, # olika, andra, många, stora, vissa + "JJ|POS|UTR/NEU|PLU|IND|NOM": {POS: ADJ}, # flera, sådana, fler, få, samtliga + "JJ|POS|UTR/NEU|SIN/PLU|IND|NOM": {POS: ADJ}, + "JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM": {POS: ADJ}, # bra, ena, enda, nästa, ringa + "JJ|POS|UTR/NEU|SIN|DEF|GEN": {POS: ADJ}, + "JJ|POS|UTR/NEU|SIN|DEF|NOM": {POS: ADJ}, # hela, nya, andra, svenska, ekonomiska + "JJ|POS|UTR|-|-|SMS": {POS: ADJ}, # fri-, låg-, sexual- + "JJ|POS|UTR|SIN|IND/DEF|NOM": {POS: ADJ}, # egen + "JJ|POS|UTR|SIN|IND|GEN": {POS: ADJ}, # enskilds + "JJ|POS|UTR|SIN|IND|NOM": {POS: ADJ}, # stor, annan, själv, sådan, viss + "JJ|SUV|MAS|SIN|DEF|GEN": {POS: ADJ}, + "JJ|SUV|MAS|SIN|DEF|NOM": {POS: ADJ}, # störste, främste, äldste, minste + "JJ|SUV|UTR/NEU|PLU|DEF|NOM": {POS: ADJ}, # flesta + "JJ|SUV|UTR/NEU|PLU|IND|NOM": {POS: ADJ}, + "JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM": { + POS: ADJ + }, # bästa, största, närmaste, viktigaste, högsta + "JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM": { + POS: ADJ + }, # störst, bäst, tidigast, högst, fattigast + "KN": {POS: CCONJ}, # och, eller, som, än, men + "KN|AN": {POS: CCONJ}, + "MAD": {POS: PUNCT}, # ., ?, :, !, ... + "MID": {POS: PUNCT}, # ,, -, :, *, ; + "NN|-|-|-|-": {POS: NOUN}, # godo, fjol, fullo, somras, måtto + "NN|AN": {POS: NOUN}, # kr, %, s., dr, kap. + "NN|NEU|-|-|-": {POS: NOUN}, + "NN|NEU|-|-|SMS": {POS: NOUN}, # yrkes-, barn-, hem-, fack-, vatten- + "NN|NEU|PLU|DEF|GEN": { + POS: NOUN + }, # barnens, årens, u-ländernas, företagens, århundradenas + "NN|NEU|PLU|DEF|NOM": {POS: NOUN}, # barnen, u-länderna, åren, länderna, könen + "NN|NEU|PLU|IND|GEN": {POS: NOUN}, # slags, års, barns, länders, tusentals + "NN|NEU|PLU|IND|NOM": {POS: NOUN}, # barn, år, fall, länder, problem + "NN|NEU|SIN|DEF|GEN": { + POS: NOUN + }, # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets + "NN|NEU|SIN|DEF|NOM": { + POS: NOUN + }, # äktenskapet, samhället, barnet, stället, hemmet + "NN|NEU|SIN|IND|GEN": {POS: NOUN}, # års, slags, lands, havs, företags + "NN|NEU|SIN|IND|NOM": {POS: NOUN}, # år, arbete, barn, sätt, äktenskap + "NN|SMS": {POS: NOUN}, # PCB-, Syd- + "NN|UTR|-|-|-": {POS: NOUN}, # dags, rätta + "NN|UTR|-|-|SMS": {POS: NOUN}, # far-, kibbutz-, röntgen-, barna-, hälso- + "NN|UTR|PLU|DEF|GEN": { + POS: NOUN + }, # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas + "NN|UTR|PLU|DEF|NOM": { + POS: NOUN + }, # kvinnorna, föräldrarna, makarna, männen, hyrorna + "NN|UTR|PLU|IND|GEN": {POS: NOUN}, # människors, kvinnors, dagars, tiders, månaders + "NN|UTR|PLU|IND|NOM": {POS: NOUN}, # procent, människor, kvinnor, miljoner, kronor + "NN|UTR|SIN|DEF|GEN": {POS: NOUN}, # kvinnans, världens, familjens, dagens, jordens + "NN|UTR|SIN|DEF|NOM": {POS: NOUN}, # familjen, kvinnan, mannen, världen, skolan + "NN|UTR|SIN|IND|GEN": {POS: NOUN}, # sorts, medelålders, makes, kvinnas, veckas + "NN|UTR|SIN|IND|NOM": {POS: NOUN}, # del, tid, dag, fråga, man + "PAD": {POS: PUNCT}, # , ), ( + "PC|AN": {POS: VERB}, + "PC|PRF|MAS|SIN|DEF|GEN": {POS: VERB}, # avlidnes + "PC|PRF|MAS|SIN|DEF|NOM": {POS: VERB}, + "PC|PRF|NEU|SIN|IND|NOM": {POS: VERB}, # taget, sett, särskilt, förbjudet, ökat + "PC|PRF|UTR/NEU|PLU|IND/DEF|GEN": {POS: VERB}, # försäkrades, anställdas + "PC|PRF|UTR/NEU|PLU|IND/DEF|NOM": { + POS: VERB + }, # särskilda, gifta, ökade, handikappade, skilda + "PC|PRF|UTR/NEU|SIN|DEF|GEN": {POS: VERB}, + "PC|PRF|UTR/NEU|SIN|DEF|NOM": {POS: VERB}, # ökade, gifta, nämnda, nedärvda, dolda + "PC|PRF|UTR|SIN|IND|GEN": {POS: VERB}, + "PC|PRF|UTR|SIN|IND|NOM": {POS: VERB}, # särskild, ökad, beredd, gift, oförändrad + "PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN": { + POS: VERB + }, # studerandes, sammanboendes, dubbelarbetandes + "PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM": { + POS: VERB + }, # följande, beroende, nuvarande, motsvarande, liknande + "PL": {POS: PART}, # ut, upp, in, till, med + "PL|SMS": {POS: PART}, + "PM": {POS: PROPN}, # F, N, Liechtenstein, Danmark, DK + "PM|GEN": {POS: PROPN}, # Sveriges, EEC:s, Guds, Stockholms, Kristi + "PM|NOM": {POS: PROPN}, # Sverige, EEC, Stockholm, USA, ATP + "PM|SMS": {POS: PROPN}, # Göteborgs-, Nord-, Väst- + "PN|MAS|SIN|DEF|SUB/OBJ": {POS: PRON}, # denne + "PN|NEU|SIN|DEF|SUB/OBJ": {POS: PRON}, # det, detta, detsamma + "PN|NEU|SIN|IND|SUB/OBJ": {POS: PRON}, # något, allt, mycket, annat, ingenting + "PN|UTR/NEU|PLU|DEF|OBJ": {POS: PRON}, # dem, varandra, varann + "PN|UTR/NEU|PLU|DEF|SUB": {POS: PRON}, # de, bägge + "PN|UTR/NEU|PLU|DEF|SUB/OBJ": {POS: PRON}, # dessa, dom, båda, den, bådadera + "PN|UTR/NEU|PLU|IND|SUB/OBJ": {POS: PRON}, # andra, alla, många, sådana, några + "PN|UTR/NEU|SIN/PLU|DEF|OBJ": {POS: PRON}, # sig, sej + "PN|UTR|PLU|DEF|OBJ": {POS: PRON}, # oss, er, eder + "PN|UTR|PLU|DEF|SUB": {POS: PRON}, # vi + "PN|UTR|SIN|DEF|OBJ": {POS: PRON}, # dig, mig, henne, honom, Er + "PN|UTR|SIN|DEF|SUB": {POS: PRON}, # du, han, hon, jag, ni + "PN|UTR|SIN|DEF|SUB/OBJ": {POS: PRON}, # den, denna, densamma + "PN|UTR|SIN|IND|SUB": {POS: PRON}, # man + "PN|UTR|SIN|IND|SUB/OBJ": {POS: PRON}, # en, var, någon, ingen, Varannan + "PP": {POS: ADP}, # i, av, på, för, till + "PP|AN": {POS: ADP}, # f + "PS|AN": {POS: DET}, + "PS|NEU|SIN|DEF": {POS: DET}, # sitt, vårt, ditt, mitt, ert + "PS|UTR/NEU|PLU|DEF": {POS: DET}, # sina, våra, dina, mina + "PS|UTR/NEU|SIN/PLU|DEF": {POS: DET}, # deras, dess, hans, hennes, varandras + "PS|UTR|SIN|DEF": {POS: DET}, # sin, vår, din, min, er + "RG": {POS: NUM}, # 2, 17, 20, 1, 18 + "RG|GEN": {POS: NUM}, + "RG|MAS|SIN|DEF|NOM": {POS: NUM}, + "RG|NEU|SIN|IND|NOM": {POS: NUM}, # ett + "RG|NOM": {POS: NUM}, # två, tre, 1, 20, 2 + "RG|SMS": {POS: NUM}, # ett-, 1950-, två-, tre-, 1700- + "RG|UTR/NEU|SIN|DEF|NOM": {POS: NUM}, + "RG|UTR|SIN|IND|NOM": {POS: NUM}, # en + "RO|MAS|SIN|IND/DEF|GEN": {POS: ADJ}, + "RO|MAS|SIN|IND/DEF|NOM": {POS: ADJ}, # förste + "RO|GEN": {POS: ADJ}, + "RO|NOM": {POS: ADJ}, # första, andra, tredje, fjärde, femte + "SN": {POS: SCONJ}, # att, om, innan, eftersom, medan + "UO": {POS: X}, # companionship, vice, versa, family, capita + "VB|AN": {POS: VERB}, # jfr + "VB|IMP|AKT": {POS: VERB}, # se, Diskutera, låt, Läs, Gå + "VB|IMP|SFO": {POS: VERB}, # tas + "VB|INF|AKT": {POS: VERB}, # vara, få, ha, bli, kunna + "VB|INF|SFO": {POS: VERB}, # användas, finnas, göras, tas, ses + "VB|KON|PRS|AKT": {POS: VERB}, # vare, Gånge + "VB|KON|PRT|AKT": {POS: VERB}, # vore, finge + "VB|KON|PRT|SFO": {POS: VERB}, + "VB|PRS|AKT": {POS: VERB}, # är, har, kan, får, måste + "VB|PRS|SFO": {POS: VERB}, # finns, kallas, behövs, beräknas, används + "VB|PRT|AKT": {POS: VERB}, # skulle, var, hade, kunde, fick + "VB|PRT|SFO": {POS: VERB}, # fanns, gjordes, höjdes, användes, infördes + "VB|SMS": {POS: VERB}, # läs- + "VB|SUP|AKT": {POS: VERB}, # varit, fått, blivit, haft, kommit + "VB|SUP|SFO": {POS: VERB}, # nämnts, gjorts, förändrats, sagts, framhållits } diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index 3bf932695..dd0976aa6 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -144,7 +144,7 @@ ABBREVIATIONS = [ # Add abbreviation for trailing punctuation too. If the abbreviation already has a trailing punctuation - skip it. for abbr in ABBREVIATIONS: - if abbr.endswith(".") == False: + if not abbr.endswith("."): ABBREVIATIONS.append(abbr + ".") for orth in ABBREVIATIONS: diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py index 8b21b70c7..cb23339e6 100644 --- a/spacy/lang/ta/__init__.py +++ b/spacy/lang/ta/__init__.py @@ -4,16 +4,15 @@ from __future__ import unicode_literals from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language from ...attrs import LANG -from ...util import update_exc class TamilDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: "ta" lex_attr_getters.update(LEX_ATTRS) + stop_words = STOP_WORDS class Tamil(Language): diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py index 407e24bc3..f1d624670 100644 --- a/spacy/lang/tl/__init__.py +++ b/spacy/lang/tl/__init__.py @@ -4,70 +4,33 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS - -# uncomment if files are available -# from .norm_exceptions import NORM_EXCEPTIONS -from .tag_map import TAG_MAP -# from .morph_rules import MORPH_RULES - -# uncomment if lookup-based lemmatizer is available from .lemmatizer import LOOKUP -# from ...lemmatizerlookup import Lemmatizer - -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language from ...attrs import LANG, NORM from ...util import update_exc, add_lookups + def _return_tl(_): - return 'tl' - - -# Create a Language subclass -# Documentation: https://spacy.io/docs/usage/adding-languages - -# This file should be placed in spacy/lang/xx (ISO code of language). -# Before submitting a pull request, make sure the remove all comments from the -# language data files, and run at least the basic tokenizer tests. Simply add the -# language ID to the list of languages in spacy/tests/conftest.py to include it -# in the basic tokenizer sanity tests. You can optionally add a fixture for the -# language's tokenizer and add more specific tests. For more info, see the -# tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests + return "tl" class TagalogDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = _return_tl # ISO code - # add more norm exception dictionaries here - lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - - # overwrite functions for lexical attributes + lex_attr_getters[LANG] = _return_tl + lex_attr_getters[NORM] = add_lookups( + Language.Defaults.lex_attr_getters[NORM], BASE_NORMS + ) lex_attr_getters.update(LEX_ATTRS) - - # add custom tokenizer exceptions to base exceptions tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - - # add stop words stop_words = STOP_WORDS - - # if available: add tag map - # tag_map = dict(TAG_MAP) - - # if available: add morph rules - # morph_rules = dict(MORPH_RULES) - - # if available: add lookup lemmatizer - # @classmethod - # def create_lemmatizer(cls, nlp=None): - # return Lemmatizer(LOOKUP) + lemma_lookup = LOOKUP class Tagalog(Language): - lang = 'tl' # ISO code - Defaults = TagalogDefaults # set Defaults to custom language defaults + lang = "tl" + Defaults = TagalogDefaults -# set default export – this allows the language class to be lazy-loaded -__all__ = ['Tagalog'] +__all__ = ["Tagalog"] diff --git a/spacy/lang/tl/lemmatizer.py b/spacy/lang/tl/lemmatizer.py index 57045a6a4..4eae45eb6 100644 --- a/spacy/lang/tl/lemmatizer.py +++ b/spacy/lang/tl/lemmatizer.py @@ -2,11 +2,6 @@ from __future__ import unicode_literals -# Adding a lemmatizer lookup table -# Documentation: https://spacy.io/docs/usage/adding-languages#lemmatizer -# Entries should be added in the following format: - - LOOKUP = { "kaugnayan": "ugnay", "sangkatauhan": "tao", @@ -14,5 +9,5 @@ LOOKUP = { "pandaigdigan": "daigdig", "kasaysayan": "saysay", "kabayanihan": "bayani", - "karuwagan": "duwag" + "karuwagan": "duwag", } diff --git a/spacy/lang/tl/lex_attrs.py b/spacy/lang/tl/lex_attrs.py index ba396b48e..61dc9d4f3 100644 --- a/spacy/lang/tl/lex_attrs.py +++ b/spacy/lang/tl/lex_attrs.py @@ -1,33 +1,55 @@ # coding: utf8 from __future__ import unicode_literals -# import the symbols for the attrs you want to overwrite from ...attrs import LIKE_NUM -# Overwriting functions for lexical attributes -# Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs -# Most of these functions, like is_lower or like_url should be language- -# independent. Others, like like_num (which includes both digits and number -# words), requires customisation. - - -# Example: check if token resembles a number - -_num_words = ['sero', 'isa', 'dalawa', 'tatlo', 'apat', 'lima', 'anim', 'pito', - 'walo', 'siyam', 'sampu', 'labing-isa', 'labindalawa', 'labintatlo', 'labing-apat', - 'labinlima', 'labing-anim', 'labimpito', 'labing-walo', 'labinsiyam', 'dalawampu', - 'tatlumpu', 'apatnapu', 'limampu', 'animnapu', 'pitumpu', 'walumpu', 'siyamnapu', - 'daan', 'libo', 'milyon', 'bilyon', 'trilyon', 'quadrilyon', - 'gajilyon', 'bazilyon'] +_num_words = [ + "sero", + "isa", + "dalawa", + "tatlo", + "apat", + "lima", + "anim", + "pito", + "walo", + "siyam", + "sampu", + "labing-isa", + "labindalawa", + "labintatlo", + "labing-apat", + "labinlima", + "labing-anim", + "labimpito", + "labing-walo", + "labinsiyam", + "dalawampu", + "tatlumpu", + "apatnapu", + "limampu", + "animnapu", + "pitumpu", + "walumpu", + "siyamnapu", + "daan", + "libo", + "milyon", + "bilyon", + "trilyon", + "quadrilyon", + "gajilyon", + "bazilyon", +] def like_num(text): - text = text.replace(',', '').replace('.', '') + text = text.replace(",", "").replace(".", "") if text.isdigit(): return True - if text.count('/') == 1: - num, denom = text.split('/') + if text.count("/") == 1: + num, denom = text.split("/") if num.isdigit() and denom.isdigit(): return True if text in _num_words: @@ -35,9 +57,4 @@ def like_num(text): return False -# Create dictionary of functions to overwrite. The default lex_attr_getters are -# updated with this one, so only the functions defined here are overwritten. - -LEX_ATTRS = { - LIKE_NUM: like_num -} +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/tl/stop_words.py b/spacy/lang/tl/stop_words.py index bfda89eb4..510b3a418 100644 --- a/spacy/lang/tl/stop_words.py +++ b/spacy/lang/tl/stop_words.py @@ -1,162 +1,154 @@ # encoding: utf8 from __future__ import unicode_literals - -# Add stop words -# Documentation: https://spacy.io/docs/usage/adding-languages#stop-words -# To improve readability, words should be ordered alphabetically and separated -# by spaces and newlines. When adding stop words from an online source, always -# include the link in a comment. Make sure to proofread and double-check the -# words – lists available online are often known to contain mistakes. - -# data from https://github.com/stopwords-iso/stopwords-tl/blob/master/stopwords-tl.txt - -STOP_WORDS = set(""" - akin - aking - ako - alin - am - amin - aming - ang - ano - anumang - apat - at - atin - ating - ay - bababa - bago - bakit - bawat - bilang - dahil - dalawa - dapat - din - dito - doon - gagawin - gayunman - ginagawa - ginawa - ginawang - gumawa - gusto - habang - hanggang - hindi - huwag - iba - ibaba - ibabaw - ibig - ikaw - ilagay - ilalim - ilan - inyong - isa - isang - itaas - ito - iyo - iyon - iyong - ka - kahit - kailangan - kailanman - kami - kanila - kanilang - kanino - kanya - kanyang - kapag - kapwa - karamihan - katiyakan - katulad - kaya - kaysa - ko - kong - kulang - kumuha - kung - laban - lahat - lamang - likod - lima - maaari - maaaring - maging - mahusay - makita - marami - marapat - masyado - may - mayroon - mga - minsan - mismo - mula - muli - na - nabanggit - naging - nagkaroon - nais - nakita - namin - napaka - narito - nasaan - ng - ngayon - ni - nila - nilang - nito - niya - niyang - noon - o - pa - paano - pababa - paggawa - pagitan - pagkakaroon - pagkatapos - palabas - pamamagitan - panahon - pangalawa - para - paraan - pareho - pataas - pero - pumunta - pumupunta - sa - saan - sabi - sabihin - sarili - sila - sino - siya - tatlo - tayo - tulad - tungkol - una - walang -""".split()) +STOP_WORDS = set( + """ +akin +aking +ako +alin +am +amin +aming +ang +ano +anumang +apat +at +atin +ating +ay +bababa +bago +bakit +bawat +bilang +dahil +dalawa +dapat +din +dito +doon +gagawin +gayunman +ginagawa +ginawa +ginawang +gumawa +gusto +habang +hanggang +hindi +huwag +iba +ibaba +ibabaw +ibig +ikaw +ilagay +ilalim +ilan +inyong +isa +isang +itaas +ito +iyo +iyon +iyong +ka +kahit +kailangan +kailanman +kami +kanila +kanilang +kanino +kanya +kanyang +kapag +kapwa +karamihan +katiyakan +katulad +kaya +kaysa +ko +kong +kulang +kumuha +kung +laban +lahat +lamang +likod +lima +maaari +maaaring +maging +mahusay +makita +marami +marapat +masyado +may +mayroon +mga +minsan +mismo +mula +muli +na +nabanggit +naging +nagkaroon +nais +nakita +namin +napaka +narito +nasaan +ng +ngayon +ni +nila +nilang +nito +niya +niyang +noon +o +pa +paano +pababa +paggawa +pagitan +pagkakaroon +pagkatapos +palabas +pamamagitan +panahon +pangalawa +para +paraan +pareho +pataas +pero +pumunta +pumupunta +sa +saan +sabi +sabihin +sarili +sila +sino +siya +tatlo +tayo +tulad +tungkol +una +walang +""".split() +) diff --git a/spacy/lang/tl/tag_map.py b/spacy/lang/tl/tag_map.py deleted file mode 100644 index 38476c6f6..000000000 --- a/spacy/lang/tl/tag_map.py +++ /dev/null @@ -1,36 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ -from ...symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ - - -# Add a tag map -# Documentation: https://spacy.io/docs/usage/adding-languages#tag-map -# Universal Dependencies: http://universaldependencies.org/u/pos/all.html -# The keys of the tag map should be strings in your tag set. The dictionary must -# have an entry POS whose value is one of the Universal Dependencies tags. -# Optionally, you can also include morphological features or other attributes. - - -TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "CCONJ": {POS: CCONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB}, - "PART": {POS: PART}, - "SP": {POS: SPACE} -} diff --git a/spacy/lang/tl/tokenizer_exceptions.py b/spacy/lang/tl/tokenizer_exceptions.py index 1df8d6796..77e1fb0c6 100644 --- a/spacy/lang/tl/tokenizer_exceptions.py +++ b/spacy/lang/tl/tokenizer_exceptions.py @@ -1,48 +1,20 @@ # coding: utf8 from __future__ import unicode_literals -# import symbols – if you need to use more, add them here -from ...symbols import ORTH, LEMMA, TAG, NORM, ADP, DET +from ...symbols import ORTH, LEMMA -# Add tokenizer exceptions -# Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions -# Feel free to use custom logic to generate repetitive exceptions more efficiently. -# If an exception is split into more than one token, the ORTH values combined always -# need to match the original string. - -# Exceptions should be added in the following format: - _exc = { - "tayo'y": [ - {ORTH: "tayo", LEMMA: "tayo"}, - {ORTH: "'y", LEMMA: "ay"}], - "isa'y": [ - {ORTH: "isa", LEMMA: "isa"}, - {ORTH: "'y", LEMMA: "ay"}], - "baya'y": [ - {ORTH: "baya", LEMMA: "bayan"}, - {ORTH: "'y", LEMMA: "ay"}], - "sa'yo": [ - {ORTH: "sa", LEMMA: "sa"}, - {ORTH: "'yo", LEMMA: "iyo"}], - "ano'ng": [ - {ORTH: "ano", LEMMA: "ano"}, - {ORTH: "'ng", LEMMA: "ang"}], - "siya'y": [ - {ORTH: "siya", LEMMA: "siya"}, - {ORTH: "'y", LEMMA: "ay"}], - "nawa'y": [ - {ORTH: "nawa", LEMMA: "nawa"}, - {ORTH: "'y", LEMMA: "ay"}], - "papa'no": [ - {ORTH: "papa'no", LEMMA: "papaano"}], - "'di": [ - {ORTH: "'di", LEMMA: "hindi"}] + "tayo'y": [{ORTH: "tayo", LEMMA: "tayo"}, {ORTH: "'y", LEMMA: "ay"}], + "isa'y": [{ORTH: "isa", LEMMA: "isa"}, {ORTH: "'y", LEMMA: "ay"}], + "baya'y": [{ORTH: "baya", LEMMA: "bayan"}, {ORTH: "'y", LEMMA: "ay"}], + "sa'yo": [{ORTH: "sa", LEMMA: "sa"}, {ORTH: "'yo", LEMMA: "iyo"}], + "ano'ng": [{ORTH: "ano", LEMMA: "ano"}, {ORTH: "'ng", LEMMA: "ang"}], + "siya'y": [{ORTH: "siya", LEMMA: "siya"}, {ORTH: "'y", LEMMA: "ay"}], + "nawa'y": [{ORTH: "nawa", LEMMA: "nawa"}, {ORTH: "'y", LEMMA: "ay"}], + "papa'no": [{ORTH: "papa'no", LEMMA: "papaano"}], + "'di": [{ORTH: "'di", LEMMA: "hindi"}], } -# To keep things clean and readable, it's recommended to only declare the -# TOKENIZER_EXCEPTIONS at the bottom: - TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 305f01345..90141e81a 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from ..symbols import ORTH, POS, TAG, LEMMA, SPACE, PUNCT +from ..symbols import ORTH, POS, TAG, LEMMA, SPACE # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 8ee9dab52..d152c08a4 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -5,71 +5,32 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -# uncomment if files are available -# from .norm_exceptions import NORM_EXCEPTIONS -# from .tag_map import TAG_MAP -# from .morph_rules import MORPH_RULES - -# uncomment if lookup-based lemmatizer is available -# from .lemmatizer import LOOKUP -# from ...lemmatizerlookup import Lemmatizer - from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...util import update_exc, add_lookups from ...language import Language -from ...attrs import LANG, LIKE_NUM, NORM -# from .tag_map import TAG_MAP +from ...attrs import LANG, NORM from .lemmatizer import UkrainianLemmatizer -# Create a Language subclass -# Documentation: https://spacy.io/docs/usage/adding-languages - -# This file should be placed in spacy/lang/xx (ISO code of language). -# Before submitting a pull request, make sure the remove all comments from the -# language data files, and run at least the basic tokenizer tests. Simply add the -# language ID to the list of languages in spacy/tests/conftest.py to include it -# in the basic tokenizer sanity tests. You can optionally add a fixture for the -# language's tokenizer and add more specific tests. For more info, see the -# tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests - - class UkrainianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'uk' # ISO code - # add more norm exception dictionaries here - lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - - # overwrite functions for lexical attributes + lex_attr_getters[LANG] = lambda text: "uk" + lex_attr_getters[NORM] = add_lookups( + Language.Defaults.lex_attr_getters[NORM], BASE_NORMS + ) lex_attr_getters.update(LEX_ATTRS) - - # add custom tokenizer exceptions to base exceptions tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - - # add stop words stop_words = STOP_WORDS - # if available: add tag map - # tag_map = dict(TAG_MAP) - - # if available: add morph rules - # morph_rules = dict(MORPH_RULES) - - # if available: add lookup lemmatizer - # @classmethod - # def create_lemmatizer(cls, nlp=None): - # return Lemmatizer(LOOKUP) - @classmethod def create_lemmatizer(cls, nlp=None): return UkrainianLemmatizer() class Ukrainian(Language): - lang = 'uk' # ISO code - Defaults = UkrainianDefaults # set Defaults to custom language defaults + lang = "uk" + Defaults = UkrainianDefaults -# set default export – this allows the language class to be lazy-loaded -__all__ = ['Ukrainian'] +__all__ = ["Ukrainian"] diff --git a/spacy/lang/uk/examples.py b/spacy/lang/uk/examples.py index 22df2b81c..4f2b034eb 100644 --- a/spacy/lang/uk/examples.py +++ b/spacy/lang/uk/examples.py @@ -14,10 +14,10 @@ sentences = [ "Ніч на середу буде морозною.", "Чим кращі книги ти читав, тим гірше спиш.", # Serhiy Zhadan "Найстаріші ґудзики, відомі людству, археологи знайшли в долині ріки Інд.", - "Слов'янське слово «Україна» вперше згадується у Київському літописному зводі за Іпатіївським списком під 1187 роком.", # wikipedia + "Слов'янське слово «Україна» вперше згадується у Київському літописному зводі за Іпатіївським списком під 1187 роком.", # wikipedia "Де у Києві найсмачніша кава?", - "Від Нижнього озера довгими дерев’яними сходами, над якими синьо й біло горіли маленькі коробочки-ліхтарики, підіймалися до нього двоє стовусів: найкращий друг Вертутій і його дванадцятилітній онук Чублик.", # blyznets_viktor_semenovych/zemlia_svitliachkiv + "Від Нижнього озера довгими дерев’яними сходами, над якими синьо й біло горіли маленькі коробочки-ліхтарики, підіймалися до нього двоє стовусів: найкращий друг Вертутій і його дванадцятилітній онук Чублик.", # blyznets_viktor_semenovych/zemlia_svitliachkiv "Китайський космічний зонд \"Чан'е-4\" вперше в історії здійснив м'яку посадку на зворотному боці Місяця.", - "Коли до губ твоїх лишається півподиху, коли до губ твоїх лишається півкроку – зіниці твої виткані із подиву, в очах у тебе синьо і широко.", # Hryhorij Czubaj - "Дорогу сестру збираю у дорогу, а брати вирішили не брати машину." # homographs + "Коли до губ твоїх лишається півподиху, коли до губ твоїх лишається півкроку – зіниці твої виткані із подиву, в очах у тебе синьо і широко.", # Hryhorij Czubaj + "Дорогу сестру збираю у дорогу, а брати вирішили не брати машину.", # homographs ] diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 8db294507..867cd3943 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -1,12 +1,15 @@ +# coding: utf8 +from __future__ import unicode_literals + from ..ru.lemmatizer import RussianLemmatizer class UkrainianLemmatizer(RussianLemmatizer): - - def __init__(self, pymorphy2_lang='ru'): + def __init__(self, pymorphy2_lang="ru"): try: - super(UkrainianLemmatizer, self).__init__(pymorphy2_lang='uk') + super(UkrainianLemmatizer, self).__init__(pymorphy2_lang="uk") except ImportError: raise ImportError( - 'The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: ' - 'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"') + "The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: " + 'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"' + ) diff --git a/spacy/lang/uk/lex_attrs.py b/spacy/lang/uk/lex_attrs.py index 98ded5fb8..0ade751d6 100644 --- a/spacy/lang/uk/lex_attrs.py +++ b/spacy/lang/uk/lex_attrs.py @@ -1,32 +1,68 @@ # coding: utf8 from __future__ import unicode_literals -# import the symbols for the attrs you want to overwrite from ...attrs import LIKE_NUM - -# Overwriting functions for lexical attributes -# Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs -# Most of these functions, like is_lower or like_url should be language- -# independent. Others, like like_num (which includes both digits and number -# words), requires customisation. - - -# Example: check if token resembles a number -_num_words = ["більйон", "вісім", "вісімдесят", "вісімнадцять", "вісімсот", "восьмий", "два", "двадцять", "дванадцять", - "двісті", "дев'яносто", "дев'ятнадцять", "дев'ятсот", "дев'ять", "десять", "децильйон", "квадрильйон", - "квінтильйон", "мільйон", "мільярд", "нонильйон", "один", "одинадцять", "октильйон", "п'ятий", - "п'ятисотий", "п'ятнадцять", "п'ятсот", "п'ять", "секстильйон", "септильйон", "сім", "сімдесят", - "сімнадцять", "сімсот", "сорок", "сто", "тисяча", "три", "тридцять", "трильйон", "тринадцять", "триста", - "чотири", "чотириста", "чотирнадцять", "шістдесят", "шістнадцять", "шістсот", "шість"] +_num_words = [ + "більйон", + "вісім", + "вісімдесят", + "вісімнадцять", + "вісімсот", + "восьмий", + "два", + "двадцять", + "дванадцять", + "двісті", + "дев'яносто", + "дев'ятнадцять", + "дев'ятсот", + "дев'ять", + "десять", + "децильйон", + "квадрильйон", + "квінтильйон", + "мільйон", + "мільярд", + "нонильйон", + "один", + "одинадцять", + "октильйон", + "п'ятий", + "п'ятисотий", + "п'ятнадцять", + "п'ятсот", + "п'ять", + "секстильйон", + "септильйон", + "сім", + "сімдесят", + "сімнадцять", + "сімсот", + "сорок", + "сто", + "тисяча", + "три", + "тридцять", + "трильйон", + "тринадцять", + "триста", + "чотири", + "чотириста", + "чотирнадцять", + "шістдесят", + "шістнадцять", + "шістсот", + "шість", +] def like_num(text): - text = text.replace(',', '').replace('.', '') + text = text.replace(",", "").replace(".", "") if text.isdigit(): return True - if text.count('/') == 1: - num, denom = text.split('/') + if text.count("/") == 1: + num, denom = text.split("/") if num.isdigit() and denom.isdigit(): return True if text in _num_words: @@ -34,9 +70,4 @@ def like_num(text): return False -# Create dictionary of functions to overwrite. The default lex_attr_getters are -# updated with this one, so only the functions defined here are overwritten. - -LEX_ATTRS = { - LIKE_NUM: like_num -} +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/uk/stop_words.py b/spacy/lang/uk/stop_words.py index f5b85312f..83e86d937 100644 --- a/spacy/lang/uk/stop_words.py +++ b/spacy/lang/uk/stop_words.py @@ -2,15 +2,8 @@ from __future__ import unicode_literals -# Add stop words -# Documentation: https://spacy.io/docs/usage/adding-languages#stop-words -# To improve readability, words should be ordered alphabetically and separated -# by spaces and newlines. When adding stop words from an online source, always -# include the link in a comment. Make sure to proofread and double-check the -# words – lists available online are often known to contain mistakes. - - -STOP_WORDS = set("""а +STOP_WORDS = set( + """а або адже але @@ -401,4 +394,5 @@ STOP_WORDS = set("""а якій якого якщо -""".split()) +""".split() +) diff --git a/spacy/lang/uk/tag_map.py b/spacy/lang/uk/tag_map.py index f5ae22f43..472e772ef 100644 --- a/spacy/lang/uk/tag_map.py +++ b/spacy/lang/uk/tag_map.py @@ -5,32 +5,24 @@ from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ -# Add a tag map -# Documentation: https://spacy.io/docs/usage/adding-languages#tag-map -# Universal Dependencies: http://universaldependencies.org/u/pos/all.html -# The keys of the tag map should be strings in your tag set. The dictionary must -# have an entry POS whose value is one of the Universal Dependencies tags. -# Optionally, you can also include morphological features or other attributes. - - TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "CCONJ": {POS: CCONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB}, - "PART": {POS: PART}, - "SP": {POS: SPACE} + "ADV": {POS: ADV}, + "NOUN": {POS: NOUN}, + "ADP": {POS: ADP}, + "PRON": {POS: PRON}, + "SCONJ": {POS: SCONJ}, + "PROPN": {POS: PROPN}, + "DET": {POS: DET}, + "SYM": {POS: SYM}, + "INTJ": {POS: INTJ}, + "PUNCT": {POS: PUNCT}, + "NUM": {POS: NUM}, + "AUX": {POS: AUX}, + "X": {POS: X}, + "CONJ": {POS: CONJ}, + "CCONJ": {POS: CCONJ}, + "ADJ": {POS: ADJ}, + "VERB": {POS: VERB}, + "PART": {POS: PART}, + "SP": {POS: SPACE}, } diff --git a/spacy/lang/uk/tokenizer_exceptions.py b/spacy/lang/uk/tokenizer_exceptions.py index c5e1595eb..a94d77af3 100644 --- a/spacy/lang/uk/tokenizer_exceptions.py +++ b/spacy/lang/uk/tokenizer_exceptions.py @@ -1,18 +1,9 @@ # coding: utf8 from __future__ import unicode_literals -# import symbols – if you need to use more, add them here from ...symbols import ORTH, LEMMA, POS, NORM, NOUN -# Add tokenizer exceptions -# Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions -# Feel free to use custom logic to generate repetitive exceptions more efficiently. -# If an exception is split into more than one token, the ORTH values combined always -# need to match the original string. - -# Exceptions should be added in the following format: - _exc = {} for exc_data in [ @@ -28,11 +19,9 @@ for exc_data in [ {ORTH: "проф.", LEMMA: "професор", NORM: "професор", POS: NOUN}, {ORTH: "акад.", LEMMA: "академік", NORM: "академік", POS: NOUN}, {ORTH: "доц.", LEMMA: "доцент", NORM: "доцент", POS: NOUN}, - {ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN}]: + {ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN}, +]: _exc[exc_data[ORTH]] = [exc_data] -# To keep things clean and readable, it's recommended to only declare the -# TOKENIZER_EXCEPTIONS at the bottom: - TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/matcher/__init__.py b/spacy/matcher/__init__.py index e417097b2..e9105d78b 100644 --- a/spacy/matcher/__init__.py +++ b/spacy/matcher/__init__.py @@ -1,6 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -from .matcher import Matcher -from .phrasematcher import PhraseMatcher -from .dependencymatcher import DependencyTreeMatcher +from .matcher import Matcher # noqa: F401 +from .phrasematcher import PhraseMatcher # noqa: F401 +from .dependencymatcher import DependencyTreeMatcher # noqa: F401 diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 5032dd071..3caed87e2 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -119,8 +119,8 @@ def tr_tokenizer(): @pytest.fixture(scope="session") def uk_tokenizer(): - pymorphy = pytest.importorskip("pymorphy2") - return util.get_lang_class("uk").Defaults.create_tokenizer() + pytest.importorskip("pymorphy2") + return get_lang_class("uk").Defaults.create_tokenizer() @pytest.fixture(scope="session") @@ -130,7 +130,7 @@ def ca_tokenizer(): @pytest.fixture(scope="session") def pl_tokenizer(): - return util.get_lang_class("pl").Defaults.create_tokenizer() + return get_lang_class("pl").Defaults.create_tokenizer() @pytest.fixture(scope="session") diff --git a/spacy/tests/lang/pl/__init__.py b/spacy/tests/lang/pl/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/pl/test_tokenizer.py b/spacy/tests/lang/pl/test_tokenizer.py index 27eb9af1c..9d0034589 100644 --- a/spacy/tests/lang/pl/test_tokenizer.py +++ b/spacy/tests/lang/pl/test_tokenizer.py @@ -3,57 +3,57 @@ from __future__ import unicode_literals import pytest -DOT_TESTS = [ - ('tel.', ['tel.']), - ('np.', ['np.']), - ('godz. 21:37', ['godz.', '21:37']), - ('inż.', ['inż.']), - ('gosp.-polit.', ['gosp.-polit.']), - ('ppoż', ['ppoż']), - ('płn', ['płn']), - ('ul.', ['ul.']), - ('jw.', ['jw.']), - ('itd.', ['itd.']), - ('cdn.', ['cdn.']), - ('itp.', ['itp.']), - ('10,- zł', ['10,-', 'zł']), - ('0 zł 99 gr', ['0', 'zł', '99', 'gr']), - ('0,99 rub.', ['0,99', 'rub.']), - ('dol.', ['dol.']), - ('1000 m n.p.m.', ['1000', 'm', 'n.p.m.']), - ('m.in.', ['m.in.']), - ('p.n.e.', ['p.n.e.']), - ('Sz.P.', ['Sz.P.']), - ('p.o.', ['p.o.']), - ('k.o.', ['k.o.']), - ('m.st.', ['m.st.']), - ('dra.', ['dra', '.']), - ('pp.', ['pp.']), - ('oo.', ['oo.']) +DOT_TESTS = [ + ("tel.", ["tel."]), + ("np.", ["np."]), + ("godz. 21:37", ["godz.", "21:37"]), + ("inż.", ["inż."]), + ("gosp.-polit.", ["gosp.-polit."]), + ("ppoż", ["ppoż"]), + ("płn", ["płn"]), + ("ul.", ["ul."]), + ("jw.", ["jw."]), + ("itd.", ["itd."]), + ("cdn.", ["cdn."]), + ("itp.", ["itp."]), + ("10,- zł", ["10,-", "zł"]), + ("0 zł 99 gr", ["0", "zł", "99", "gr"]), + ("0,99 rub.", ["0,99", "rub."]), + ("dol.", ["dol."]), + ("1000 m n.p.m.", ["1000", "m", "n.p.m."]), + ("m.in.", ["m.in."]), + ("p.n.e.", ["p.n.e."]), + ("Sz.P.", ["Sz.P."]), + ("p.o.", ["p.o."]), + ("k.o.", ["k.o."]), + ("m.st.", ["m.st."]), + ("dra.", ["dra", "."]), + ("pp.", ["pp."]), + ("oo.", ["oo."]), ] HYPHEN_TESTS = [ - ('5-fluoropentylo-3-pirydynyloindol', ['5-fluoropentylo-3-pirydynyloindol']), - ('NESS-040C5', ['NESS-040C5']), - ('JTE-7-31', ['JTE-7-31']), - ('BAY-59-3074', ['BAY-59-3074']), - ('BAY-38-7271', ['BAY-38-7271']), - ('STS-135', ['STS-135']), - ('5F-PB-22', ['5F-PB-22']), - ('cztero-', ['cztero-']), - ('jedno-', ['jedno-']), - ('dwu-', ['dwu-']), - ('trzy-', ['trzy-']), - ('b-adoratorzy', ['b-adoratorzy']), - ('2-3-4 drzewa', ['2-3-4', 'drzewa']), - ('b-drzewa', ['b-drzewa']) + ("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]), + ("NESS-040C5", ["NESS-040C5"]), + ("JTE-7-31", ["JTE-7-31"]), + ("BAY-59-3074", ["BAY-59-3074"]), + ("BAY-38-7271", ["BAY-38-7271"]), + ("STS-135", ["STS-135"]), + ("5F-PB-22", ["5F-PB-22"]), + ("cztero-", ["cztero-"]), + ("jedno-", ["jedno-"]), + ("dwu-", ["dwu-"]), + ("trzy-", ["trzy-"]), + ("b-adoratorzy", ["b-adoratorzy"]), + ("2-3-4 drzewa", ["2-3-4", "drzewa"]), + ("b-drzewa", ["b-drzewa"]), ] TESTCASES = DOT_TESTS + HYPHEN_TESTS -@pytest.mark.parametrize('text,expected_tokens', TESTCASES) +@pytest.mark.parametrize("text,expected_tokens", TESTCASES) def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens): tokens = pl_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] diff --git a/spacy/tests/lang/sv/test_exceptions.py b/spacy/tests/lang/sv/test_exceptions.py index 992edd98e..c977a4183 100644 --- a/spacy/tests/lang/sv/test_exceptions.py +++ b/spacy/tests/lang/sv/test_exceptions.py @@ -5,34 +5,42 @@ import pytest SV_TOKEN_EXCEPTION_TESTS = [ - ('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']), - ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']), - ('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."]) + ( + "Smörsåsen används bl.a. till fisk", + ["Smörsåsen", "används", "bl.a.", "till", "fisk"], + ), + ( + "Jag kommer först kl. 13 p.g.a. diverse förseningar", + ["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"], + ), + ( + "Anders I. tycker om ord med i i.", + ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."], + ), ] -@pytest.mark.parametrize('text,expected_tokens', SV_TOKEN_EXCEPTION_TESTS) +@pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS) def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens): tokens = sv_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list -@pytest.mark.parametrize('text', ["driveru", "hajaru", "Serru", "Fixaru"]) +@pytest.mark.parametrize("text", ["driveru", "hajaru", "Serru", "Fixaru"]) def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 2 assert tokens[1].text == "u" -@pytest.mark.parametrize('text', - ["bl.a", "m.a.o.", "Jan.", "Dec.", "kr.", "osv."]) +@pytest.mark.parametrize("text", ["bl.a", "m.a.o.", "Jan.", "Dec.", "kr.", "osv."]) def test_sv_tokenizer_handles_abbr(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 1 -@pytest.mark.parametrize('text', ["Jul.", "jul.", "sön.", "Sön."]) +@pytest.mark.parametrize("text", ["Jul.", "jul.", "sön.", "Sön."]) def test_sv_tokenizer_handles_ambiguous_abbr(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 2 diff --git a/spacy/tests/lang/sv/test_lemmatizer.py b/spacy/tests/lang/sv/test_lemmatizer.py index a96e1edfb..4f47a4a3c 100644 --- a/spacy/tests/lang/sv/test_lemmatizer.py +++ b/spacy/tests/lang/sv/test_lemmatizer.py @@ -4,12 +4,17 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('string,lemma', [('DNA-profilernas', 'DNA-profil'), - ('Elfenbenskustens', 'Elfenbenskusten'), - ('abortmotståndarens', 'abortmotståndare'), - ('kolesterols', 'kolesterol'), - ('portionssnusernas', 'portionssnus'), - ('åsyns', 'åsyn')]) +@pytest.mark.parametrize( + "string,lemma", + [ + ("DNA-profilernas", "DNA-profil"), + ("Elfenbenskustens", "Elfenbenskusten"), + ("abortmotståndarens", "abortmotståndare"), + ("kolesterols", "kolesterol"), + ("portionssnusernas", "portionssnus"), + ("åsyns", "åsyn"), + ], +) def test_lemmatizer_lookup_assigns(sv_tokenizer, string, lemma): tokens = sv_tokenizer(string) assert tokens[0].lemma_ == lemma diff --git a/spacy/tests/lang/sv/test_prefix_suffix_infix.py b/spacy/tests/lang/sv/test_prefix_suffix_infix.py index c04d501b0..f3fdd9a9e 100644 --- a/spacy/tests/lang/sv/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/sv/test_prefix_suffix_infix.py @@ -1,28 +1,28 @@ # coding: utf-8 -"""Test that tokenizer prefixes, suffixes and infixes are handled correctly.""" from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text', ["(under)"]) + +@pytest.mark.parametrize("text", ["(under)"]) def test_tokenizer_splits_no_special(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["gitta'r", "Björn's", "Lars'"]) +@pytest.mark.parametrize("text", ["gitta'r", "Björn's", "Lars'"]) def test_tokenizer_handles_no_punct(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 1 -@pytest.mark.parametrize('text', ["svart.Gul", "Hej.Världen"]) +@pytest.mark.parametrize("text", ["svart.Gul", "Hej.Världen"]) def test_tokenizer_splits_period_infix(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["Hej,Världen", "en,två"]) +@pytest.mark.parametrize("text", ["Hej,Världen", "en,två"]) def test_tokenizer_splits_comma_infix(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 3 @@ -31,7 +31,7 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text): assert tokens[2].text == text.split(",")[1] -@pytest.mark.parametrize('text', ["svart...Gul", "svart...gul"]) +@pytest.mark.parametrize("text", ["svart...Gul", "svart...gul"]) def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 3 diff --git a/spacy/tests/lang/sv/test_text.py b/spacy/tests/lang/sv/test_text.py index 953ecc904..9ea1851ae 100644 --- a/spacy/tests/lang/sv/test_text.py +++ b/spacy/tests/lang/sv/test_text.py @@ -1,9 +1,6 @@ # coding: utf-8 -"""Test that longer and mixed texts are tokenized correctly.""" - from __future__ import unicode_literals -import pytest def test_sv_tokenizer_handles_long_text(sv_tokenizer): text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön, diff --git a/spacy/tests/lang/uk/test_tokenizer.py b/spacy/tests/lang/uk/test_tokenizer.py index ded8e9300..860d21953 100644 --- a/spacy/tests/lang/uk/test_tokenizer.py +++ b/spacy/tests/lang/uk/test_tokenizer.py @@ -1,25 +1,24 @@ # coding: utf-8 -"""Test that open, closed and paired punctuation is split off correctly.""" - - from __future__ import unicode_literals import pytest -PUNCT_OPEN = ['(', '[', '{', '*'] -PUNCT_CLOSE = [')', ']', '}', '*'] -PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] +PUNCT_OPEN = ["(", "[", "{", "*"] +PUNCT_CLOSE = [")", "]", "}", "*"] +PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")] -@pytest.mark.parametrize('text', ["(", "((", "<"]) +@pytest.mark.parametrize("text", ["(", "((", "<"]) def test_uk_tokenizer_handles_only_punct(uk_tokenizer, text): tokens = uk_tokenizer(text) assert len(tokens) == len(text) -@pytest.mark.parametrize('punct', PUNCT_OPEN) -@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) +@pytest.mark.parametrize("punct", PUNCT_OPEN) +@pytest.mark.parametrize( + "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] +) def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text): tokens = uk_tokenizer(punct + text) assert len(tokens) == 2 @@ -27,8 +26,10 @@ def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text): assert tokens[1].text == text -@pytest.mark.parametrize('punct', PUNCT_CLOSE) -@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) +@pytest.mark.parametrize("punct", PUNCT_CLOSE) +@pytest.mark.parametrize( + "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] +) def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text): tokens = uk_tokenizer(text + punct) assert len(tokens) == 2 @@ -36,9 +37,11 @@ def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text): assert tokens[1].text == punct -@pytest.mark.parametrize('punct', PUNCT_OPEN) -@pytest.mark.parametrize('punct_add', ["`"]) -@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) +@pytest.mark.parametrize("punct", PUNCT_OPEN) +@pytest.mark.parametrize("punct_add", ["`"]) +@pytest.mark.parametrize( + "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] +) def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add, text): tokens = uk_tokenizer(punct + punct_add + text) assert len(tokens) == 3 @@ -47,9 +50,11 @@ def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add, assert tokens[2].text == text -@pytest.mark.parametrize('punct', PUNCT_CLOSE) -@pytest.mark.parametrize('punct_add', ["'"]) -@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) +@pytest.mark.parametrize("punct", PUNCT_CLOSE) +@pytest.mark.parametrize("punct_add", ["'"]) +@pytest.mark.parametrize( + "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] +) def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add, text): tokens = uk_tokenizer(text + punct + punct_add) assert len(tokens) == 3 @@ -58,8 +63,10 @@ def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add assert tokens[2].text == punct_add -@pytest.mark.parametrize('punct', PUNCT_OPEN) -@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) +@pytest.mark.parametrize("punct", PUNCT_OPEN) +@pytest.mark.parametrize( + "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] +) def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text): tokens = uk_tokenizer(punct + punct + punct + text) assert len(tokens) == 4 @@ -67,8 +74,10 @@ def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text): assert tokens[3].text == text -@pytest.mark.parametrize('punct', PUNCT_CLOSE) -@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) +@pytest.mark.parametrize("punct", PUNCT_CLOSE) +@pytest.mark.parametrize( + "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] +) def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text): tokens = uk_tokenizer(text + punct + punct + punct) assert len(tokens) == 4 @@ -76,14 +85,14 @@ def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text): assert tokens[1].text == punct -@pytest.mark.parametrize('text', ["'Тест"]) +@pytest.mark.parametrize("text", ["'Тест"]) def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text): tokens = uk_tokenizer(text) assert len(tokens) == 2 assert tokens[0].text == "'" -@pytest.mark.parametrize('text', ["Тест''"]) +@pytest.mark.parametrize("text", ["Тест''"]) def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text): tokens = uk_tokenizer(text) assert len(tokens) == 2 @@ -91,10 +100,13 @@ def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text): assert len(tokens_punct) == 1 -@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) -@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) -def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open, - punct_close, text): +@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED) +@pytest.mark.parametrize( + "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] +) +def test_uk_tokenizer_splits_open_close_punct( + uk_tokenizer, punct_open, punct_close, text +): tokens = uk_tokenizer(punct_open + text + punct_close) assert len(tokens) == 3 assert tokens[0].text == punct_open @@ -102,11 +114,14 @@ def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open, assert tokens[2].text == punct_close -@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) -@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")]) -@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) -def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close, - punct_open2, punct_close2, text): +@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED) +@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")]) +@pytest.mark.parametrize( + "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"] +) +def test_uk_tokenizer_two_diff_punct( + uk_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text +): tokens = uk_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2) assert len(tokens) == 5 assert tokens[0].text == punct_open2 @@ -116,7 +131,9 @@ def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close, assert tokens[4].text == punct_close2 -@pytest.mark.parametrize('text', ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."]) +@pytest.mark.parametrize( + "text", ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."] +) def test_uk_tokenizer_splits_trailing_dot(uk_tokenizer, text): tokens = uk_tokenizer(text) assert tokens[1].text == "." diff --git a/spacy/tests/lang/uk/test_tokenizer_exc.py b/spacy/tests/lang/uk/test_tokenizer_exc.py index 88def72e7..328e1d287 100644 --- a/spacy/tests/lang/uk/test_tokenizer_exc.py +++ b/spacy/tests/lang/uk/test_tokenizer_exc.py @@ -1,18 +1,14 @@ # coding: utf-8 -"""Test that tokenizer exceptions are parsed correctly.""" - - from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text,norms,lemmas', [("ім.", ["імені"], ["ім'я"]), - ("проф.", ["професор"], ["професор"])]) +@pytest.mark.parametrize( + "text,norms,lemmas", + [("ім.", ["імені"], ["ім'я"]), ("проф.", ["професор"], ["професор"])], +) def test_uk_tokenizer_abbrev_exceptions(uk_tokenizer, text, norms, lemmas): tokens = uk_tokenizer(text) assert len(tokens) == 1 assert [token.norm_ for token in tokens] == norms - - - diff --git a/spacy/tests/regression/_test_issue1622.py b/spacy/tests/regression/_test_issue1622.py index 4da0cea85..e8348b508 100644 --- a/spacy/tests/regression/_test_issue1622.py +++ b/spacy/tests/regression/_test_issue1622.py @@ -1,16 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals + import json from tempfile import NamedTemporaryFile -import pytest from ...cli.train import train def test_cli_trained_model_can_be_saved(tmpdir): - lang = 'nl' + lang = "nl" output_dir = str(tmpdir) - train_file = NamedTemporaryFile('wb', dir=output_dir, delete=False) + train_file = NamedTemporaryFile("wb", dir=output_dir, delete=False) train_corpus = [ { "id": "identifier_0", @@ -26,7 +26,7 @@ def test_cli_trained_model_can_be_saved(tmpdir): "head": 1, "tag": "NOUN", "orth": "Jan", - "ner": "B-PER" + "ner": "B-PER", }, { "id": 1, @@ -34,7 +34,7 @@ def test_cli_trained_model_can_be_saved(tmpdir): "head": 0, "tag": "VERB", "orth": "houdt", - "ner": "O" + "ner": "O", }, { "id": 2, @@ -42,7 +42,7 @@ def test_cli_trained_model_can_be_saved(tmpdir): "head": 1, "tag": "ADP", "orth": "van", - "ner": "O" + "ner": "O", }, { "id": 3, @@ -50,7 +50,7 @@ def test_cli_trained_model_can_be_saved(tmpdir): "head": -2, "tag": "NOUN", "orth": "Marie", - "ner": "B-PER" + "ner": "B-PER", }, { "id": 4, @@ -58,7 +58,7 @@ def test_cli_trained_model_can_be_saved(tmpdir): "head": -3, "tag": "PUNCT", "orth": ".", - "ner": "O" + "ner": "O", }, { "id": 5, @@ -66,18 +66,18 @@ def test_cli_trained_model_can_be_saved(tmpdir): "head": -1, "tag": "SPACE", "orth": "\n", - "ner": "O" - } + "ner": "O", + }, ], - "brackets": [] + "brackets": [], } - ] + ], } - ] + ], } ] - train_file.write(json.dumps(train_corpus).encode('utf-8')) + train_file.write(json.dumps(train_corpus).encode("utf-8")) train_file.close() train_data = train_file.name dev_data = train_data diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 580740a84..1bb6cebe1 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -155,6 +155,14 @@ def test_issue1758(en_tokenizer): assert tokens[1].lemma_ == "have" +def test_issue1773(en_tokenizer): + """Test that spaces don't receive a POS but no TAG. This is the root cause + of the serialization issue reported in #1773.""" + doc = en_tokenizer("\n") + if doc[0].pos_ == "SPACE": + assert doc[0].tag_ != "" + + def test_issue1799(): """Test sentence boundaries are deserialized correctly, even for non-projective sentences.""" @@ -249,8 +257,8 @@ def test_issue1945(): def test_issue1963(en_tokenizer): """Test that doc.merge() resizes doc.tensor""" - doc = en_tokenizer('a b c d') - doc.tensor = numpy.ones((len(doc), 128), dtype='f') + doc = en_tokenizer("a b c d") + doc.tensor = numpy.ones((len(doc), 128), dtype="f") with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2]) assert len(doc) == 3 diff --git a/spacy/tests/regression/test_issue1773.py b/spacy/tests/regression/test_issue1773.py deleted file mode 100644 index 5b4307b44..000000000 --- a/spacy/tests/regression/test_issue1773.py +++ /dev/null @@ -1,9 +0,0 @@ -from __future__ import unicode_literals - - -def test_issue1773(en_tokenizer): - """Test that spaces don't receive a POS but no TAG. This is the root cause - of the serialization issue reported in #1773.""" - doc = en_tokenizer('\n') - if doc[0].pos_ == 'SPACE': - assert doc[0].tag_ != "" diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 580cd77af..70040898f 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -6,8 +6,9 @@ from spacy.tokens import Doc from spacy.displacy import render from spacy.gold import iob_to_biluo from spacy.lang.it import Italian +import numpy -from ..util import add_vecs_to_vocab +from ..util import add_vecs_to_vocab, get_doc @pytest.mark.xfail @@ -69,6 +70,26 @@ def test_issue2385_biluo(tags): assert iob_to_biluo(tags) == list(tags) +def test_issue2396(en_vocab): + words = ["She", "created", "a", "test", "for", "spacy"] + heads = [1, 0, 1, -2, -1, -1] + matrix = numpy.array( + [ + [0, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [1, 1, 2, 3, 3, 3], + [1, 1, 3, 3, 3, 3], + [1, 1, 3, 3, 4, 4], + [1, 1, 3, 3, 4, 5], + ], + dtype=numpy.int32, + ) + doc = get_doc(en_vocab, words=words, heads=heads) + span = doc[:] + assert (doc.get_lca_matrix() == matrix).all() + assert (span.get_lca_matrix() == matrix).all() + + def test_issue2482(): """Test we can serialize and deserialize a blank NER or parser model.""" nlp = Italian() diff --git a/spacy/tests/regression/test_issue2396.py b/spacy/tests/regression/test_issue2396.py deleted file mode 100644 index 1df151ced..000000000 --- a/spacy/tests/regression/test_issue2396.py +++ /dev/null @@ -1,35 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from ..util import get_doc - -import pytest -import numpy - - -@pytest.mark.parametrize( - "sentence,heads,matrix", - [ - ( - "She created a test for spacy", - [1, 0, 1, -2, -1, -1], - numpy.array( - [ - [0, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - [1, 1, 2, 3, 3, 3], - [1, 1, 3, 3, 3, 3], - [1, 1, 3, 3, 4, 4], - [1, 1, 3, 3, 4, 5], - ], - dtype=numpy.int32, - ), - ) - ], -) -def test_issue2396(en_tokenizer, sentence, heads, matrix): - tokens = en_tokenizer(sentence) - doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) - span = doc[:] - assert (doc.get_lca_matrix() == matrix).all() - assert (span.get_lca_matrix() == matrix).all() diff --git a/spacy/tests/regression/test_issue2754.py b/spacy/tests/regression/test_issue2754.py index 5f76727f8..c05006517 100644 --- a/spacy/tests/regression/test_issue2754.py +++ b/spacy/tests/regression/test_issue2754.py @@ -1,14 +1,10 @@ # coding: utf8 from __future__ import unicode_literals -import pytest -from spacy.lang.en import English -def test_issue2754(): +def test_issue2754(en_tokenizer): """Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" - nlp = English() - a = nlp('a') - assert a[0].norm_ == 'a' - am = nlp('am') - assert am[0].norm_ == 'am' - + a = en_tokenizer("a") + assert a[0].norm_ == "a" + am = en_tokenizer("am") + assert am[0].norm_ == "am" diff --git a/spacy/tests/regression/test_issue2835.py b/spacy/tests/regression/test_issue2835.py index e186c3b8f..e5734b756 100644 --- a/spacy/tests/regression/test_issue2835.py +++ b/spacy/tests/regression/test_issue2835.py @@ -9,4 +9,3 @@ def test_issue2835(en_tokenizer): """ doc = en_tokenizer(text) assert doc - diff --git a/spacy/tests/regression/test_issue2871.py b/spacy/tests/regression/test_issue2871.py index 2b7d88c01..b71099ed0 100644 --- a/spacy/tests/regression/test_issue2871.py +++ b/spacy/tests/regression/test_issue2871.py @@ -2,26 +2,24 @@ from __future__ import unicode_literals import numpy -from spacy.vectors import Vectors from spacy.vocab import Vocab -from spacy.tokens import Doc from spacy._ml import link_vectors_to_models def test_issue2871(): """Test that vectors recover the correct key for spaCy reserved words.""" - words = ['dog', 'cat', 'SUFFIX'] + words = ["dog", "cat", "SUFFIX"] vocab = Vocab() vocab.vectors.resize(shape=(3, 10)) - vector_data = numpy.zeros((3, 10), dtype='f') + vector_data = numpy.zeros((3, 10), dtype="f") for word in words: - _ = vocab[word] + _ = vocab[word] # noqa: F841 vocab.set_vector(word, vector_data[0]) - vocab.vectors.name = 'dummy_vectors' + vocab.vectors.name = "dummy_vectors" link_vectors_to_models(vocab) - assert vocab['dog'].rank == 0 - assert vocab['cat'].rank == 1 - assert vocab['SUFFIX'].rank == 2 - assert vocab.vectors.find(key='dog') == 0 - assert vocab.vectors.find(key='cat') == 1 - assert vocab.vectors.find(key='SUFFIX') == 2 + assert vocab["dog"].rank == 0 + assert vocab["cat"].rank == 1 + assert vocab["SUFFIX"].rank == 2 + assert vocab.vectors.find(key="dog") == 0 + assert vocab.vectors.find(key="cat") == 1 + assert vocab.vectors.find(key="SUFFIX") == 2 diff --git a/spacy/tests/regression/test_issue3009.py b/spacy/tests/regression/test_issue3009.py index f8407741b..25f208903 100644 --- a/spacy/tests/regression/test_issue3009.py +++ b/spacy/tests/regression/test_issue3009.py @@ -58,9 +58,10 @@ def test_issue3009(doc, matcher, pattern): matches = matcher(doc) assert matches + def test_issue2464(matcher): """Test problem with successive ?. This is the same bug, so putting it here.""" - doc = Doc(matcher.vocab, words=['a', 'b']) - matcher.add('4', None, [{'OP': '?'}, {'OP': '?'}]) + doc = Doc(matcher.vocab, words=["a", "b"]) + matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}]) matches = matcher(doc) assert len(matches) == 3 diff --git a/spacy/tests/regression/test_issue3012.py b/spacy/tests/regression/test_issue3012.py index fb69c1b6e..8fdc8b318 100644 --- a/spacy/tests/regression/test_issue3012.py +++ b/spacy/tests/regression/test_issue3012.py @@ -1,8 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -import pytest - from ...attrs import ENT_IOB, ENT_TYPE from ...tokens import Doc from ..util import get_doc @@ -30,4 +28,4 @@ def test_issue3012(en_vocab): # serializing then deserializing doc_bytes = doc.to_bytes() doc2 = Doc(en_vocab).from_bytes(doc_bytes) - assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected + assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected diff --git a/spacy/tests/regression/test_issue3178.py b/spacy/tests/regression/test_issue3178.py deleted file mode 100644 index fb07f5c03..000000000 --- a/spacy/tests/regression/test_issue3178.py +++ /dev/null @@ -1,10 +0,0 @@ -from __future__ import unicode_literals -import pytest -import spacy - - -@pytest.mark.models('fr') -def test_issue1959(FR): - texts = ['Je suis la mauvaise herbe', "Me, myself and moi"] - for text in texts: - FR(text)