Tidy up and fix small bugs and typos

This commit is contained in:
Ines Montani 2019-02-08 14:14:49 +01:00
parent 9e652afa4b
commit 25602c794c
47 changed files with 751 additions and 933 deletions

View File

@ -8,15 +8,14 @@ import time
from collections import Counter
from pathlib import Path
from thinc.v2v import Affine, Maxout
from thinc.api import wrap, layerize
from thinc.misc import LayerNorm as LN
from thinc.neural.util import prefer_gpu, get_array_module
from thinc.neural.util import prefer_gpu
from wasabi import Printer
import srsly
from ..tokens import Doc
from ..attrs import ID, HEAD
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
from .._ml import masked_language_model
from .. import util
@ -136,7 +135,7 @@ def pretrain(
random.shuffle(texts)
def make_update(model, docs, optimizer, drop=0.0, objective='L2'):
def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
"""Perform an update over a single batch of documents.
docs (iterable): A batch of `Doc` objects.
@ -171,7 +170,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500):
return docs
def get_vectors_loss(ops, docs, prediction, objective='L2'):
def get_vectors_loss(ops, docs, prediction, objective="L2"):
"""Compute a mean-squared error loss between the documents' vectors and
the prediction.
@ -185,9 +184,9 @@ def get_vectors_loss(ops, docs, prediction, objective='L2'):
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
if objective == 'L2':
if objective == "L2":
d_scores = prediction - target
loss = (d_scores**2).sum()
loss = (d_scores ** 2).sum()
else:
raise NotImplementedError(objective)
return loss, d_scores
@ -201,8 +200,7 @@ def create_pretraining_model(nlp, tok2vec):
"""
output_size = nlp.vocab.vectors.data.shape[1]
output_layer = chain(
LN(Maxout(300, pieces=3)),
Affine(output_size, drop_factor=0.0),
LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
)
# This is annoying, but the parser etc have the flatten step after
# the tok2vec. To load the weights in cleanly, we need to match

View File

@ -13,13 +13,7 @@ RENDER_WRAPPER = None
def render(
docs,
style="dep",
page=False,
minify=False,
jupyter=False,
options={},
manual=False,
docs, style="dep", page=False, minify=False, jupyter=False, options={}, manual=False
):
"""Render displaCy visualisation.
@ -80,7 +74,7 @@ def serve(
"""
from wsgiref import simple_server
if IS_JUPYTER:
if is_in_jupyter():
user_warning(Warnings.W011)
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)

View File

@ -1,8 +1,9 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES
from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
# removing ° from the special icons to keep e.g. 99° as one token
_concat_icons = CONCAT_ICONS.replace("\u00B0", "")
@ -29,7 +30,9 @@ _suffixes = (
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{al}{e}{q}(?:{c})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency),
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
),
r"(?<=[{al})])-e".format(al=ALPHA_LOWER),
]
)
@ -40,7 +43,7 @@ _infixes = (
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),

View File

@ -5,24 +5,24 @@ import re
from collections import namedtuple
from .tag_map import TAG_MAP
from ...attrs import LANG
from ...language import Language
from ...tokens import Doc, Token
from ...util import DummyTokenizer
ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
# TODO: Is this the right place for this?
Token.set_extension("mecab_tag", default=None)
def try_mecab_import():
"""Mecab is required for Japanese support, so check for it.
It it's not available blow up and explain how to fix it."""
try:
import MeCab
# XXX Is this the right place for this?
Token.set_extension("mecab_tag", default=None)
return MeCab
except ImportError:
raise ImportError(
@ -33,14 +33,13 @@ def try_mecab_import():
def resolve_pos(token):
"""If necessary, add a field to the POS tag for UD mapping.
Under Universal Dependencies, sometimes the same Unidic POS tag can
be mapped differently depending on the literal token or its context
in the sentence. This function adds information to the POS tag to
resolve ambiguous mappings.
"""
# NOTE: This is a first take. The rules here are crude approximations.
# TODO: This is a first take. The rules here are crude approximations.
# For many of these, full dependencies are needed to properly resolve
# PoS mappings.
@ -56,7 +55,7 @@ def resolve_pos(token):
def detailed_tokens(tokenizer, text):
"""Format Mecab output into a nice data structure, based on Janome."""
tokenizer.parse(text)
node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it
words = []
@ -98,62 +97,15 @@ class JapaneseTokenizer(DummyTokenizer):
return doc
class JapaneseCharacterSegmenter(object):
def __init__(self, vocab):
self.vocab = vocab
self._presegmenter = self._make_presegmenter(self.vocab)
def _make_presegmenter(self, vocab):
rules = Japanese.Defaults.tokenizer_exceptions
token_match = Japanese.Defaults.token_match
prefix_search = (
util.compile_prefix_regex(Japanese.Defaults.prefixes).search
if Japanese.Defaults.prefixes
else None
)
suffix_search = (
util.compile_suffix_regex(Japanese.Defaults.suffixes).search
if Japanese.Defaults.suffixes
else None
)
infix_finditer = (
util.compile_infix_regex(Japanese.Defaults.infixes).finditer
if Japanese.Defaults.infixes
else None
)
return Tokenizer(
vocab,
rules=rules,
prefix_search=prefix_search,
suffix_search=suffix_search,
infix_finditer=infix_finditer,
token_match=token_match,
)
def __call__(self, text):
words = []
spaces = []
doc = self._presegmenter(text)
for token in doc:
words.extend(list(token.text))
spaces.extend([False] * len(token.text))
spaces[-1] = bool(token.whitespace_)
return Doc(self.vocab, words=words, spaces=spaces)
class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda _text: "ja"
tag_map = TAG_MAP
use_janome = True
@classmethod
def create_tokenizer(cls, nlp=None):
if cls.use_janome:
return JapaneseTokenizer(cls, nlp)
else:
return JapaneseCharacterSegmenter(nlp.vocab)
class Japanese(Language):

View File

@ -2,10 +2,10 @@
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
@ -22,9 +22,9 @@ class PolishDefaults(Language.Defaults):
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = tuple(TOKENIZER_INFIXES)
stop_words = STOP_WORDS
tag_map = TAG_MAP
infixes = TOKENIZER_INFIXES
class Polish(Language):

View File

@ -1,14 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
_quotes = QUOTES.replace("'", '')
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)])
from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
_quotes = CONCAT_QUOTES.replace("'", "")
_infixes = (
LIST_ELLIPSES
+ [CONCAT_ICONS]
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
]
)
TOKENIZER_INFIXES = _infixes

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ
_exc = {}

View File

@ -6,7 +6,9 @@ from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .morph_rules import MORPH_RULES
from .lemmatizer import LEMMA_RULES, LOOKUP
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
# Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
@ -31,6 +33,7 @@ class SwedishDefaults(Language.Defaults):
lemma_lookup = LOOKUP
morph_rules = MORPH_RULES
class Swedish(Language):
lang = "sv"
Defaults = SwedishDefaults

View File

@ -1,25 +0,0 @@
# coding: utf8
"""Punctuation stolen from Danish"""
from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_SUFFIXES
_quotes = QUOTES.replace("'", '')
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)])
_suffixes = [suffix for suffix in TOKENIZER_SUFFIXES if suffix not in ["'s", "'S", "s", "S", r"\'"]]
_suffixes += [r"(?<=[^sSxXzZ])\'"]
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes

View File

@ -1,169 +1,191 @@
# coding: utf8
"""
Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html
for https://github.com/UniversalDependencies/UD_Swedish-Talbanken
"""
from __future__ import unicode_literals
from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV
from ...symbols import ADP, X, VERB, NOUN, PROPN, PART, INTJ, PRON
# Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html
# for https://github.com/UniversalDependencies/UD_Swedish-Talbanken
TAG_MAP = {
'AB': { POS: ADV }, # inte, också, så, bara, nu
'AB|AN': { POS: ADV }, # t.ex., ca, t_ex, bl.a., s_k
'AB|KOM': { POS: ADV }, # mer, tidigare, mindre, vidare, mera
'AB|POS': { POS: ADV }, # mycket, helt, ofta, länge, långt
'AB|SMS': { POS: ADV }, # över-, in-
'AB|SUV': { POS: ADV }, # minst, mest, högst, främst, helst
'DT|MAS|SIN|DEF': { POS: DET },
'DT|MAS|SIN|IND': { POS: DET },
'DT|NEU|SIN|DEF': { POS: DET }, # det, detta
'DT|NEU|SIN|IND': { POS: DET }, # ett, något, inget, vart, vartannat
'DT|NEU|SIN|IND/DEF': { POS: DET }, # allt
'DT|UTR/NEU|PLU|DEF': { POS: DET }, # de, dessa, bägge, dom
'DT|UTR/NEU|PLU|IND': { POS: DET }, # några, inga
'DT|UTR/NEU|PLU|IND/DEF': { POS: DET }, # alla
'DT|UTR/NEU|SIN/PLU|IND': { POS: DET }, # samma
'DT|UTR/NEU|SIN|DEF': { POS: DET }, # vardera
'DT|UTR/NEU|SIN|IND': { POS: DET }, # varje, varenda
'DT|UTR|SIN|DEF': { POS: DET }, # den, denna
'DT|UTR|SIN|IND': { POS: DET }, # en, någon, ingen, var, varannan
'DT|UTR|SIN|IND/DEF': { POS: DET }, # all
'HA': { POS: ADV }, # när, där, hur, som, då
'HD|NEU|SIN|IND': { POS: DET }, # vilket
'HD|UTR/NEU|PLU|IND': { POS: DET }, # vilka
'HD|UTR|SIN|IND': { POS: DET }, # vilken
'HP|-|-|-': { POS: PRON }, # som
'HP|NEU|SIN|IND': { POS: PRON }, # vad, vilket
'HP|NEU|SIN|IND|SMS': { POS: PRON },
'HP|UTR/NEU|PLU|IND': { POS: PRON }, # vilka
'HP|UTR|SIN|IND': { POS: PRON }, # vilken, vem
'HS|DEF': { POS: DET }, # vars, vilkas, Vems
'IE': { POS: PART }, # att
'IN': { POS: INTJ }, # Jo, ja, nej, fan, visst
'JJ|AN': { POS: ADJ }, # ev, S:t, Kungl, Kungl., Teol
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: ADJ }, # äldres
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # större, högre, mindre, bättre, äldre
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS': { POS: ADJ },
'JJ|POS|MAS|SIN|DEF|GEN': { POS: ADJ }, # enskildes, sjukes, andres
'JJ|POS|MAS|SIN|DEF|NOM': { POS: ADJ }, # enskilde, sjuke, andre, unge, ene
'JJ|POS|NEU|SIN|IND/DEF|NOM': { POS: ADJ }, # eget
'JJ|POS|NEU|SIN|IND|GEN': { POS: ADJ },
'JJ|POS|NEU|SIN|IND|NOM': { POS: ADJ }, # annat, svårt, möjligt, nytt, sådant
'JJ|POS|UTR/NEU|PLU|IND/DEF|GEN': { POS: ADJ }, # ogiftas, ungas, frånskildas, efterkommandes, färgblindas
'JJ|POS|UTR/NEU|PLU|IND/DEF|NOM': { POS: ADJ }, # olika, andra, många, stora, vissa
'JJ|POS|UTR/NEU|PLU|IND|NOM': { POS: ADJ }, # flera, sådana, fler, få, samtliga
'JJ|POS|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ },
'JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # bra, ena, enda, nästa, ringa
'JJ|POS|UTR/NEU|SIN|DEF|GEN': { POS: ADJ },
'JJ|POS|UTR/NEU|SIN|DEF|NOM': { POS: ADJ }, # hela, nya, andra, svenska, ekonomiska
'JJ|POS|UTR|-|-|SMS': { POS: ADJ }, # fri-, låg-, sexual-
'JJ|POS|UTR|SIN|IND/DEF|NOM': { POS: ADJ }, # egen
'JJ|POS|UTR|SIN|IND|GEN': { POS: ADJ }, # enskilds
'JJ|POS|UTR|SIN|IND|NOM': { POS: ADJ }, # stor, annan, själv, sådan, viss
'JJ|SUV|MAS|SIN|DEF|GEN': { POS: ADJ },
'JJ|SUV|MAS|SIN|DEF|NOM': { POS: ADJ }, # störste, främste, äldste, minste
'JJ|SUV|UTR/NEU|PLU|DEF|NOM': { POS: ADJ }, # flesta
'JJ|SUV|UTR/NEU|PLU|IND|NOM': { POS: ADJ },
'JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM': { POS: ADJ }, # bästa, största, närmaste, viktigaste, högsta
'JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ }, # störst, bäst, tidigast, högst, fattigast
'KN': { POS: CCONJ }, # och, eller, som, än, men
'KN|AN': { POS: CCONJ },
'MAD': { POS: PUNCT }, # ., ?, :, !, ...
'MID': { POS: PUNCT }, # ,, -, :, *, ;
'NN|-|-|-|-': { POS: NOUN }, # godo, fjol, fullo, somras, måtto
'NN|AN': { POS: NOUN }, # kr, %, s., dr, kap.
'NN|NEU|-|-|-': { POS: NOUN },
'NN|NEU|-|-|SMS': { POS: NOUN }, # yrkes-, barn-, hem-, fack-, vatten-
'NN|NEU|PLU|DEF|GEN': { POS: NOUN }, # barnens, årens, u-ländernas, företagens, århundradenas
'NN|NEU|PLU|DEF|NOM': { POS: NOUN }, # barnen, u-länderna, åren, länderna, könen
'NN|NEU|PLU|IND|GEN': { POS: NOUN }, # slags, års, barns, länders, tusentals
'NN|NEU|PLU|IND|NOM': { POS: NOUN }, # barn, år, fall, länder, problem
'NN|NEU|SIN|DEF|GEN': { POS: NOUN }, # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets
'NN|NEU|SIN|DEF|NOM': { POS: NOUN }, # äktenskapet, samhället, barnet, stället, hemmet
'NN|NEU|SIN|IND|GEN': { POS: NOUN }, # års, slags, lands, havs, företags
'NN|NEU|SIN|IND|NOM': { POS: NOUN }, # år, arbete, barn, sätt, äktenskap
'NN|SMS': { POS: NOUN }, # PCB-, Syd-
'NN|UTR|-|-|-': { POS: NOUN }, # dags, rätta
'NN|UTR|-|-|SMS': { POS: NOUN }, # far-, kibbutz-, röntgen-, barna-, hälso-
'NN|UTR|PLU|DEF|GEN': { POS: NOUN }, # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas
'NN|UTR|PLU|DEF|NOM': { POS: NOUN }, # kvinnorna, föräldrarna, makarna, männen, hyrorna
'NN|UTR|PLU|IND|GEN': { POS: NOUN }, # människors, kvinnors, dagars, tiders, månaders
'NN|UTR|PLU|IND|NOM': { POS: NOUN }, # procent, människor, kvinnor, miljoner, kronor
'NN|UTR|SIN|DEF|GEN': { POS: NOUN }, # kvinnans, världens, familjens, dagens, jordens
'NN|UTR|SIN|DEF|NOM': { POS: NOUN }, # familjen, kvinnan, mannen, världen, skolan
'NN|UTR|SIN|IND|GEN': { POS: NOUN }, # sorts, medelålders, makes, kvinnas, veckas
'NN|UTR|SIN|IND|NOM': { POS: NOUN }, # del, tid, dag, fråga, man
'PAD': { POS: PUNCT }, # , ), (
'PC|AN': { POS: VERB },
'PC|PRF|MAS|SIN|DEF|GEN': { POS: VERB }, # avlidnes
'PC|PRF|MAS|SIN|DEF|NOM': { POS: VERB },
'PC|PRF|NEU|SIN|IND|NOM': { POS: VERB }, # taget, sett, särskilt, förbjudet, ökat
'PC|PRF|UTR/NEU|PLU|IND/DEF|GEN': { POS: VERB }, # försäkrades, anställdas
'PC|PRF|UTR/NEU|PLU|IND/DEF|NOM': { POS: VERB }, # särskilda, gifta, ökade, handikappade, skilda
'PC|PRF|UTR/NEU|SIN|DEF|GEN': { POS: VERB },
'PC|PRF|UTR/NEU|SIN|DEF|NOM': { POS: VERB }, # ökade, gifta, nämnda, nedärvda, dolda
'PC|PRF|UTR|SIN|IND|GEN': { POS: VERB },
'PC|PRF|UTR|SIN|IND|NOM': { POS: VERB }, # särskild, ökad, beredd, gift, oförändrad
'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: VERB }, # studerandes, sammanboendes, dubbelarbetandes
'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: VERB }, # följande, beroende, nuvarande, motsvarande, liknande
'PL': { POS: PART }, # ut, upp, in, till, med
'PL|SMS': { POS: PART },
'PM': { POS: PROPN }, # F, N, Liechtenstein, Danmark, DK
'PM|GEN': { POS: PROPN }, # Sveriges, EEC:s, Guds, Stockholms, Kristi
'PM|NOM': { POS: PROPN }, # Sverige, EEC, Stockholm, USA, ATP
'PM|SMS': { POS: PROPN }, # Göteborgs-, Nord-, Väst-
'PN|MAS|SIN|DEF|SUB/OBJ': { POS: PRON }, # denne
'PN|NEU|SIN|DEF|SUB/OBJ': { POS: PRON }, # det, detta, detsamma
'PN|NEU|SIN|IND|SUB/OBJ': { POS: PRON }, # något, allt, mycket, annat, ingenting
'PN|UTR/NEU|PLU|DEF|OBJ': { POS: PRON }, # dem, varandra, varann
'PN|UTR/NEU|PLU|DEF|SUB': { POS: PRON }, # de, bägge
'PN|UTR/NEU|PLU|DEF|SUB/OBJ': { POS: PRON }, # dessa, dom, båda, den, bådadera
'PN|UTR/NEU|PLU|IND|SUB/OBJ': { POS: PRON }, # andra, alla, många, sådana, några
'PN|UTR/NEU|SIN/PLU|DEF|OBJ': { POS: PRON }, # sig, sej
'PN|UTR|PLU|DEF|OBJ': { POS: PRON }, # oss, er, eder
'PN|UTR|PLU|DEF|SUB': { POS: PRON }, # vi
'PN|UTR|SIN|DEF|OBJ': { POS: PRON }, # dig, mig, henne, honom, Er
'PN|UTR|SIN|DEF|SUB': { POS: PRON }, # du, han, hon, jag, ni
'PN|UTR|SIN|DEF|SUB/OBJ': { POS: PRON }, # den, denna, densamma
'PN|UTR|SIN|IND|SUB': { POS: PRON }, # man
'PN|UTR|SIN|IND|SUB/OBJ': { POS: PRON }, # en, var, någon, ingen, Varannan
'PP': { POS: ADP }, # i, av, på, för, till
'PP|AN': { POS: ADP }, # f
'PS|AN': { POS: DET },
'PS|NEU|SIN|DEF': { POS: DET }, # sitt, vårt, ditt, mitt, ert
'PS|UTR/NEU|PLU|DEF': { POS: DET }, # sina, våra, dina, mina
'PS|UTR/NEU|SIN/PLU|DEF': { POS: DET }, # deras, dess, hans, hennes, varandras
'PS|UTR|SIN|DEF': { POS: DET }, # sin, vår, din, min, er
'RG': { POS: NUM }, # 2, 17, 20, 1, 18
'RG|GEN': { POS: NUM },
'RG|MAS|SIN|DEF|NOM': { POS: NUM },
'RG|NEU|SIN|IND|NOM': { POS: NUM }, # ett
'RG|NOM': { POS: NUM }, # två, tre, 1, 20, 2
'RG|SMS': { POS: NUM }, # ett-, 1950-, två-, tre-, 1700-
'RG|UTR/NEU|SIN|DEF|NOM': { POS: NUM },
'RG|UTR|SIN|IND|NOM': { POS: NUM }, # en
'RO|MAS|SIN|IND/DEF|GEN': { POS: ADJ },
'RO|MAS|SIN|IND/DEF|NOM': { POS: ADJ }, # förste
'RO|GEN': { POS: ADJ },
'RO|NOM': { POS: ADJ }, # första, andra, tredje, fjärde, femte
'SN': { POS: SCONJ }, # att, om, innan, eftersom, medan
'UO': { POS: X }, # companionship, vice, versa, family, capita
'VB|AN': { POS: VERB }, # jfr
'VB|IMP|AKT': { POS: VERB }, # se, Diskutera, låt, Läs, Gå
'VB|IMP|SFO': { POS: VERB }, # tas
'VB|INF|AKT': { POS: VERB }, # vara, få, ha, bli, kunna
'VB|INF|SFO': { POS: VERB }, # användas, finnas, göras, tas, ses
'VB|KON|PRS|AKT': { POS: VERB }, # vare, Gånge
'VB|KON|PRT|AKT': { POS: VERB }, # vore, finge
'VB|KON|PRT|SFO': { POS: VERB },
'VB|PRS|AKT': { POS: VERB }, # är, har, kan, får, måste
'VB|PRS|SFO': { POS: VERB }, # finns, kallas, behövs, beräknas, används
'VB|PRT|AKT': { POS: VERB }, # skulle, var, hade, kunde, fick
'VB|PRT|SFO': { POS: VERB }, # fanns, gjordes, höjdes, användes, infördes
'VB|SMS': { POS: VERB }, # läs-
'VB|SUP|AKT': { POS: VERB }, # varit, fått, blivit, haft, kommit
'VB|SUP|SFO': { POS: VERB } # nämnts, gjorts, förändrats, sagts, framhållits
"AB": {POS: ADV}, # inte, också, så, bara, nu
"AB|AN": {POS: ADV}, # t.ex., ca, t_ex, bl.a., s_k
"AB|KOM": {POS: ADV}, # mer, tidigare, mindre, vidare, mera
"AB|POS": {POS: ADV}, # mycket, helt, ofta, länge, långt
"AB|SMS": {POS: ADV}, # över-, in-
"AB|SUV": {POS: ADV}, # minst, mest, högst, främst, helst
"DT|MAS|SIN|DEF": {POS: DET},
"DT|MAS|SIN|IND": {POS: DET},
"DT|NEU|SIN|DEF": {POS: DET}, # det, detta
"DT|NEU|SIN|IND": {POS: DET}, # ett, något, inget, vart, vartannat
"DT|NEU|SIN|IND/DEF": {POS: DET}, # allt
"DT|UTR/NEU|PLU|DEF": {POS: DET}, # de, dessa, bägge, dom
"DT|UTR/NEU|PLU|IND": {POS: DET}, # några, inga
"DT|UTR/NEU|PLU|IND/DEF": {POS: DET}, # alla
"DT|UTR/NEU|SIN/PLU|IND": {POS: DET}, # samma
"DT|UTR/NEU|SIN|DEF": {POS: DET}, # vardera
"DT|UTR/NEU|SIN|IND": {POS: DET}, # varje, varenda
"DT|UTR|SIN|DEF": {POS: DET}, # den, denna
"DT|UTR|SIN|IND": {POS: DET}, # en, någon, ingen, var, varannan
"DT|UTR|SIN|IND/DEF": {POS: DET}, # all
"HA": {POS: ADV}, # när, där, hur, som, då
"HD|NEU|SIN|IND": {POS: DET}, # vilket
"HD|UTR/NEU|PLU|IND": {POS: DET}, # vilka
"HD|UTR|SIN|IND": {POS: DET}, # vilken
"HP|-|-|-": {POS: PRON}, # som
"HP|NEU|SIN|IND": {POS: PRON}, # vad, vilket
"HP|NEU|SIN|IND|SMS": {POS: PRON},
"HP|UTR/NEU|PLU|IND": {POS: PRON}, # vilka
"HP|UTR|SIN|IND": {POS: PRON}, # vilken, vem
"HS|DEF": {POS: DET}, # vars, vilkas, Vems
"IE": {POS: PART}, # att
"IN": {POS: INTJ}, # Jo, ja, nej, fan, visst
"JJ|AN": {POS: ADJ}, # ev, S:t, Kungl, Kungl., Teol
"JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN": {POS: ADJ}, # äldres
"JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM": {
POS: ADJ
}, # större, högre, mindre, bättre, äldre
"JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS": {POS: ADJ},
"JJ|POS|MAS|SIN|DEF|GEN": {POS: ADJ}, # enskildes, sjukes, andres
"JJ|POS|MAS|SIN|DEF|NOM": {POS: ADJ}, # enskilde, sjuke, andre, unge, ene
"JJ|POS|NEU|SIN|IND/DEF|NOM": {POS: ADJ}, # eget
"JJ|POS|NEU|SIN|IND|GEN": {POS: ADJ},
"JJ|POS|NEU|SIN|IND|NOM": {POS: ADJ}, # annat, svårt, möjligt, nytt, sådant
"JJ|POS|UTR/NEU|PLU|IND/DEF|GEN": {
POS: ADJ
}, # ogiftas, ungas, frånskildas, efterkommandes, färgblindas
"JJ|POS|UTR/NEU|PLU|IND/DEF|NOM": {POS: ADJ}, # olika, andra, många, stora, vissa
"JJ|POS|UTR/NEU|PLU|IND|NOM": {POS: ADJ}, # flera, sådana, fler, få, samtliga
"JJ|POS|UTR/NEU|SIN/PLU|IND|NOM": {POS: ADJ},
"JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM": {POS: ADJ}, # bra, ena, enda, nästa, ringa
"JJ|POS|UTR/NEU|SIN|DEF|GEN": {POS: ADJ},
"JJ|POS|UTR/NEU|SIN|DEF|NOM": {POS: ADJ}, # hela, nya, andra, svenska, ekonomiska
"JJ|POS|UTR|-|-|SMS": {POS: ADJ}, # fri-, låg-, sexual-
"JJ|POS|UTR|SIN|IND/DEF|NOM": {POS: ADJ}, # egen
"JJ|POS|UTR|SIN|IND|GEN": {POS: ADJ}, # enskilds
"JJ|POS|UTR|SIN|IND|NOM": {POS: ADJ}, # stor, annan, själv, sådan, viss
"JJ|SUV|MAS|SIN|DEF|GEN": {POS: ADJ},
"JJ|SUV|MAS|SIN|DEF|NOM": {POS: ADJ}, # störste, främste, äldste, minste
"JJ|SUV|UTR/NEU|PLU|DEF|NOM": {POS: ADJ}, # flesta
"JJ|SUV|UTR/NEU|PLU|IND|NOM": {POS: ADJ},
"JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM": {
POS: ADJ
}, # bästa, största, närmaste, viktigaste, högsta
"JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM": {
POS: ADJ
}, # störst, bäst, tidigast, högst, fattigast
"KN": {POS: CCONJ}, # och, eller, som, än, men
"KN|AN": {POS: CCONJ},
"MAD": {POS: PUNCT}, # ., ?, :, !, ...
"MID": {POS: PUNCT}, # ,, -, :, *, ;
"NN|-|-|-|-": {POS: NOUN}, # godo, fjol, fullo, somras, måtto
"NN|AN": {POS: NOUN}, # kr, %, s., dr, kap.
"NN|NEU|-|-|-": {POS: NOUN},
"NN|NEU|-|-|SMS": {POS: NOUN}, # yrkes-, barn-, hem-, fack-, vatten-
"NN|NEU|PLU|DEF|GEN": {
POS: NOUN
}, # barnens, årens, u-ländernas, företagens, århundradenas
"NN|NEU|PLU|DEF|NOM": {POS: NOUN}, # barnen, u-länderna, åren, länderna, könen
"NN|NEU|PLU|IND|GEN": {POS: NOUN}, # slags, års, barns, länders, tusentals
"NN|NEU|PLU|IND|NOM": {POS: NOUN}, # barn, år, fall, länder, problem
"NN|NEU|SIN|DEF|GEN": {
POS: NOUN
}, # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets
"NN|NEU|SIN|DEF|NOM": {
POS: NOUN
}, # äktenskapet, samhället, barnet, stället, hemmet
"NN|NEU|SIN|IND|GEN": {POS: NOUN}, # års, slags, lands, havs, företags
"NN|NEU|SIN|IND|NOM": {POS: NOUN}, # år, arbete, barn, sätt, äktenskap
"NN|SMS": {POS: NOUN}, # PCB-, Syd-
"NN|UTR|-|-|-": {POS: NOUN}, # dags, rätta
"NN|UTR|-|-|SMS": {POS: NOUN}, # far-, kibbutz-, röntgen-, barna-, hälso-
"NN|UTR|PLU|DEF|GEN": {
POS: NOUN
}, # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas
"NN|UTR|PLU|DEF|NOM": {
POS: NOUN
}, # kvinnorna, föräldrarna, makarna, männen, hyrorna
"NN|UTR|PLU|IND|GEN": {POS: NOUN}, # människors, kvinnors, dagars, tiders, månaders
"NN|UTR|PLU|IND|NOM": {POS: NOUN}, # procent, människor, kvinnor, miljoner, kronor
"NN|UTR|SIN|DEF|GEN": {POS: NOUN}, # kvinnans, världens, familjens, dagens, jordens
"NN|UTR|SIN|DEF|NOM": {POS: NOUN}, # familjen, kvinnan, mannen, världen, skolan
"NN|UTR|SIN|IND|GEN": {POS: NOUN}, # sorts, medelålders, makes, kvinnas, veckas
"NN|UTR|SIN|IND|NOM": {POS: NOUN}, # del, tid, dag, fråga, man
"PAD": {POS: PUNCT}, # , ), (
"PC|AN": {POS: VERB},
"PC|PRF|MAS|SIN|DEF|GEN": {POS: VERB}, # avlidnes
"PC|PRF|MAS|SIN|DEF|NOM": {POS: VERB},
"PC|PRF|NEU|SIN|IND|NOM": {POS: VERB}, # taget, sett, särskilt, förbjudet, ökat
"PC|PRF|UTR/NEU|PLU|IND/DEF|GEN": {POS: VERB}, # försäkrades, anställdas
"PC|PRF|UTR/NEU|PLU|IND/DEF|NOM": {
POS: VERB
}, # särskilda, gifta, ökade, handikappade, skilda
"PC|PRF|UTR/NEU|SIN|DEF|GEN": {POS: VERB},
"PC|PRF|UTR/NEU|SIN|DEF|NOM": {POS: VERB}, # ökade, gifta, nämnda, nedärvda, dolda
"PC|PRF|UTR|SIN|IND|GEN": {POS: VERB},
"PC|PRF|UTR|SIN|IND|NOM": {POS: VERB}, # särskild, ökad, beredd, gift, oförändrad
"PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN": {
POS: VERB
}, # studerandes, sammanboendes, dubbelarbetandes
"PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM": {
POS: VERB
}, # följande, beroende, nuvarande, motsvarande, liknande
"PL": {POS: PART}, # ut, upp, in, till, med
"PL|SMS": {POS: PART},
"PM": {POS: PROPN}, # F, N, Liechtenstein, Danmark, DK
"PM|GEN": {POS: PROPN}, # Sveriges, EEC:s, Guds, Stockholms, Kristi
"PM|NOM": {POS: PROPN}, # Sverige, EEC, Stockholm, USA, ATP
"PM|SMS": {POS: PROPN}, # Göteborgs-, Nord-, Väst-
"PN|MAS|SIN|DEF|SUB/OBJ": {POS: PRON}, # denne
"PN|NEU|SIN|DEF|SUB/OBJ": {POS: PRON}, # det, detta, detsamma
"PN|NEU|SIN|IND|SUB/OBJ": {POS: PRON}, # något, allt, mycket, annat, ingenting
"PN|UTR/NEU|PLU|DEF|OBJ": {POS: PRON}, # dem, varandra, varann
"PN|UTR/NEU|PLU|DEF|SUB": {POS: PRON}, # de, bägge
"PN|UTR/NEU|PLU|DEF|SUB/OBJ": {POS: PRON}, # dessa, dom, båda, den, bådadera
"PN|UTR/NEU|PLU|IND|SUB/OBJ": {POS: PRON}, # andra, alla, många, sådana, några
"PN|UTR/NEU|SIN/PLU|DEF|OBJ": {POS: PRON}, # sig, sej
"PN|UTR|PLU|DEF|OBJ": {POS: PRON}, # oss, er, eder
"PN|UTR|PLU|DEF|SUB": {POS: PRON}, # vi
"PN|UTR|SIN|DEF|OBJ": {POS: PRON}, # dig, mig, henne, honom, Er
"PN|UTR|SIN|DEF|SUB": {POS: PRON}, # du, han, hon, jag, ni
"PN|UTR|SIN|DEF|SUB/OBJ": {POS: PRON}, # den, denna, densamma
"PN|UTR|SIN|IND|SUB": {POS: PRON}, # man
"PN|UTR|SIN|IND|SUB/OBJ": {POS: PRON}, # en, var, någon, ingen, Varannan
"PP": {POS: ADP}, # i, av, på, för, till
"PP|AN": {POS: ADP}, # f
"PS|AN": {POS: DET},
"PS|NEU|SIN|DEF": {POS: DET}, # sitt, vårt, ditt, mitt, ert
"PS|UTR/NEU|PLU|DEF": {POS: DET}, # sina, våra, dina, mina
"PS|UTR/NEU|SIN/PLU|DEF": {POS: DET}, # deras, dess, hans, hennes, varandras
"PS|UTR|SIN|DEF": {POS: DET}, # sin, vår, din, min, er
"RG": {POS: NUM}, # 2, 17, 20, 1, 18
"RG|GEN": {POS: NUM},
"RG|MAS|SIN|DEF|NOM": {POS: NUM},
"RG|NEU|SIN|IND|NOM": {POS: NUM}, # ett
"RG|NOM": {POS: NUM}, # två, tre, 1, 20, 2
"RG|SMS": {POS: NUM}, # ett-, 1950-, två-, tre-, 1700-
"RG|UTR/NEU|SIN|DEF|NOM": {POS: NUM},
"RG|UTR|SIN|IND|NOM": {POS: NUM}, # en
"RO|MAS|SIN|IND/DEF|GEN": {POS: ADJ},
"RO|MAS|SIN|IND/DEF|NOM": {POS: ADJ}, # förste
"RO|GEN": {POS: ADJ},
"RO|NOM": {POS: ADJ}, # första, andra, tredje, fjärde, femte
"SN": {POS: SCONJ}, # att, om, innan, eftersom, medan
"UO": {POS: X}, # companionship, vice, versa, family, capita
"VB|AN": {POS: VERB}, # jfr
"VB|IMP|AKT": {POS: VERB}, # se, Diskutera, låt, Läs, Gå
"VB|IMP|SFO": {POS: VERB}, # tas
"VB|INF|AKT": {POS: VERB}, # vara, få, ha, bli, kunna
"VB|INF|SFO": {POS: VERB}, # användas, finnas, göras, tas, ses
"VB|KON|PRS|AKT": {POS: VERB}, # vare, Gånge
"VB|KON|PRT|AKT": {POS: VERB}, # vore, finge
"VB|KON|PRT|SFO": {POS: VERB},
"VB|PRS|AKT": {POS: VERB}, # är, har, kan, får, måste
"VB|PRS|SFO": {POS: VERB}, # finns, kallas, behövs, beräknas, används
"VB|PRT|AKT": {POS: VERB}, # skulle, var, hade, kunde, fick
"VB|PRT|SFO": {POS: VERB}, # fanns, gjordes, höjdes, användes, infördes
"VB|SMS": {POS: VERB}, # läs-
"VB|SUP|AKT": {POS: VERB}, # varit, fått, blivit, haft, kommit
"VB|SUP|SFO": {POS: VERB}, # nämnts, gjorts, förändrats, sagts, framhållits
}

View File

@ -144,7 +144,7 @@ ABBREVIATIONS = [
# Add abbreviation for trailing punctuation too. If the abbreviation already has a trailing punctuation - skip it.
for abbr in ABBREVIATIONS:
if abbr.endswith(".") == False:
if not abbr.endswith("."):
ABBREVIATIONS.append(abbr + ".")
for orth in ABBREVIATIONS:

View File

@ -4,16 +4,15 @@ from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
class TamilDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "ta"
lex_attr_getters.update(LEX_ATTRS)
stop_words = STOP_WORDS
class Tamil(Language):

View File

@ -4,70 +4,33 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
# uncomment if files are available
# from .norm_exceptions import NORM_EXCEPTIONS
from .tag_map import TAG_MAP
# from .morph_rules import MORPH_RULES
# uncomment if lookup-based lemmatizer is available
from .lemmatizer import LOOKUP
# from ...lemmatizerlookup import Lemmatizer
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
def _return_tl(_):
return 'tl'
# Create a Language subclass
# Documentation: https://spacy.io/docs/usage/adding-languages
# This file should be placed in spacy/lang/xx (ISO code of language).
# Before submitting a pull request, make sure the remove all comments from the
# language data files, and run at least the basic tokenizer tests. Simply add the
# language ID to the list of languages in spacy/tests/conftest.py to include it
# in the basic tokenizer sanity tests. You can optionally add a fixture for the
# language's tokenizer and add more specific tests. For more info, see the
# tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests
return "tl"
class TagalogDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = _return_tl # ISO code
# add more norm exception dictionaries here
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
# overwrite functions for lexical attributes
lex_attr_getters[LANG] = _return_tl
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
lex_attr_getters.update(LEX_ATTRS)
# add custom tokenizer exceptions to base exceptions
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
# add stop words
stop_words = STOP_WORDS
# if available: add tag map
# tag_map = dict(TAG_MAP)
# if available: add morph rules
# morph_rules = dict(MORPH_RULES)
# if available: add lookup lemmatizer
# @classmethod
# def create_lemmatizer(cls, nlp=None):
# return Lemmatizer(LOOKUP)
lemma_lookup = LOOKUP
class Tagalog(Language):
lang = 'tl' # ISO code
Defaults = TagalogDefaults # set Defaults to custom language defaults
lang = "tl"
Defaults = TagalogDefaults
# set default export this allows the language class to be lazy-loaded
__all__ = ['Tagalog']
__all__ = ["Tagalog"]

View File

@ -2,11 +2,6 @@
from __future__ import unicode_literals
# Adding a lemmatizer lookup table
# Documentation: https://spacy.io/docs/usage/adding-languages#lemmatizer
# Entries should be added in the following format:
LOOKUP = {
"kaugnayan": "ugnay",
"sangkatauhan": "tao",
@ -14,5 +9,5 @@ LOOKUP = {
"pandaigdigan": "daigdig",
"kasaysayan": "saysay",
"kabayanihan": "bayani",
"karuwagan": "duwag"
"karuwagan": "duwag",
}

View File

@ -1,33 +1,55 @@
# coding: utf8
from __future__ import unicode_literals
# import the symbols for the attrs you want to overwrite
from ...attrs import LIKE_NUM
# Overwriting functions for lexical attributes
# Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs
# Most of these functions, like is_lower or like_url should be language-
# independent. Others, like like_num (which includes both digits and number
# words), requires customisation.
# Example: check if token resembles a number
_num_words = ['sero', 'isa', 'dalawa', 'tatlo', 'apat', 'lima', 'anim', 'pito',
'walo', 'siyam', 'sampu', 'labing-isa', 'labindalawa', 'labintatlo', 'labing-apat',
'labinlima', 'labing-anim', 'labimpito', 'labing-walo', 'labinsiyam', 'dalawampu',
'tatlumpu', 'apatnapu', 'limampu', 'animnapu', 'pitumpu', 'walumpu', 'siyamnapu',
'daan', 'libo', 'milyon', 'bilyon', 'trilyon', 'quadrilyon',
'gajilyon', 'bazilyon']
_num_words = [
"sero",
"isa",
"dalawa",
"tatlo",
"apat",
"lima",
"anim",
"pito",
"walo",
"siyam",
"sampu",
"labing-isa",
"labindalawa",
"labintatlo",
"labing-apat",
"labinlima",
"labing-anim",
"labimpito",
"labing-walo",
"labinsiyam",
"dalawampu",
"tatlumpu",
"apatnapu",
"limampu",
"animnapu",
"pitumpu",
"walumpu",
"siyamnapu",
"daan",
"libo",
"milyon",
"bilyon",
"trilyon",
"quadrilyon",
"gajilyon",
"bazilyon",
]
def like_num(text):
text = text.replace(',', '').replace('.', '')
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
@ -35,9 +57,4 @@ def like_num(text):
return False
# Create dictionary of functions to overwrite. The default lex_attr_getters are
# updated with this one, so only the functions defined here are overwritten.
LEX_ATTRS = {
LIKE_NUM: like_num
}
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -1,162 +1,154 @@
# encoding: utf8
from __future__ import unicode_literals
# Add stop words
# Documentation: https://spacy.io/docs/usage/adding-languages#stop-words
# To improve readability, words should be ordered alphabetically and separated
# by spaces and newlines. When adding stop words from an online source, always
# include the link in a comment. Make sure to proofread and double-check the
# words lists available online are often known to contain mistakes.
# data from https://github.com/stopwords-iso/stopwords-tl/blob/master/stopwords-tl.txt
STOP_WORDS = set("""
akin
aking
ako
alin
am
amin
aming
ang
ano
anumang
apat
at
atin
ating
ay
bababa
bago
bakit
bawat
bilang
dahil
dalawa
dapat
din
dito
doon
gagawin
gayunman
ginagawa
ginawa
ginawang
gumawa
gusto
habang
hanggang
hindi
huwag
iba
ibaba
ibabaw
ibig
ikaw
ilagay
ilalim
ilan
inyong
isa
isang
itaas
ito
iyo
iyon
iyong
ka
kahit
kailangan
kailanman
kami
kanila
kanilang
kanino
kanya
kanyang
kapag
kapwa
karamihan
katiyakan
katulad
kaya
kaysa
ko
kong
kulang
kumuha
kung
laban
lahat
lamang
likod
lima
maaari
maaaring
maging
mahusay
makita
marami
marapat
masyado
may
mayroon
mga
minsan
mismo
mula
muli
na
nabanggit
naging
nagkaroon
nais
nakita
namin
napaka
narito
nasaan
ng
ngayon
ni
nila
nilang
nito
niya
niyang
noon
o
pa
paano
pababa
paggawa
pagitan
pagkakaroon
pagkatapos
palabas
pamamagitan
panahon
pangalawa
para
paraan
pareho
pataas
pero
pumunta
pumupunta
sa
saan
sabi
sabihin
sarili
sila
sino
siya
tatlo
tayo
tulad
tungkol
una
walang
""".split())
STOP_WORDS = set(
"""
akin
aking
ako
alin
am
amin
aming
ang
ano
anumang
apat
at
atin
ating
ay
bababa
bago
bakit
bawat
bilang
dahil
dalawa
dapat
din
dito
doon
gagawin
gayunman
ginagawa
ginawa
ginawang
gumawa
gusto
habang
hanggang
hindi
huwag
iba
ibaba
ibabaw
ibig
ikaw
ilagay
ilalim
ilan
inyong
isa
isang
itaas
ito
iyo
iyon
iyong
ka
kahit
kailangan
kailanman
kami
kanila
kanilang
kanino
kanya
kanyang
kapag
kapwa
karamihan
katiyakan
katulad
kaya
kaysa
ko
kong
kulang
kumuha
kung
laban
lahat
lamang
likod
lima
maaari
maaaring
maging
mahusay
makita
marami
marapat
masyado
may
mayroon
mga
minsan
mismo
mula
muli
na
nabanggit
naging
nagkaroon
nais
nakita
namin
napaka
narito
nasaan
ng
ngayon
ni
nila
nilang
nito
niya
niyang
noon
o
pa
paano
pababa
paggawa
pagitan
pagkakaroon
pagkatapos
palabas
pamamagitan
panahon
pangalawa
para
paraan
pareho
pataas
pero
pumunta
pumupunta
sa
saan
sabi
sabihin
sarili
sila
sino
siya
tatlo
tayo
tulad
tungkol
una
walang
""".split()
)

View File

@ -1,36 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
from ...symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
# Add a tag map
# Documentation: https://spacy.io/docs/usage/adding-languages#tag-map
# Universal Dependencies: http://universaldependencies.org/u/pos/all.html
# The keys of the tag map should be strings in your tag set. The dictionary must
# have an entry POS whose value is one of the Universal Dependencies tags.
# Optionally, you can also include morphological features or other attributes.
TAG_MAP = {
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
"ADP": {POS: ADP},
"PRON": {POS: PRON},
"SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN},
"DET": {POS: DET},
"SYM": {POS: SYM},
"INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT},
"NUM": {POS: NUM},
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"CCONJ": {POS: CCONJ},
"ADJ": {POS: ADJ},
"VERB": {POS: VERB},
"PART": {POS: PART},
"SP": {POS: SPACE}
}

View File

@ -1,48 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
# import symbols if you need to use more, add them here
from ...symbols import ORTH, LEMMA, TAG, NORM, ADP, DET
from ...symbols import ORTH, LEMMA
# Add tokenizer exceptions
# Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions
# Feel free to use custom logic to generate repetitive exceptions more efficiently.
# If an exception is split into more than one token, the ORTH values combined always
# need to match the original string.
# Exceptions should be added in the following format:
_exc = {
"tayo'y": [
{ORTH: "tayo", LEMMA: "tayo"},
{ORTH: "'y", LEMMA: "ay"}],
"isa'y": [
{ORTH: "isa", LEMMA: "isa"},
{ORTH: "'y", LEMMA: "ay"}],
"baya'y": [
{ORTH: "baya", LEMMA: "bayan"},
{ORTH: "'y", LEMMA: "ay"}],
"sa'yo": [
{ORTH: "sa", LEMMA: "sa"},
{ORTH: "'yo", LEMMA: "iyo"}],
"ano'ng": [
{ORTH: "ano", LEMMA: "ano"},
{ORTH: "'ng", LEMMA: "ang"}],
"siya'y": [
{ORTH: "siya", LEMMA: "siya"},
{ORTH: "'y", LEMMA: "ay"}],
"nawa'y": [
{ORTH: "nawa", LEMMA: "nawa"},
{ORTH: "'y", LEMMA: "ay"}],
"papa'no": [
{ORTH: "papa'no", LEMMA: "papaano"}],
"'di": [
{ORTH: "'di", LEMMA: "hindi"}]
"tayo'y": [{ORTH: "tayo", LEMMA: "tayo"}, {ORTH: "'y", LEMMA: "ay"}],
"isa'y": [{ORTH: "isa", LEMMA: "isa"}, {ORTH: "'y", LEMMA: "ay"}],
"baya'y": [{ORTH: "baya", LEMMA: "bayan"}, {ORTH: "'y", LEMMA: "ay"}],
"sa'yo": [{ORTH: "sa", LEMMA: "sa"}, {ORTH: "'yo", LEMMA: "iyo"}],
"ano'ng": [{ORTH: "ano", LEMMA: "ano"}, {ORTH: "'ng", LEMMA: "ang"}],
"siya'y": [{ORTH: "siya", LEMMA: "siya"}, {ORTH: "'y", LEMMA: "ay"}],
"nawa'y": [{ORTH: "nawa", LEMMA: "nawa"}, {ORTH: "'y", LEMMA: "ay"}],
"papa'no": [{ORTH: "papa'no", LEMMA: "papaano"}],
"'di": [{ORTH: "'di", LEMMA: "hindi"}],
}
# To keep things clean and readable, it's recommended to only declare the
# TOKENIZER_EXCEPTIONS at the bottom:
TOKENIZER_EXCEPTIONS = _exc

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE, PUNCT
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex

View File

@ -5,71 +5,32 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
# uncomment if files are available
# from .norm_exceptions import NORM_EXCEPTIONS
# from .tag_map import TAG_MAP
# from .morph_rules import MORPH_RULES
# uncomment if lookup-based lemmatizer is available
# from .lemmatizer import LOOKUP
# from ...lemmatizerlookup import Lemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...util import update_exc, add_lookups
from ...language import Language
from ...attrs import LANG, LIKE_NUM, NORM
# from .tag_map import TAG_MAP
from ...attrs import LANG, NORM
from .lemmatizer import UkrainianLemmatizer
# Create a Language subclass
# Documentation: https://spacy.io/docs/usage/adding-languages
# This file should be placed in spacy/lang/xx (ISO code of language).
# Before submitting a pull request, make sure the remove all comments from the
# language data files, and run at least the basic tokenizer tests. Simply add the
# language ID to the list of languages in spacy/tests/conftest.py to include it
# in the basic tokenizer sanity tests. You can optionally add a fixture for the
# language's tokenizer and add more specific tests. For more info, see the
# tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests
class UkrainianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'uk' # ISO code
# add more norm exception dictionaries here
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
# overwrite functions for lexical attributes
lex_attr_getters[LANG] = lambda text: "uk"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
lex_attr_getters.update(LEX_ATTRS)
# add custom tokenizer exceptions to base exceptions
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
# add stop words
stop_words = STOP_WORDS
# if available: add tag map
# tag_map = dict(TAG_MAP)
# if available: add morph rules
# morph_rules = dict(MORPH_RULES)
# if available: add lookup lemmatizer
# @classmethod
# def create_lemmatizer(cls, nlp=None):
# return Lemmatizer(LOOKUP)
@classmethod
def create_lemmatizer(cls, nlp=None):
return UkrainianLemmatizer()
class Ukrainian(Language):
lang = 'uk' # ISO code
Defaults = UkrainianDefaults # set Defaults to custom language defaults
lang = "uk"
Defaults = UkrainianDefaults
# set default export this allows the language class to be lazy-loaded
__all__ = ['Ukrainian']
__all__ = ["Ukrainian"]

View File

@ -19,5 +19,5 @@ sentences = [
"Від Нижнього озера довгими дерев’яними сходами, над якими синьо й біло горіли маленькі коробочки-ліхтарики, підіймалися до нього двоє стовусів: найкращий друг Вертутій і його дванадцятилітній онук Чублик.", # blyznets_viktor_semenovych/zemlia_svitliachkiv
"Китайський космічний зонд \"Чан'е-4\" вперше в історії здійснив м'яку посадку на зворотному боці Місяця.",
"Коли до губ твоїх лишається півподиху, коли до губ твоїх лишається півкроку зіниці твої виткані із подиву, в очах у тебе синьо і широко.", # Hryhorij Czubaj
"Дорогу сестру збираю у дорогу, а брати вирішили не брати машину." # homographs
"Дорогу сестру збираю у дорогу, а брати вирішили не брати машину.", # homographs
]

View File

@ -1,12 +1,15 @@
# coding: utf8
from __future__ import unicode_literals
from ..ru.lemmatizer import RussianLemmatizer
class UkrainianLemmatizer(RussianLemmatizer):
def __init__(self, pymorphy2_lang='ru'):
def __init__(self, pymorphy2_lang="ru"):
try:
super(UkrainianLemmatizer, self).__init__(pymorphy2_lang='uk')
super(UkrainianLemmatizer, self).__init__(pymorphy2_lang="uk")
except ImportError:
raise ImportError(
'The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: '
'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"')
"The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: "
'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
)

View File

@ -1,32 +1,68 @@
# coding: utf8
from __future__ import unicode_literals
# import the symbols for the attrs you want to overwrite
from ...attrs import LIKE_NUM
# Overwriting functions for lexical attributes
# Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs
# Most of these functions, like is_lower or like_url should be language-
# independent. Others, like like_num (which includes both digits and number
# words), requires customisation.
# Example: check if token resembles a number
_num_words = ["більйон", "вісім", "вісімдесят", "вісімнадцять", "вісімсот", "восьмий", "два", "двадцять", "дванадцять",
"двісті", "дев'яносто", "дев'ятнадцять", "дев'ятсот", "дев'ять", "десять", "децильйон", "квадрильйон",
"квінтильйон", "мільйон", "мільярд", "нонильйон", "один", "одинадцять", "октильйон", "п'ятий",
"п'ятисотий", "п'ятнадцять", "п'ятсот", "п'ять", "секстильйон", "септильйон", "сім", "сімдесят",
"сімнадцять", "сімсот", "сорок", "сто", "тисяча", "три", "тридцять", "трильйон", "тринадцять", "триста",
"чотири", "чотириста", "чотирнадцять", "шістдесят", "шістнадцять", "шістсот", "шість"]
_num_words = [
"більйон",
"вісім",
"вісімдесят",
"вісімнадцять",
"вісімсот",
"восьмий",
"два",
"двадцять",
"дванадцять",
"двісті",
"дев'яносто",
"дев'ятнадцять",
"дев'ятсот",
"дев'ять",
"десять",
"децильйон",
"квадрильйон",
"квінтильйон",
"мільйон",
"мільярд",
"нонильйон",
"один",
"одинадцять",
"октильйон",
"п'ятий",
"п'ятисотий",
"п'ятнадцять",
"п'ятсот",
"п'ять",
"секстильйон",
"септильйон",
"сім",
"сімдесят",
"сімнадцять",
"сімсот",
"сорок",
"сто",
"тисяча",
"три",
"тридцять",
"трильйон",
"тринадцять",
"триста",
"чотири",
"чотириста",
"чотирнадцять",
"шістдесят",
"шістнадцять",
"шістсот",
"шість",
]
def like_num(text):
text = text.replace(',', '').replace('.', '')
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
@ -34,9 +70,4 @@ def like_num(text):
return False
# Create dictionary of functions to overwrite. The default lex_attr_getters are
# updated with this one, so only the functions defined here are overwritten.
LEX_ATTRS = {
LIKE_NUM: like_num
}
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -2,15 +2,8 @@
from __future__ import unicode_literals
# Add stop words
# Documentation: https://spacy.io/docs/usage/adding-languages#stop-words
# To improve readability, words should be ordered alphabetically and separated
# by spaces and newlines. When adding stop words from an online source, always
# include the link in a comment. Make sure to proofread and double-check the
# words lists available online are often known to contain mistakes.
STOP_WORDS = set("""а
STOP_WORDS = set(
"""а
або
адже
але
@ -401,4 +394,5 @@ STOP_WORDS = set("""а
якій
якого
якщо
""".split())
""".split()
)

View File

@ -5,14 +5,6 @@ from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
# Add a tag map
# Documentation: https://spacy.io/docs/usage/adding-languages#tag-map
# Universal Dependencies: http://universaldependencies.org/u/pos/all.html
# The keys of the tag map should be strings in your tag set. The dictionary must
# have an entry POS whose value is one of the Universal Dependencies tags.
# Optionally, you can also include morphological features or other attributes.
TAG_MAP = {
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
@ -32,5 +24,5 @@ TAG_MAP = {
"ADJ": {POS: ADJ},
"VERB": {POS: VERB},
"PART": {POS: PART},
"SP": {POS: SPACE}
"SP": {POS: SPACE},
}

View File

@ -1,18 +1,9 @@
# coding: utf8
from __future__ import unicode_literals
# import symbols if you need to use more, add them here
from ...symbols import ORTH, LEMMA, POS, NORM, NOUN
# Add tokenizer exceptions
# Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions
# Feel free to use custom logic to generate repetitive exceptions more efficiently.
# If an exception is split into more than one token, the ORTH values combined always
# need to match the original string.
# Exceptions should be added in the following format:
_exc = {}
for exc_data in [
@ -28,11 +19,9 @@ for exc_data in [
{ORTH: "проф.", LEMMA: "професор", NORM: "професор", POS: NOUN},
{ORTH: "акад.", LEMMA: "академік", NORM: "академік", POS: NOUN},
{ORTH: "доц.", LEMMA: "доцент", NORM: "доцент", POS: NOUN},
{ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN}]:
{ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN},
]:
_exc[exc_data[ORTH]] = [exc_data]
# To keep things clean and readable, it's recommended to only declare the
# TOKENIZER_EXCEPTIONS at the bottom:
TOKENIZER_EXCEPTIONS = _exc

View File

@ -1,6 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from .matcher import Matcher
from .phrasematcher import PhraseMatcher
from .dependencymatcher import DependencyTreeMatcher
from .matcher import Matcher # noqa: F401
from .phrasematcher import PhraseMatcher # noqa: F401
from .dependencymatcher import DependencyTreeMatcher # noqa: F401

View File

@ -119,8 +119,8 @@ def tr_tokenizer():
@pytest.fixture(scope="session")
def uk_tokenizer():
pymorphy = pytest.importorskip("pymorphy2")
return util.get_lang_class("uk").Defaults.create_tokenizer()
pytest.importorskip("pymorphy2")
return get_lang_class("uk").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
@ -130,7 +130,7 @@ def ca_tokenizer():
@pytest.fixture(scope="session")
def pl_tokenizer():
return util.get_lang_class("pl").Defaults.create_tokenizer()
return get_lang_class("pl").Defaults.create_tokenizer()
@pytest.fixture(scope="session")

View File

View File

@ -4,56 +4,56 @@ from __future__ import unicode_literals
import pytest
DOT_TESTS = [
('tel.', ['tel.']),
('np.', ['np.']),
('godz. 21:37', ['godz.', '21:37']),
('inż.', ['inż.']),
('gosp.-polit.', ['gosp.-polit.']),
('ppoż', ['ppoż']),
('płn', ['płn']),
('ul.', ['ul.']),
('jw.', ['jw.']),
('itd.', ['itd.']),
('cdn.', ['cdn.']),
('itp.', ['itp.']),
('10,- zł', ['10,-', '']),
('0 zł 99 gr', ['0', '', '99', 'gr']),
('0,99 rub.', ['0,99', 'rub.']),
('dol.', ['dol.']),
('1000 m n.p.m.', ['1000', 'm', 'n.p.m.']),
('m.in.', ['m.in.']),
('p.n.e.', ['p.n.e.']),
('Sz.P.', ['Sz.P.']),
('p.o.', ['p.o.']),
('k.o.', ['k.o.']),
('m.st.', ['m.st.']),
('dra.', ['dra', '.']),
('pp.', ['pp.']),
('oo.', ['oo.'])
("tel.", ["tel."]),
("np.", ["np."]),
("godz. 21:37", ["godz.", "21:37"]),
("inż.", ["inż."]),
("gosp.-polit.", ["gosp.-polit."]),
("ppoż", ["ppoż"]),
("płn", ["płn"]),
("ul.", ["ul."]),
("jw.", ["jw."]),
("itd.", ["itd."]),
("cdn.", ["cdn."]),
("itp.", ["itp."]),
("10,- zł", ["10,-", ""]),
("0 zł 99 gr", ["0", "", "99", "gr"]),
("0,99 rub.", ["0,99", "rub."]),
("dol.", ["dol."]),
("1000 m n.p.m.", ["1000", "m", "n.p.m."]),
("m.in.", ["m.in."]),
("p.n.e.", ["p.n.e."]),
("Sz.P.", ["Sz.P."]),
("p.o.", ["p.o."]),
("k.o.", ["k.o."]),
("m.st.", ["m.st."]),
("dra.", ["dra", "."]),
("pp.", ["pp."]),
("oo.", ["oo."]),
]
HYPHEN_TESTS = [
('5-fluoropentylo-3-pirydynyloindol', ['5-fluoropentylo-3-pirydynyloindol']),
('NESS-040C5', ['NESS-040C5']),
('JTE-7-31', ['JTE-7-31']),
('BAY-59-3074', ['BAY-59-3074']),
('BAY-38-7271', ['BAY-38-7271']),
('STS-135', ['STS-135']),
('5F-PB-22', ['5F-PB-22']),
('cztero-', ['cztero-']),
('jedno-', ['jedno-']),
('dwu-', ['dwu-']),
('trzy-', ['trzy-']),
('b-adoratorzy', ['b-adoratorzy']),
('2-3-4 drzewa', ['2-3-4', 'drzewa']),
('b-drzewa', ['b-drzewa'])
("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]),
("NESS-040C5", ["NESS-040C5"]),
("JTE-7-31", ["JTE-7-31"]),
("BAY-59-3074", ["BAY-59-3074"]),
("BAY-38-7271", ["BAY-38-7271"]),
("STS-135", ["STS-135"]),
("5F-PB-22", ["5F-PB-22"]),
("cztero-", ["cztero-"]),
("jedno-", ["jedno-"]),
("dwu-", ["dwu-"]),
("trzy-", ["trzy-"]),
("b-adoratorzy", ["b-adoratorzy"]),
("2-3-4 drzewa", ["2-3-4", "drzewa"]),
("b-drzewa", ["b-drzewa"]),
]
TESTCASES = DOT_TESTS + HYPHEN_TESTS
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens):
tokens = pl_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]

View File

@ -5,34 +5,42 @@ import pytest
SV_TOKEN_EXCEPTION_TESTS = [
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']),
('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."])
(
"Smörsåsen används bl.a. till fisk",
["Smörsåsen", "används", "bl.a.", "till", "fisk"],
),
(
"Jag kommer först kl. 13 p.g.a. diverse förseningar",
["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
),
(
"Anders I. tycker om ord med i i.",
["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."],
),
]
@pytest.mark.parametrize('text,expected_tokens', SV_TOKEN_EXCEPTION_TESTS)
@pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS)
def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens):
tokens = sv_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list
@pytest.mark.parametrize('text', ["driveru", "hajaru", "Serru", "Fixaru"])
@pytest.mark.parametrize("text", ["driveru", "hajaru", "Serru", "Fixaru"])
def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 2
assert tokens[1].text == "u"
@pytest.mark.parametrize('text',
["bl.a", "m.a.o.", "Jan.", "Dec.", "kr.", "osv."])
@pytest.mark.parametrize("text", ["bl.a", "m.a.o.", "Jan.", "Dec.", "kr.", "osv."])
def test_sv_tokenizer_handles_abbr(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text', ["Jul.", "jul.", "sön.", "Sön."])
@pytest.mark.parametrize("text", ["Jul.", "jul.", "sön.", "Sön."])
def test_sv_tokenizer_handles_ambiguous_abbr(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 2

View File

@ -4,12 +4,17 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('string,lemma', [('DNA-profilernas', 'DNA-profil'),
('Elfenbenskustens', 'Elfenbenskusten'),
('abortmotståndarens', 'abortmotståndare'),
('kolesterols', 'kolesterol'),
('portionssnusernas', 'portionssnus'),
('åsyns', 'åsyn')])
@pytest.mark.parametrize(
"string,lemma",
[
("DNA-profilernas", "DNA-profil"),
("Elfenbenskustens", "Elfenbenskusten"),
("abortmotståndarens", "abortmotståndare"),
("kolesterols", "kolesterol"),
("portionssnusernas", "portionssnus"),
("åsyns", "åsyn"),
],
)
def test_lemmatizer_lookup_assigns(sv_tokenizer, string, lemma):
tokens = sv_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -1,28 +1,28 @@
# coding: utf-8
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["(under)"])
@pytest.mark.parametrize("text", ["(under)"])
def test_tokenizer_splits_no_special(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["gitta'r", "Björn's", "Lars'"])
@pytest.mark.parametrize("text", ["gitta'r", "Björn's", "Lars'"])
def test_tokenizer_handles_no_punct(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text', ["svart.Gul", "Hej.Världen"])
@pytest.mark.parametrize("text", ["svart.Gul", "Hej.Världen"])
def test_tokenizer_splits_period_infix(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Hej,Världen", "en,två"])
@pytest.mark.parametrize("text", ["Hej,Världen", "en,två"])
def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 3
@ -31,7 +31,7 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["svart...Gul", "svart...gul"])
@pytest.mark.parametrize("text", ["svart...Gul", "svart...gul"])
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 3

View File

@ -1,9 +1,6 @@
# coding: utf-8
"""Test that longer and mixed texts are tokenized correctly."""
from __future__ import unicode_literals
import pytest
def test_sv_tokenizer_handles_long_text(sv_tokenizer):
text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön,

View File

@ -1,25 +1,24 @@
# coding: utf-8
"""Test that open, closed and paired punctuation is split off correctly."""
from __future__ import unicode_literals
import pytest
PUNCT_OPEN = ['(', '[', '{', '*']
PUNCT_CLOSE = [')', ']', '}', '*']
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
PUNCT_OPEN = ["(", "[", "{", "*"]
PUNCT_CLOSE = [")", "]", "}", "*"]
PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
@pytest.mark.parametrize('text', ["(", "((", "<"])
@pytest.mark.parametrize("text", ["(", "((", "<"])
def test_uk_tokenizer_handles_only_punct(uk_tokenizer, text):
tokens = uk_tokenizer(text)
assert len(tokens) == len(text)
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text):
tokens = uk_tokenizer(punct + text)
assert len(tokens) == 2
@ -27,8 +26,10 @@ def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text):
assert tokens[1].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text):
tokens = uk_tokenizer(text + punct)
assert len(tokens) == 2
@ -36,9 +37,11 @@ def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text):
assert tokens[1].text == punct
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('punct_add', ["`"])
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("punct_add", ["`"])
@pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add, text):
tokens = uk_tokenizer(punct + punct_add + text)
assert len(tokens) == 3
@ -47,9 +50,11 @@ def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add,
assert tokens[2].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('punct_add', ["'"])
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("punct_add", ["'"])
@pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add, text):
tokens = uk_tokenizer(text + punct + punct_add)
assert len(tokens) == 3
@ -58,8 +63,10 @@ def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add
assert tokens[2].text == punct_add
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text):
tokens = uk_tokenizer(punct + punct + punct + text)
assert len(tokens) == 4
@ -67,8 +74,10 @@ def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text):
assert tokens[3].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text):
tokens = uk_tokenizer(text + punct + punct + punct)
assert len(tokens) == 4
@ -76,14 +85,14 @@ def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text):
assert tokens[1].text == punct
@pytest.mark.parametrize('text', ["'Тест"])
@pytest.mark.parametrize("text", ["'Тест"])
def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text):
tokens = uk_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == "'"
@pytest.mark.parametrize('text', ["Тест''"])
@pytest.mark.parametrize("text", ["Тест''"])
def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
tokens = uk_tokenizer(text)
assert len(tokens) == 2
@ -91,10 +100,13 @@ def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
assert len(tokens_punct) == 1
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open,
punct_close, text):
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_splits_open_close_punct(
uk_tokenizer, punct_open, punct_close, text
):
tokens = uk_tokenizer(punct_open + text + punct_close)
assert len(tokens) == 3
assert tokens[0].text == punct_open
@ -102,11 +114,14 @@ def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open,
assert tokens[2].text == punct_close
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close,
punct_open2, punct_close2, text):
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
@pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_two_diff_punct(
uk_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
):
tokens = uk_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
assert len(tokens) == 5
assert tokens[0].text == punct_open2
@ -116,7 +131,9 @@ def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close,
assert tokens[4].text == punct_close2
@pytest.mark.parametrize('text', ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."])
@pytest.mark.parametrize(
"text", ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."]
)
def test_uk_tokenizer_splits_trailing_dot(uk_tokenizer, text):
tokens = uk_tokenizer(text)
assert tokens[1].text == "."

View File

@ -1,18 +1,14 @@
# coding: utf-8
"""Test that tokenizer exceptions are parsed correctly."""
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text,norms,lemmas', [("ім.", ["імені"], ["ім'я"]),
("проф.", ["професор"], ["професор"])])
@pytest.mark.parametrize(
"text,norms,lemmas",
[("ім.", ["імені"], ["ім'я"]), ("проф.", ["професор"], ["професор"])],
)
def test_uk_tokenizer_abbrev_exceptions(uk_tokenizer, text, norms, lemmas):
tokens = uk_tokenizer(text)
assert len(tokens) == 1
assert [token.norm_ for token in tokens] == norms

View File

@ -1,16 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from tempfile import NamedTemporaryFile
import pytest
from ...cli.train import train
def test_cli_trained_model_can_be_saved(tmpdir):
lang = 'nl'
lang = "nl"
output_dir = str(tmpdir)
train_file = NamedTemporaryFile('wb', dir=output_dir, delete=False)
train_file = NamedTemporaryFile("wb", dir=output_dir, delete=False)
train_corpus = [
{
"id": "identifier_0",
@ -26,7 +26,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
"head": 1,
"tag": "NOUN",
"orth": "Jan",
"ner": "B-PER"
"ner": "B-PER",
},
{
"id": 1,
@ -34,7 +34,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
"head": 0,
"tag": "VERB",
"orth": "houdt",
"ner": "O"
"ner": "O",
},
{
"id": 2,
@ -42,7 +42,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
"head": 1,
"tag": "ADP",
"orth": "van",
"ner": "O"
"ner": "O",
},
{
"id": 3,
@ -50,7 +50,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
"head": -2,
"tag": "NOUN",
"orth": "Marie",
"ner": "B-PER"
"ner": "B-PER",
},
{
"id": 4,
@ -58,7 +58,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
"head": -3,
"tag": "PUNCT",
"orth": ".",
"ner": "O"
"ner": "O",
},
{
"id": 5,
@ -66,18 +66,18 @@ def test_cli_trained_model_can_be_saved(tmpdir):
"head": -1,
"tag": "SPACE",
"orth": "\n",
"ner": "O"
"ner": "O",
},
],
"brackets": [],
}
],
"brackets": []
}
]
}
]
],
}
]
train_file.write(json.dumps(train_corpus).encode('utf-8'))
train_file.write(json.dumps(train_corpus).encode("utf-8"))
train_file.close()
train_data = train_file.name
dev_data = train_data

View File

@ -155,6 +155,14 @@ def test_issue1758(en_tokenizer):
assert tokens[1].lemma_ == "have"
def test_issue1773(en_tokenizer):
"""Test that spaces don't receive a POS but no TAG. This is the root cause
of the serialization issue reported in #1773."""
doc = en_tokenizer("\n")
if doc[0].pos_ == "SPACE":
assert doc[0].tag_ != ""
def test_issue1799():
"""Test sentence boundaries are deserialized correctly, even for
non-projective sentences."""
@ -249,8 +257,8 @@ def test_issue1945():
def test_issue1963(en_tokenizer):
"""Test that doc.merge() resizes doc.tensor"""
doc = en_tokenizer('a b c d')
doc.tensor = numpy.ones((len(doc), 128), dtype='f')
doc = en_tokenizer("a b c d")
doc.tensor = numpy.ones((len(doc), 128), dtype="f")
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2])
assert len(doc) == 3

View File

@ -1,9 +0,0 @@
from __future__ import unicode_literals
def test_issue1773(en_tokenizer):
"""Test that spaces don't receive a POS but no TAG. This is the root cause
of the serialization issue reported in #1773."""
doc = en_tokenizer('\n')
if doc[0].pos_ == 'SPACE':
assert doc[0].tag_ != ""

View File

@ -6,8 +6,9 @@ from spacy.tokens import Doc
from spacy.displacy import render
from spacy.gold import iob_to_biluo
from spacy.lang.it import Italian
import numpy
from ..util import add_vecs_to_vocab
from ..util import add_vecs_to_vocab, get_doc
@pytest.mark.xfail
@ -69,6 +70,26 @@ def test_issue2385_biluo(tags):
assert iob_to_biluo(tags) == list(tags)
def test_issue2396(en_vocab):
words = ["She", "created", "a", "test", "for", "spacy"]
heads = [1, 0, 1, -2, -1, -1]
matrix = numpy.array(
[
[0, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[1, 1, 2, 3, 3, 3],
[1, 1, 3, 3, 3, 3],
[1, 1, 3, 3, 4, 4],
[1, 1, 3, 3, 4, 5],
],
dtype=numpy.int32,
)
doc = get_doc(en_vocab, words=words, heads=heads)
span = doc[:]
assert (doc.get_lca_matrix() == matrix).all()
assert (span.get_lca_matrix() == matrix).all()
def test_issue2482():
"""Test we can serialize and deserialize a blank NER or parser model."""
nlp = Italian()

View File

@ -1,35 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from ..util import get_doc
import pytest
import numpy
@pytest.mark.parametrize(
"sentence,heads,matrix",
[
(
"She created a test for spacy",
[1, 0, 1, -2, -1, -1],
numpy.array(
[
[0, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[1, 1, 2, 3, 3, 3],
[1, 1, 3, 3, 3, 3],
[1, 1, 3, 3, 4, 4],
[1, 1, 3, 3, 4, 5],
],
dtype=numpy.int32,
),
)
],
)
def test_issue2396(en_tokenizer, sentence, heads, matrix):
tokens = en_tokenizer(sentence)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
span = doc[:]
assert (doc.get_lca_matrix() == matrix).all()
assert (span.get_lca_matrix() == matrix).all()

View File

@ -1,14 +1,10 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.lang.en import English
def test_issue2754():
def test_issue2754(en_tokenizer):
"""Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
nlp = English()
a = nlp('a')
assert a[0].norm_ == 'a'
am = nlp('am')
assert am[0].norm_ == 'am'
a = en_tokenizer("a")
assert a[0].norm_ == "a"
am = en_tokenizer("am")
assert am[0].norm_ == "am"

View File

@ -9,4 +9,3 @@ def test_issue2835(en_tokenizer):
"""
doc = en_tokenizer(text)
assert doc

View File

@ -2,26 +2,24 @@
from __future__ import unicode_literals
import numpy
from spacy.vectors import Vectors
from spacy.vocab import Vocab
from spacy.tokens import Doc
from spacy._ml import link_vectors_to_models
def test_issue2871():
"""Test that vectors recover the correct key for spaCy reserved words."""
words = ['dog', 'cat', 'SUFFIX']
words = ["dog", "cat", "SUFFIX"]
vocab = Vocab()
vocab.vectors.resize(shape=(3, 10))
vector_data = numpy.zeros((3, 10), dtype='f')
vector_data = numpy.zeros((3, 10), dtype="f")
for word in words:
_ = vocab[word]
_ = vocab[word] # noqa: F841
vocab.set_vector(word, vector_data[0])
vocab.vectors.name = 'dummy_vectors'
vocab.vectors.name = "dummy_vectors"
link_vectors_to_models(vocab)
assert vocab['dog'].rank == 0
assert vocab['cat'].rank == 1
assert vocab['SUFFIX'].rank == 2
assert vocab.vectors.find(key='dog') == 0
assert vocab.vectors.find(key='cat') == 1
assert vocab.vectors.find(key='SUFFIX') == 2
assert vocab["dog"].rank == 0
assert vocab["cat"].rank == 1
assert vocab["SUFFIX"].rank == 2
assert vocab.vectors.find(key="dog") == 0
assert vocab.vectors.find(key="cat") == 1
assert vocab.vectors.find(key="SUFFIX") == 2

View File

@ -58,9 +58,10 @@ def test_issue3009(doc, matcher, pattern):
matches = matcher(doc)
assert matches
def test_issue2464(matcher):
"""Test problem with successive ?. This is the same bug, so putting it here."""
doc = Doc(matcher.vocab, words=['a', 'b'])
matcher.add('4', None, [{'OP': '?'}, {'OP': '?'}])
doc = Doc(matcher.vocab, words=["a", "b"])
matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}])
matches = matcher(doc)
assert len(matches) == 3

View File

@ -1,8 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from ...attrs import ENT_IOB, ENT_TYPE
from ...tokens import Doc
from ..util import get_doc
@ -30,4 +28,4 @@ def test_issue3012(en_vocab):
# serializing then deserializing
doc_bytes = doc.to_bytes()
doc2 = Doc(en_vocab).from_bytes(doc_bytes)
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected

View File

@ -1,10 +0,0 @@
from __future__ import unicode_literals
import pytest
import spacy
@pytest.mark.models('fr')
def test_issue1959(FR):
texts = ['Je suis la mauvaise herbe', "Me, myself and moi"]
for text in texts:
FR(text)