Tidy up and fix small bugs and typos

This commit is contained in:
Ines Montani 2019-02-08 14:14:49 +01:00
parent 9e652afa4b
commit 25602c794c
47 changed files with 751 additions and 933 deletions

View File

@ -8,15 +8,14 @@ import time
from collections import Counter from collections import Counter
from pathlib import Path from pathlib import Path
from thinc.v2v import Affine, Maxout from thinc.v2v import Affine, Maxout
from thinc.api import wrap, layerize
from thinc.misc import LayerNorm as LN from thinc.misc import LayerNorm as LN
from thinc.neural.util import prefer_gpu, get_array_module from thinc.neural.util import prefer_gpu
from wasabi import Printer from wasabi import Printer
import srsly import srsly
from ..tokens import Doc from ..tokens import Doc
from ..attrs import ID, HEAD from ..attrs import ID, HEAD
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
from .._ml import masked_language_model from .._ml import masked_language_model
from .. import util from .. import util
@ -136,7 +135,7 @@ def pretrain(
random.shuffle(texts) random.shuffle(texts)
def make_update(model, docs, optimizer, drop=0.0, objective='L2'): def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
"""Perform an update over a single batch of documents. """Perform an update over a single batch of documents.
docs (iterable): A batch of `Doc` objects. docs (iterable): A batch of `Doc` objects.
@ -171,7 +170,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500):
return docs return docs
def get_vectors_loss(ops, docs, prediction, objective='L2'): def get_vectors_loss(ops, docs, prediction, objective="L2"):
"""Compute a mean-squared error loss between the documents' vectors and """Compute a mean-squared error loss between the documents' vectors and
the prediction. the prediction.
@ -185,9 +184,9 @@ def get_vectors_loss(ops, docs, prediction, objective='L2'):
# and look them up all at once. This prevents data copying. # and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids] target = docs[0].vocab.vectors.data[ids]
if objective == 'L2': if objective == "L2":
d_scores = prediction - target d_scores = prediction - target
loss = (d_scores**2).sum() loss = (d_scores ** 2).sum()
else: else:
raise NotImplementedError(objective) raise NotImplementedError(objective)
return loss, d_scores return loss, d_scores
@ -201,8 +200,7 @@ def create_pretraining_model(nlp, tok2vec):
""" """
output_size = nlp.vocab.vectors.data.shape[1] output_size = nlp.vocab.vectors.data.shape[1]
output_layer = chain( output_layer = chain(
LN(Maxout(300, pieces=3)), LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
Affine(output_size, drop_factor=0.0),
) )
# This is annoying, but the parser etc have the flatten step after # This is annoying, but the parser etc have the flatten step after
# the tok2vec. To load the weights in cleanly, we need to match # the tok2vec. To load the weights in cleanly, we need to match

View File

@ -13,13 +13,7 @@ RENDER_WRAPPER = None
def render( def render(
docs, docs, style="dep", page=False, minify=False, jupyter=False, options={}, manual=False
style="dep",
page=False,
minify=False,
jupyter=False,
options={},
manual=False,
): ):
"""Render displaCy visualisation. """Render displaCy visualisation.
@ -80,7 +74,7 @@ def serve(
""" """
from wsgiref import simple_server from wsgiref import simple_server
if IS_JUPYTER: if is_in_jupyter():
user_warning(Warnings.W011) user_warning(Warnings.W011)
render(docs, style=style, page=page, minify=minify, options=options, manual=manual) render(docs, style=style, page=page, minify=minify, options=options, manual=manual)

View File

@ -1,8 +1,9 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES
from ..char_classes import CONCAT_QUOTES, CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
# removing ° from the special icons to keep e.g. 99° as one token # removing ° from the special icons to keep e.g. 99° as one token
_concat_icons = CONCAT_ICONS.replace("\u00B0", "") _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
@ -29,7 +30,9 @@ _suffixes = (
r"(?<=°[FfCcKk])\.", r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:[{c}])".format(c=_currency), r"(?<=[0-9])(?:[{c}])".format(c=_currency),
r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{al}{e}{q}(?:{c})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency), r"(?<=[{al}{e}{q}(?:{c})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
),
r"(?<=[{al})])-e".format(al=ALPHA_LOWER), r"(?<=[{al})])-e".format(al=ALPHA_LOWER),
] ]
) )
@ -40,7 +43,7 @@ _infixes = (
+ [ + [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA), r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes), r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),

View File

@ -5,24 +5,24 @@ import re
from collections import namedtuple from collections import namedtuple
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from ...attrs import LANG from ...attrs import LANG
from ...language import Language from ...language import Language
from ...tokens import Doc, Token from ...tokens import Doc, Token
from ...util import DummyTokenizer from ...util import DummyTokenizer
ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"]) ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
# TODO: Is this the right place for this?
Token.set_extension("mecab_tag", default=None)
def try_mecab_import(): def try_mecab_import():
"""Mecab is required for Japanese support, so check for it. """Mecab is required for Japanese support, so check for it.
It it's not available blow up and explain how to fix it.""" It it's not available blow up and explain how to fix it."""
try: try:
import MeCab import MeCab
# XXX Is this the right place for this?
Token.set_extension("mecab_tag", default=None)
return MeCab return MeCab
except ImportError: except ImportError:
raise ImportError( raise ImportError(
@ -33,14 +33,13 @@ def try_mecab_import():
def resolve_pos(token): def resolve_pos(token):
"""If necessary, add a field to the POS tag for UD mapping. """If necessary, add a field to the POS tag for UD mapping.
Under Universal Dependencies, sometimes the same Unidic POS tag can Under Universal Dependencies, sometimes the same Unidic POS tag can
be mapped differently depending on the literal token or its context be mapped differently depending on the literal token or its context
in the sentence. This function adds information to the POS tag to in the sentence. This function adds information to the POS tag to
resolve ambiguous mappings. resolve ambiguous mappings.
""" """
# NOTE: This is a first take. The rules here are crude approximations. # TODO: This is a first take. The rules here are crude approximations.
# For many of these, full dependencies are needed to properly resolve # For many of these, full dependencies are needed to properly resolve
# PoS mappings. # PoS mappings.
@ -56,7 +55,7 @@ def resolve_pos(token):
def detailed_tokens(tokenizer, text): def detailed_tokens(tokenizer, text):
"""Format Mecab output into a nice data structure, based on Janome.""" """Format Mecab output into a nice data structure, based on Janome."""
tokenizer.parse(text)
node = tokenizer.parseToNode(text) node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it node = node.next # first node is beginning of sentence and empty, skip it
words = [] words = []
@ -98,62 +97,15 @@ class JapaneseTokenizer(DummyTokenizer):
return doc return doc
class JapaneseCharacterSegmenter(object):
def __init__(self, vocab):
self.vocab = vocab
self._presegmenter = self._make_presegmenter(self.vocab)
def _make_presegmenter(self, vocab):
rules = Japanese.Defaults.tokenizer_exceptions
token_match = Japanese.Defaults.token_match
prefix_search = (
util.compile_prefix_regex(Japanese.Defaults.prefixes).search
if Japanese.Defaults.prefixes
else None
)
suffix_search = (
util.compile_suffix_regex(Japanese.Defaults.suffixes).search
if Japanese.Defaults.suffixes
else None
)
infix_finditer = (
util.compile_infix_regex(Japanese.Defaults.infixes).finditer
if Japanese.Defaults.infixes
else None
)
return Tokenizer(
vocab,
rules=rules,
prefix_search=prefix_search,
suffix_search=suffix_search,
infix_finditer=infix_finditer,
token_match=token_match,
)
def __call__(self, text):
words = []
spaces = []
doc = self._presegmenter(text)
for token in doc:
words.extend(list(token.text))
spaces.extend([False] * len(token.text))
spaces[-1] = bool(token.whitespace_)
return Doc(self.vocab, words=words, spaces=spaces)
class JapaneseDefaults(Language.Defaults): class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda _text: "ja" lex_attr_getters[LANG] = lambda _text: "ja"
tag_map = TAG_MAP tag_map = TAG_MAP
use_janome = True
@classmethod @classmethod
def create_tokenizer(cls, nlp=None): def create_tokenizer(cls, nlp=None):
if cls.use_janome: return JapaneseTokenizer(cls, nlp)
return JapaneseTokenizer(cls, nlp)
else:
return JapaneseCharacterSegmenter(nlp.vocab)
class Japanese(Language): class Japanese(Language):

View File

@ -2,10 +2,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
@ -22,9 +22,9 @@ class PolishDefaults(Language.Defaults):
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
) )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = tuple(TOKENIZER_INFIXES)
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
infixes = TOKENIZER_INFIXES
class Polish(Language): class Polish(Language):

View File

@ -1,14 +1,22 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS
_quotes = QUOTES.replace("'", '') from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), _quotes = CONCAT_QUOTES.replace("'", "")
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), _infixes = (
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), LIST_ELLIPSES
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes), + [CONCAT_ICONS]
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)]) + [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
]
)
TOKENIZER_INFIXES = _infixes TOKENIZER_INFIXES = _infixes

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ
_exc = {} _exc = {}

View File

@ -6,7 +6,9 @@ from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .morph_rules import MORPH_RULES from .morph_rules import MORPH_RULES
from .lemmatizer import LEMMA_RULES, LOOKUP from .lemmatizer import LEMMA_RULES, LOOKUP
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
# Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
@ -31,6 +33,7 @@ class SwedishDefaults(Language.Defaults):
lemma_lookup = LOOKUP lemma_lookup = LOOKUP
morph_rules = MORPH_RULES morph_rules = MORPH_RULES
class Swedish(Language): class Swedish(Language):
lang = "sv" lang = "sv"
Defaults = SwedishDefaults Defaults = SwedishDefaults

View File

@ -1,25 +0,0 @@
# coding: utf8
"""Punctuation stolen from Danish"""
from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_SUFFIXES
_quotes = QUOTES.replace("'", '')
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)])
_suffixes = [suffix for suffix in TOKENIZER_SUFFIXES if suffix not in ["'s", "'S", "s", "S", r"\'"]]
_suffixes += [r"(?<=[^sSxXzZ])\'"]
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes

View File

@ -1,169 +1,191 @@
# coding: utf8 # coding: utf8
"""
Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html
for https://github.com/UniversalDependencies/UD_Swedish-Talbanken
"""
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X, VERB from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX from ...symbols import ADP, X, VERB, NOUN, PROPN, PART, INTJ, PRON
# Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html
# for https://github.com/UniversalDependencies/UD_Swedish-Talbanken
TAG_MAP = { TAG_MAP = {
'AB': { POS: ADV }, # inte, också, så, bara, nu "AB": {POS: ADV}, # inte, också, så, bara, nu
'AB|AN': { POS: ADV }, # t.ex., ca, t_ex, bl.a., s_k "AB|AN": {POS: ADV}, # t.ex., ca, t_ex, bl.a., s_k
'AB|KOM': { POS: ADV }, # mer, tidigare, mindre, vidare, mera "AB|KOM": {POS: ADV}, # mer, tidigare, mindre, vidare, mera
'AB|POS': { POS: ADV }, # mycket, helt, ofta, länge, långt "AB|POS": {POS: ADV}, # mycket, helt, ofta, länge, långt
'AB|SMS': { POS: ADV }, # över-, in- "AB|SMS": {POS: ADV}, # över-, in-
'AB|SUV': { POS: ADV }, # minst, mest, högst, främst, helst "AB|SUV": {POS: ADV}, # minst, mest, högst, främst, helst
'DT|MAS|SIN|DEF': { POS: DET }, "DT|MAS|SIN|DEF": {POS: DET},
'DT|MAS|SIN|IND': { POS: DET }, "DT|MAS|SIN|IND": {POS: DET},
'DT|NEU|SIN|DEF': { POS: DET }, # det, detta "DT|NEU|SIN|DEF": {POS: DET}, # det, detta
'DT|NEU|SIN|IND': { POS: DET }, # ett, något, inget, vart, vartannat "DT|NEU|SIN|IND": {POS: DET}, # ett, något, inget, vart, vartannat
'DT|NEU|SIN|IND/DEF': { POS: DET }, # allt "DT|NEU|SIN|IND/DEF": {POS: DET}, # allt
'DT|UTR/NEU|PLU|DEF': { POS: DET }, # de, dessa, bägge, dom "DT|UTR/NEU|PLU|DEF": {POS: DET}, # de, dessa, bägge, dom
'DT|UTR/NEU|PLU|IND': { POS: DET }, # några, inga "DT|UTR/NEU|PLU|IND": {POS: DET}, # några, inga
'DT|UTR/NEU|PLU|IND/DEF': { POS: DET }, # alla "DT|UTR/NEU|PLU|IND/DEF": {POS: DET}, # alla
'DT|UTR/NEU|SIN/PLU|IND': { POS: DET }, # samma "DT|UTR/NEU|SIN/PLU|IND": {POS: DET}, # samma
'DT|UTR/NEU|SIN|DEF': { POS: DET }, # vardera "DT|UTR/NEU|SIN|DEF": {POS: DET}, # vardera
'DT|UTR/NEU|SIN|IND': { POS: DET }, # varje, varenda "DT|UTR/NEU|SIN|IND": {POS: DET}, # varje, varenda
'DT|UTR|SIN|DEF': { POS: DET }, # den, denna "DT|UTR|SIN|DEF": {POS: DET}, # den, denna
'DT|UTR|SIN|IND': { POS: DET }, # en, någon, ingen, var, varannan "DT|UTR|SIN|IND": {POS: DET}, # en, någon, ingen, var, varannan
'DT|UTR|SIN|IND/DEF': { POS: DET }, # all "DT|UTR|SIN|IND/DEF": {POS: DET}, # all
'HA': { POS: ADV }, # när, där, hur, som, då "HA": {POS: ADV}, # när, där, hur, som, då
'HD|NEU|SIN|IND': { POS: DET }, # vilket "HD|NEU|SIN|IND": {POS: DET}, # vilket
'HD|UTR/NEU|PLU|IND': { POS: DET }, # vilka "HD|UTR/NEU|PLU|IND": {POS: DET}, # vilka
'HD|UTR|SIN|IND': { POS: DET }, # vilken "HD|UTR|SIN|IND": {POS: DET}, # vilken
'HP|-|-|-': { POS: PRON }, # som "HP|-|-|-": {POS: PRON}, # som
'HP|NEU|SIN|IND': { POS: PRON }, # vad, vilket "HP|NEU|SIN|IND": {POS: PRON}, # vad, vilket
'HP|NEU|SIN|IND|SMS': { POS: PRON }, "HP|NEU|SIN|IND|SMS": {POS: PRON},
'HP|UTR/NEU|PLU|IND': { POS: PRON }, # vilka "HP|UTR/NEU|PLU|IND": {POS: PRON}, # vilka
'HP|UTR|SIN|IND': { POS: PRON }, # vilken, vem "HP|UTR|SIN|IND": {POS: PRON}, # vilken, vem
'HS|DEF': { POS: DET }, # vars, vilkas, Vems "HS|DEF": {POS: DET}, # vars, vilkas, Vems
'IE': { POS: PART }, # att "IE": {POS: PART}, # att
'IN': { POS: INTJ }, # Jo, ja, nej, fan, visst "IN": {POS: INTJ}, # Jo, ja, nej, fan, visst
'JJ|AN': { POS: ADJ }, # ev, S:t, Kungl, Kungl., Teol "JJ|AN": {POS: ADJ}, # ev, S:t, Kungl, Kungl., Teol
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: ADJ }, # äldres "JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN": {POS: ADJ}, # äldres
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # större, högre, mindre, bättre, äldre "JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM": {
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS': { POS: ADJ }, POS: ADJ
'JJ|POS|MAS|SIN|DEF|GEN': { POS: ADJ }, # enskildes, sjukes, andres }, # större, högre, mindre, bättre, äldre
'JJ|POS|MAS|SIN|DEF|NOM': { POS: ADJ }, # enskilde, sjuke, andre, unge, ene "JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS": {POS: ADJ},
'JJ|POS|NEU|SIN|IND/DEF|NOM': { POS: ADJ }, # eget "JJ|POS|MAS|SIN|DEF|GEN": {POS: ADJ}, # enskildes, sjukes, andres
'JJ|POS|NEU|SIN|IND|GEN': { POS: ADJ }, "JJ|POS|MAS|SIN|DEF|NOM": {POS: ADJ}, # enskilde, sjuke, andre, unge, ene
'JJ|POS|NEU|SIN|IND|NOM': { POS: ADJ }, # annat, svårt, möjligt, nytt, sådant "JJ|POS|NEU|SIN|IND/DEF|NOM": {POS: ADJ}, # eget
'JJ|POS|UTR/NEU|PLU|IND/DEF|GEN': { POS: ADJ }, # ogiftas, ungas, frånskildas, efterkommandes, färgblindas "JJ|POS|NEU|SIN|IND|GEN": {POS: ADJ},
'JJ|POS|UTR/NEU|PLU|IND/DEF|NOM': { POS: ADJ }, # olika, andra, många, stora, vissa "JJ|POS|NEU|SIN|IND|NOM": {POS: ADJ}, # annat, svårt, möjligt, nytt, sådant
'JJ|POS|UTR/NEU|PLU|IND|NOM': { POS: ADJ }, # flera, sådana, fler, få, samtliga "JJ|POS|UTR/NEU|PLU|IND/DEF|GEN": {
'JJ|POS|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ }, POS: ADJ
'JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # bra, ena, enda, nästa, ringa }, # ogiftas, ungas, frånskildas, efterkommandes, färgblindas
'JJ|POS|UTR/NEU|SIN|DEF|GEN': { POS: ADJ }, "JJ|POS|UTR/NEU|PLU|IND/DEF|NOM": {POS: ADJ}, # olika, andra, många, stora, vissa
'JJ|POS|UTR/NEU|SIN|DEF|NOM': { POS: ADJ }, # hela, nya, andra, svenska, ekonomiska "JJ|POS|UTR/NEU|PLU|IND|NOM": {POS: ADJ}, # flera, sådana, fler, få, samtliga
'JJ|POS|UTR|-|-|SMS': { POS: ADJ }, # fri-, låg-, sexual- "JJ|POS|UTR/NEU|SIN/PLU|IND|NOM": {POS: ADJ},
'JJ|POS|UTR|SIN|IND/DEF|NOM': { POS: ADJ }, # egen "JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM": {POS: ADJ}, # bra, ena, enda, nästa, ringa
'JJ|POS|UTR|SIN|IND|GEN': { POS: ADJ }, # enskilds "JJ|POS|UTR/NEU|SIN|DEF|GEN": {POS: ADJ},
'JJ|POS|UTR|SIN|IND|NOM': { POS: ADJ }, # stor, annan, själv, sådan, viss "JJ|POS|UTR/NEU|SIN|DEF|NOM": {POS: ADJ}, # hela, nya, andra, svenska, ekonomiska
'JJ|SUV|MAS|SIN|DEF|GEN': { POS: ADJ }, "JJ|POS|UTR|-|-|SMS": {POS: ADJ}, # fri-, låg-, sexual-
'JJ|SUV|MAS|SIN|DEF|NOM': { POS: ADJ }, # störste, främste, äldste, minste "JJ|POS|UTR|SIN|IND/DEF|NOM": {POS: ADJ}, # egen
'JJ|SUV|UTR/NEU|PLU|DEF|NOM': { POS: ADJ }, # flesta "JJ|POS|UTR|SIN|IND|GEN": {POS: ADJ}, # enskilds
'JJ|SUV|UTR/NEU|PLU|IND|NOM': { POS: ADJ }, "JJ|POS|UTR|SIN|IND|NOM": {POS: ADJ}, # stor, annan, själv, sådan, viss
'JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM': { POS: ADJ }, # bästa, största, närmaste, viktigaste, högsta "JJ|SUV|MAS|SIN|DEF|GEN": {POS: ADJ},
'JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ }, # störst, bäst, tidigast, högst, fattigast "JJ|SUV|MAS|SIN|DEF|NOM": {POS: ADJ}, # störste, främste, äldste, minste
'KN': { POS: CCONJ }, # och, eller, som, än, men "JJ|SUV|UTR/NEU|PLU|DEF|NOM": {POS: ADJ}, # flesta
'KN|AN': { POS: CCONJ }, "JJ|SUV|UTR/NEU|PLU|IND|NOM": {POS: ADJ},
'MAD': { POS: PUNCT }, # ., ?, :, !, ... "JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM": {
'MID': { POS: PUNCT }, # ,, -, :, *, ; POS: ADJ
'NN|-|-|-|-': { POS: NOUN }, # godo, fjol, fullo, somras, måtto }, # bästa, största, närmaste, viktigaste, högsta
'NN|AN': { POS: NOUN }, # kr, %, s., dr, kap. "JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM": {
'NN|NEU|-|-|-': { POS: NOUN }, POS: ADJ
'NN|NEU|-|-|SMS': { POS: NOUN }, # yrkes-, barn-, hem-, fack-, vatten- }, # störst, bäst, tidigast, högst, fattigast
'NN|NEU|PLU|DEF|GEN': { POS: NOUN }, # barnens, årens, u-ländernas, företagens, århundradenas "KN": {POS: CCONJ}, # och, eller, som, än, men
'NN|NEU|PLU|DEF|NOM': { POS: NOUN }, # barnen, u-länderna, åren, länderna, könen "KN|AN": {POS: CCONJ},
'NN|NEU|PLU|IND|GEN': { POS: NOUN }, # slags, års, barns, länders, tusentals "MAD": {POS: PUNCT}, # ., ?, :, !, ...
'NN|NEU|PLU|IND|NOM': { POS: NOUN }, # barn, år, fall, länder, problem "MID": {POS: PUNCT}, # ,, -, :, *, ;
'NN|NEU|SIN|DEF|GEN': { POS: NOUN }, # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets "NN|-|-|-|-": {POS: NOUN}, # godo, fjol, fullo, somras, måtto
'NN|NEU|SIN|DEF|NOM': { POS: NOUN }, # äktenskapet, samhället, barnet, stället, hemmet "NN|AN": {POS: NOUN}, # kr, %, s., dr, kap.
'NN|NEU|SIN|IND|GEN': { POS: NOUN }, # års, slags, lands, havs, företags "NN|NEU|-|-|-": {POS: NOUN},
'NN|NEU|SIN|IND|NOM': { POS: NOUN }, # år, arbete, barn, sätt, äktenskap "NN|NEU|-|-|SMS": {POS: NOUN}, # yrkes-, barn-, hem-, fack-, vatten-
'NN|SMS': { POS: NOUN }, # PCB-, Syd- "NN|NEU|PLU|DEF|GEN": {
'NN|UTR|-|-|-': { POS: NOUN }, # dags, rätta POS: NOUN
'NN|UTR|-|-|SMS': { POS: NOUN }, # far-, kibbutz-, röntgen-, barna-, hälso- }, # barnens, årens, u-ländernas, företagens, århundradenas
'NN|UTR|PLU|DEF|GEN': { POS: NOUN }, # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas "NN|NEU|PLU|DEF|NOM": {POS: NOUN}, # barnen, u-länderna, åren, länderna, könen
'NN|UTR|PLU|DEF|NOM': { POS: NOUN }, # kvinnorna, föräldrarna, makarna, männen, hyrorna "NN|NEU|PLU|IND|GEN": {POS: NOUN}, # slags, års, barns, länders, tusentals
'NN|UTR|PLU|IND|GEN': { POS: NOUN }, # människors, kvinnors, dagars, tiders, månaders "NN|NEU|PLU|IND|NOM": {POS: NOUN}, # barn, år, fall, länder, problem
'NN|UTR|PLU|IND|NOM': { POS: NOUN }, # procent, människor, kvinnor, miljoner, kronor "NN|NEU|SIN|DEF|GEN": {
'NN|UTR|SIN|DEF|GEN': { POS: NOUN }, # kvinnans, världens, familjens, dagens, jordens POS: NOUN
'NN|UTR|SIN|DEF|NOM': { POS: NOUN }, # familjen, kvinnan, mannen, världen, skolan }, # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets
'NN|UTR|SIN|IND|GEN': { POS: NOUN }, # sorts, medelålders, makes, kvinnas, veckas "NN|NEU|SIN|DEF|NOM": {
'NN|UTR|SIN|IND|NOM': { POS: NOUN }, # del, tid, dag, fråga, man POS: NOUN
'PAD': { POS: PUNCT }, # , ), ( }, # äktenskapet, samhället, barnet, stället, hemmet
'PC|AN': { POS: VERB }, "NN|NEU|SIN|IND|GEN": {POS: NOUN}, # års, slags, lands, havs, företags
'PC|PRF|MAS|SIN|DEF|GEN': { POS: VERB }, # avlidnes "NN|NEU|SIN|IND|NOM": {POS: NOUN}, # år, arbete, barn, sätt, äktenskap
'PC|PRF|MAS|SIN|DEF|NOM': { POS: VERB }, "NN|SMS": {POS: NOUN}, # PCB-, Syd-
'PC|PRF|NEU|SIN|IND|NOM': { POS: VERB }, # taget, sett, särskilt, förbjudet, ökat "NN|UTR|-|-|-": {POS: NOUN}, # dags, rätta
'PC|PRF|UTR/NEU|PLU|IND/DEF|GEN': { POS: VERB }, # försäkrades, anställdas "NN|UTR|-|-|SMS": {POS: NOUN}, # far-, kibbutz-, röntgen-, barna-, hälso-
'PC|PRF|UTR/NEU|PLU|IND/DEF|NOM': { POS: VERB }, # särskilda, gifta, ökade, handikappade, skilda "NN|UTR|PLU|DEF|GEN": {
'PC|PRF|UTR/NEU|SIN|DEF|GEN': { POS: VERB }, POS: NOUN
'PC|PRF|UTR/NEU|SIN|DEF|NOM': { POS: VERB }, # ökade, gifta, nämnda, nedärvda, dolda }, # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas
'PC|PRF|UTR|SIN|IND|GEN': { POS: VERB }, "NN|UTR|PLU|DEF|NOM": {
'PC|PRF|UTR|SIN|IND|NOM': { POS: VERB }, # särskild, ökad, beredd, gift, oförändrad POS: NOUN
'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: VERB }, # studerandes, sammanboendes, dubbelarbetandes }, # kvinnorna, föräldrarna, makarna, männen, hyrorna
'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: VERB }, # följande, beroende, nuvarande, motsvarande, liknande "NN|UTR|PLU|IND|GEN": {POS: NOUN}, # människors, kvinnors, dagars, tiders, månaders
'PL': { POS: PART }, # ut, upp, in, till, med "NN|UTR|PLU|IND|NOM": {POS: NOUN}, # procent, människor, kvinnor, miljoner, kronor
'PL|SMS': { POS: PART }, "NN|UTR|SIN|DEF|GEN": {POS: NOUN}, # kvinnans, världens, familjens, dagens, jordens
'PM': { POS: PROPN }, # F, N, Liechtenstein, Danmark, DK "NN|UTR|SIN|DEF|NOM": {POS: NOUN}, # familjen, kvinnan, mannen, världen, skolan
'PM|GEN': { POS: PROPN }, # Sveriges, EEC:s, Guds, Stockholms, Kristi "NN|UTR|SIN|IND|GEN": {POS: NOUN}, # sorts, medelålders, makes, kvinnas, veckas
'PM|NOM': { POS: PROPN }, # Sverige, EEC, Stockholm, USA, ATP "NN|UTR|SIN|IND|NOM": {POS: NOUN}, # del, tid, dag, fråga, man
'PM|SMS': { POS: PROPN }, # Göteborgs-, Nord-, Väst- "PAD": {POS: PUNCT}, # , ), (
'PN|MAS|SIN|DEF|SUB/OBJ': { POS: PRON }, # denne "PC|AN": {POS: VERB},
'PN|NEU|SIN|DEF|SUB/OBJ': { POS: PRON }, # det, detta, detsamma "PC|PRF|MAS|SIN|DEF|GEN": {POS: VERB}, # avlidnes
'PN|NEU|SIN|IND|SUB/OBJ': { POS: PRON }, # något, allt, mycket, annat, ingenting "PC|PRF|MAS|SIN|DEF|NOM": {POS: VERB},
'PN|UTR/NEU|PLU|DEF|OBJ': { POS: PRON }, # dem, varandra, varann "PC|PRF|NEU|SIN|IND|NOM": {POS: VERB}, # taget, sett, särskilt, förbjudet, ökat
'PN|UTR/NEU|PLU|DEF|SUB': { POS: PRON }, # de, bägge "PC|PRF|UTR/NEU|PLU|IND/DEF|GEN": {POS: VERB}, # försäkrades, anställdas
'PN|UTR/NEU|PLU|DEF|SUB/OBJ': { POS: PRON }, # dessa, dom, båda, den, bådadera "PC|PRF|UTR/NEU|PLU|IND/DEF|NOM": {
'PN|UTR/NEU|PLU|IND|SUB/OBJ': { POS: PRON }, # andra, alla, många, sådana, några POS: VERB
'PN|UTR/NEU|SIN/PLU|DEF|OBJ': { POS: PRON }, # sig, sej }, # särskilda, gifta, ökade, handikappade, skilda
'PN|UTR|PLU|DEF|OBJ': { POS: PRON }, # oss, er, eder "PC|PRF|UTR/NEU|SIN|DEF|GEN": {POS: VERB},
'PN|UTR|PLU|DEF|SUB': { POS: PRON }, # vi "PC|PRF|UTR/NEU|SIN|DEF|NOM": {POS: VERB}, # ökade, gifta, nämnda, nedärvda, dolda
'PN|UTR|SIN|DEF|OBJ': { POS: PRON }, # dig, mig, henne, honom, Er "PC|PRF|UTR|SIN|IND|GEN": {POS: VERB},
'PN|UTR|SIN|DEF|SUB': { POS: PRON }, # du, han, hon, jag, ni "PC|PRF|UTR|SIN|IND|NOM": {POS: VERB}, # särskild, ökad, beredd, gift, oförändrad
'PN|UTR|SIN|DEF|SUB/OBJ': { POS: PRON }, # den, denna, densamma "PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN": {
'PN|UTR|SIN|IND|SUB': { POS: PRON }, # man POS: VERB
'PN|UTR|SIN|IND|SUB/OBJ': { POS: PRON }, # en, var, någon, ingen, Varannan }, # studerandes, sammanboendes, dubbelarbetandes
'PP': { POS: ADP }, # i, av, på, för, till "PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM": {
'PP|AN': { POS: ADP }, # f POS: VERB
'PS|AN': { POS: DET }, }, # följande, beroende, nuvarande, motsvarande, liknande
'PS|NEU|SIN|DEF': { POS: DET }, # sitt, vårt, ditt, mitt, ert "PL": {POS: PART}, # ut, upp, in, till, med
'PS|UTR/NEU|PLU|DEF': { POS: DET }, # sina, våra, dina, mina "PL|SMS": {POS: PART},
'PS|UTR/NEU|SIN/PLU|DEF': { POS: DET }, # deras, dess, hans, hennes, varandras "PM": {POS: PROPN}, # F, N, Liechtenstein, Danmark, DK
'PS|UTR|SIN|DEF': { POS: DET }, # sin, vår, din, min, er "PM|GEN": {POS: PROPN}, # Sveriges, EEC:s, Guds, Stockholms, Kristi
'RG': { POS: NUM }, # 2, 17, 20, 1, 18 "PM|NOM": {POS: PROPN}, # Sverige, EEC, Stockholm, USA, ATP
'RG|GEN': { POS: NUM }, "PM|SMS": {POS: PROPN}, # Göteborgs-, Nord-, Väst-
'RG|MAS|SIN|DEF|NOM': { POS: NUM }, "PN|MAS|SIN|DEF|SUB/OBJ": {POS: PRON}, # denne
'RG|NEU|SIN|IND|NOM': { POS: NUM }, # ett "PN|NEU|SIN|DEF|SUB/OBJ": {POS: PRON}, # det, detta, detsamma
'RG|NOM': { POS: NUM }, # två, tre, 1, 20, 2 "PN|NEU|SIN|IND|SUB/OBJ": {POS: PRON}, # något, allt, mycket, annat, ingenting
'RG|SMS': { POS: NUM }, # ett-, 1950-, två-, tre-, 1700- "PN|UTR/NEU|PLU|DEF|OBJ": {POS: PRON}, # dem, varandra, varann
'RG|UTR/NEU|SIN|DEF|NOM': { POS: NUM }, "PN|UTR/NEU|PLU|DEF|SUB": {POS: PRON}, # de, bägge
'RG|UTR|SIN|IND|NOM': { POS: NUM }, # en "PN|UTR/NEU|PLU|DEF|SUB/OBJ": {POS: PRON}, # dessa, dom, båda, den, bådadera
'RO|MAS|SIN|IND/DEF|GEN': { POS: ADJ }, "PN|UTR/NEU|PLU|IND|SUB/OBJ": {POS: PRON}, # andra, alla, många, sådana, några
'RO|MAS|SIN|IND/DEF|NOM': { POS: ADJ }, # förste "PN|UTR/NEU|SIN/PLU|DEF|OBJ": {POS: PRON}, # sig, sej
'RO|GEN': { POS: ADJ }, "PN|UTR|PLU|DEF|OBJ": {POS: PRON}, # oss, er, eder
'RO|NOM': { POS: ADJ }, # första, andra, tredje, fjärde, femte "PN|UTR|PLU|DEF|SUB": {POS: PRON}, # vi
'SN': { POS: SCONJ }, # att, om, innan, eftersom, medan "PN|UTR|SIN|DEF|OBJ": {POS: PRON}, # dig, mig, henne, honom, Er
'UO': { POS: X }, # companionship, vice, versa, family, capita "PN|UTR|SIN|DEF|SUB": {POS: PRON}, # du, han, hon, jag, ni
'VB|AN': { POS: VERB }, # jfr "PN|UTR|SIN|DEF|SUB/OBJ": {POS: PRON}, # den, denna, densamma
'VB|IMP|AKT': { POS: VERB }, # se, Diskutera, låt, Läs, Gå "PN|UTR|SIN|IND|SUB": {POS: PRON}, # man
'VB|IMP|SFO': { POS: VERB }, # tas "PN|UTR|SIN|IND|SUB/OBJ": {POS: PRON}, # en, var, någon, ingen, Varannan
'VB|INF|AKT': { POS: VERB }, # vara, få, ha, bli, kunna "PP": {POS: ADP}, # i, av, på, för, till
'VB|INF|SFO': { POS: VERB }, # användas, finnas, göras, tas, ses "PP|AN": {POS: ADP}, # f
'VB|KON|PRS|AKT': { POS: VERB }, # vare, Gånge "PS|AN": {POS: DET},
'VB|KON|PRT|AKT': { POS: VERB }, # vore, finge "PS|NEU|SIN|DEF": {POS: DET}, # sitt, vårt, ditt, mitt, ert
'VB|KON|PRT|SFO': { POS: VERB }, "PS|UTR/NEU|PLU|DEF": {POS: DET}, # sina, våra, dina, mina
'VB|PRS|AKT': { POS: VERB }, # är, har, kan, får, måste "PS|UTR/NEU|SIN/PLU|DEF": {POS: DET}, # deras, dess, hans, hennes, varandras
'VB|PRS|SFO': { POS: VERB }, # finns, kallas, behövs, beräknas, används "PS|UTR|SIN|DEF": {POS: DET}, # sin, vår, din, min, er
'VB|PRT|AKT': { POS: VERB }, # skulle, var, hade, kunde, fick "RG": {POS: NUM}, # 2, 17, 20, 1, 18
'VB|PRT|SFO': { POS: VERB }, # fanns, gjordes, höjdes, användes, infördes "RG|GEN": {POS: NUM},
'VB|SMS': { POS: VERB }, # läs- "RG|MAS|SIN|DEF|NOM": {POS: NUM},
'VB|SUP|AKT': { POS: VERB }, # varit, fått, blivit, haft, kommit "RG|NEU|SIN|IND|NOM": {POS: NUM}, # ett
'VB|SUP|SFO': { POS: VERB } # nämnts, gjorts, förändrats, sagts, framhållits "RG|NOM": {POS: NUM}, # två, tre, 1, 20, 2
"RG|SMS": {POS: NUM}, # ett-, 1950-, två-, tre-, 1700-
"RG|UTR/NEU|SIN|DEF|NOM": {POS: NUM},
"RG|UTR|SIN|IND|NOM": {POS: NUM}, # en
"RO|MAS|SIN|IND/DEF|GEN": {POS: ADJ},
"RO|MAS|SIN|IND/DEF|NOM": {POS: ADJ}, # förste
"RO|GEN": {POS: ADJ},
"RO|NOM": {POS: ADJ}, # första, andra, tredje, fjärde, femte
"SN": {POS: SCONJ}, # att, om, innan, eftersom, medan
"UO": {POS: X}, # companionship, vice, versa, family, capita
"VB|AN": {POS: VERB}, # jfr
"VB|IMP|AKT": {POS: VERB}, # se, Diskutera, låt, Läs, Gå
"VB|IMP|SFO": {POS: VERB}, # tas
"VB|INF|AKT": {POS: VERB}, # vara, få, ha, bli, kunna
"VB|INF|SFO": {POS: VERB}, # användas, finnas, göras, tas, ses
"VB|KON|PRS|AKT": {POS: VERB}, # vare, Gånge
"VB|KON|PRT|AKT": {POS: VERB}, # vore, finge
"VB|KON|PRT|SFO": {POS: VERB},
"VB|PRS|AKT": {POS: VERB}, # är, har, kan, får, måste
"VB|PRS|SFO": {POS: VERB}, # finns, kallas, behövs, beräknas, används
"VB|PRT|AKT": {POS: VERB}, # skulle, var, hade, kunde, fick
"VB|PRT|SFO": {POS: VERB}, # fanns, gjordes, höjdes, användes, infördes
"VB|SMS": {POS: VERB}, # läs-
"VB|SUP|AKT": {POS: VERB}, # varit, fått, blivit, haft, kommit
"VB|SUP|SFO": {POS: VERB}, # nämnts, gjorts, förändrats, sagts, framhållits
} }

View File

@ -144,7 +144,7 @@ ABBREVIATIONS = [
# Add abbreviation for trailing punctuation too. If the abbreviation already has a trailing punctuation - skip it. # Add abbreviation for trailing punctuation too. If the abbreviation already has a trailing punctuation - skip it.
for abbr in ABBREVIATIONS: for abbr in ABBREVIATIONS:
if abbr.endswith(".") == False: if not abbr.endswith("."):
ABBREVIATIONS.append(abbr + ".") ABBREVIATIONS.append(abbr + ".")
for orth in ABBREVIATIONS: for orth in ABBREVIATIONS:

View File

@ -4,16 +4,15 @@ from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG
from ...util import update_exc
class TamilDefaults(Language.Defaults): class TamilDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "ta" lex_attr_getters[LANG] = lambda text: "ta"
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
stop_words = STOP_WORDS
class Tamil(Language): class Tamil(Language):

View File

@ -4,70 +4,33 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
# uncomment if files are available
# from .norm_exceptions import NORM_EXCEPTIONS
from .tag_map import TAG_MAP
# from .morph_rules import MORPH_RULES
# uncomment if lookup-based lemmatizer is available
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
# from ...lemmatizerlookup import Lemmatizer
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
def _return_tl(_): def _return_tl(_):
return 'tl' return "tl"
# Create a Language subclass
# Documentation: https://spacy.io/docs/usage/adding-languages
# This file should be placed in spacy/lang/xx (ISO code of language).
# Before submitting a pull request, make sure the remove all comments from the
# language data files, and run at least the basic tokenizer tests. Simply add the
# language ID to the list of languages in spacy/tests/conftest.py to include it
# in the basic tokenizer sanity tests. You can optionally add a fixture for the
# language's tokenizer and add more specific tests. For more info, see the
# tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests
class TagalogDefaults(Language.Defaults): class TagalogDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = _return_tl # ISO code lex_attr_getters[LANG] = _return_tl
# add more norm exception dictionaries here lex_attr_getters[NORM] = add_lookups(
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
# overwrite functions for lexical attributes
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
# add custom tokenizer exceptions to base exceptions
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
# add stop words
stop_words = STOP_WORDS stop_words = STOP_WORDS
lemma_lookup = LOOKUP
# if available: add tag map
# tag_map = dict(TAG_MAP)
# if available: add morph rules
# morph_rules = dict(MORPH_RULES)
# if available: add lookup lemmatizer
# @classmethod
# def create_lemmatizer(cls, nlp=None):
# return Lemmatizer(LOOKUP)
class Tagalog(Language): class Tagalog(Language):
lang = 'tl' # ISO code lang = "tl"
Defaults = TagalogDefaults # set Defaults to custom language defaults Defaults = TagalogDefaults
# set default export this allows the language class to be lazy-loaded __all__ = ["Tagalog"]
__all__ = ['Tagalog']

View File

@ -2,11 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
# Adding a lemmatizer lookup table
# Documentation: https://spacy.io/docs/usage/adding-languages#lemmatizer
# Entries should be added in the following format:
LOOKUP = { LOOKUP = {
"kaugnayan": "ugnay", "kaugnayan": "ugnay",
"sangkatauhan": "tao", "sangkatauhan": "tao",
@ -14,5 +9,5 @@ LOOKUP = {
"pandaigdigan": "daigdig", "pandaigdigan": "daigdig",
"kasaysayan": "saysay", "kasaysayan": "saysay",
"kabayanihan": "bayani", "kabayanihan": "bayani",
"karuwagan": "duwag" "karuwagan": "duwag",
} }

View File

@ -1,33 +1,55 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
# import the symbols for the attrs you want to overwrite
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
# Overwriting functions for lexical attributes _num_words = [
# Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs "sero",
# Most of these functions, like is_lower or like_url should be language- "isa",
# independent. Others, like like_num (which includes both digits and number "dalawa",
# words), requires customisation. "tatlo",
"apat",
"lima",
# Example: check if token resembles a number "anim",
"pito",
_num_words = ['sero', 'isa', 'dalawa', 'tatlo', 'apat', 'lima', 'anim', 'pito', "walo",
'walo', 'siyam', 'sampu', 'labing-isa', 'labindalawa', 'labintatlo', 'labing-apat', "siyam",
'labinlima', 'labing-anim', 'labimpito', 'labing-walo', 'labinsiyam', 'dalawampu', "sampu",
'tatlumpu', 'apatnapu', 'limampu', 'animnapu', 'pitumpu', 'walumpu', 'siyamnapu', "labing-isa",
'daan', 'libo', 'milyon', 'bilyon', 'trilyon', 'quadrilyon', "labindalawa",
'gajilyon', 'bazilyon'] "labintatlo",
"labing-apat",
"labinlima",
"labing-anim",
"labimpito",
"labing-walo",
"labinsiyam",
"dalawampu",
"tatlumpu",
"apatnapu",
"limampu",
"animnapu",
"pitumpu",
"walumpu",
"siyamnapu",
"daan",
"libo",
"milyon",
"bilyon",
"trilyon",
"quadrilyon",
"gajilyon",
"bazilyon",
]
def like_num(text): def like_num(text):
text = text.replace(',', '').replace('.', '') text = text.replace(",", "").replace(".", "")
if text.isdigit(): if text.isdigit():
return True return True
if text.count('/') == 1: if text.count("/") == 1:
num, denom = text.split('/') num, denom = text.split("/")
if num.isdigit() and denom.isdigit(): if num.isdigit() and denom.isdigit():
return True return True
if text in _num_words: if text in _num_words:
@ -35,9 +57,4 @@ def like_num(text):
return False return False
# Create dictionary of functions to overwrite. The default lex_attr_getters are LEX_ATTRS = {LIKE_NUM: like_num}
# updated with this one, so only the functions defined here are overwritten.
LEX_ATTRS = {
LIKE_NUM: like_num
}

View File

@ -1,162 +1,154 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
STOP_WORDS = set(
# Add stop words """
# Documentation: https://spacy.io/docs/usage/adding-languages#stop-words akin
# To improve readability, words should be ordered alphabetically and separated aking
# by spaces and newlines. When adding stop words from an online source, always ako
# include the link in a comment. Make sure to proofread and double-check the alin
# words lists available online are often known to contain mistakes. am
amin
# data from https://github.com/stopwords-iso/stopwords-tl/blob/master/stopwords-tl.txt aming
ang
STOP_WORDS = set(""" ano
akin anumang
aking apat
ako at
alin atin
am ating
amin ay
aming bababa
ang bago
ano bakit
anumang bawat
apat bilang
at dahil
atin dalawa
ating dapat
ay din
bababa dito
bago doon
bakit gagawin
bawat gayunman
bilang ginagawa
dahil ginawa
dalawa ginawang
dapat gumawa
din gusto
dito habang
doon hanggang
gagawin hindi
gayunman huwag
ginagawa iba
ginawa ibaba
ginawang ibabaw
gumawa ibig
gusto ikaw
habang ilagay
hanggang ilalim
hindi ilan
huwag inyong
iba isa
ibaba isang
ibabaw itaas
ibig ito
ikaw iyo
ilagay iyon
ilalim iyong
ilan ka
inyong kahit
isa kailangan
isang kailanman
itaas kami
ito kanila
iyo kanilang
iyon kanino
iyong kanya
ka kanyang
kahit kapag
kailangan kapwa
kailanman karamihan
kami katiyakan
kanila katulad
kanilang kaya
kanino kaysa
kanya ko
kanyang kong
kapag kulang
kapwa kumuha
karamihan kung
katiyakan laban
katulad lahat
kaya lamang
kaysa likod
ko lima
kong maaari
kulang maaaring
kumuha maging
kung mahusay
laban makita
lahat marami
lamang marapat
likod masyado
lima may
maaari mayroon
maaaring mga
maging minsan
mahusay mismo
makita mula
marami muli
marapat na
masyado nabanggit
may naging
mayroon nagkaroon
mga nais
minsan nakita
mismo namin
mula napaka
muli narito
na nasaan
nabanggit ng
naging ngayon
nagkaroon ni
nais nila
nakita nilang
namin nito
napaka niya
narito niyang
nasaan noon
ng o
ngayon pa
ni paano
nila pababa
nilang paggawa
nito pagitan
niya pagkakaroon
niyang pagkatapos
noon palabas
o pamamagitan
pa panahon
paano pangalawa
pababa para
paggawa paraan
pagitan pareho
pagkakaroon pataas
pagkatapos pero
palabas pumunta
pamamagitan pumupunta
panahon sa
pangalawa saan
para sabi
paraan sabihin
pareho sarili
pataas sila
pero sino
pumunta siya
pumupunta tatlo
sa tayo
saan tulad
sabi tungkol
sabihin una
sarili walang
sila """.split()
sino )
siya
tatlo
tayo
tulad
tungkol
una
walang
""".split())

View File

@ -1,36 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
from ...symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
# Add a tag map
# Documentation: https://spacy.io/docs/usage/adding-languages#tag-map
# Universal Dependencies: http://universaldependencies.org/u/pos/all.html
# The keys of the tag map should be strings in your tag set. The dictionary must
# have an entry POS whose value is one of the Universal Dependencies tags.
# Optionally, you can also include morphological features or other attributes.
TAG_MAP = {
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
"ADP": {POS: ADP},
"PRON": {POS: PRON},
"SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN},
"DET": {POS: DET},
"SYM": {POS: SYM},
"INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT},
"NUM": {POS: NUM},
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"CCONJ": {POS: CCONJ},
"ADJ": {POS: ADJ},
"VERB": {POS: VERB},
"PART": {POS: PART},
"SP": {POS: SPACE}
}

View File

@ -1,48 +1,20 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
# import symbols if you need to use more, add them here from ...symbols import ORTH, LEMMA
from ...symbols import ORTH, LEMMA, TAG, NORM, ADP, DET
# Add tokenizer exceptions
# Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions
# Feel free to use custom logic to generate repetitive exceptions more efficiently.
# If an exception is split into more than one token, the ORTH values combined always
# need to match the original string.
# Exceptions should be added in the following format:
_exc = { _exc = {
"tayo'y": [ "tayo'y": [{ORTH: "tayo", LEMMA: "tayo"}, {ORTH: "'y", LEMMA: "ay"}],
{ORTH: "tayo", LEMMA: "tayo"}, "isa'y": [{ORTH: "isa", LEMMA: "isa"}, {ORTH: "'y", LEMMA: "ay"}],
{ORTH: "'y", LEMMA: "ay"}], "baya'y": [{ORTH: "baya", LEMMA: "bayan"}, {ORTH: "'y", LEMMA: "ay"}],
"isa'y": [ "sa'yo": [{ORTH: "sa", LEMMA: "sa"}, {ORTH: "'yo", LEMMA: "iyo"}],
{ORTH: "isa", LEMMA: "isa"}, "ano'ng": [{ORTH: "ano", LEMMA: "ano"}, {ORTH: "'ng", LEMMA: "ang"}],
{ORTH: "'y", LEMMA: "ay"}], "siya'y": [{ORTH: "siya", LEMMA: "siya"}, {ORTH: "'y", LEMMA: "ay"}],
"baya'y": [ "nawa'y": [{ORTH: "nawa", LEMMA: "nawa"}, {ORTH: "'y", LEMMA: "ay"}],
{ORTH: "baya", LEMMA: "bayan"}, "papa'no": [{ORTH: "papa'no", LEMMA: "papaano"}],
{ORTH: "'y", LEMMA: "ay"}], "'di": [{ORTH: "'di", LEMMA: "hindi"}],
"sa'yo": [
{ORTH: "sa", LEMMA: "sa"},
{ORTH: "'yo", LEMMA: "iyo"}],
"ano'ng": [
{ORTH: "ano", LEMMA: "ano"},
{ORTH: "'ng", LEMMA: "ang"}],
"siya'y": [
{ORTH: "siya", LEMMA: "siya"},
{ORTH: "'y", LEMMA: "ay"}],
"nawa'y": [
{ORTH: "nawa", LEMMA: "nawa"},
{ORTH: "'y", LEMMA: "ay"}],
"papa'no": [
{ORTH: "papa'no", LEMMA: "papaano"}],
"'di": [
{ORTH: "'di", LEMMA: "hindi"}]
} }
# To keep things clean and readable, it's recommended to only declare the
# TOKENIZER_EXCEPTIONS at the bottom:
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = _exc

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re import re
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE, PUNCT from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex

View File

@ -5,71 +5,32 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
# uncomment if files are available
# from .norm_exceptions import NORM_EXCEPTIONS
# from .tag_map import TAG_MAP
# from .morph_rules import MORPH_RULES
# uncomment if lookup-based lemmatizer is available
# from .lemmatizer import LOOKUP
# from ...lemmatizerlookup import Lemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
from ...language import Language from ...language import Language
from ...attrs import LANG, LIKE_NUM, NORM from ...attrs import LANG, NORM
# from .tag_map import TAG_MAP
from .lemmatizer import UkrainianLemmatizer from .lemmatizer import UkrainianLemmatizer
# Create a Language subclass
# Documentation: https://spacy.io/docs/usage/adding-languages
# This file should be placed in spacy/lang/xx (ISO code of language).
# Before submitting a pull request, make sure the remove all comments from the
# language data files, and run at least the basic tokenizer tests. Simply add the
# language ID to the list of languages in spacy/tests/conftest.py to include it
# in the basic tokenizer sanity tests. You can optionally add a fixture for the
# language's tokenizer and add more specific tests. For more info, see the
# tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests
class UkrainianDefaults(Language.Defaults): class UkrainianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'uk' # ISO code lex_attr_getters[LANG] = lambda text: "uk"
# add more norm exception dictionaries here lex_attr_getters[NORM] = add_lookups(
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
# overwrite functions for lexical attributes
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
# add custom tokenizer exceptions to base exceptions
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
# add stop words
stop_words = STOP_WORDS stop_words = STOP_WORDS
# if available: add tag map
# tag_map = dict(TAG_MAP)
# if available: add morph rules
# morph_rules = dict(MORPH_RULES)
# if available: add lookup lemmatizer
# @classmethod
# def create_lemmatizer(cls, nlp=None):
# return Lemmatizer(LOOKUP)
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None): def create_lemmatizer(cls, nlp=None):
return UkrainianLemmatizer() return UkrainianLemmatizer()
class Ukrainian(Language): class Ukrainian(Language):
lang = 'uk' # ISO code lang = "uk"
Defaults = UkrainianDefaults # set Defaults to custom language defaults Defaults = UkrainianDefaults
# set default export this allows the language class to be lazy-loaded __all__ = ["Ukrainian"]
__all__ = ['Ukrainian']

View File

@ -14,10 +14,10 @@ sentences = [
"Ніч на середу буде морозною.", "Ніч на середу буде морозною.",
"Чим кращі книги ти читав, тим гірше спиш.", # Serhiy Zhadan "Чим кращі книги ти читав, тим гірше спиш.", # Serhiy Zhadan
"Найстаріші ґудзики, відомі людству, археологи знайшли в долині ріки Інд.", "Найстаріші ґудзики, відомі людству, археологи знайшли в долині ріки Інд.",
"Слов'янське слово «Україна» вперше згадується у Київському літописному зводі за Іпатіївським списком під 1187 роком.", # wikipedia "Слов'янське слово «Україна» вперше згадується у Київському літописному зводі за Іпатіївським списком під 1187 роком.", # wikipedia
"Де у Києві найсмачніша кава?", "Де у Києві найсмачніша кава?",
"Від Нижнього озера довгими дерев’яними сходами, над якими синьо й біло горіли маленькі коробочки-ліхтарики, підіймалися до нього двоє стовусів: найкращий друг Вертутій і його дванадцятилітній онук Чублик.", # blyznets_viktor_semenovych/zemlia_svitliachkiv "Від Нижнього озера довгими дерев’яними сходами, над якими синьо й біло горіли маленькі коробочки-ліхтарики, підіймалися до нього двоє стовусів: найкращий друг Вертутій і його дванадцятилітній онук Чублик.", # blyznets_viktor_semenovych/zemlia_svitliachkiv
"Китайський космічний зонд \"Чан'е-4\" вперше в історії здійснив м'яку посадку на зворотному боці Місяця.", "Китайський космічний зонд \"Чан'е-4\" вперше в історії здійснив м'яку посадку на зворотному боці Місяця.",
"Коли до губ твоїх лишається півподиху, коли до губ твоїх лишається півкроку зіниці твої виткані із подиву, в очах у тебе синьо і широко.", # Hryhorij Czubaj "Коли до губ твоїх лишається півподиху, коли до губ твоїх лишається півкроку зіниці твої виткані із подиву, в очах у тебе синьо і широко.", # Hryhorij Czubaj
"Дорогу сестру збираю у дорогу, а брати вирішили не брати машину." # homographs "Дорогу сестру збираю у дорогу, а брати вирішили не брати машину.", # homographs
] ]

View File

@ -1,12 +1,15 @@
# coding: utf8
from __future__ import unicode_literals
from ..ru.lemmatizer import RussianLemmatizer from ..ru.lemmatizer import RussianLemmatizer
class UkrainianLemmatizer(RussianLemmatizer): class UkrainianLemmatizer(RussianLemmatizer):
def __init__(self, pymorphy2_lang="ru"):
def __init__(self, pymorphy2_lang='ru'):
try: try:
super(UkrainianLemmatizer, self).__init__(pymorphy2_lang='uk') super(UkrainianLemmatizer, self).__init__(pymorphy2_lang="uk")
except ImportError: except ImportError:
raise ImportError( raise ImportError(
'The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: ' "The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: "
'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"') 'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
)

View File

@ -1,32 +1,68 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
# import the symbols for the attrs you want to overwrite
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
_num_words = [
# Overwriting functions for lexical attributes "більйон",
# Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs "вісім",
# Most of these functions, like is_lower or like_url should be language- "вісімдесят",
# independent. Others, like like_num (which includes both digits and number "вісімнадцять",
# words), requires customisation. "вісімсот",
"восьмий",
"два",
# Example: check if token resembles a number "двадцять",
_num_words = ["більйон", "вісім", "вісімдесят", "вісімнадцять", "вісімсот", "восьмий", "два", "двадцять", "дванадцять", "дванадцять",
"двісті", "дев'яносто", "дев'ятнадцять", "дев'ятсот", "дев'ять", "десять", "децильйон", "квадрильйон", "двісті",
"квінтильйон", "мільйон", "мільярд", "нонильйон", "один", "одинадцять", "октильйон", "п'ятий", "дев'яносто",
"п'ятисотий", "п'ятнадцять", "п'ятсот", "п'ять", "секстильйон", "септильйон", "сім", "сімдесят", "дев'ятнадцять",
"сімнадцять", "сімсот", "сорок", "сто", "тисяча", "три", "тридцять", "трильйон", "тринадцять", "триста", "дев'ятсот",
"чотири", "чотириста", "чотирнадцять", "шістдесят", "шістнадцять", "шістсот", "шість"] "дев'ять",
"десять",
"децильйон",
"квадрильйон",
"квінтильйон",
"мільйон",
"мільярд",
"нонильйон",
"один",
"одинадцять",
"октильйон",
"п'ятий",
"п'ятисотий",
"п'ятнадцять",
"п'ятсот",
"п'ять",
"секстильйон",
"септильйон",
"сім",
"сімдесят",
"сімнадцять",
"сімсот",
"сорок",
"сто",
"тисяча",
"три",
"тридцять",
"трильйон",
"тринадцять",
"триста",
"чотири",
"чотириста",
"чотирнадцять",
"шістдесят",
"шістнадцять",
"шістсот",
"шість",
]
def like_num(text): def like_num(text):
text = text.replace(',', '').replace('.', '') text = text.replace(",", "").replace(".", "")
if text.isdigit(): if text.isdigit():
return True return True
if text.count('/') == 1: if text.count("/") == 1:
num, denom = text.split('/') num, denom = text.split("/")
if num.isdigit() and denom.isdigit(): if num.isdigit() and denom.isdigit():
return True return True
if text in _num_words: if text in _num_words:
@ -34,9 +70,4 @@ def like_num(text):
return False return False
# Create dictionary of functions to overwrite. The default lex_attr_getters are LEX_ATTRS = {LIKE_NUM: like_num}
# updated with this one, so only the functions defined here are overwritten.
LEX_ATTRS = {
LIKE_NUM: like_num
}

View File

@ -2,15 +2,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
# Add stop words STOP_WORDS = set(
# Documentation: https://spacy.io/docs/usage/adding-languages#stop-words """а
# To improve readability, words should be ordered alphabetically and separated
# by spaces and newlines. When adding stop words from an online source, always
# include the link in a comment. Make sure to proofread and double-check the
# words lists available online are often known to contain mistakes.
STOP_WORDS = set("""а
або або
адже адже
але але
@ -401,4 +394,5 @@ STOP_WORDS = set("""а
якій якій
якого якого
якщо якщо
""".split()) """.split()
)

View File

@ -5,32 +5,24 @@ from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
# Add a tag map
# Documentation: https://spacy.io/docs/usage/adding-languages#tag-map
# Universal Dependencies: http://universaldependencies.org/u/pos/all.html
# The keys of the tag map should be strings in your tag set. The dictionary must
# have an entry POS whose value is one of the Universal Dependencies tags.
# Optionally, you can also include morphological features or other attributes.
TAG_MAP = { TAG_MAP = {
"ADV": {POS: ADV}, "ADV": {POS: ADV},
"NOUN": {POS: NOUN}, "NOUN": {POS: NOUN},
"ADP": {POS: ADP}, "ADP": {POS: ADP},
"PRON": {POS: PRON}, "PRON": {POS: PRON},
"SCONJ": {POS: SCONJ}, "SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN}, "PROPN": {POS: PROPN},
"DET": {POS: DET}, "DET": {POS: DET},
"SYM": {POS: SYM}, "SYM": {POS: SYM},
"INTJ": {POS: INTJ}, "INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT}, "PUNCT": {POS: PUNCT},
"NUM": {POS: NUM}, "NUM": {POS: NUM},
"AUX": {POS: AUX}, "AUX": {POS: AUX},
"X": {POS: X}, "X": {POS: X},
"CONJ": {POS: CONJ}, "CONJ": {POS: CONJ},
"CCONJ": {POS: CCONJ}, "CCONJ": {POS: CCONJ},
"ADJ": {POS: ADJ}, "ADJ": {POS: ADJ},
"VERB": {POS: VERB}, "VERB": {POS: VERB},
"PART": {POS: PART}, "PART": {POS: PART},
"SP": {POS: SPACE} "SP": {POS: SPACE},
} }

View File

@ -1,18 +1,9 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
# import symbols if you need to use more, add them here
from ...symbols import ORTH, LEMMA, POS, NORM, NOUN from ...symbols import ORTH, LEMMA, POS, NORM, NOUN
# Add tokenizer exceptions
# Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions
# Feel free to use custom logic to generate repetitive exceptions more efficiently.
# If an exception is split into more than one token, the ORTH values combined always
# need to match the original string.
# Exceptions should be added in the following format:
_exc = {} _exc = {}
for exc_data in [ for exc_data in [
@ -28,11 +19,9 @@ for exc_data in [
{ORTH: "проф.", LEMMA: "професор", NORM: "професор", POS: NOUN}, {ORTH: "проф.", LEMMA: "професор", NORM: "професор", POS: NOUN},
{ORTH: "акад.", LEMMA: "академік", NORM: "академік", POS: NOUN}, {ORTH: "акад.", LEMMA: "академік", NORM: "академік", POS: NOUN},
{ORTH: "доц.", LEMMA: "доцент", NORM: "доцент", POS: NOUN}, {ORTH: "доц.", LEMMA: "доцент", NORM: "доцент", POS: NOUN},
{ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN}]: {ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN},
]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
# To keep things clean and readable, it's recommended to only declare the
# TOKENIZER_EXCEPTIONS at the bottom:
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = _exc

View File

@ -1,6 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .matcher import Matcher from .matcher import Matcher # noqa: F401
from .phrasematcher import PhraseMatcher from .phrasematcher import PhraseMatcher # noqa: F401
from .dependencymatcher import DependencyTreeMatcher from .dependencymatcher import DependencyTreeMatcher # noqa: F401

View File

@ -119,8 +119,8 @@ def tr_tokenizer():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def uk_tokenizer(): def uk_tokenizer():
pymorphy = pytest.importorskip("pymorphy2") pytest.importorskip("pymorphy2")
return util.get_lang_class("uk").Defaults.create_tokenizer() return get_lang_class("uk").Defaults.create_tokenizer()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
@ -130,7 +130,7 @@ def ca_tokenizer():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def pl_tokenizer(): def pl_tokenizer():
return util.get_lang_class("pl").Defaults.create_tokenizer() return get_lang_class("pl").Defaults.create_tokenizer()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")

View File

View File

@ -3,57 +3,57 @@ from __future__ import unicode_literals
import pytest import pytest
DOT_TESTS = [ DOT_TESTS = [
('tel.', ['tel.']), ("tel.", ["tel."]),
('np.', ['np.']), ("np.", ["np."]),
('godz. 21:37', ['godz.', '21:37']), ("godz. 21:37", ["godz.", "21:37"]),
('inż.', ['inż.']), ("inż.", ["inż."]),
('gosp.-polit.', ['gosp.-polit.']), ("gosp.-polit.", ["gosp.-polit."]),
('ppoż', ['ppoż']), ("ppoż", ["ppoż"]),
('płn', ['płn']), ("płn", ["płn"]),
('ul.', ['ul.']), ("ul.", ["ul."]),
('jw.', ['jw.']), ("jw.", ["jw."]),
('itd.', ['itd.']), ("itd.", ["itd."]),
('cdn.', ['cdn.']), ("cdn.", ["cdn."]),
('itp.', ['itp.']), ("itp.", ["itp."]),
('10,- zł', ['10,-', '']), ("10,- zł", ["10,-", ""]),
('0 zł 99 gr', ['0', '', '99', 'gr']), ("0 zł 99 gr", ["0", "", "99", "gr"]),
('0,99 rub.', ['0,99', 'rub.']), ("0,99 rub.", ["0,99", "rub."]),
('dol.', ['dol.']), ("dol.", ["dol."]),
('1000 m n.p.m.', ['1000', 'm', 'n.p.m.']), ("1000 m n.p.m.", ["1000", "m", "n.p.m."]),
('m.in.', ['m.in.']), ("m.in.", ["m.in."]),
('p.n.e.', ['p.n.e.']), ("p.n.e.", ["p.n.e."]),
('Sz.P.', ['Sz.P.']), ("Sz.P.", ["Sz.P."]),
('p.o.', ['p.o.']), ("p.o.", ["p.o."]),
('k.o.', ['k.o.']), ("k.o.", ["k.o."]),
('m.st.', ['m.st.']), ("m.st.", ["m.st."]),
('dra.', ['dra', '.']), ("dra.", ["dra", "."]),
('pp.', ['pp.']), ("pp.", ["pp."]),
('oo.', ['oo.']) ("oo.", ["oo."]),
] ]
HYPHEN_TESTS = [ HYPHEN_TESTS = [
('5-fluoropentylo-3-pirydynyloindol', ['5-fluoropentylo-3-pirydynyloindol']), ("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]),
('NESS-040C5', ['NESS-040C5']), ("NESS-040C5", ["NESS-040C5"]),
('JTE-7-31', ['JTE-7-31']), ("JTE-7-31", ["JTE-7-31"]),
('BAY-59-3074', ['BAY-59-3074']), ("BAY-59-3074", ["BAY-59-3074"]),
('BAY-38-7271', ['BAY-38-7271']), ("BAY-38-7271", ["BAY-38-7271"]),
('STS-135', ['STS-135']), ("STS-135", ["STS-135"]),
('5F-PB-22', ['5F-PB-22']), ("5F-PB-22", ["5F-PB-22"]),
('cztero-', ['cztero-']), ("cztero-", ["cztero-"]),
('jedno-', ['jedno-']), ("jedno-", ["jedno-"]),
('dwu-', ['dwu-']), ("dwu-", ["dwu-"]),
('trzy-', ['trzy-']), ("trzy-", ["trzy-"]),
('b-adoratorzy', ['b-adoratorzy']), ("b-adoratorzy", ["b-adoratorzy"]),
('2-3-4 drzewa', ['2-3-4', 'drzewa']), ("2-3-4 drzewa", ["2-3-4", "drzewa"]),
('b-drzewa', ['b-drzewa']) ("b-drzewa", ["b-drzewa"]),
] ]
TESTCASES = DOT_TESTS + HYPHEN_TESTS TESTCASES = DOT_TESTS + HYPHEN_TESTS
@pytest.mark.parametrize('text,expected_tokens', TESTCASES) @pytest.mark.parametrize("text,expected_tokens", TESTCASES)
def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens): def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens):
tokens = pl_tokenizer(text) tokens = pl_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space] token_list = [token.text for token in tokens if not token.is_space]

View File

@ -5,34 +5,42 @@ import pytest
SV_TOKEN_EXCEPTION_TESTS = [ SV_TOKEN_EXCEPTION_TESTS = [
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']), (
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']), "Smörsåsen används bl.a. till fisk",
('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."]) ["Smörsåsen", "används", "bl.a.", "till", "fisk"],
),
(
"Jag kommer först kl. 13 p.g.a. diverse förseningar",
["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
),
(
"Anders I. tycker om ord med i i.",
["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."],
),
] ]
@pytest.mark.parametrize('text,expected_tokens', SV_TOKEN_EXCEPTION_TESTS) @pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS)
def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens): def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens):
tokens = sv_tokenizer(text) tokens = sv_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space] token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list assert expected_tokens == token_list
@pytest.mark.parametrize('text', ["driveru", "hajaru", "Serru", "Fixaru"]) @pytest.mark.parametrize("text", ["driveru", "hajaru", "Serru", "Fixaru"])
def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text): def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text):
tokens = sv_tokenizer(text) tokens = sv_tokenizer(text)
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[1].text == "u" assert tokens[1].text == "u"
@pytest.mark.parametrize('text', @pytest.mark.parametrize("text", ["bl.a", "m.a.o.", "Jan.", "Dec.", "kr.", "osv."])
["bl.a", "m.a.o.", "Jan.", "Dec.", "kr.", "osv."])
def test_sv_tokenizer_handles_abbr(sv_tokenizer, text): def test_sv_tokenizer_handles_abbr(sv_tokenizer, text):
tokens = sv_tokenizer(text) tokens = sv_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1
@pytest.mark.parametrize('text', ["Jul.", "jul.", "sön.", "Sön."]) @pytest.mark.parametrize("text", ["Jul.", "jul.", "sön.", "Sön."])
def test_sv_tokenizer_handles_ambiguous_abbr(sv_tokenizer, text): def test_sv_tokenizer_handles_ambiguous_abbr(sv_tokenizer, text):
tokens = sv_tokenizer(text) tokens = sv_tokenizer(text)
assert len(tokens) == 2 assert len(tokens) == 2

View File

@ -4,12 +4,17 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.parametrize('string,lemma', [('DNA-profilernas', 'DNA-profil'), @pytest.mark.parametrize(
('Elfenbenskustens', 'Elfenbenskusten'), "string,lemma",
('abortmotståndarens', 'abortmotståndare'), [
('kolesterols', 'kolesterol'), ("DNA-profilernas", "DNA-profil"),
('portionssnusernas', 'portionssnus'), ("Elfenbenskustens", "Elfenbenskusten"),
('åsyns', 'åsyn')]) ("abortmotståndarens", "abortmotståndare"),
("kolesterols", "kolesterol"),
("portionssnusernas", "portionssnus"),
("åsyns", "åsyn"),
],
)
def test_lemmatizer_lookup_assigns(sv_tokenizer, string, lemma): def test_lemmatizer_lookup_assigns(sv_tokenizer, string, lemma):
tokens = sv_tokenizer(string) tokens = sv_tokenizer(string)
assert tokens[0].lemma_ == lemma assert tokens[0].lemma_ == lemma

View File

@ -1,28 +1,28 @@
# coding: utf-8 # coding: utf-8
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.parametrize('text', ["(under)"])
@pytest.mark.parametrize("text", ["(under)"])
def test_tokenizer_splits_no_special(sv_tokenizer, text): def test_tokenizer_splits_no_special(sv_tokenizer, text):
tokens = sv_tokenizer(text) tokens = sv_tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3
@pytest.mark.parametrize('text', ["gitta'r", "Björn's", "Lars'"]) @pytest.mark.parametrize("text", ["gitta'r", "Björn's", "Lars'"])
def test_tokenizer_handles_no_punct(sv_tokenizer, text): def test_tokenizer_handles_no_punct(sv_tokenizer, text):
tokens = sv_tokenizer(text) tokens = sv_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1
@pytest.mark.parametrize('text', ["svart.Gul", "Hej.Världen"]) @pytest.mark.parametrize("text", ["svart.Gul", "Hej.Världen"])
def test_tokenizer_splits_period_infix(sv_tokenizer, text): def test_tokenizer_splits_period_infix(sv_tokenizer, text):
tokens = sv_tokenizer(text) tokens = sv_tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Hej,Världen", "en,två"]) @pytest.mark.parametrize("text", ["Hej,Världen", "en,två"])
def test_tokenizer_splits_comma_infix(sv_tokenizer, text): def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
tokens = sv_tokenizer(text) tokens = sv_tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3
@ -31,7 +31,7 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
assert tokens[2].text == text.split(",")[1] assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["svart...Gul", "svart...gul"]) @pytest.mark.parametrize("text", ["svart...Gul", "svart...gul"])
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text): def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
tokens = sv_tokenizer(text) tokens = sv_tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3

View File

@ -1,9 +1,6 @@
# coding: utf-8 # coding: utf-8
"""Test that longer and mixed texts are tokenized correctly."""
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
def test_sv_tokenizer_handles_long_text(sv_tokenizer): def test_sv_tokenizer_handles_long_text(sv_tokenizer):
text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön, text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön,

View File

@ -1,25 +1,24 @@
# coding: utf-8 # coding: utf-8
"""Test that open, closed and paired punctuation is split off correctly."""
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
PUNCT_OPEN = ['(', '[', '{', '*'] PUNCT_OPEN = ["(", "[", "{", "*"]
PUNCT_CLOSE = [')', ']', '}', '*'] PUNCT_CLOSE = [")", "]", "}", "*"]
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
@pytest.mark.parametrize('text', ["(", "((", "<"]) @pytest.mark.parametrize("text", ["(", "((", "<"])
def test_uk_tokenizer_handles_only_punct(uk_tokenizer, text): def test_uk_tokenizer_handles_only_punct(uk_tokenizer, text):
tokens = uk_tokenizer(text) tokens = uk_tokenizer(text)
assert len(tokens) == len(text) assert len(tokens) == len(text)
@pytest.mark.parametrize('punct', PUNCT_OPEN) @pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) @pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text): def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text):
tokens = uk_tokenizer(punct + text) tokens = uk_tokenizer(punct + text)
assert len(tokens) == 2 assert len(tokens) == 2
@ -27,8 +26,10 @@ def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text):
assert tokens[1].text == text assert tokens[1].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE) @pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) @pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text): def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text):
tokens = uk_tokenizer(text + punct) tokens = uk_tokenizer(text + punct)
assert len(tokens) == 2 assert len(tokens) == 2
@ -36,9 +37,11 @@ def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text):
assert tokens[1].text == punct assert tokens[1].text == punct
@pytest.mark.parametrize('punct', PUNCT_OPEN) @pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize('punct_add', ["`"]) @pytest.mark.parametrize("punct_add", ["`"])
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) @pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add, text): def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add, text):
tokens = uk_tokenizer(punct + punct_add + text) tokens = uk_tokenizer(punct + punct_add + text)
assert len(tokens) == 3 assert len(tokens) == 3
@ -47,9 +50,11 @@ def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add,
assert tokens[2].text == text assert tokens[2].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE) @pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize('punct_add', ["'"]) @pytest.mark.parametrize("punct_add", ["'"])
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) @pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add, text): def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add, text):
tokens = uk_tokenizer(text + punct + punct_add) tokens = uk_tokenizer(text + punct + punct_add)
assert len(tokens) == 3 assert len(tokens) == 3
@ -58,8 +63,10 @@ def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add
assert tokens[2].text == punct_add assert tokens[2].text == punct_add
@pytest.mark.parametrize('punct', PUNCT_OPEN) @pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) @pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text): def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text):
tokens = uk_tokenizer(punct + punct + punct + text) tokens = uk_tokenizer(punct + punct + punct + text)
assert len(tokens) == 4 assert len(tokens) == 4
@ -67,8 +74,10 @@ def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text):
assert tokens[3].text == text assert tokens[3].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE) @pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) @pytest.mark.parametrize(
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
)
def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text): def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text):
tokens = uk_tokenizer(text + punct + punct + punct) tokens = uk_tokenizer(text + punct + punct + punct)
assert len(tokens) == 4 assert len(tokens) == 4
@ -76,14 +85,14 @@ def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text):
assert tokens[1].text == punct assert tokens[1].text == punct
@pytest.mark.parametrize('text', ["'Тест"]) @pytest.mark.parametrize("text", ["'Тест"])
def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text): def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text):
tokens = uk_tokenizer(text) tokens = uk_tokenizer(text)
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[0].text == "'" assert tokens[0].text == "'"
@pytest.mark.parametrize('text', ["Тест''"]) @pytest.mark.parametrize("text", ["Тест''"])
def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text): def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
tokens = uk_tokenizer(text) tokens = uk_tokenizer(text)
assert len(tokens) == 2 assert len(tokens) == 2
@ -91,10 +100,13 @@ def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
assert len(tokens_punct) == 1 assert len(tokens_punct) == 1
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) @pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) @pytest.mark.parametrize(
def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open, "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
punct_close, text): )
def test_uk_tokenizer_splits_open_close_punct(
uk_tokenizer, punct_open, punct_close, text
):
tokens = uk_tokenizer(punct_open + text + punct_close) tokens = uk_tokenizer(punct_open + text + punct_close)
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[0].text == punct_open assert tokens[0].text == punct_open
@ -102,11 +114,14 @@ def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open,
assert tokens[2].text == punct_close assert tokens[2].text == punct_close
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) @pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")]) @pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]) @pytest.mark.parametrize(
def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close, "text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
punct_open2, punct_close2, text): )
def test_uk_tokenizer_two_diff_punct(
uk_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
):
tokens = uk_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2) tokens = uk_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[0].text == punct_open2 assert tokens[0].text == punct_open2
@ -116,7 +131,9 @@ def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close,
assert tokens[4].text == punct_close2 assert tokens[4].text == punct_close2
@pytest.mark.parametrize('text', ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."]) @pytest.mark.parametrize(
"text", ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."]
)
def test_uk_tokenizer_splits_trailing_dot(uk_tokenizer, text): def test_uk_tokenizer_splits_trailing_dot(uk_tokenizer, text):
tokens = uk_tokenizer(text) tokens = uk_tokenizer(text)
assert tokens[1].text == "." assert tokens[1].text == "."

View File

@ -1,18 +1,14 @@
# coding: utf-8 # coding: utf-8
"""Test that tokenizer exceptions are parsed correctly."""
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.parametrize('text,norms,lemmas', [("ім.", ["імені"], ["ім'я"]), @pytest.mark.parametrize(
("проф.", ["професор"], ["професор"])]) "text,norms,lemmas",
[("ім.", ["імені"], ["ім'я"]), ("проф.", ["професор"], ["професор"])],
)
def test_uk_tokenizer_abbrev_exceptions(uk_tokenizer, text, norms, lemmas): def test_uk_tokenizer_abbrev_exceptions(uk_tokenizer, text, norms, lemmas):
tokens = uk_tokenizer(text) tokens = uk_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1
assert [token.norm_ for token in tokens] == norms assert [token.norm_ for token in tokens] == norms

View File

@ -1,16 +1,16 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import json import json
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
import pytest
from ...cli.train import train from ...cli.train import train
def test_cli_trained_model_can_be_saved(tmpdir): def test_cli_trained_model_can_be_saved(tmpdir):
lang = 'nl' lang = "nl"
output_dir = str(tmpdir) output_dir = str(tmpdir)
train_file = NamedTemporaryFile('wb', dir=output_dir, delete=False) train_file = NamedTemporaryFile("wb", dir=output_dir, delete=False)
train_corpus = [ train_corpus = [
{ {
"id": "identifier_0", "id": "identifier_0",
@ -26,7 +26,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
"head": 1, "head": 1,
"tag": "NOUN", "tag": "NOUN",
"orth": "Jan", "orth": "Jan",
"ner": "B-PER" "ner": "B-PER",
}, },
{ {
"id": 1, "id": 1,
@ -34,7 +34,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
"head": 0, "head": 0,
"tag": "VERB", "tag": "VERB",
"orth": "houdt", "orth": "houdt",
"ner": "O" "ner": "O",
}, },
{ {
"id": 2, "id": 2,
@ -42,7 +42,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
"head": 1, "head": 1,
"tag": "ADP", "tag": "ADP",
"orth": "van", "orth": "van",
"ner": "O" "ner": "O",
}, },
{ {
"id": 3, "id": 3,
@ -50,7 +50,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
"head": -2, "head": -2,
"tag": "NOUN", "tag": "NOUN",
"orth": "Marie", "orth": "Marie",
"ner": "B-PER" "ner": "B-PER",
}, },
{ {
"id": 4, "id": 4,
@ -58,7 +58,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
"head": -3, "head": -3,
"tag": "PUNCT", "tag": "PUNCT",
"orth": ".", "orth": ".",
"ner": "O" "ner": "O",
}, },
{ {
"id": 5, "id": 5,
@ -66,18 +66,18 @@ def test_cli_trained_model_can_be_saved(tmpdir):
"head": -1, "head": -1,
"tag": "SPACE", "tag": "SPACE",
"orth": "\n", "orth": "\n",
"ner": "O" "ner": "O",
} },
], ],
"brackets": [] "brackets": [],
} }
] ],
} }
] ],
} }
] ]
train_file.write(json.dumps(train_corpus).encode('utf-8')) train_file.write(json.dumps(train_corpus).encode("utf-8"))
train_file.close() train_file.close()
train_data = train_file.name train_data = train_file.name
dev_data = train_data dev_data = train_data

View File

@ -155,6 +155,14 @@ def test_issue1758(en_tokenizer):
assert tokens[1].lemma_ == "have" assert tokens[1].lemma_ == "have"
def test_issue1773(en_tokenizer):
"""Test that spaces don't receive a POS but no TAG. This is the root cause
of the serialization issue reported in #1773."""
doc = en_tokenizer("\n")
if doc[0].pos_ == "SPACE":
assert doc[0].tag_ != ""
def test_issue1799(): def test_issue1799():
"""Test sentence boundaries are deserialized correctly, even for """Test sentence boundaries are deserialized correctly, even for
non-projective sentences.""" non-projective sentences."""
@ -249,8 +257,8 @@ def test_issue1945():
def test_issue1963(en_tokenizer): def test_issue1963(en_tokenizer):
"""Test that doc.merge() resizes doc.tensor""" """Test that doc.merge() resizes doc.tensor"""
doc = en_tokenizer('a b c d') doc = en_tokenizer("a b c d")
doc.tensor = numpy.ones((len(doc), 128), dtype='f') doc.tensor = numpy.ones((len(doc), 128), dtype="f")
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2]) retokenizer.merge(doc[0:2])
assert len(doc) == 3 assert len(doc) == 3

View File

@ -1,9 +0,0 @@
from __future__ import unicode_literals
def test_issue1773(en_tokenizer):
"""Test that spaces don't receive a POS but no TAG. This is the root cause
of the serialization issue reported in #1773."""
doc = en_tokenizer('\n')
if doc[0].pos_ == 'SPACE':
assert doc[0].tag_ != ""

View File

@ -6,8 +6,9 @@ from spacy.tokens import Doc
from spacy.displacy import render from spacy.displacy import render
from spacy.gold import iob_to_biluo from spacy.gold import iob_to_biluo
from spacy.lang.it import Italian from spacy.lang.it import Italian
import numpy
from ..util import add_vecs_to_vocab from ..util import add_vecs_to_vocab, get_doc
@pytest.mark.xfail @pytest.mark.xfail
@ -69,6 +70,26 @@ def test_issue2385_biluo(tags):
assert iob_to_biluo(tags) == list(tags) assert iob_to_biluo(tags) == list(tags)
def test_issue2396(en_vocab):
words = ["She", "created", "a", "test", "for", "spacy"]
heads = [1, 0, 1, -2, -1, -1]
matrix = numpy.array(
[
[0, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[1, 1, 2, 3, 3, 3],
[1, 1, 3, 3, 3, 3],
[1, 1, 3, 3, 4, 4],
[1, 1, 3, 3, 4, 5],
],
dtype=numpy.int32,
)
doc = get_doc(en_vocab, words=words, heads=heads)
span = doc[:]
assert (doc.get_lca_matrix() == matrix).all()
assert (span.get_lca_matrix() == matrix).all()
def test_issue2482(): def test_issue2482():
"""Test we can serialize and deserialize a blank NER or parser model.""" """Test we can serialize and deserialize a blank NER or parser model."""
nlp = Italian() nlp = Italian()

View File

@ -1,35 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from ..util import get_doc
import pytest
import numpy
@pytest.mark.parametrize(
"sentence,heads,matrix",
[
(
"She created a test for spacy",
[1, 0, 1, -2, -1, -1],
numpy.array(
[
[0, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1],
[1, 1, 2, 3, 3, 3],
[1, 1, 3, 3, 3, 3],
[1, 1, 3, 3, 4, 4],
[1, 1, 3, 3, 4, 5],
],
dtype=numpy.int32,
),
)
],
)
def test_issue2396(en_tokenizer, sentence, heads, matrix):
tokens = en_tokenizer(sentence)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
span = doc[:]
assert (doc.get_lca_matrix() == matrix).all()
assert (span.get_lca_matrix() == matrix).all()

View File

@ -1,14 +1,10 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from spacy.lang.en import English
def test_issue2754(): def test_issue2754(en_tokenizer):
"""Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" """Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
nlp = English() a = en_tokenizer("a")
a = nlp('a') assert a[0].norm_ == "a"
assert a[0].norm_ == 'a' am = en_tokenizer("am")
am = nlp('am') assert am[0].norm_ == "am"
assert am[0].norm_ == 'am'

View File

@ -9,4 +9,3 @@ def test_issue2835(en_tokenizer):
""" """
doc = en_tokenizer(text) doc = en_tokenizer(text)
assert doc assert doc

View File

@ -2,26 +2,24 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import numpy import numpy
from spacy.vectors import Vectors
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tokens import Doc
from spacy._ml import link_vectors_to_models from spacy._ml import link_vectors_to_models
def test_issue2871(): def test_issue2871():
"""Test that vectors recover the correct key for spaCy reserved words.""" """Test that vectors recover the correct key for spaCy reserved words."""
words = ['dog', 'cat', 'SUFFIX'] words = ["dog", "cat", "SUFFIX"]
vocab = Vocab() vocab = Vocab()
vocab.vectors.resize(shape=(3, 10)) vocab.vectors.resize(shape=(3, 10))
vector_data = numpy.zeros((3, 10), dtype='f') vector_data = numpy.zeros((3, 10), dtype="f")
for word in words: for word in words:
_ = vocab[word] _ = vocab[word] # noqa: F841
vocab.set_vector(word, vector_data[0]) vocab.set_vector(word, vector_data[0])
vocab.vectors.name = 'dummy_vectors' vocab.vectors.name = "dummy_vectors"
link_vectors_to_models(vocab) link_vectors_to_models(vocab)
assert vocab['dog'].rank == 0 assert vocab["dog"].rank == 0
assert vocab['cat'].rank == 1 assert vocab["cat"].rank == 1
assert vocab['SUFFIX'].rank == 2 assert vocab["SUFFIX"].rank == 2
assert vocab.vectors.find(key='dog') == 0 assert vocab.vectors.find(key="dog") == 0
assert vocab.vectors.find(key='cat') == 1 assert vocab.vectors.find(key="cat") == 1
assert vocab.vectors.find(key='SUFFIX') == 2 assert vocab.vectors.find(key="SUFFIX") == 2

View File

@ -58,9 +58,10 @@ def test_issue3009(doc, matcher, pattern):
matches = matcher(doc) matches = matcher(doc)
assert matches assert matches
def test_issue2464(matcher): def test_issue2464(matcher):
"""Test problem with successive ?. This is the same bug, so putting it here.""" """Test problem with successive ?. This is the same bug, so putting it here."""
doc = Doc(matcher.vocab, words=['a', 'b']) doc = Doc(matcher.vocab, words=["a", "b"])
matcher.add('4', None, [{'OP': '?'}, {'OP': '?'}]) matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 3 assert len(matches) == 3

View File

@ -1,8 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from ...attrs import ENT_IOB, ENT_TYPE from ...attrs import ENT_IOB, ENT_TYPE
from ...tokens import Doc from ...tokens import Doc
from ..util import get_doc from ..util import get_doc
@ -30,4 +28,4 @@ def test_issue3012(en_vocab):
# serializing then deserializing # serializing then deserializing
doc_bytes = doc.to_bytes() doc_bytes = doc.to_bytes()
doc2 = Doc(en_vocab).from_bytes(doc_bytes) doc2 = Doc(en_vocab).from_bytes(doc_bytes)
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected

View File

@ -1,10 +0,0 @@
from __future__ import unicode_literals
import pytest
import spacy
@pytest.mark.models('fr')
def test_issue1959(FR):
texts = ['Je suis la mauvaise herbe', "Me, myself and moi"]
for text in texts:
FR(text)