mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Tidy up and fix small bugs and typos
This commit is contained in:
parent
9e652afa4b
commit
25602c794c
|
@ -8,15 +8,14 @@ import time
|
|||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from thinc.v2v import Affine, Maxout
|
||||
from thinc.api import wrap, layerize
|
||||
from thinc.misc import LayerNorm as LN
|
||||
from thinc.neural.util import prefer_gpu, get_array_module
|
||||
from thinc.neural.util import prefer_gpu
|
||||
from wasabi import Printer
|
||||
import srsly
|
||||
|
||||
from ..tokens import Doc
|
||||
from ..attrs import ID, HEAD
|
||||
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
||||
from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
|
||||
from .._ml import masked_language_model
|
||||
from .. import util
|
||||
|
||||
|
@ -136,7 +135,7 @@ def pretrain(
|
|||
random.shuffle(texts)
|
||||
|
||||
|
||||
def make_update(model, docs, optimizer, drop=0.0, objective='L2'):
|
||||
def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
|
||||
"""Perform an update over a single batch of documents.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
|
@ -171,7 +170,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500):
|
|||
return docs
|
||||
|
||||
|
||||
def get_vectors_loss(ops, docs, prediction, objective='L2'):
|
||||
def get_vectors_loss(ops, docs, prediction, objective="L2"):
|
||||
"""Compute a mean-squared error loss between the documents' vectors and
|
||||
the prediction.
|
||||
|
||||
|
@ -185,7 +184,7 @@ def get_vectors_loss(ops, docs, prediction, objective='L2'):
|
|||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
if objective == 'L2':
|
||||
if objective == "L2":
|
||||
d_scores = prediction - target
|
||||
loss = (d_scores ** 2).sum()
|
||||
else:
|
||||
|
@ -201,8 +200,7 @@ def create_pretraining_model(nlp, tok2vec):
|
|||
"""
|
||||
output_size = nlp.vocab.vectors.data.shape[1]
|
||||
output_layer = chain(
|
||||
LN(Maxout(300, pieces=3)),
|
||||
Affine(output_size, drop_factor=0.0),
|
||||
LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
|
||||
)
|
||||
# This is annoying, but the parser etc have the flatten step after
|
||||
# the tok2vec. To load the weights in cleanly, we need to match
|
||||
|
|
|
@ -13,13 +13,7 @@ RENDER_WRAPPER = None
|
|||
|
||||
|
||||
def render(
|
||||
docs,
|
||||
style="dep",
|
||||
page=False,
|
||||
minify=False,
|
||||
jupyter=False,
|
||||
options={},
|
||||
manual=False,
|
||||
docs, style="dep", page=False, minify=False, jupyter=False, options={}, manual=False
|
||||
):
|
||||
"""Render displaCy visualisation.
|
||||
|
||||
|
@ -80,7 +74,7 @@ def serve(
|
|||
"""
|
||||
from wsgiref import simple_server
|
||||
|
||||
if IS_JUPYTER:
|
||||
if is_in_jupyter():
|
||||
user_warning(Warnings.W011)
|
||||
|
||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||
from ..char_classes import CONCAT_QUOTES, CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES
|
||||
from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
|
||||
|
||||
# removing ° from the special icons to keep e.g. 99° as one token
|
||||
_concat_icons = CONCAT_ICONS.replace("\u00B0", "")
|
||||
|
@ -29,7 +30,9 @@ _suffixes = (
|
|||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency),
|
||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
||||
),
|
||||
r"(?<=[{al})])-e".format(al=ALPHA_LOWER),
|
||||
]
|
||||
)
|
||||
|
@ -40,7 +43,7 @@ _infixes = (
|
|||
+ [
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),
|
||||
|
|
|
@ -5,24 +5,24 @@ import re
|
|||
from collections import namedtuple
|
||||
|
||||
from .tag_map import TAG_MAP
|
||||
|
||||
from ...attrs import LANG
|
||||
from ...language import Language
|
||||
from ...tokens import Doc, Token
|
||||
from ...util import DummyTokenizer
|
||||
|
||||
|
||||
ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
|
||||
|
||||
# TODO: Is this the right place for this?
|
||||
Token.set_extension("mecab_tag", default=None)
|
||||
|
||||
|
||||
def try_mecab_import():
|
||||
"""Mecab is required for Japanese support, so check for it.
|
||||
|
||||
It it's not available blow up and explain how to fix it."""
|
||||
try:
|
||||
import MeCab
|
||||
|
||||
# XXX Is this the right place for this?
|
||||
Token.set_extension("mecab_tag", default=None)
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
|
@ -33,14 +33,13 @@ def try_mecab_import():
|
|||
|
||||
def resolve_pos(token):
|
||||
"""If necessary, add a field to the POS tag for UD mapping.
|
||||
|
||||
Under Universal Dependencies, sometimes the same Unidic POS tag can
|
||||
be mapped differently depending on the literal token or its context
|
||||
in the sentence. This function adds information to the POS tag to
|
||||
resolve ambiguous mappings.
|
||||
"""
|
||||
|
||||
# NOTE: This is a first take. The rules here are crude approximations.
|
||||
# TODO: This is a first take. The rules here are crude approximations.
|
||||
# For many of these, full dependencies are needed to properly resolve
|
||||
# PoS mappings.
|
||||
|
||||
|
@ -56,7 +55,7 @@ def resolve_pos(token):
|
|||
|
||||
def detailed_tokens(tokenizer, text):
|
||||
"""Format Mecab output into a nice data structure, based on Janome."""
|
||||
tokenizer.parse(text)
|
||||
|
||||
node = tokenizer.parseToNode(text)
|
||||
node = node.next # first node is beginning of sentence and empty, skip it
|
||||
words = []
|
||||
|
@ -98,62 +97,15 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
return doc
|
||||
|
||||
|
||||
class JapaneseCharacterSegmenter(object):
|
||||
def __init__(self, vocab):
|
||||
self.vocab = vocab
|
||||
self._presegmenter = self._make_presegmenter(self.vocab)
|
||||
|
||||
def _make_presegmenter(self, vocab):
|
||||
rules = Japanese.Defaults.tokenizer_exceptions
|
||||
token_match = Japanese.Defaults.token_match
|
||||
prefix_search = (
|
||||
util.compile_prefix_regex(Japanese.Defaults.prefixes).search
|
||||
if Japanese.Defaults.prefixes
|
||||
else None
|
||||
)
|
||||
suffix_search = (
|
||||
util.compile_suffix_regex(Japanese.Defaults.suffixes).search
|
||||
if Japanese.Defaults.suffixes
|
||||
else None
|
||||
)
|
||||
infix_finditer = (
|
||||
util.compile_infix_regex(Japanese.Defaults.infixes).finditer
|
||||
if Japanese.Defaults.infixes
|
||||
else None
|
||||
)
|
||||
return Tokenizer(
|
||||
vocab,
|
||||
rules=rules,
|
||||
prefix_search=prefix_search,
|
||||
suffix_search=suffix_search,
|
||||
infix_finditer=infix_finditer,
|
||||
token_match=token_match,
|
||||
)
|
||||
|
||||
def __call__(self, text):
|
||||
words = []
|
||||
spaces = []
|
||||
doc = self._presegmenter(text)
|
||||
for token in doc:
|
||||
words.extend(list(token.text))
|
||||
spaces.extend([False] * len(token.text))
|
||||
spaces[-1] = bool(token.whitespace_)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
|
||||
class JapaneseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda _text: "ja"
|
||||
|
||||
tag_map = TAG_MAP
|
||||
use_janome = True
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
if cls.use_janome:
|
||||
return JapaneseTokenizer(cls, nlp)
|
||||
else:
|
||||
return JapaneseCharacterSegmenter(nlp.vocab)
|
||||
|
||||
|
||||
class Japanese(Language):
|
||||
|
|
|
@ -2,10 +2,10 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
|
@ -22,9 +22,9 @@ class PolishDefaults(Language.Defaults):
|
|||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Polish(Language):
|
||||
|
|
|
@ -1,14 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
_quotes = QUOTES.replace("'", '')
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
|
||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)])
|
||||
|
||||
from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ [CONCAT_ICONS]
|
||||
+ [
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
|
||||
from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
|
|
@ -6,7 +6,9 @@ from .tag_map import TAG_MAP
|
|||
from .stop_words import STOP_WORDS
|
||||
from .morph_rules import MORPH_RULES
|
||||
from .lemmatizer import LEMMA_RULES, LOOKUP
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
||||
# Punctuation stolen from Danish
|
||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
|
@ -31,6 +33,7 @@ class SwedishDefaults(Language.Defaults):
|
|||
lemma_lookup = LOOKUP
|
||||
morph_rules = MORPH_RULES
|
||||
|
||||
|
||||
class Swedish(Language):
|
||||
lang = "sv"
|
||||
Defaults = SwedishDefaults
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
# coding: utf8
|
||||
"""Punctuation stolen from Danish"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
from ..punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
_quotes = QUOTES.replace("'", '')
|
||||
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
|
||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)])
|
||||
|
||||
_suffixes = [suffix for suffix in TOKENIZER_SUFFIXES if suffix not in ["'s", "'S", "’s", "’S", r"\'"]]
|
||||
_suffixes += [r"(?<=[^sSxXzZ])\'"]
|
||||
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
|
@ -1,169 +1,191 @@
|
|||
# coding: utf8
|
||||
|
||||
"""
|
||||
Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html
|
||||
for https://github.com/UniversalDependencies/UD_Swedish-Talbanken
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X, VERB
|
||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
|
||||
from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV
|
||||
from ...symbols import ADP, X, VERB, NOUN, PROPN, PART, INTJ, PRON
|
||||
|
||||
|
||||
# Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html
|
||||
# for https://github.com/UniversalDependencies/UD_Swedish-Talbanken
|
||||
|
||||
TAG_MAP = {
|
||||
'AB': { POS: ADV }, # inte, också, så, bara, nu
|
||||
'AB|AN': { POS: ADV }, # t.ex., ca, t_ex, bl.a., s_k
|
||||
'AB|KOM': { POS: ADV }, # mer, tidigare, mindre, vidare, mera
|
||||
'AB|POS': { POS: ADV }, # mycket, helt, ofta, länge, långt
|
||||
'AB|SMS': { POS: ADV }, # över-, in-
|
||||
'AB|SUV': { POS: ADV }, # minst, mest, högst, främst, helst
|
||||
'DT|MAS|SIN|DEF': { POS: DET },
|
||||
'DT|MAS|SIN|IND': { POS: DET },
|
||||
'DT|NEU|SIN|DEF': { POS: DET }, # det, detta
|
||||
'DT|NEU|SIN|IND': { POS: DET }, # ett, något, inget, vart, vartannat
|
||||
'DT|NEU|SIN|IND/DEF': { POS: DET }, # allt
|
||||
'DT|UTR/NEU|PLU|DEF': { POS: DET }, # de, dessa, bägge, dom
|
||||
'DT|UTR/NEU|PLU|IND': { POS: DET }, # några, inga
|
||||
'DT|UTR/NEU|PLU|IND/DEF': { POS: DET }, # alla
|
||||
'DT|UTR/NEU|SIN/PLU|IND': { POS: DET }, # samma
|
||||
'DT|UTR/NEU|SIN|DEF': { POS: DET }, # vardera
|
||||
'DT|UTR/NEU|SIN|IND': { POS: DET }, # varje, varenda
|
||||
'DT|UTR|SIN|DEF': { POS: DET }, # den, denna
|
||||
'DT|UTR|SIN|IND': { POS: DET }, # en, någon, ingen, var, varannan
|
||||
'DT|UTR|SIN|IND/DEF': { POS: DET }, # all
|
||||
'HA': { POS: ADV }, # när, där, hur, som, då
|
||||
'HD|NEU|SIN|IND': { POS: DET }, # vilket
|
||||
'HD|UTR/NEU|PLU|IND': { POS: DET }, # vilka
|
||||
'HD|UTR|SIN|IND': { POS: DET }, # vilken
|
||||
'HP|-|-|-': { POS: PRON }, # som
|
||||
'HP|NEU|SIN|IND': { POS: PRON }, # vad, vilket
|
||||
'HP|NEU|SIN|IND|SMS': { POS: PRON },
|
||||
'HP|UTR/NEU|PLU|IND': { POS: PRON }, # vilka
|
||||
'HP|UTR|SIN|IND': { POS: PRON }, # vilken, vem
|
||||
'HS|DEF': { POS: DET }, # vars, vilkas, Vems
|
||||
'IE': { POS: PART }, # att
|
||||
'IN': { POS: INTJ }, # Jo, ja, nej, fan, visst
|
||||
'JJ|AN': { POS: ADJ }, # ev, S:t, Kungl, Kungl., Teol
|
||||
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: ADJ }, # äldres
|
||||
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # större, högre, mindre, bättre, äldre
|
||||
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS': { POS: ADJ },
|
||||
'JJ|POS|MAS|SIN|DEF|GEN': { POS: ADJ }, # enskildes, sjukes, andres
|
||||
'JJ|POS|MAS|SIN|DEF|NOM': { POS: ADJ }, # enskilde, sjuke, andre, unge, ene
|
||||
'JJ|POS|NEU|SIN|IND/DEF|NOM': { POS: ADJ }, # eget
|
||||
'JJ|POS|NEU|SIN|IND|GEN': { POS: ADJ },
|
||||
'JJ|POS|NEU|SIN|IND|NOM': { POS: ADJ }, # annat, svårt, möjligt, nytt, sådant
|
||||
'JJ|POS|UTR/NEU|PLU|IND/DEF|GEN': { POS: ADJ }, # ogiftas, ungas, frånskildas, efterkommandes, färgblindas
|
||||
'JJ|POS|UTR/NEU|PLU|IND/DEF|NOM': { POS: ADJ }, # olika, andra, många, stora, vissa
|
||||
'JJ|POS|UTR/NEU|PLU|IND|NOM': { POS: ADJ }, # flera, sådana, fler, få, samtliga
|
||||
'JJ|POS|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ },
|
||||
'JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # bra, ena, enda, nästa, ringa
|
||||
'JJ|POS|UTR/NEU|SIN|DEF|GEN': { POS: ADJ },
|
||||
'JJ|POS|UTR/NEU|SIN|DEF|NOM': { POS: ADJ }, # hela, nya, andra, svenska, ekonomiska
|
||||
'JJ|POS|UTR|-|-|SMS': { POS: ADJ }, # fri-, låg-, sexual-
|
||||
'JJ|POS|UTR|SIN|IND/DEF|NOM': { POS: ADJ }, # egen
|
||||
'JJ|POS|UTR|SIN|IND|GEN': { POS: ADJ }, # enskilds
|
||||
'JJ|POS|UTR|SIN|IND|NOM': { POS: ADJ }, # stor, annan, själv, sådan, viss
|
||||
'JJ|SUV|MAS|SIN|DEF|GEN': { POS: ADJ },
|
||||
'JJ|SUV|MAS|SIN|DEF|NOM': { POS: ADJ }, # störste, främste, äldste, minste
|
||||
'JJ|SUV|UTR/NEU|PLU|DEF|NOM': { POS: ADJ }, # flesta
|
||||
'JJ|SUV|UTR/NEU|PLU|IND|NOM': { POS: ADJ },
|
||||
'JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM': { POS: ADJ }, # bästa, största, närmaste, viktigaste, högsta
|
||||
'JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ }, # störst, bäst, tidigast, högst, fattigast
|
||||
'KN': { POS: CCONJ }, # och, eller, som, än, men
|
||||
'KN|AN': { POS: CCONJ },
|
||||
'MAD': { POS: PUNCT }, # ., ?, :, !, ...
|
||||
'MID': { POS: PUNCT }, # ,, -, :, *, ;
|
||||
'NN|-|-|-|-': { POS: NOUN }, # godo, fjol, fullo, somras, måtto
|
||||
'NN|AN': { POS: NOUN }, # kr, %, s., dr, kap.
|
||||
'NN|NEU|-|-|-': { POS: NOUN },
|
||||
'NN|NEU|-|-|SMS': { POS: NOUN }, # yrkes-, barn-, hem-, fack-, vatten-
|
||||
'NN|NEU|PLU|DEF|GEN': { POS: NOUN }, # barnens, årens, u-ländernas, företagens, århundradenas
|
||||
'NN|NEU|PLU|DEF|NOM': { POS: NOUN }, # barnen, u-länderna, åren, länderna, könen
|
||||
'NN|NEU|PLU|IND|GEN': { POS: NOUN }, # slags, års, barns, länders, tusentals
|
||||
'NN|NEU|PLU|IND|NOM': { POS: NOUN }, # barn, år, fall, länder, problem
|
||||
'NN|NEU|SIN|DEF|GEN': { POS: NOUN }, # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets
|
||||
'NN|NEU|SIN|DEF|NOM': { POS: NOUN }, # äktenskapet, samhället, barnet, stället, hemmet
|
||||
'NN|NEU|SIN|IND|GEN': { POS: NOUN }, # års, slags, lands, havs, företags
|
||||
'NN|NEU|SIN|IND|NOM': { POS: NOUN }, # år, arbete, barn, sätt, äktenskap
|
||||
'NN|SMS': { POS: NOUN }, # PCB-, Syd-
|
||||
'NN|UTR|-|-|-': { POS: NOUN }, # dags, rätta
|
||||
'NN|UTR|-|-|SMS': { POS: NOUN }, # far-, kibbutz-, röntgen-, barna-, hälso-
|
||||
'NN|UTR|PLU|DEF|GEN': { POS: NOUN }, # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas
|
||||
'NN|UTR|PLU|DEF|NOM': { POS: NOUN }, # kvinnorna, föräldrarna, makarna, männen, hyrorna
|
||||
'NN|UTR|PLU|IND|GEN': { POS: NOUN }, # människors, kvinnors, dagars, tiders, månaders
|
||||
'NN|UTR|PLU|IND|NOM': { POS: NOUN }, # procent, människor, kvinnor, miljoner, kronor
|
||||
'NN|UTR|SIN|DEF|GEN': { POS: NOUN }, # kvinnans, världens, familjens, dagens, jordens
|
||||
'NN|UTR|SIN|DEF|NOM': { POS: NOUN }, # familjen, kvinnan, mannen, världen, skolan
|
||||
'NN|UTR|SIN|IND|GEN': { POS: NOUN }, # sorts, medelålders, makes, kvinnas, veckas
|
||||
'NN|UTR|SIN|IND|NOM': { POS: NOUN }, # del, tid, dag, fråga, man
|
||||
'PAD': { POS: PUNCT }, # , ), (
|
||||
'PC|AN': { POS: VERB },
|
||||
'PC|PRF|MAS|SIN|DEF|GEN': { POS: VERB }, # avlidnes
|
||||
'PC|PRF|MAS|SIN|DEF|NOM': { POS: VERB },
|
||||
'PC|PRF|NEU|SIN|IND|NOM': { POS: VERB }, # taget, sett, särskilt, förbjudet, ökat
|
||||
'PC|PRF|UTR/NEU|PLU|IND/DEF|GEN': { POS: VERB }, # försäkrades, anställdas
|
||||
'PC|PRF|UTR/NEU|PLU|IND/DEF|NOM': { POS: VERB }, # särskilda, gifta, ökade, handikappade, skilda
|
||||
'PC|PRF|UTR/NEU|SIN|DEF|GEN': { POS: VERB },
|
||||
'PC|PRF|UTR/NEU|SIN|DEF|NOM': { POS: VERB }, # ökade, gifta, nämnda, nedärvda, dolda
|
||||
'PC|PRF|UTR|SIN|IND|GEN': { POS: VERB },
|
||||
'PC|PRF|UTR|SIN|IND|NOM': { POS: VERB }, # särskild, ökad, beredd, gift, oförändrad
|
||||
'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: VERB }, # studerandes, sammanboendes, dubbelarbetandes
|
||||
'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: VERB }, # följande, beroende, nuvarande, motsvarande, liknande
|
||||
'PL': { POS: PART }, # ut, upp, in, till, med
|
||||
'PL|SMS': { POS: PART },
|
||||
'PM': { POS: PROPN }, # F, N, Liechtenstein, Danmark, DK
|
||||
'PM|GEN': { POS: PROPN }, # Sveriges, EEC:s, Guds, Stockholms, Kristi
|
||||
'PM|NOM': { POS: PROPN }, # Sverige, EEC, Stockholm, USA, ATP
|
||||
'PM|SMS': { POS: PROPN }, # Göteborgs-, Nord-, Väst-
|
||||
'PN|MAS|SIN|DEF|SUB/OBJ': { POS: PRON }, # denne
|
||||
'PN|NEU|SIN|DEF|SUB/OBJ': { POS: PRON }, # det, detta, detsamma
|
||||
'PN|NEU|SIN|IND|SUB/OBJ': { POS: PRON }, # något, allt, mycket, annat, ingenting
|
||||
'PN|UTR/NEU|PLU|DEF|OBJ': { POS: PRON }, # dem, varandra, varann
|
||||
'PN|UTR/NEU|PLU|DEF|SUB': { POS: PRON }, # de, bägge
|
||||
'PN|UTR/NEU|PLU|DEF|SUB/OBJ': { POS: PRON }, # dessa, dom, båda, den, bådadera
|
||||
'PN|UTR/NEU|PLU|IND|SUB/OBJ': { POS: PRON }, # andra, alla, många, sådana, några
|
||||
'PN|UTR/NEU|SIN/PLU|DEF|OBJ': { POS: PRON }, # sig, sej
|
||||
'PN|UTR|PLU|DEF|OBJ': { POS: PRON }, # oss, er, eder
|
||||
'PN|UTR|PLU|DEF|SUB': { POS: PRON }, # vi
|
||||
'PN|UTR|SIN|DEF|OBJ': { POS: PRON }, # dig, mig, henne, honom, Er
|
||||
'PN|UTR|SIN|DEF|SUB': { POS: PRON }, # du, han, hon, jag, ni
|
||||
'PN|UTR|SIN|DEF|SUB/OBJ': { POS: PRON }, # den, denna, densamma
|
||||
'PN|UTR|SIN|IND|SUB': { POS: PRON }, # man
|
||||
'PN|UTR|SIN|IND|SUB/OBJ': { POS: PRON }, # en, var, någon, ingen, Varannan
|
||||
'PP': { POS: ADP }, # i, av, på, för, till
|
||||
'PP|AN': { POS: ADP }, # f
|
||||
'PS|AN': { POS: DET },
|
||||
'PS|NEU|SIN|DEF': { POS: DET }, # sitt, vårt, ditt, mitt, ert
|
||||
'PS|UTR/NEU|PLU|DEF': { POS: DET }, # sina, våra, dina, mina
|
||||
'PS|UTR/NEU|SIN/PLU|DEF': { POS: DET }, # deras, dess, hans, hennes, varandras
|
||||
'PS|UTR|SIN|DEF': { POS: DET }, # sin, vår, din, min, er
|
||||
'RG': { POS: NUM }, # 2, 17, 20, 1, 18
|
||||
'RG|GEN': { POS: NUM },
|
||||
'RG|MAS|SIN|DEF|NOM': { POS: NUM },
|
||||
'RG|NEU|SIN|IND|NOM': { POS: NUM }, # ett
|
||||
'RG|NOM': { POS: NUM }, # två, tre, 1, 20, 2
|
||||
'RG|SMS': { POS: NUM }, # ett-, 1950-, två-, tre-, 1700-
|
||||
'RG|UTR/NEU|SIN|DEF|NOM': { POS: NUM },
|
||||
'RG|UTR|SIN|IND|NOM': { POS: NUM }, # en
|
||||
'RO|MAS|SIN|IND/DEF|GEN': { POS: ADJ },
|
||||
'RO|MAS|SIN|IND/DEF|NOM': { POS: ADJ }, # förste
|
||||
'RO|GEN': { POS: ADJ },
|
||||
'RO|NOM': { POS: ADJ }, # första, andra, tredje, fjärde, femte
|
||||
'SN': { POS: SCONJ }, # att, om, innan, eftersom, medan
|
||||
'UO': { POS: X }, # companionship, vice, versa, family, capita
|
||||
'VB|AN': { POS: VERB }, # jfr
|
||||
'VB|IMP|AKT': { POS: VERB }, # se, Diskutera, låt, Läs, Gå
|
||||
'VB|IMP|SFO': { POS: VERB }, # tas
|
||||
'VB|INF|AKT': { POS: VERB }, # vara, få, ha, bli, kunna
|
||||
'VB|INF|SFO': { POS: VERB }, # användas, finnas, göras, tas, ses
|
||||
'VB|KON|PRS|AKT': { POS: VERB }, # vare, Gånge
|
||||
'VB|KON|PRT|AKT': { POS: VERB }, # vore, finge
|
||||
'VB|KON|PRT|SFO': { POS: VERB },
|
||||
'VB|PRS|AKT': { POS: VERB }, # är, har, kan, får, måste
|
||||
'VB|PRS|SFO': { POS: VERB }, # finns, kallas, behövs, beräknas, används
|
||||
'VB|PRT|AKT': { POS: VERB }, # skulle, var, hade, kunde, fick
|
||||
'VB|PRT|SFO': { POS: VERB }, # fanns, gjordes, höjdes, användes, infördes
|
||||
'VB|SMS': { POS: VERB }, # läs-
|
||||
'VB|SUP|AKT': { POS: VERB }, # varit, fått, blivit, haft, kommit
|
||||
'VB|SUP|SFO': { POS: VERB } # nämnts, gjorts, förändrats, sagts, framhållits
|
||||
"AB": {POS: ADV}, # inte, också, så, bara, nu
|
||||
"AB|AN": {POS: ADV}, # t.ex., ca, t_ex, bl.a., s_k
|
||||
"AB|KOM": {POS: ADV}, # mer, tidigare, mindre, vidare, mera
|
||||
"AB|POS": {POS: ADV}, # mycket, helt, ofta, länge, långt
|
||||
"AB|SMS": {POS: ADV}, # över-, in-
|
||||
"AB|SUV": {POS: ADV}, # minst, mest, högst, främst, helst
|
||||
"DT|MAS|SIN|DEF": {POS: DET},
|
||||
"DT|MAS|SIN|IND": {POS: DET},
|
||||
"DT|NEU|SIN|DEF": {POS: DET}, # det, detta
|
||||
"DT|NEU|SIN|IND": {POS: DET}, # ett, något, inget, vart, vartannat
|
||||
"DT|NEU|SIN|IND/DEF": {POS: DET}, # allt
|
||||
"DT|UTR/NEU|PLU|DEF": {POS: DET}, # de, dessa, bägge, dom
|
||||
"DT|UTR/NEU|PLU|IND": {POS: DET}, # några, inga
|
||||
"DT|UTR/NEU|PLU|IND/DEF": {POS: DET}, # alla
|
||||
"DT|UTR/NEU|SIN/PLU|IND": {POS: DET}, # samma
|
||||
"DT|UTR/NEU|SIN|DEF": {POS: DET}, # vardera
|
||||
"DT|UTR/NEU|SIN|IND": {POS: DET}, # varje, varenda
|
||||
"DT|UTR|SIN|DEF": {POS: DET}, # den, denna
|
||||
"DT|UTR|SIN|IND": {POS: DET}, # en, någon, ingen, var, varannan
|
||||
"DT|UTR|SIN|IND/DEF": {POS: DET}, # all
|
||||
"HA": {POS: ADV}, # när, där, hur, som, då
|
||||
"HD|NEU|SIN|IND": {POS: DET}, # vilket
|
||||
"HD|UTR/NEU|PLU|IND": {POS: DET}, # vilka
|
||||
"HD|UTR|SIN|IND": {POS: DET}, # vilken
|
||||
"HP|-|-|-": {POS: PRON}, # som
|
||||
"HP|NEU|SIN|IND": {POS: PRON}, # vad, vilket
|
||||
"HP|NEU|SIN|IND|SMS": {POS: PRON},
|
||||
"HP|UTR/NEU|PLU|IND": {POS: PRON}, # vilka
|
||||
"HP|UTR|SIN|IND": {POS: PRON}, # vilken, vem
|
||||
"HS|DEF": {POS: DET}, # vars, vilkas, Vems
|
||||
"IE": {POS: PART}, # att
|
||||
"IN": {POS: INTJ}, # Jo, ja, nej, fan, visst
|
||||
"JJ|AN": {POS: ADJ}, # ev, S:t, Kungl, Kungl., Teol
|
||||
"JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN": {POS: ADJ}, # äldres
|
||||
"JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM": {
|
||||
POS: ADJ
|
||||
}, # större, högre, mindre, bättre, äldre
|
||||
"JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS": {POS: ADJ},
|
||||
"JJ|POS|MAS|SIN|DEF|GEN": {POS: ADJ}, # enskildes, sjukes, andres
|
||||
"JJ|POS|MAS|SIN|DEF|NOM": {POS: ADJ}, # enskilde, sjuke, andre, unge, ene
|
||||
"JJ|POS|NEU|SIN|IND/DEF|NOM": {POS: ADJ}, # eget
|
||||
"JJ|POS|NEU|SIN|IND|GEN": {POS: ADJ},
|
||||
"JJ|POS|NEU|SIN|IND|NOM": {POS: ADJ}, # annat, svårt, möjligt, nytt, sådant
|
||||
"JJ|POS|UTR/NEU|PLU|IND/DEF|GEN": {
|
||||
POS: ADJ
|
||||
}, # ogiftas, ungas, frånskildas, efterkommandes, färgblindas
|
||||
"JJ|POS|UTR/NEU|PLU|IND/DEF|NOM": {POS: ADJ}, # olika, andra, många, stora, vissa
|
||||
"JJ|POS|UTR/NEU|PLU|IND|NOM": {POS: ADJ}, # flera, sådana, fler, få, samtliga
|
||||
"JJ|POS|UTR/NEU|SIN/PLU|IND|NOM": {POS: ADJ},
|
||||
"JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM": {POS: ADJ}, # bra, ena, enda, nästa, ringa
|
||||
"JJ|POS|UTR/NEU|SIN|DEF|GEN": {POS: ADJ},
|
||||
"JJ|POS|UTR/NEU|SIN|DEF|NOM": {POS: ADJ}, # hela, nya, andra, svenska, ekonomiska
|
||||
"JJ|POS|UTR|-|-|SMS": {POS: ADJ}, # fri-, låg-, sexual-
|
||||
"JJ|POS|UTR|SIN|IND/DEF|NOM": {POS: ADJ}, # egen
|
||||
"JJ|POS|UTR|SIN|IND|GEN": {POS: ADJ}, # enskilds
|
||||
"JJ|POS|UTR|SIN|IND|NOM": {POS: ADJ}, # stor, annan, själv, sådan, viss
|
||||
"JJ|SUV|MAS|SIN|DEF|GEN": {POS: ADJ},
|
||||
"JJ|SUV|MAS|SIN|DEF|NOM": {POS: ADJ}, # störste, främste, äldste, minste
|
||||
"JJ|SUV|UTR/NEU|PLU|DEF|NOM": {POS: ADJ}, # flesta
|
||||
"JJ|SUV|UTR/NEU|PLU|IND|NOM": {POS: ADJ},
|
||||
"JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM": {
|
||||
POS: ADJ
|
||||
}, # bästa, största, närmaste, viktigaste, högsta
|
||||
"JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM": {
|
||||
POS: ADJ
|
||||
}, # störst, bäst, tidigast, högst, fattigast
|
||||
"KN": {POS: CCONJ}, # och, eller, som, än, men
|
||||
"KN|AN": {POS: CCONJ},
|
||||
"MAD": {POS: PUNCT}, # ., ?, :, !, ...
|
||||
"MID": {POS: PUNCT}, # ,, -, :, *, ;
|
||||
"NN|-|-|-|-": {POS: NOUN}, # godo, fjol, fullo, somras, måtto
|
||||
"NN|AN": {POS: NOUN}, # kr, %, s., dr, kap.
|
||||
"NN|NEU|-|-|-": {POS: NOUN},
|
||||
"NN|NEU|-|-|SMS": {POS: NOUN}, # yrkes-, barn-, hem-, fack-, vatten-
|
||||
"NN|NEU|PLU|DEF|GEN": {
|
||||
POS: NOUN
|
||||
}, # barnens, årens, u-ländernas, företagens, århundradenas
|
||||
"NN|NEU|PLU|DEF|NOM": {POS: NOUN}, # barnen, u-länderna, åren, länderna, könen
|
||||
"NN|NEU|PLU|IND|GEN": {POS: NOUN}, # slags, års, barns, länders, tusentals
|
||||
"NN|NEU|PLU|IND|NOM": {POS: NOUN}, # barn, år, fall, länder, problem
|
||||
"NN|NEU|SIN|DEF|GEN": {
|
||||
POS: NOUN
|
||||
}, # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets
|
||||
"NN|NEU|SIN|DEF|NOM": {
|
||||
POS: NOUN
|
||||
}, # äktenskapet, samhället, barnet, stället, hemmet
|
||||
"NN|NEU|SIN|IND|GEN": {POS: NOUN}, # års, slags, lands, havs, företags
|
||||
"NN|NEU|SIN|IND|NOM": {POS: NOUN}, # år, arbete, barn, sätt, äktenskap
|
||||
"NN|SMS": {POS: NOUN}, # PCB-, Syd-
|
||||
"NN|UTR|-|-|-": {POS: NOUN}, # dags, rätta
|
||||
"NN|UTR|-|-|SMS": {POS: NOUN}, # far-, kibbutz-, röntgen-, barna-, hälso-
|
||||
"NN|UTR|PLU|DEF|GEN": {
|
||||
POS: NOUN
|
||||
}, # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas
|
||||
"NN|UTR|PLU|DEF|NOM": {
|
||||
POS: NOUN
|
||||
}, # kvinnorna, föräldrarna, makarna, männen, hyrorna
|
||||
"NN|UTR|PLU|IND|GEN": {POS: NOUN}, # människors, kvinnors, dagars, tiders, månaders
|
||||
"NN|UTR|PLU|IND|NOM": {POS: NOUN}, # procent, människor, kvinnor, miljoner, kronor
|
||||
"NN|UTR|SIN|DEF|GEN": {POS: NOUN}, # kvinnans, världens, familjens, dagens, jordens
|
||||
"NN|UTR|SIN|DEF|NOM": {POS: NOUN}, # familjen, kvinnan, mannen, världen, skolan
|
||||
"NN|UTR|SIN|IND|GEN": {POS: NOUN}, # sorts, medelålders, makes, kvinnas, veckas
|
||||
"NN|UTR|SIN|IND|NOM": {POS: NOUN}, # del, tid, dag, fråga, man
|
||||
"PAD": {POS: PUNCT}, # , ), (
|
||||
"PC|AN": {POS: VERB},
|
||||
"PC|PRF|MAS|SIN|DEF|GEN": {POS: VERB}, # avlidnes
|
||||
"PC|PRF|MAS|SIN|DEF|NOM": {POS: VERB},
|
||||
"PC|PRF|NEU|SIN|IND|NOM": {POS: VERB}, # taget, sett, särskilt, förbjudet, ökat
|
||||
"PC|PRF|UTR/NEU|PLU|IND/DEF|GEN": {POS: VERB}, # försäkrades, anställdas
|
||||
"PC|PRF|UTR/NEU|PLU|IND/DEF|NOM": {
|
||||
POS: VERB
|
||||
}, # särskilda, gifta, ökade, handikappade, skilda
|
||||
"PC|PRF|UTR/NEU|SIN|DEF|GEN": {POS: VERB},
|
||||
"PC|PRF|UTR/NEU|SIN|DEF|NOM": {POS: VERB}, # ökade, gifta, nämnda, nedärvda, dolda
|
||||
"PC|PRF|UTR|SIN|IND|GEN": {POS: VERB},
|
||||
"PC|PRF|UTR|SIN|IND|NOM": {POS: VERB}, # särskild, ökad, beredd, gift, oförändrad
|
||||
"PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN": {
|
||||
POS: VERB
|
||||
}, # studerandes, sammanboendes, dubbelarbetandes
|
||||
"PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM": {
|
||||
POS: VERB
|
||||
}, # följande, beroende, nuvarande, motsvarande, liknande
|
||||
"PL": {POS: PART}, # ut, upp, in, till, med
|
||||
"PL|SMS": {POS: PART},
|
||||
"PM": {POS: PROPN}, # F, N, Liechtenstein, Danmark, DK
|
||||
"PM|GEN": {POS: PROPN}, # Sveriges, EEC:s, Guds, Stockholms, Kristi
|
||||
"PM|NOM": {POS: PROPN}, # Sverige, EEC, Stockholm, USA, ATP
|
||||
"PM|SMS": {POS: PROPN}, # Göteborgs-, Nord-, Väst-
|
||||
"PN|MAS|SIN|DEF|SUB/OBJ": {POS: PRON}, # denne
|
||||
"PN|NEU|SIN|DEF|SUB/OBJ": {POS: PRON}, # det, detta, detsamma
|
||||
"PN|NEU|SIN|IND|SUB/OBJ": {POS: PRON}, # något, allt, mycket, annat, ingenting
|
||||
"PN|UTR/NEU|PLU|DEF|OBJ": {POS: PRON}, # dem, varandra, varann
|
||||
"PN|UTR/NEU|PLU|DEF|SUB": {POS: PRON}, # de, bägge
|
||||
"PN|UTR/NEU|PLU|DEF|SUB/OBJ": {POS: PRON}, # dessa, dom, båda, den, bådadera
|
||||
"PN|UTR/NEU|PLU|IND|SUB/OBJ": {POS: PRON}, # andra, alla, många, sådana, några
|
||||
"PN|UTR/NEU|SIN/PLU|DEF|OBJ": {POS: PRON}, # sig, sej
|
||||
"PN|UTR|PLU|DEF|OBJ": {POS: PRON}, # oss, er, eder
|
||||
"PN|UTR|PLU|DEF|SUB": {POS: PRON}, # vi
|
||||
"PN|UTR|SIN|DEF|OBJ": {POS: PRON}, # dig, mig, henne, honom, Er
|
||||
"PN|UTR|SIN|DEF|SUB": {POS: PRON}, # du, han, hon, jag, ni
|
||||
"PN|UTR|SIN|DEF|SUB/OBJ": {POS: PRON}, # den, denna, densamma
|
||||
"PN|UTR|SIN|IND|SUB": {POS: PRON}, # man
|
||||
"PN|UTR|SIN|IND|SUB/OBJ": {POS: PRON}, # en, var, någon, ingen, Varannan
|
||||
"PP": {POS: ADP}, # i, av, på, för, till
|
||||
"PP|AN": {POS: ADP}, # f
|
||||
"PS|AN": {POS: DET},
|
||||
"PS|NEU|SIN|DEF": {POS: DET}, # sitt, vårt, ditt, mitt, ert
|
||||
"PS|UTR/NEU|PLU|DEF": {POS: DET}, # sina, våra, dina, mina
|
||||
"PS|UTR/NEU|SIN/PLU|DEF": {POS: DET}, # deras, dess, hans, hennes, varandras
|
||||
"PS|UTR|SIN|DEF": {POS: DET}, # sin, vår, din, min, er
|
||||
"RG": {POS: NUM}, # 2, 17, 20, 1, 18
|
||||
"RG|GEN": {POS: NUM},
|
||||
"RG|MAS|SIN|DEF|NOM": {POS: NUM},
|
||||
"RG|NEU|SIN|IND|NOM": {POS: NUM}, # ett
|
||||
"RG|NOM": {POS: NUM}, # två, tre, 1, 20, 2
|
||||
"RG|SMS": {POS: NUM}, # ett-, 1950-, två-, tre-, 1700-
|
||||
"RG|UTR/NEU|SIN|DEF|NOM": {POS: NUM},
|
||||
"RG|UTR|SIN|IND|NOM": {POS: NUM}, # en
|
||||
"RO|MAS|SIN|IND/DEF|GEN": {POS: ADJ},
|
||||
"RO|MAS|SIN|IND/DEF|NOM": {POS: ADJ}, # förste
|
||||
"RO|GEN": {POS: ADJ},
|
||||
"RO|NOM": {POS: ADJ}, # första, andra, tredje, fjärde, femte
|
||||
"SN": {POS: SCONJ}, # att, om, innan, eftersom, medan
|
||||
"UO": {POS: X}, # companionship, vice, versa, family, capita
|
||||
"VB|AN": {POS: VERB}, # jfr
|
||||
"VB|IMP|AKT": {POS: VERB}, # se, Diskutera, låt, Läs, Gå
|
||||
"VB|IMP|SFO": {POS: VERB}, # tas
|
||||
"VB|INF|AKT": {POS: VERB}, # vara, få, ha, bli, kunna
|
||||
"VB|INF|SFO": {POS: VERB}, # användas, finnas, göras, tas, ses
|
||||
"VB|KON|PRS|AKT": {POS: VERB}, # vare, Gånge
|
||||
"VB|KON|PRT|AKT": {POS: VERB}, # vore, finge
|
||||
"VB|KON|PRT|SFO": {POS: VERB},
|
||||
"VB|PRS|AKT": {POS: VERB}, # är, har, kan, får, måste
|
||||
"VB|PRS|SFO": {POS: VERB}, # finns, kallas, behövs, beräknas, används
|
||||
"VB|PRT|AKT": {POS: VERB}, # skulle, var, hade, kunde, fick
|
||||
"VB|PRT|SFO": {POS: VERB}, # fanns, gjordes, höjdes, användes, infördes
|
||||
"VB|SMS": {POS: VERB}, # läs-
|
||||
"VB|SUP|AKT": {POS: VERB}, # varit, fått, blivit, haft, kommit
|
||||
"VB|SUP|SFO": {POS: VERB}, # nämnts, gjorts, förändrats, sagts, framhållits
|
||||
}
|
||||
|
|
|
@ -144,7 +144,7 @@ ABBREVIATIONS = [
|
|||
|
||||
# Add abbreviation for trailing punctuation too. If the abbreviation already has a trailing punctuation - skip it.
|
||||
for abbr in ABBREVIATIONS:
|
||||
if abbr.endswith(".") == False:
|
||||
if not abbr.endswith("."):
|
||||
ABBREVIATIONS.append(abbr + ".")
|
||||
|
||||
for orth in ABBREVIATIONS:
|
||||
|
|
|
@ -4,16 +4,15 @@ from __future__ import unicode_literals
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class TamilDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "ta"
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Tamil(Language):
|
||||
|
|
|
@ -4,70 +4,33 @@ from __future__ import unicode_literals
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
# uncomment if files are available
|
||||
# from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
# from .morph_rules import MORPH_RULES
|
||||
|
||||
# uncomment if lookup-based lemmatizer is available
|
||||
from .lemmatizer import LOOKUP
|
||||
# from ...lemmatizerlookup import Lemmatizer
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
def _return_tl(_):
|
||||
return 'tl'
|
||||
|
||||
|
||||
# Create a Language subclass
|
||||
# Documentation: https://spacy.io/docs/usage/adding-languages
|
||||
|
||||
# This file should be placed in spacy/lang/xx (ISO code of language).
|
||||
# Before submitting a pull request, make sure the remove all comments from the
|
||||
# language data files, and run at least the basic tokenizer tests. Simply add the
|
||||
# language ID to the list of languages in spacy/tests/conftest.py to include it
|
||||
# in the basic tokenizer sanity tests. You can optionally add a fixture for the
|
||||
# language's tokenizer and add more specific tests. For more info, see the
|
||||
# tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests
|
||||
return "tl"
|
||||
|
||||
|
||||
class TagalogDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = _return_tl # ISO code
|
||||
# add more norm exception dictionaries here
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
# overwrite functions for lexical attributes
|
||||
lex_attr_getters[LANG] = _return_tl
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
# add custom tokenizer exceptions to base exceptions
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
|
||||
# add stop words
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
# if available: add tag map
|
||||
# tag_map = dict(TAG_MAP)
|
||||
|
||||
# if available: add morph rules
|
||||
# morph_rules = dict(MORPH_RULES)
|
||||
|
||||
# if available: add lookup lemmatizer
|
||||
# @classmethod
|
||||
# def create_lemmatizer(cls, nlp=None):
|
||||
# return Lemmatizer(LOOKUP)
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Tagalog(Language):
|
||||
lang = 'tl' # ISO code
|
||||
Defaults = TagalogDefaults # set Defaults to custom language defaults
|
||||
lang = "tl"
|
||||
Defaults = TagalogDefaults
|
||||
|
||||
|
||||
# set default export – this allows the language class to be lazy-loaded
|
||||
__all__ = ['Tagalog']
|
||||
__all__ = ["Tagalog"]
|
||||
|
|
|
@ -2,11 +2,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Adding a lemmatizer lookup table
|
||||
# Documentation: https://spacy.io/docs/usage/adding-languages#lemmatizer
|
||||
# Entries should be added in the following format:
|
||||
|
||||
|
||||
LOOKUP = {
|
||||
"kaugnayan": "ugnay",
|
||||
"sangkatauhan": "tao",
|
||||
|
@ -14,5 +9,5 @@ LOOKUP = {
|
|||
"pandaigdigan": "daigdig",
|
||||
"kasaysayan": "saysay",
|
||||
"kabayanihan": "bayani",
|
||||
"karuwagan": "duwag"
|
||||
"karuwagan": "duwag",
|
||||
}
|
||||
|
|
|
@ -1,33 +1,55 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# import the symbols for the attrs you want to overwrite
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
# Overwriting functions for lexical attributes
|
||||
# Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs
|
||||
# Most of these functions, like is_lower or like_url should be language-
|
||||
# independent. Others, like like_num (which includes both digits and number
|
||||
# words), requires customisation.
|
||||
|
||||
|
||||
# Example: check if token resembles a number
|
||||
|
||||
_num_words = ['sero', 'isa', 'dalawa', 'tatlo', 'apat', 'lima', 'anim', 'pito',
|
||||
'walo', 'siyam', 'sampu', 'labing-isa', 'labindalawa', 'labintatlo', 'labing-apat',
|
||||
'labinlima', 'labing-anim', 'labimpito', 'labing-walo', 'labinsiyam', 'dalawampu',
|
||||
'tatlumpu', 'apatnapu', 'limampu', 'animnapu', 'pitumpu', 'walumpu', 'siyamnapu',
|
||||
'daan', 'libo', 'milyon', 'bilyon', 'trilyon', 'quadrilyon',
|
||||
'gajilyon', 'bazilyon']
|
||||
_num_words = [
|
||||
"sero",
|
||||
"isa",
|
||||
"dalawa",
|
||||
"tatlo",
|
||||
"apat",
|
||||
"lima",
|
||||
"anim",
|
||||
"pito",
|
||||
"walo",
|
||||
"siyam",
|
||||
"sampu",
|
||||
"labing-isa",
|
||||
"labindalawa",
|
||||
"labintatlo",
|
||||
"labing-apat",
|
||||
"labinlima",
|
||||
"labing-anim",
|
||||
"labimpito",
|
||||
"labing-walo",
|
||||
"labinsiyam",
|
||||
"dalawampu",
|
||||
"tatlumpu",
|
||||
"apatnapu",
|
||||
"limampu",
|
||||
"animnapu",
|
||||
"pitumpu",
|
||||
"walumpu",
|
||||
"siyamnapu",
|
||||
"daan",
|
||||
"libo",
|
||||
"milyon",
|
||||
"bilyon",
|
||||
"trilyon",
|
||||
"quadrilyon",
|
||||
"gajilyon",
|
||||
"bazilyon",
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
text = text.replace(',', '').replace('.', '')
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
|
@ -35,9 +57,4 @@ def like_num(text):
|
|||
return False
|
||||
|
||||
|
||||
# Create dictionary of functions to overwrite. The default lex_attr_getters are
|
||||
# updated with this one, so only the functions defined here are overwritten.
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||
|
|
|
@ -1,17 +1,8 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Add stop words
|
||||
# Documentation: https://spacy.io/docs/usage/adding-languages#stop-words
|
||||
# To improve readability, words should be ordered alphabetically and separated
|
||||
# by spaces and newlines. When adding stop words from an online source, always
|
||||
# include the link in a comment. Make sure to proofread and double-check the
|
||||
# words – lists available online are often known to contain mistakes.
|
||||
|
||||
# data from https://github.com/stopwords-iso/stopwords-tl/blob/master/stopwords-tl.txt
|
||||
|
||||
STOP_WORDS = set("""
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
akin
|
||||
aking
|
||||
ako
|
||||
|
@ -159,4 +150,5 @@ STOP_WORDS = set("""
|
|||
tungkol
|
||||
una
|
||||
walang
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
|
|
@ -1,36 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
||||
from ...symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
|
||||
|
||||
|
||||
# Add a tag map
|
||||
# Documentation: https://spacy.io/docs/usage/adding-languages#tag-map
|
||||
# Universal Dependencies: http://universaldependencies.org/u/pos/all.html
|
||||
# The keys of the tag map should be strings in your tag set. The dictionary must
|
||||
# have an entry POS whose value is one of the Universal Dependencies tags.
|
||||
# Optionally, you can also include morphological features or other attributes.
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
"ADV": {POS: ADV},
|
||||
"NOUN": {POS: NOUN},
|
||||
"ADP": {POS: ADP},
|
||||
"PRON": {POS: PRON},
|
||||
"SCONJ": {POS: SCONJ},
|
||||
"PROPN": {POS: PROPN},
|
||||
"DET": {POS: DET},
|
||||
"SYM": {POS: SYM},
|
||||
"INTJ": {POS: INTJ},
|
||||
"PUNCT": {POS: PUNCT},
|
||||
"NUM": {POS: NUM},
|
||||
"AUX": {POS: AUX},
|
||||
"X": {POS: X},
|
||||
"CONJ": {POS: CONJ},
|
||||
"CCONJ": {POS: CCONJ},
|
||||
"ADJ": {POS: ADJ},
|
||||
"VERB": {POS: VERB},
|
||||
"PART": {POS: PART},
|
||||
"SP": {POS: SPACE}
|
||||
}
|
|
@ -1,48 +1,20 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# import symbols – if you need to use more, add them here
|
||||
from ...symbols import ORTH, LEMMA, TAG, NORM, ADP, DET
|
||||
from ...symbols import ORTH, LEMMA
|
||||
|
||||
|
||||
# Add tokenizer exceptions
|
||||
# Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions
|
||||
# Feel free to use custom logic to generate repetitive exceptions more efficiently.
|
||||
# If an exception is split into more than one token, the ORTH values combined always
|
||||
# need to match the original string.
|
||||
|
||||
# Exceptions should be added in the following format:
|
||||
|
||||
_exc = {
|
||||
"tayo'y": [
|
||||
{ORTH: "tayo", LEMMA: "tayo"},
|
||||
{ORTH: "'y", LEMMA: "ay"}],
|
||||
"isa'y": [
|
||||
{ORTH: "isa", LEMMA: "isa"},
|
||||
{ORTH: "'y", LEMMA: "ay"}],
|
||||
"baya'y": [
|
||||
{ORTH: "baya", LEMMA: "bayan"},
|
||||
{ORTH: "'y", LEMMA: "ay"}],
|
||||
"sa'yo": [
|
||||
{ORTH: "sa", LEMMA: "sa"},
|
||||
{ORTH: "'yo", LEMMA: "iyo"}],
|
||||
"ano'ng": [
|
||||
{ORTH: "ano", LEMMA: "ano"},
|
||||
{ORTH: "'ng", LEMMA: "ang"}],
|
||||
"siya'y": [
|
||||
{ORTH: "siya", LEMMA: "siya"},
|
||||
{ORTH: "'y", LEMMA: "ay"}],
|
||||
"nawa'y": [
|
||||
{ORTH: "nawa", LEMMA: "nawa"},
|
||||
{ORTH: "'y", LEMMA: "ay"}],
|
||||
"papa'no": [
|
||||
{ORTH: "papa'no", LEMMA: "papaano"}],
|
||||
"'di": [
|
||||
{ORTH: "'di", LEMMA: "hindi"}]
|
||||
"tayo'y": [{ORTH: "tayo", LEMMA: "tayo"}, {ORTH: "'y", LEMMA: "ay"}],
|
||||
"isa'y": [{ORTH: "isa", LEMMA: "isa"}, {ORTH: "'y", LEMMA: "ay"}],
|
||||
"baya'y": [{ORTH: "baya", LEMMA: "bayan"}, {ORTH: "'y", LEMMA: "ay"}],
|
||||
"sa'yo": [{ORTH: "sa", LEMMA: "sa"}, {ORTH: "'yo", LEMMA: "iyo"}],
|
||||
"ano'ng": [{ORTH: "ano", LEMMA: "ano"}, {ORTH: "'ng", LEMMA: "ang"}],
|
||||
"siya'y": [{ORTH: "siya", LEMMA: "siya"}, {ORTH: "'y", LEMMA: "ay"}],
|
||||
"nawa'y": [{ORTH: "nawa", LEMMA: "nawa"}, {ORTH: "'y", LEMMA: "ay"}],
|
||||
"papa'no": [{ORTH: "papa'no", LEMMA: "papaano"}],
|
||||
"'di": [{ORTH: "'di", LEMMA: "hindi"}],
|
||||
}
|
||||
|
||||
|
||||
# To keep things clean and readable, it's recommended to only declare the
|
||||
# TOKENIZER_EXCEPTIONS at the bottom:
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
import re
|
||||
|
||||
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE, PUNCT
|
||||
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
||||
|
||||
|
||||
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
||||
|
|
|
@ -5,71 +5,32 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
# uncomment if files are available
|
||||
# from .norm_exceptions import NORM_EXCEPTIONS
|
||||
# from .tag_map import TAG_MAP
|
||||
# from .morph_rules import MORPH_RULES
|
||||
|
||||
# uncomment if lookup-based lemmatizer is available
|
||||
# from .lemmatizer import LOOKUP
|
||||
# from ...lemmatizerlookup import Lemmatizer
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, LIKE_NUM, NORM
|
||||
# from .tag_map import TAG_MAP
|
||||
from ...attrs import LANG, NORM
|
||||
from .lemmatizer import UkrainianLemmatizer
|
||||
|
||||
|
||||
# Create a Language subclass
|
||||
# Documentation: https://spacy.io/docs/usage/adding-languages
|
||||
|
||||
# This file should be placed in spacy/lang/xx (ISO code of language).
|
||||
# Before submitting a pull request, make sure the remove all comments from the
|
||||
# language data files, and run at least the basic tokenizer tests. Simply add the
|
||||
# language ID to the list of languages in spacy/tests/conftest.py to include it
|
||||
# in the basic tokenizer sanity tests. You can optionally add a fixture for the
|
||||
# language's tokenizer and add more specific tests. For more info, see the
|
||||
# tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests
|
||||
|
||||
|
||||
class UkrainianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'uk' # ISO code
|
||||
# add more norm exception dictionaries here
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
# overwrite functions for lexical attributes
|
||||
lex_attr_getters[LANG] = lambda text: "uk"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
# add custom tokenizer exceptions to base exceptions
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
|
||||
# add stop words
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
# if available: add tag map
|
||||
# tag_map = dict(TAG_MAP)
|
||||
|
||||
# if available: add morph rules
|
||||
# morph_rules = dict(MORPH_RULES)
|
||||
|
||||
# if available: add lookup lemmatizer
|
||||
# @classmethod
|
||||
# def create_lemmatizer(cls, nlp=None):
|
||||
# return Lemmatizer(LOOKUP)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return UkrainianLemmatizer()
|
||||
|
||||
|
||||
class Ukrainian(Language):
|
||||
lang = 'uk' # ISO code
|
||||
Defaults = UkrainianDefaults # set Defaults to custom language defaults
|
||||
lang = "uk"
|
||||
Defaults = UkrainianDefaults
|
||||
|
||||
|
||||
# set default export – this allows the language class to be lazy-loaded
|
||||
__all__ = ['Ukrainian']
|
||||
__all__ = ["Ukrainian"]
|
||||
|
|
|
@ -19,5 +19,5 @@ sentences = [
|
|||
"Від Нижнього озера довгими дерев’яними сходами, над якими синьо й біло горіли маленькі коробочки-ліхтарики, підіймалися до нього двоє стовусів: найкращий друг Вертутій і його дванадцятилітній онук Чублик.", # blyznets_viktor_semenovych/zemlia_svitliachkiv
|
||||
"Китайський космічний зонд \"Чан'е-4\" вперше в історії здійснив м'яку посадку на зворотному боці Місяця.",
|
||||
"Коли до губ твоїх лишається півподиху, коли до губ твоїх лишається півкроку – зіниці твої виткані із подиву, в очах у тебе синьо і широко.", # Hryhorij Czubaj
|
||||
"Дорогу сестру збираю у дорогу, а брати вирішили не брати машину." # homographs
|
||||
"Дорогу сестру збираю у дорогу, а брати вирішили не брати машину.", # homographs
|
||||
]
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..ru.lemmatizer import RussianLemmatizer
|
||||
|
||||
|
||||
class UkrainianLemmatizer(RussianLemmatizer):
|
||||
|
||||
def __init__(self, pymorphy2_lang='ru'):
|
||||
def __init__(self, pymorphy2_lang="ru"):
|
||||
try:
|
||||
super(UkrainianLemmatizer, self).__init__(pymorphy2_lang='uk')
|
||||
super(UkrainianLemmatizer, self).__init__(pymorphy2_lang="uk")
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: '
|
||||
'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"')
|
||||
"The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: "
|
||||
'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
|
||||
)
|
||||
|
|
|
@ -1,32 +1,68 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# import the symbols for the attrs you want to overwrite
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
# Overwriting functions for lexical attributes
|
||||
# Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs
|
||||
# Most of these functions, like is_lower or like_url should be language-
|
||||
# independent. Others, like like_num (which includes both digits and number
|
||||
# words), requires customisation.
|
||||
|
||||
|
||||
# Example: check if token resembles a number
|
||||
_num_words = ["більйон", "вісім", "вісімдесят", "вісімнадцять", "вісімсот", "восьмий", "два", "двадцять", "дванадцять",
|
||||
"двісті", "дев'яносто", "дев'ятнадцять", "дев'ятсот", "дев'ять", "десять", "децильйон", "квадрильйон",
|
||||
"квінтильйон", "мільйон", "мільярд", "нонильйон", "один", "одинадцять", "октильйон", "п'ятий",
|
||||
"п'ятисотий", "п'ятнадцять", "п'ятсот", "п'ять", "секстильйон", "септильйон", "сім", "сімдесят",
|
||||
"сімнадцять", "сімсот", "сорок", "сто", "тисяча", "три", "тридцять", "трильйон", "тринадцять", "триста",
|
||||
"чотири", "чотириста", "чотирнадцять", "шістдесят", "шістнадцять", "шістсот", "шість"]
|
||||
_num_words = [
|
||||
"більйон",
|
||||
"вісім",
|
||||
"вісімдесят",
|
||||
"вісімнадцять",
|
||||
"вісімсот",
|
||||
"восьмий",
|
||||
"два",
|
||||
"двадцять",
|
||||
"дванадцять",
|
||||
"двісті",
|
||||
"дев'яносто",
|
||||
"дев'ятнадцять",
|
||||
"дев'ятсот",
|
||||
"дев'ять",
|
||||
"десять",
|
||||
"децильйон",
|
||||
"квадрильйон",
|
||||
"квінтильйон",
|
||||
"мільйон",
|
||||
"мільярд",
|
||||
"нонильйон",
|
||||
"один",
|
||||
"одинадцять",
|
||||
"октильйон",
|
||||
"п'ятий",
|
||||
"п'ятисотий",
|
||||
"п'ятнадцять",
|
||||
"п'ятсот",
|
||||
"п'ять",
|
||||
"секстильйон",
|
||||
"септильйон",
|
||||
"сім",
|
||||
"сімдесят",
|
||||
"сімнадцять",
|
||||
"сімсот",
|
||||
"сорок",
|
||||
"сто",
|
||||
"тисяча",
|
||||
"три",
|
||||
"тридцять",
|
||||
"трильйон",
|
||||
"тринадцять",
|
||||
"триста",
|
||||
"чотири",
|
||||
"чотириста",
|
||||
"чотирнадцять",
|
||||
"шістдесят",
|
||||
"шістнадцять",
|
||||
"шістсот",
|
||||
"шість",
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
text = text.replace(',', '').replace('.', '')
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
|
@ -34,9 +70,4 @@ def like_num(text):
|
|||
return False
|
||||
|
||||
|
||||
# Create dictionary of functions to overwrite. The default lex_attr_getters are
|
||||
# updated with this one, so only the functions defined here are overwritten.
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||
|
|
|
@ -2,15 +2,8 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Add stop words
|
||||
# Documentation: https://spacy.io/docs/usage/adding-languages#stop-words
|
||||
# To improve readability, words should be ordered alphabetically and separated
|
||||
# by spaces and newlines. When adding stop words from an online source, always
|
||||
# include the link in a comment. Make sure to proofread and double-check the
|
||||
# words – lists available online are often known to contain mistakes.
|
||||
|
||||
|
||||
STOP_WORDS = set("""а
|
||||
STOP_WORDS = set(
|
||||
"""а
|
||||
або
|
||||
адже
|
||||
але
|
||||
|
@ -401,4 +394,5 @@ STOP_WORDS = set("""а
|
|||
якій
|
||||
якого
|
||||
якщо
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
|
|
@ -5,14 +5,6 @@ from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
|||
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
|
||||
|
||||
|
||||
# Add a tag map
|
||||
# Documentation: https://spacy.io/docs/usage/adding-languages#tag-map
|
||||
# Universal Dependencies: http://universaldependencies.org/u/pos/all.html
|
||||
# The keys of the tag map should be strings in your tag set. The dictionary must
|
||||
# have an entry POS whose value is one of the Universal Dependencies tags.
|
||||
# Optionally, you can also include morphological features or other attributes.
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
"ADV": {POS: ADV},
|
||||
"NOUN": {POS: NOUN},
|
||||
|
@ -32,5 +24,5 @@ TAG_MAP = {
|
|||
"ADJ": {POS: ADJ},
|
||||
"VERB": {POS: VERB},
|
||||
"PART": {POS: PART},
|
||||
"SP": {POS: SPACE}
|
||||
"SP": {POS: SPACE},
|
||||
}
|
||||
|
|
|
@ -1,18 +1,9 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# import symbols – if you need to use more, add them here
|
||||
from ...symbols import ORTH, LEMMA, POS, NORM, NOUN
|
||||
|
||||
|
||||
# Add tokenizer exceptions
|
||||
# Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions
|
||||
# Feel free to use custom logic to generate repetitive exceptions more efficiently.
|
||||
# If an exception is split into more than one token, the ORTH values combined always
|
||||
# need to match the original string.
|
||||
|
||||
# Exceptions should be added in the following format:
|
||||
|
||||
_exc = {}
|
||||
|
||||
for exc_data in [
|
||||
|
@ -28,11 +19,9 @@ for exc_data in [
|
|||
{ORTH: "проф.", LEMMA: "професор", NORM: "професор", POS: NOUN},
|
||||
{ORTH: "акад.", LEMMA: "академік", NORM: "академік", POS: NOUN},
|
||||
{ORTH: "доц.", LEMMA: "доцент", NORM: "доцент", POS: NOUN},
|
||||
{ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN}]:
|
||||
{ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
||||
# To keep things clean and readable, it's recommended to only declare the
|
||||
# TOKENIZER_EXCEPTIONS at the bottom:
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .matcher import Matcher
|
||||
from .phrasematcher import PhraseMatcher
|
||||
from .dependencymatcher import DependencyTreeMatcher
|
||||
from .matcher import Matcher # noqa: F401
|
||||
from .phrasematcher import PhraseMatcher # noqa: F401
|
||||
from .dependencymatcher import DependencyTreeMatcher # noqa: F401
|
||||
|
|
|
@ -119,8 +119,8 @@ def tr_tokenizer():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def uk_tokenizer():
|
||||
pymorphy = pytest.importorskip("pymorphy2")
|
||||
return util.get_lang_class("uk").Defaults.create_tokenizer()
|
||||
pytest.importorskip("pymorphy2")
|
||||
return get_lang_class("uk").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
@ -130,7 +130,7 @@ def ca_tokenizer():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def pl_tokenizer():
|
||||
return util.get_lang_class("pl").Defaults.create_tokenizer()
|
||||
return get_lang_class("pl").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
|
0
spacy/tests/lang/pl/__init__.py
Normal file
0
spacy/tests/lang/pl/__init__.py
Normal file
|
@ -4,56 +4,56 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
DOT_TESTS = [
|
||||
('tel.', ['tel.']),
|
||||
('np.', ['np.']),
|
||||
('godz. 21:37', ['godz.', '21:37']),
|
||||
('inż.', ['inż.']),
|
||||
('gosp.-polit.', ['gosp.-polit.']),
|
||||
('ppoż', ['ppoż']),
|
||||
('płn', ['płn']),
|
||||
('ul.', ['ul.']),
|
||||
('jw.', ['jw.']),
|
||||
('itd.', ['itd.']),
|
||||
('cdn.', ['cdn.']),
|
||||
('itp.', ['itp.']),
|
||||
('10,- zł', ['10,-', 'zł']),
|
||||
('0 zł 99 gr', ['0', 'zł', '99', 'gr']),
|
||||
('0,99 rub.', ['0,99', 'rub.']),
|
||||
('dol.', ['dol.']),
|
||||
('1000 m n.p.m.', ['1000', 'm', 'n.p.m.']),
|
||||
('m.in.', ['m.in.']),
|
||||
('p.n.e.', ['p.n.e.']),
|
||||
('Sz.P.', ['Sz.P.']),
|
||||
('p.o.', ['p.o.']),
|
||||
('k.o.', ['k.o.']),
|
||||
('m.st.', ['m.st.']),
|
||||
('dra.', ['dra', '.']),
|
||||
('pp.', ['pp.']),
|
||||
('oo.', ['oo.'])
|
||||
("tel.", ["tel."]),
|
||||
("np.", ["np."]),
|
||||
("godz. 21:37", ["godz.", "21:37"]),
|
||||
("inż.", ["inż."]),
|
||||
("gosp.-polit.", ["gosp.-polit."]),
|
||||
("ppoż", ["ppoż"]),
|
||||
("płn", ["płn"]),
|
||||
("ul.", ["ul."]),
|
||||
("jw.", ["jw."]),
|
||||
("itd.", ["itd."]),
|
||||
("cdn.", ["cdn."]),
|
||||
("itp.", ["itp."]),
|
||||
("10,- zł", ["10,-", "zł"]),
|
||||
("0 zł 99 gr", ["0", "zł", "99", "gr"]),
|
||||
("0,99 rub.", ["0,99", "rub."]),
|
||||
("dol.", ["dol."]),
|
||||
("1000 m n.p.m.", ["1000", "m", "n.p.m."]),
|
||||
("m.in.", ["m.in."]),
|
||||
("p.n.e.", ["p.n.e."]),
|
||||
("Sz.P.", ["Sz.P."]),
|
||||
("p.o.", ["p.o."]),
|
||||
("k.o.", ["k.o."]),
|
||||
("m.st.", ["m.st."]),
|
||||
("dra.", ["dra", "."]),
|
||||
("pp.", ["pp."]),
|
||||
("oo.", ["oo."]),
|
||||
]
|
||||
|
||||
HYPHEN_TESTS = [
|
||||
('5-fluoropentylo-3-pirydynyloindol', ['5-fluoropentylo-3-pirydynyloindol']),
|
||||
('NESS-040C5', ['NESS-040C5']),
|
||||
('JTE-7-31', ['JTE-7-31']),
|
||||
('BAY-59-3074', ['BAY-59-3074']),
|
||||
('BAY-38-7271', ['BAY-38-7271']),
|
||||
('STS-135', ['STS-135']),
|
||||
('5F-PB-22', ['5F-PB-22']),
|
||||
('cztero-', ['cztero-']),
|
||||
('jedno-', ['jedno-']),
|
||||
('dwu-', ['dwu-']),
|
||||
('trzy-', ['trzy-']),
|
||||
('b-adoratorzy', ['b-adoratorzy']),
|
||||
('2-3-4 drzewa', ['2-3-4', 'drzewa']),
|
||||
('b-drzewa', ['b-drzewa'])
|
||||
("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]),
|
||||
("NESS-040C5", ["NESS-040C5"]),
|
||||
("JTE-7-31", ["JTE-7-31"]),
|
||||
("BAY-59-3074", ["BAY-59-3074"]),
|
||||
("BAY-38-7271", ["BAY-38-7271"]),
|
||||
("STS-135", ["STS-135"]),
|
||||
("5F-PB-22", ["5F-PB-22"]),
|
||||
("cztero-", ["cztero-"]),
|
||||
("jedno-", ["jedno-"]),
|
||||
("dwu-", ["dwu-"]),
|
||||
("trzy-", ["trzy-"]),
|
||||
("b-adoratorzy", ["b-adoratorzy"]),
|
||||
("2-3-4 drzewa", ["2-3-4", "drzewa"]),
|
||||
("b-drzewa", ["b-drzewa"]),
|
||||
]
|
||||
|
||||
|
||||
TESTCASES = DOT_TESTS + HYPHEN_TESTS
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||
@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
|
||||
def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens):
|
||||
tokens = pl_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
|
|
|
@ -5,34 +5,42 @@ import pytest
|
|||
|
||||
|
||||
SV_TOKEN_EXCEPTION_TESTS = [
|
||||
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
|
||||
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']),
|
||||
('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."])
|
||||
(
|
||||
"Smörsåsen används bl.a. till fisk",
|
||||
["Smörsåsen", "används", "bl.a.", "till", "fisk"],
|
||||
),
|
||||
(
|
||||
"Jag kommer först kl. 13 p.g.a. diverse förseningar",
|
||||
["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
|
||||
),
|
||||
(
|
||||
"Anders I. tycker om ord med i i.",
|
||||
["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', SV_TOKEN_EXCEPTION_TESTS)
|
||||
@pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS)
|
||||
def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens):
|
||||
tokens = sv_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["driveru", "hajaru", "Serru", "Fixaru"])
|
||||
@pytest.mark.parametrize("text", ["driveru", "hajaru", "Serru", "Fixaru"])
|
||||
def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].text == "u"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text',
|
||||
["bl.a", "m.a.o.", "Jan.", "Dec.", "kr.", "osv."])
|
||||
@pytest.mark.parametrize("text", ["bl.a", "m.a.o.", "Jan.", "Dec.", "kr.", "osv."])
|
||||
def test_sv_tokenizer_handles_abbr(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Jul.", "jul.", "sön.", "Sön."])
|
||||
@pytest.mark.parametrize("text", ["Jul.", "jul.", "sön.", "Sön."])
|
||||
def test_sv_tokenizer_handles_ambiguous_abbr(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
|
|
@ -4,12 +4,17 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('string,lemma', [('DNA-profilernas', 'DNA-profil'),
|
||||
('Elfenbenskustens', 'Elfenbenskusten'),
|
||||
('abortmotståndarens', 'abortmotståndare'),
|
||||
('kolesterols', 'kolesterol'),
|
||||
('portionssnusernas', 'portionssnus'),
|
||||
('åsyns', 'åsyn')])
|
||||
@pytest.mark.parametrize(
|
||||
"string,lemma",
|
||||
[
|
||||
("DNA-profilernas", "DNA-profil"),
|
||||
("Elfenbenskustens", "Elfenbenskusten"),
|
||||
("abortmotståndarens", "abortmotståndare"),
|
||||
("kolesterols", "kolesterol"),
|
||||
("portionssnusernas", "portionssnus"),
|
||||
("åsyns", "åsyn"),
|
||||
],
|
||||
)
|
||||
def test_lemmatizer_lookup_assigns(sv_tokenizer, string, lemma):
|
||||
tokens = sv_tokenizer(string)
|
||||
assert tokens[0].lemma_ == lemma
|
||||
|
|
|
@ -1,28 +1,28 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
@pytest.mark.parametrize('text', ["(under)"])
|
||||
|
||||
@pytest.mark.parametrize("text", ["(under)"])
|
||||
def test_tokenizer_splits_no_special(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["gitta'r", "Björn's", "Lars'"])
|
||||
@pytest.mark.parametrize("text", ["gitta'r", "Björn's", "Lars'"])
|
||||
def test_tokenizer_handles_no_punct(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["svart.Gul", "Hej.Världen"])
|
||||
@pytest.mark.parametrize("text", ["svart.Gul", "Hej.Världen"])
|
||||
def test_tokenizer_splits_period_infix(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Hej,Världen", "en,två"])
|
||||
@pytest.mark.parametrize("text", ["Hej,Världen", "en,två"])
|
||||
def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
@ -31,7 +31,7 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
|
|||
assert tokens[2].text == text.split(",")[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["svart...Gul", "svart...gul"])
|
||||
@pytest.mark.parametrize("text", ["svart...Gul", "svart...gul"])
|
||||
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
# coding: utf-8
|
||||
"""Test that longer and mixed texts are tokenized correctly."""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
def test_sv_tokenizer_handles_long_text(sv_tokenizer):
|
||||
text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön,
|
||||
|
|
|
@ -1,25 +1,24 @@
|
|||
# coding: utf-8
|
||||
"""Test that open, closed and paired punctuation is split off correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PUNCT_OPEN = ['(', '[', '{', '*']
|
||||
PUNCT_CLOSE = [')', ']', '}', '*']
|
||||
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||
PUNCT_OPEN = ["(", "[", "{", "*"]
|
||||
PUNCT_CLOSE = [")", "]", "}", "*"]
|
||||
PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
||||
@pytest.mark.parametrize("text", ["(", "((", "<"])
|
||||
def test_uk_tokenizer_handles_only_punct(uk_tokenizer, text):
|
||||
tokens = uk_tokenizer(text)
|
||||
assert len(tokens) == len(text)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||
)
|
||||
def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text):
|
||||
tokens = uk_tokenizer(punct + text)
|
||||
assert len(tokens) == 2
|
||||
|
@ -27,8 +26,10 @@ def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text):
|
|||
assert tokens[1].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||
)
|
||||
def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text):
|
||||
tokens = uk_tokenizer(text + punct)
|
||||
assert len(tokens) == 2
|
||||
|
@ -36,9 +37,11 @@ def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text):
|
|||
assert tokens[1].text == punct
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('punct_add', ["`"])
|
||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||
@pytest.mark.parametrize("punct_add", ["`"])
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||
)
|
||||
def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add, text):
|
||||
tokens = uk_tokenizer(punct + punct_add + text)
|
||||
assert len(tokens) == 3
|
||||
|
@ -47,9 +50,11 @@ def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add,
|
|||
assert tokens[2].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('punct_add', ["'"])
|
||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize("punct_add", ["'"])
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||
)
|
||||
def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add, text):
|
||||
tokens = uk_tokenizer(text + punct + punct_add)
|
||||
assert len(tokens) == 3
|
||||
|
@ -58,8 +63,10 @@ def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add
|
|||
assert tokens[2].text == punct_add
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||
)
|
||||
def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text):
|
||||
tokens = uk_tokenizer(punct + punct + punct + text)
|
||||
assert len(tokens) == 4
|
||||
|
@ -67,8 +74,10 @@ def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text):
|
|||
assert tokens[3].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
||||
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||
)
|
||||
def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text):
|
||||
tokens = uk_tokenizer(text + punct + punct + punct)
|
||||
assert len(tokens) == 4
|
||||
|
@ -76,14 +85,14 @@ def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text):
|
|||
assert tokens[1].text == punct
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["'Тест"])
|
||||
@pytest.mark.parametrize("text", ["'Тест"])
|
||||
def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text):
|
||||
tokens = uk_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Тест''"])
|
||||
@pytest.mark.parametrize("text", ["Тест''"])
|
||||
def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
|
||||
tokens = uk_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
@ -91,10 +100,13 @@ def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
|
|||
assert len(tokens_punct) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
||||
def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open,
|
||||
punct_close, text):
|
||||
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||
)
|
||||
def test_uk_tokenizer_splits_open_close_punct(
|
||||
uk_tokenizer, punct_open, punct_close, text
|
||||
):
|
||||
tokens = uk_tokenizer(punct_open + text + punct_close)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == punct_open
|
||||
|
@ -102,11 +114,14 @@ def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open,
|
|||
assert tokens[2].text == punct_close
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
||||
def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close,
|
||||
punct_open2, punct_close2, text):
|
||||
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||
)
|
||||
def test_uk_tokenizer_two_diff_punct(
|
||||
uk_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
|
||||
):
|
||||
tokens = uk_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].text == punct_open2
|
||||
|
@ -116,7 +131,9 @@ def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close,
|
|||
assert tokens[4].text == punct_close2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."])
|
||||
@pytest.mark.parametrize(
|
||||
"text", ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."]
|
||||
)
|
||||
def test_uk_tokenizer_splits_trailing_dot(uk_tokenizer, text):
|
||||
tokens = uk_tokenizer(text)
|
||||
assert tokens[1].text == "."
|
||||
|
|
|
@ -1,18 +1,14 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer exceptions are parsed correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norms,lemmas', [("ім.", ["імені"], ["ім'я"]),
|
||||
("проф.", ["професор"], ["професор"])])
|
||||
@pytest.mark.parametrize(
|
||||
"text,norms,lemmas",
|
||||
[("ім.", ["імені"], ["ім'я"]), ("проф.", ["професор"], ["професор"])],
|
||||
)
|
||||
def test_uk_tokenizer_abbrev_exceptions(uk_tokenizer, text, norms, lemmas):
|
||||
tokens = uk_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert [token.norm_ for token in tokens] == norms
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
from tempfile import NamedTemporaryFile
|
||||
import pytest
|
||||
|
||||
from ...cli.train import train
|
||||
|
||||
|
||||
def test_cli_trained_model_can_be_saved(tmpdir):
|
||||
lang = 'nl'
|
||||
lang = "nl"
|
||||
output_dir = str(tmpdir)
|
||||
train_file = NamedTemporaryFile('wb', dir=output_dir, delete=False)
|
||||
train_file = NamedTemporaryFile("wb", dir=output_dir, delete=False)
|
||||
train_corpus = [
|
||||
{
|
||||
"id": "identifier_0",
|
||||
|
@ -26,7 +26,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
|
|||
"head": 1,
|
||||
"tag": "NOUN",
|
||||
"orth": "Jan",
|
||||
"ner": "B-PER"
|
||||
"ner": "B-PER",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
|
@ -34,7 +34,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
|
|||
"head": 0,
|
||||
"tag": "VERB",
|
||||
"orth": "houdt",
|
||||
"ner": "O"
|
||||
"ner": "O",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
|
@ -42,7 +42,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
|
|||
"head": 1,
|
||||
"tag": "ADP",
|
||||
"orth": "van",
|
||||
"ner": "O"
|
||||
"ner": "O",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
|
@ -50,7 +50,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
|
|||
"head": -2,
|
||||
"tag": "NOUN",
|
||||
"orth": "Marie",
|
||||
"ner": "B-PER"
|
||||
"ner": "B-PER",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
|
@ -58,7 +58,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
|
|||
"head": -3,
|
||||
"tag": "PUNCT",
|
||||
"orth": ".",
|
||||
"ner": "O"
|
||||
"ner": "O",
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
|
@ -66,18 +66,18 @@ def test_cli_trained_model_can_be_saved(tmpdir):
|
|||
"head": -1,
|
||||
"tag": "SPACE",
|
||||
"orth": "\n",
|
||||
"ner": "O"
|
||||
"ner": "O",
|
||||
},
|
||||
],
|
||||
"brackets": [],
|
||||
}
|
||||
],
|
||||
"brackets": []
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
train_file.write(json.dumps(train_corpus).encode('utf-8'))
|
||||
train_file.write(json.dumps(train_corpus).encode("utf-8"))
|
||||
train_file.close()
|
||||
train_data = train_file.name
|
||||
dev_data = train_data
|
||||
|
|
|
@ -155,6 +155,14 @@ def test_issue1758(en_tokenizer):
|
|||
assert tokens[1].lemma_ == "have"
|
||||
|
||||
|
||||
def test_issue1773(en_tokenizer):
|
||||
"""Test that spaces don't receive a POS but no TAG. This is the root cause
|
||||
of the serialization issue reported in #1773."""
|
||||
doc = en_tokenizer("\n")
|
||||
if doc[0].pos_ == "SPACE":
|
||||
assert doc[0].tag_ != ""
|
||||
|
||||
|
||||
def test_issue1799():
|
||||
"""Test sentence boundaries are deserialized correctly, even for
|
||||
non-projective sentences."""
|
||||
|
@ -249,8 +257,8 @@ def test_issue1945():
|
|||
|
||||
def test_issue1963(en_tokenizer):
|
||||
"""Test that doc.merge() resizes doc.tensor"""
|
||||
doc = en_tokenizer('a b c d')
|
||||
doc.tensor = numpy.ones((len(doc), 128), dtype='f')
|
||||
doc = en_tokenizer("a b c d")
|
||||
doc.tensor = numpy.ones((len(doc), 128), dtype="f")
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[0:2])
|
||||
assert len(doc) == 3
|
||||
|
|
|
@ -1,9 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
def test_issue1773(en_tokenizer):
|
||||
"""Test that spaces don't receive a POS but no TAG. This is the root cause
|
||||
of the serialization issue reported in #1773."""
|
||||
doc = en_tokenizer('\n')
|
||||
if doc[0].pos_ == 'SPACE':
|
||||
assert doc[0].tag_ != ""
|
|
@ -6,8 +6,9 @@ from spacy.tokens import Doc
|
|||
from spacy.displacy import render
|
||||
from spacy.gold import iob_to_biluo
|
||||
from spacy.lang.it import Italian
|
||||
import numpy
|
||||
|
||||
from ..util import add_vecs_to_vocab
|
||||
from ..util import add_vecs_to_vocab, get_doc
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
|
@ -69,6 +70,26 @@ def test_issue2385_biluo(tags):
|
|||
assert iob_to_biluo(tags) == list(tags)
|
||||
|
||||
|
||||
def test_issue2396(en_vocab):
|
||||
words = ["She", "created", "a", "test", "for", "spacy"]
|
||||
heads = [1, 0, 1, -2, -1, -1]
|
||||
matrix = numpy.array(
|
||||
[
|
||||
[0, 1, 1, 1, 1, 1],
|
||||
[1, 1, 1, 1, 1, 1],
|
||||
[1, 1, 2, 3, 3, 3],
|
||||
[1, 1, 3, 3, 3, 3],
|
||||
[1, 1, 3, 3, 4, 4],
|
||||
[1, 1, 3, 3, 4, 5],
|
||||
],
|
||||
dtype=numpy.int32,
|
||||
)
|
||||
doc = get_doc(en_vocab, words=words, heads=heads)
|
||||
span = doc[:]
|
||||
assert (doc.get_lca_matrix() == matrix).all()
|
||||
assert (span.get_lca_matrix() == matrix).all()
|
||||
|
||||
|
||||
def test_issue2482():
|
||||
"""Test we can serialize and deserialize a blank NER or parser model."""
|
||||
nlp = Italian()
|
||||
|
|
|
@ -1,35 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
import pytest
|
||||
import numpy
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sentence,heads,matrix",
|
||||
[
|
||||
(
|
||||
"She created a test for spacy",
|
||||
[1, 0, 1, -2, -1, -1],
|
||||
numpy.array(
|
||||
[
|
||||
[0, 1, 1, 1, 1, 1],
|
||||
[1, 1, 1, 1, 1, 1],
|
||||
[1, 1, 2, 3, 3, 3],
|
||||
[1, 1, 3, 3, 3, 3],
|
||||
[1, 1, 3, 3, 4, 4],
|
||||
[1, 1, 3, 3, 4, 5],
|
||||
],
|
||||
dtype=numpy.int32,
|
||||
),
|
||||
)
|
||||
],
|
||||
)
|
||||
def test_issue2396(en_tokenizer, sentence, heads, matrix):
|
||||
tokens = en_tokenizer(sentence)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||
span = doc[:]
|
||||
assert (doc.get_lca_matrix() == matrix).all()
|
||||
assert (span.get_lca_matrix() == matrix).all()
|
|
@ -1,14 +1,10 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.lang.en import English
|
||||
|
||||
def test_issue2754():
|
||||
def test_issue2754(en_tokenizer):
|
||||
"""Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
|
||||
nlp = English()
|
||||
a = nlp('a')
|
||||
assert a[0].norm_ == 'a'
|
||||
am = nlp('am')
|
||||
assert am[0].norm_ == 'am'
|
||||
|
||||
a = en_tokenizer("a")
|
||||
assert a[0].norm_ == "a"
|
||||
am = en_tokenizer("am")
|
||||
assert am[0].norm_ == "am"
|
||||
|
|
|
@ -9,4 +9,3 @@ def test_issue2835(en_tokenizer):
|
|||
"""
|
||||
doc = en_tokenizer(text)
|
||||
assert doc
|
||||
|
||||
|
|
|
@ -2,26 +2,24 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import numpy
|
||||
from spacy.vectors import Vectors
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.tokens import Doc
|
||||
from spacy._ml import link_vectors_to_models
|
||||
|
||||
|
||||
def test_issue2871():
|
||||
"""Test that vectors recover the correct key for spaCy reserved words."""
|
||||
words = ['dog', 'cat', 'SUFFIX']
|
||||
words = ["dog", "cat", "SUFFIX"]
|
||||
vocab = Vocab()
|
||||
vocab.vectors.resize(shape=(3, 10))
|
||||
vector_data = numpy.zeros((3, 10), dtype='f')
|
||||
vector_data = numpy.zeros((3, 10), dtype="f")
|
||||
for word in words:
|
||||
_ = vocab[word]
|
||||
_ = vocab[word] # noqa: F841
|
||||
vocab.set_vector(word, vector_data[0])
|
||||
vocab.vectors.name = 'dummy_vectors'
|
||||
vocab.vectors.name = "dummy_vectors"
|
||||
link_vectors_to_models(vocab)
|
||||
assert vocab['dog'].rank == 0
|
||||
assert vocab['cat'].rank == 1
|
||||
assert vocab['SUFFIX'].rank == 2
|
||||
assert vocab.vectors.find(key='dog') == 0
|
||||
assert vocab.vectors.find(key='cat') == 1
|
||||
assert vocab.vectors.find(key='SUFFIX') == 2
|
||||
assert vocab["dog"].rank == 0
|
||||
assert vocab["cat"].rank == 1
|
||||
assert vocab["SUFFIX"].rank == 2
|
||||
assert vocab.vectors.find(key="dog") == 0
|
||||
assert vocab.vectors.find(key="cat") == 1
|
||||
assert vocab.vectors.find(key="SUFFIX") == 2
|
||||
|
|
|
@ -58,9 +58,10 @@ def test_issue3009(doc, matcher, pattern):
|
|||
matches = matcher(doc)
|
||||
assert matches
|
||||
|
||||
|
||||
def test_issue2464(matcher):
|
||||
"""Test problem with successive ?. This is the same bug, so putting it here."""
|
||||
doc = Doc(matcher.vocab, words=['a', 'b'])
|
||||
matcher.add('4', None, [{'OP': '?'}, {'OP': '?'}])
|
||||
doc = Doc(matcher.vocab, words=["a", "b"])
|
||||
matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 3
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
from ...attrs import ENT_IOB, ENT_TYPE
|
||||
from ...tokens import Doc
|
||||
from ..util import get_doc
|
||||
|
@ -30,4 +28,4 @@ def test_issue3012(en_vocab):
|
|||
# serializing then deserializing
|
||||
doc_bytes = doc.to_bytes()
|
||||
doc2 = Doc(en_vocab).from_bytes(doc_bytes)
|
||||
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
|
||||
assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
|
||||
|
|
|
@ -1,10 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
import spacy
|
||||
|
||||
|
||||
@pytest.mark.models('fr')
|
||||
def test_issue1959(FR):
|
||||
texts = ['Je suis la mauvaise herbe', "Me, myself and moi"]
|
||||
for text in texts:
|
||||
FR(text)
|
Loading…
Reference in New Issue
Block a user