mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Tidy up and fix small bugs and typos
This commit is contained in:
parent
9e652afa4b
commit
25602c794c
|
@ -8,15 +8,14 @@ import time
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.v2v import Affine, Maxout
|
from thinc.v2v import Affine, Maxout
|
||||||
from thinc.api import wrap, layerize
|
|
||||||
from thinc.misc import LayerNorm as LN
|
from thinc.misc import LayerNorm as LN
|
||||||
from thinc.neural.util import prefer_gpu, get_array_module
|
from thinc.neural.util import prefer_gpu
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..attrs import ID, HEAD
|
from ..attrs import ID, HEAD
|
||||||
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
|
||||||
from .._ml import masked_language_model
|
from .._ml import masked_language_model
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -136,7 +135,7 @@ def pretrain(
|
||||||
random.shuffle(texts)
|
random.shuffle(texts)
|
||||||
|
|
||||||
|
|
||||||
def make_update(model, docs, optimizer, drop=0.0, objective='L2'):
|
def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
|
||||||
"""Perform an update over a single batch of documents.
|
"""Perform an update over a single batch of documents.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -171,7 +170,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500):
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
|
||||||
def get_vectors_loss(ops, docs, prediction, objective='L2'):
|
def get_vectors_loss(ops, docs, prediction, objective="L2"):
|
||||||
"""Compute a mean-squared error loss between the documents' vectors and
|
"""Compute a mean-squared error loss between the documents' vectors and
|
||||||
the prediction.
|
the prediction.
|
||||||
|
|
||||||
|
@ -185,9 +184,9 @@ def get_vectors_loss(ops, docs, prediction, objective='L2'):
|
||||||
# and look them up all at once. This prevents data copying.
|
# and look them up all at once. This prevents data copying.
|
||||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
target = docs[0].vocab.vectors.data[ids]
|
target = docs[0].vocab.vectors.data[ids]
|
||||||
if objective == 'L2':
|
if objective == "L2":
|
||||||
d_scores = prediction - target
|
d_scores = prediction - target
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores ** 2).sum()
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(objective)
|
raise NotImplementedError(objective)
|
||||||
return loss, d_scores
|
return loss, d_scores
|
||||||
|
@ -201,8 +200,7 @@ def create_pretraining_model(nlp, tok2vec):
|
||||||
"""
|
"""
|
||||||
output_size = nlp.vocab.vectors.data.shape[1]
|
output_size = nlp.vocab.vectors.data.shape[1]
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
LN(Maxout(300, pieces=3)),
|
LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
|
||||||
Affine(output_size, drop_factor=0.0),
|
|
||||||
)
|
)
|
||||||
# This is annoying, but the parser etc have the flatten step after
|
# This is annoying, but the parser etc have the flatten step after
|
||||||
# the tok2vec. To load the weights in cleanly, we need to match
|
# the tok2vec. To load the weights in cleanly, we need to match
|
||||||
|
|
|
@ -13,13 +13,7 @@ RENDER_WRAPPER = None
|
||||||
|
|
||||||
|
|
||||||
def render(
|
def render(
|
||||||
docs,
|
docs, style="dep", page=False, minify=False, jupyter=False, options={}, manual=False
|
||||||
style="dep",
|
|
||||||
page=False,
|
|
||||||
minify=False,
|
|
||||||
jupyter=False,
|
|
||||||
options={},
|
|
||||||
manual=False,
|
|
||||||
):
|
):
|
||||||
"""Render displaCy visualisation.
|
"""Render displaCy visualisation.
|
||||||
|
|
||||||
|
@ -80,7 +74,7 @@ def serve(
|
||||||
"""
|
"""
|
||||||
from wsgiref import simple_server
|
from wsgiref import simple_server
|
||||||
|
|
||||||
if IS_JUPYTER:
|
if is_in_jupyter():
|
||||||
user_warning(Warnings.W011)
|
user_warning(Warnings.W011)
|
||||||
|
|
||||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES
|
||||||
from ..char_classes import CONCAT_QUOTES, CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
|
||||||
|
|
||||||
# removing ° from the special icons to keep e.g. 99° as one token
|
# removing ° from the special icons to keep e.g. 99° as one token
|
||||||
_concat_icons = CONCAT_ICONS.replace("\u00B0", "")
|
_concat_icons = CONCAT_ICONS.replace("\u00B0", "")
|
||||||
|
@ -29,7 +30,9 @@ _suffixes = (
|
||||||
r"(?<=°[FfCcKk])\.",
|
r"(?<=°[FfCcKk])\.",
|
||||||
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency),
|
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
||||||
|
),
|
||||||
r"(?<=[{al})])-e".format(al=ALPHA_LOWER),
|
r"(?<=[{al})])-e".format(al=ALPHA_LOWER),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
@ -40,7 +43,7 @@ _infixes = (
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
|
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),
|
||||||
|
|
|
@ -5,24 +5,24 @@ import re
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc, Token
|
from ...tokens import Doc, Token
|
||||||
from ...util import DummyTokenizer
|
from ...util import DummyTokenizer
|
||||||
|
|
||||||
|
|
||||||
ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
|
ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
|
||||||
|
|
||||||
|
# TODO: Is this the right place for this?
|
||||||
|
Token.set_extension("mecab_tag", default=None)
|
||||||
|
|
||||||
|
|
||||||
def try_mecab_import():
|
def try_mecab_import():
|
||||||
"""Mecab is required for Japanese support, so check for it.
|
"""Mecab is required for Japanese support, so check for it.
|
||||||
|
|
||||||
It it's not available blow up and explain how to fix it."""
|
It it's not available blow up and explain how to fix it."""
|
||||||
try:
|
try:
|
||||||
import MeCab
|
import MeCab
|
||||||
|
|
||||||
# XXX Is this the right place for this?
|
|
||||||
Token.set_extension("mecab_tag", default=None)
|
|
||||||
return MeCab
|
return MeCab
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
|
@ -33,14 +33,13 @@ def try_mecab_import():
|
||||||
|
|
||||||
def resolve_pos(token):
|
def resolve_pos(token):
|
||||||
"""If necessary, add a field to the POS tag for UD mapping.
|
"""If necessary, add a field to the POS tag for UD mapping.
|
||||||
|
|
||||||
Under Universal Dependencies, sometimes the same Unidic POS tag can
|
Under Universal Dependencies, sometimes the same Unidic POS tag can
|
||||||
be mapped differently depending on the literal token or its context
|
be mapped differently depending on the literal token or its context
|
||||||
in the sentence. This function adds information to the POS tag to
|
in the sentence. This function adds information to the POS tag to
|
||||||
resolve ambiguous mappings.
|
resolve ambiguous mappings.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# NOTE: This is a first take. The rules here are crude approximations.
|
# TODO: This is a first take. The rules here are crude approximations.
|
||||||
# For many of these, full dependencies are needed to properly resolve
|
# For many of these, full dependencies are needed to properly resolve
|
||||||
# PoS mappings.
|
# PoS mappings.
|
||||||
|
|
||||||
|
@ -56,7 +55,7 @@ def resolve_pos(token):
|
||||||
|
|
||||||
def detailed_tokens(tokenizer, text):
|
def detailed_tokens(tokenizer, text):
|
||||||
"""Format Mecab output into a nice data structure, based on Janome."""
|
"""Format Mecab output into a nice data structure, based on Janome."""
|
||||||
tokenizer.parse(text)
|
|
||||||
node = tokenizer.parseToNode(text)
|
node = tokenizer.parseToNode(text)
|
||||||
node = node.next # first node is beginning of sentence and empty, skip it
|
node = node.next # first node is beginning of sentence and empty, skip it
|
||||||
words = []
|
words = []
|
||||||
|
@ -98,62 +97,15 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
class JapaneseCharacterSegmenter(object):
|
|
||||||
def __init__(self, vocab):
|
|
||||||
self.vocab = vocab
|
|
||||||
self._presegmenter = self._make_presegmenter(self.vocab)
|
|
||||||
|
|
||||||
def _make_presegmenter(self, vocab):
|
|
||||||
rules = Japanese.Defaults.tokenizer_exceptions
|
|
||||||
token_match = Japanese.Defaults.token_match
|
|
||||||
prefix_search = (
|
|
||||||
util.compile_prefix_regex(Japanese.Defaults.prefixes).search
|
|
||||||
if Japanese.Defaults.prefixes
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
suffix_search = (
|
|
||||||
util.compile_suffix_regex(Japanese.Defaults.suffixes).search
|
|
||||||
if Japanese.Defaults.suffixes
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
infix_finditer = (
|
|
||||||
util.compile_infix_regex(Japanese.Defaults.infixes).finditer
|
|
||||||
if Japanese.Defaults.infixes
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
return Tokenizer(
|
|
||||||
vocab,
|
|
||||||
rules=rules,
|
|
||||||
prefix_search=prefix_search,
|
|
||||||
suffix_search=suffix_search,
|
|
||||||
infix_finditer=infix_finditer,
|
|
||||||
token_match=token_match,
|
|
||||||
)
|
|
||||||
|
|
||||||
def __call__(self, text):
|
|
||||||
words = []
|
|
||||||
spaces = []
|
|
||||||
doc = self._presegmenter(text)
|
|
||||||
for token in doc:
|
|
||||||
words.extend(list(token.text))
|
|
||||||
spaces.extend([False] * len(token.text))
|
|
||||||
spaces[-1] = bool(token.whitespace_)
|
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
|
||||||
|
|
||||||
|
|
||||||
class JapaneseDefaults(Language.Defaults):
|
class JapaneseDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda _text: "ja"
|
lex_attr_getters[LANG] = lambda _text: "ja"
|
||||||
|
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
use_janome = True
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_tokenizer(cls, nlp=None):
|
def create_tokenizer(cls, nlp=None):
|
||||||
if cls.use_janome:
|
return JapaneseTokenizer(cls, nlp)
|
||||||
return JapaneseTokenizer(cls, nlp)
|
|
||||||
else:
|
|
||||||
return JapaneseCharacterSegmenter(nlp.vocab)
|
|
||||||
|
|
||||||
|
|
||||||
class Japanese(Language):
|
class Japanese(Language):
|
||||||
|
|
|
@ -2,10 +2,10 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
@ -22,9 +22,9 @@ class PolishDefaults(Language.Defaults):
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
)
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
class Polish(Language):
|
class Polish(Language):
|
||||||
|
|
|
@ -1,14 +1,22 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
|
||||||
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS
|
||||||
_quotes = QUOTES.replace("'", '')
|
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
|
||||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
|
||||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
_infixes = (
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
LIST_ELLIPSES
|
||||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
|
+ [CONCAT_ICONS]
|
||||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)])
|
+ [
|
||||||
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
TOKENIZER_INFIXES = _infixes
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
|
from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
|
||||||
|
from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
|
@ -6,7 +6,9 @@ from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .morph_rules import MORPH_RULES
|
from .morph_rules import MORPH_RULES
|
||||||
from .lemmatizer import LEMMA_RULES, LOOKUP
|
from .lemmatizer import LEMMA_RULES, LOOKUP
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
|
||||||
|
# Punctuation stolen from Danish
|
||||||
|
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
@ -31,6 +33,7 @@ class SwedishDefaults(Language.Defaults):
|
||||||
lemma_lookup = LOOKUP
|
lemma_lookup = LOOKUP
|
||||||
morph_rules = MORPH_RULES
|
morph_rules = MORPH_RULES
|
||||||
|
|
||||||
|
|
||||||
class Swedish(Language):
|
class Swedish(Language):
|
||||||
lang = "sv"
|
lang = "sv"
|
||||||
Defaults = SwedishDefaults
|
Defaults = SwedishDefaults
|
||||||
|
|
|
@ -1,25 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
"""Punctuation stolen from Danish"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
|
||||||
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
|
||||||
from ..punctuation import TOKENIZER_SUFFIXES
|
|
||||||
|
|
||||||
|
|
||||||
_quotes = QUOTES.replace("'", '')
|
|
||||||
|
|
||||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
|
||||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
|
||||||
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
|
||||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
|
||||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
|
|
||||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)])
|
|
||||||
|
|
||||||
_suffixes = [suffix for suffix in TOKENIZER_SUFFIXES if suffix not in ["'s", "'S", "’s", "’S", r"\'"]]
|
|
||||||
_suffixes += [r"(?<=[^sSxXzZ])\'"]
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_INFIXES = _infixes
|
|
||||||
TOKENIZER_SUFFIXES = _suffixes
|
|
|
@ -1,169 +1,191 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
|
|
||||||
"""
|
|
||||||
Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html
|
|
||||||
for https://github.com/UniversalDependencies/UD_Swedish-Talbanken
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X, VERB
|
from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV
|
||||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
|
from ...symbols import ADP, X, VERB, NOUN, PROPN, PART, INTJ, PRON
|
||||||
|
|
||||||
|
|
||||||
|
# Tag mappings according to https://universaldependencies.org/tagset-conversion/sv-suc-uposf.html
|
||||||
|
# for https://github.com/UniversalDependencies/UD_Swedish-Talbanken
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
'AB': { POS: ADV }, # inte, också, så, bara, nu
|
"AB": {POS: ADV}, # inte, också, så, bara, nu
|
||||||
'AB|AN': { POS: ADV }, # t.ex., ca, t_ex, bl.a., s_k
|
"AB|AN": {POS: ADV}, # t.ex., ca, t_ex, bl.a., s_k
|
||||||
'AB|KOM': { POS: ADV }, # mer, tidigare, mindre, vidare, mera
|
"AB|KOM": {POS: ADV}, # mer, tidigare, mindre, vidare, mera
|
||||||
'AB|POS': { POS: ADV }, # mycket, helt, ofta, länge, långt
|
"AB|POS": {POS: ADV}, # mycket, helt, ofta, länge, långt
|
||||||
'AB|SMS': { POS: ADV }, # över-, in-
|
"AB|SMS": {POS: ADV}, # över-, in-
|
||||||
'AB|SUV': { POS: ADV }, # minst, mest, högst, främst, helst
|
"AB|SUV": {POS: ADV}, # minst, mest, högst, främst, helst
|
||||||
'DT|MAS|SIN|DEF': { POS: DET },
|
"DT|MAS|SIN|DEF": {POS: DET},
|
||||||
'DT|MAS|SIN|IND': { POS: DET },
|
"DT|MAS|SIN|IND": {POS: DET},
|
||||||
'DT|NEU|SIN|DEF': { POS: DET }, # det, detta
|
"DT|NEU|SIN|DEF": {POS: DET}, # det, detta
|
||||||
'DT|NEU|SIN|IND': { POS: DET }, # ett, något, inget, vart, vartannat
|
"DT|NEU|SIN|IND": {POS: DET}, # ett, något, inget, vart, vartannat
|
||||||
'DT|NEU|SIN|IND/DEF': { POS: DET }, # allt
|
"DT|NEU|SIN|IND/DEF": {POS: DET}, # allt
|
||||||
'DT|UTR/NEU|PLU|DEF': { POS: DET }, # de, dessa, bägge, dom
|
"DT|UTR/NEU|PLU|DEF": {POS: DET}, # de, dessa, bägge, dom
|
||||||
'DT|UTR/NEU|PLU|IND': { POS: DET }, # några, inga
|
"DT|UTR/NEU|PLU|IND": {POS: DET}, # några, inga
|
||||||
'DT|UTR/NEU|PLU|IND/DEF': { POS: DET }, # alla
|
"DT|UTR/NEU|PLU|IND/DEF": {POS: DET}, # alla
|
||||||
'DT|UTR/NEU|SIN/PLU|IND': { POS: DET }, # samma
|
"DT|UTR/NEU|SIN/PLU|IND": {POS: DET}, # samma
|
||||||
'DT|UTR/NEU|SIN|DEF': { POS: DET }, # vardera
|
"DT|UTR/NEU|SIN|DEF": {POS: DET}, # vardera
|
||||||
'DT|UTR/NEU|SIN|IND': { POS: DET }, # varje, varenda
|
"DT|UTR/NEU|SIN|IND": {POS: DET}, # varje, varenda
|
||||||
'DT|UTR|SIN|DEF': { POS: DET }, # den, denna
|
"DT|UTR|SIN|DEF": {POS: DET}, # den, denna
|
||||||
'DT|UTR|SIN|IND': { POS: DET }, # en, någon, ingen, var, varannan
|
"DT|UTR|SIN|IND": {POS: DET}, # en, någon, ingen, var, varannan
|
||||||
'DT|UTR|SIN|IND/DEF': { POS: DET }, # all
|
"DT|UTR|SIN|IND/DEF": {POS: DET}, # all
|
||||||
'HA': { POS: ADV }, # när, där, hur, som, då
|
"HA": {POS: ADV}, # när, där, hur, som, då
|
||||||
'HD|NEU|SIN|IND': { POS: DET }, # vilket
|
"HD|NEU|SIN|IND": {POS: DET}, # vilket
|
||||||
'HD|UTR/NEU|PLU|IND': { POS: DET }, # vilka
|
"HD|UTR/NEU|PLU|IND": {POS: DET}, # vilka
|
||||||
'HD|UTR|SIN|IND': { POS: DET }, # vilken
|
"HD|UTR|SIN|IND": {POS: DET}, # vilken
|
||||||
'HP|-|-|-': { POS: PRON }, # som
|
"HP|-|-|-": {POS: PRON}, # som
|
||||||
'HP|NEU|SIN|IND': { POS: PRON }, # vad, vilket
|
"HP|NEU|SIN|IND": {POS: PRON}, # vad, vilket
|
||||||
'HP|NEU|SIN|IND|SMS': { POS: PRON },
|
"HP|NEU|SIN|IND|SMS": {POS: PRON},
|
||||||
'HP|UTR/NEU|PLU|IND': { POS: PRON }, # vilka
|
"HP|UTR/NEU|PLU|IND": {POS: PRON}, # vilka
|
||||||
'HP|UTR|SIN|IND': { POS: PRON }, # vilken, vem
|
"HP|UTR|SIN|IND": {POS: PRON}, # vilken, vem
|
||||||
'HS|DEF': { POS: DET }, # vars, vilkas, Vems
|
"HS|DEF": {POS: DET}, # vars, vilkas, Vems
|
||||||
'IE': { POS: PART }, # att
|
"IE": {POS: PART}, # att
|
||||||
'IN': { POS: INTJ }, # Jo, ja, nej, fan, visst
|
"IN": {POS: INTJ}, # Jo, ja, nej, fan, visst
|
||||||
'JJ|AN': { POS: ADJ }, # ev, S:t, Kungl, Kungl., Teol
|
"JJ|AN": {POS: ADJ}, # ev, S:t, Kungl, Kungl., Teol
|
||||||
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: ADJ }, # äldres
|
"JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|GEN": {POS: ADJ}, # äldres
|
||||||
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # större, högre, mindre, bättre, äldre
|
"JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|NOM": {
|
||||||
'JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS': { POS: ADJ },
|
POS: ADJ
|
||||||
'JJ|POS|MAS|SIN|DEF|GEN': { POS: ADJ }, # enskildes, sjukes, andres
|
}, # större, högre, mindre, bättre, äldre
|
||||||
'JJ|POS|MAS|SIN|DEF|NOM': { POS: ADJ }, # enskilde, sjuke, andre, unge, ene
|
"JJ|KOM|UTR/NEU|SIN/PLU|IND/DEF|SMS": {POS: ADJ},
|
||||||
'JJ|POS|NEU|SIN|IND/DEF|NOM': { POS: ADJ }, # eget
|
"JJ|POS|MAS|SIN|DEF|GEN": {POS: ADJ}, # enskildes, sjukes, andres
|
||||||
'JJ|POS|NEU|SIN|IND|GEN': { POS: ADJ },
|
"JJ|POS|MAS|SIN|DEF|NOM": {POS: ADJ}, # enskilde, sjuke, andre, unge, ene
|
||||||
'JJ|POS|NEU|SIN|IND|NOM': { POS: ADJ }, # annat, svårt, möjligt, nytt, sådant
|
"JJ|POS|NEU|SIN|IND/DEF|NOM": {POS: ADJ}, # eget
|
||||||
'JJ|POS|UTR/NEU|PLU|IND/DEF|GEN': { POS: ADJ }, # ogiftas, ungas, frånskildas, efterkommandes, färgblindas
|
"JJ|POS|NEU|SIN|IND|GEN": {POS: ADJ},
|
||||||
'JJ|POS|UTR/NEU|PLU|IND/DEF|NOM': { POS: ADJ }, # olika, andra, många, stora, vissa
|
"JJ|POS|NEU|SIN|IND|NOM": {POS: ADJ}, # annat, svårt, möjligt, nytt, sådant
|
||||||
'JJ|POS|UTR/NEU|PLU|IND|NOM': { POS: ADJ }, # flera, sådana, fler, få, samtliga
|
"JJ|POS|UTR/NEU|PLU|IND/DEF|GEN": {
|
||||||
'JJ|POS|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ },
|
POS: ADJ
|
||||||
'JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: ADJ }, # bra, ena, enda, nästa, ringa
|
}, # ogiftas, ungas, frånskildas, efterkommandes, färgblindas
|
||||||
'JJ|POS|UTR/NEU|SIN|DEF|GEN': { POS: ADJ },
|
"JJ|POS|UTR/NEU|PLU|IND/DEF|NOM": {POS: ADJ}, # olika, andra, många, stora, vissa
|
||||||
'JJ|POS|UTR/NEU|SIN|DEF|NOM': { POS: ADJ }, # hela, nya, andra, svenska, ekonomiska
|
"JJ|POS|UTR/NEU|PLU|IND|NOM": {POS: ADJ}, # flera, sådana, fler, få, samtliga
|
||||||
'JJ|POS|UTR|-|-|SMS': { POS: ADJ }, # fri-, låg-, sexual-
|
"JJ|POS|UTR/NEU|SIN/PLU|IND|NOM": {POS: ADJ},
|
||||||
'JJ|POS|UTR|SIN|IND/DEF|NOM': { POS: ADJ }, # egen
|
"JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM": {POS: ADJ}, # bra, ena, enda, nästa, ringa
|
||||||
'JJ|POS|UTR|SIN|IND|GEN': { POS: ADJ }, # enskilds
|
"JJ|POS|UTR/NEU|SIN|DEF|GEN": {POS: ADJ},
|
||||||
'JJ|POS|UTR|SIN|IND|NOM': { POS: ADJ }, # stor, annan, själv, sådan, viss
|
"JJ|POS|UTR/NEU|SIN|DEF|NOM": {POS: ADJ}, # hela, nya, andra, svenska, ekonomiska
|
||||||
'JJ|SUV|MAS|SIN|DEF|GEN': { POS: ADJ },
|
"JJ|POS|UTR|-|-|SMS": {POS: ADJ}, # fri-, låg-, sexual-
|
||||||
'JJ|SUV|MAS|SIN|DEF|NOM': { POS: ADJ }, # störste, främste, äldste, minste
|
"JJ|POS|UTR|SIN|IND/DEF|NOM": {POS: ADJ}, # egen
|
||||||
'JJ|SUV|UTR/NEU|PLU|DEF|NOM': { POS: ADJ }, # flesta
|
"JJ|POS|UTR|SIN|IND|GEN": {POS: ADJ}, # enskilds
|
||||||
'JJ|SUV|UTR/NEU|PLU|IND|NOM': { POS: ADJ },
|
"JJ|POS|UTR|SIN|IND|NOM": {POS: ADJ}, # stor, annan, själv, sådan, viss
|
||||||
'JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM': { POS: ADJ }, # bästa, största, närmaste, viktigaste, högsta
|
"JJ|SUV|MAS|SIN|DEF|GEN": {POS: ADJ},
|
||||||
'JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM': { POS: ADJ }, # störst, bäst, tidigast, högst, fattigast
|
"JJ|SUV|MAS|SIN|DEF|NOM": {POS: ADJ}, # störste, främste, äldste, minste
|
||||||
'KN': { POS: CCONJ }, # och, eller, som, än, men
|
"JJ|SUV|UTR/NEU|PLU|DEF|NOM": {POS: ADJ}, # flesta
|
||||||
'KN|AN': { POS: CCONJ },
|
"JJ|SUV|UTR/NEU|PLU|IND|NOM": {POS: ADJ},
|
||||||
'MAD': { POS: PUNCT }, # ., ?, :, !, ...
|
"JJ|SUV|UTR/NEU|SIN/PLU|DEF|NOM": {
|
||||||
'MID': { POS: PUNCT }, # ,, -, :, *, ;
|
POS: ADJ
|
||||||
'NN|-|-|-|-': { POS: NOUN }, # godo, fjol, fullo, somras, måtto
|
}, # bästa, största, närmaste, viktigaste, högsta
|
||||||
'NN|AN': { POS: NOUN }, # kr, %, s., dr, kap.
|
"JJ|SUV|UTR/NEU|SIN/PLU|IND|NOM": {
|
||||||
'NN|NEU|-|-|-': { POS: NOUN },
|
POS: ADJ
|
||||||
'NN|NEU|-|-|SMS': { POS: NOUN }, # yrkes-, barn-, hem-, fack-, vatten-
|
}, # störst, bäst, tidigast, högst, fattigast
|
||||||
'NN|NEU|PLU|DEF|GEN': { POS: NOUN }, # barnens, årens, u-ländernas, företagens, århundradenas
|
"KN": {POS: CCONJ}, # och, eller, som, än, men
|
||||||
'NN|NEU|PLU|DEF|NOM': { POS: NOUN }, # barnen, u-länderna, åren, länderna, könen
|
"KN|AN": {POS: CCONJ},
|
||||||
'NN|NEU|PLU|IND|GEN': { POS: NOUN }, # slags, års, barns, länders, tusentals
|
"MAD": {POS: PUNCT}, # ., ?, :, !, ...
|
||||||
'NN|NEU|PLU|IND|NOM': { POS: NOUN }, # barn, år, fall, länder, problem
|
"MID": {POS: PUNCT}, # ,, -, :, *, ;
|
||||||
'NN|NEU|SIN|DEF|GEN': { POS: NOUN }, # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets
|
"NN|-|-|-|-": {POS: NOUN}, # godo, fjol, fullo, somras, måtto
|
||||||
'NN|NEU|SIN|DEF|NOM': { POS: NOUN }, # äktenskapet, samhället, barnet, stället, hemmet
|
"NN|AN": {POS: NOUN}, # kr, %, s., dr, kap.
|
||||||
'NN|NEU|SIN|IND|GEN': { POS: NOUN }, # års, slags, lands, havs, företags
|
"NN|NEU|-|-|-": {POS: NOUN},
|
||||||
'NN|NEU|SIN|IND|NOM': { POS: NOUN }, # år, arbete, barn, sätt, äktenskap
|
"NN|NEU|-|-|SMS": {POS: NOUN}, # yrkes-, barn-, hem-, fack-, vatten-
|
||||||
'NN|SMS': { POS: NOUN }, # PCB-, Syd-
|
"NN|NEU|PLU|DEF|GEN": {
|
||||||
'NN|UTR|-|-|-': { POS: NOUN }, # dags, rätta
|
POS: NOUN
|
||||||
'NN|UTR|-|-|SMS': { POS: NOUN }, # far-, kibbutz-, röntgen-, barna-, hälso-
|
}, # barnens, årens, u-ländernas, företagens, århundradenas
|
||||||
'NN|UTR|PLU|DEF|GEN': { POS: NOUN }, # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas
|
"NN|NEU|PLU|DEF|NOM": {POS: NOUN}, # barnen, u-länderna, åren, länderna, könen
|
||||||
'NN|UTR|PLU|DEF|NOM': { POS: NOUN }, # kvinnorna, föräldrarna, makarna, männen, hyrorna
|
"NN|NEU|PLU|IND|GEN": {POS: NOUN}, # slags, års, barns, länders, tusentals
|
||||||
'NN|UTR|PLU|IND|GEN': { POS: NOUN }, # människors, kvinnors, dagars, tiders, månaders
|
"NN|NEU|PLU|IND|NOM": {POS: NOUN}, # barn, år, fall, länder, problem
|
||||||
'NN|UTR|PLU|IND|NOM': { POS: NOUN }, # procent, människor, kvinnor, miljoner, kronor
|
"NN|NEU|SIN|DEF|GEN": {
|
||||||
'NN|UTR|SIN|DEF|GEN': { POS: NOUN }, # kvinnans, världens, familjens, dagens, jordens
|
POS: NOUN
|
||||||
'NN|UTR|SIN|DEF|NOM': { POS: NOUN }, # familjen, kvinnan, mannen, världen, skolan
|
}, # äktenskapets, samhällets, barnets, 1800-talets, 1960-talets
|
||||||
'NN|UTR|SIN|IND|GEN': { POS: NOUN }, # sorts, medelålders, makes, kvinnas, veckas
|
"NN|NEU|SIN|DEF|NOM": {
|
||||||
'NN|UTR|SIN|IND|NOM': { POS: NOUN }, # del, tid, dag, fråga, man
|
POS: NOUN
|
||||||
'PAD': { POS: PUNCT }, # , ), (
|
}, # äktenskapet, samhället, barnet, stället, hemmet
|
||||||
'PC|AN': { POS: VERB },
|
"NN|NEU|SIN|IND|GEN": {POS: NOUN}, # års, slags, lands, havs, företags
|
||||||
'PC|PRF|MAS|SIN|DEF|GEN': { POS: VERB }, # avlidnes
|
"NN|NEU|SIN|IND|NOM": {POS: NOUN}, # år, arbete, barn, sätt, äktenskap
|
||||||
'PC|PRF|MAS|SIN|DEF|NOM': { POS: VERB },
|
"NN|SMS": {POS: NOUN}, # PCB-, Syd-
|
||||||
'PC|PRF|NEU|SIN|IND|NOM': { POS: VERB }, # taget, sett, särskilt, förbjudet, ökat
|
"NN|UTR|-|-|-": {POS: NOUN}, # dags, rätta
|
||||||
'PC|PRF|UTR/NEU|PLU|IND/DEF|GEN': { POS: VERB }, # försäkrades, anställdas
|
"NN|UTR|-|-|SMS": {POS: NOUN}, # far-, kibbutz-, röntgen-, barna-, hälso-
|
||||||
'PC|PRF|UTR/NEU|PLU|IND/DEF|NOM': { POS: VERB }, # särskilda, gifta, ökade, handikappade, skilda
|
"NN|UTR|PLU|DEF|GEN": {
|
||||||
'PC|PRF|UTR/NEU|SIN|DEF|GEN': { POS: VERB },
|
POS: NOUN
|
||||||
'PC|PRF|UTR/NEU|SIN|DEF|NOM': { POS: VERB }, # ökade, gifta, nämnda, nedärvda, dolda
|
}, # föräldrarnas, kvinnornas, elevernas, kibbutzernas, makarnas
|
||||||
'PC|PRF|UTR|SIN|IND|GEN': { POS: VERB },
|
"NN|UTR|PLU|DEF|NOM": {
|
||||||
'PC|PRF|UTR|SIN|IND|NOM': { POS: VERB }, # särskild, ökad, beredd, gift, oförändrad
|
POS: NOUN
|
||||||
'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN': { POS: VERB }, # studerandes, sammanboendes, dubbelarbetandes
|
}, # kvinnorna, föräldrarna, makarna, männen, hyrorna
|
||||||
'PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM': { POS: VERB }, # följande, beroende, nuvarande, motsvarande, liknande
|
"NN|UTR|PLU|IND|GEN": {POS: NOUN}, # människors, kvinnors, dagars, tiders, månaders
|
||||||
'PL': { POS: PART }, # ut, upp, in, till, med
|
"NN|UTR|PLU|IND|NOM": {POS: NOUN}, # procent, människor, kvinnor, miljoner, kronor
|
||||||
'PL|SMS': { POS: PART },
|
"NN|UTR|SIN|DEF|GEN": {POS: NOUN}, # kvinnans, världens, familjens, dagens, jordens
|
||||||
'PM': { POS: PROPN }, # F, N, Liechtenstein, Danmark, DK
|
"NN|UTR|SIN|DEF|NOM": {POS: NOUN}, # familjen, kvinnan, mannen, världen, skolan
|
||||||
'PM|GEN': { POS: PROPN }, # Sveriges, EEC:s, Guds, Stockholms, Kristi
|
"NN|UTR|SIN|IND|GEN": {POS: NOUN}, # sorts, medelålders, makes, kvinnas, veckas
|
||||||
'PM|NOM': { POS: PROPN }, # Sverige, EEC, Stockholm, USA, ATP
|
"NN|UTR|SIN|IND|NOM": {POS: NOUN}, # del, tid, dag, fråga, man
|
||||||
'PM|SMS': { POS: PROPN }, # Göteborgs-, Nord-, Väst-
|
"PAD": {POS: PUNCT}, # , ), (
|
||||||
'PN|MAS|SIN|DEF|SUB/OBJ': { POS: PRON }, # denne
|
"PC|AN": {POS: VERB},
|
||||||
'PN|NEU|SIN|DEF|SUB/OBJ': { POS: PRON }, # det, detta, detsamma
|
"PC|PRF|MAS|SIN|DEF|GEN": {POS: VERB}, # avlidnes
|
||||||
'PN|NEU|SIN|IND|SUB/OBJ': { POS: PRON }, # något, allt, mycket, annat, ingenting
|
"PC|PRF|MAS|SIN|DEF|NOM": {POS: VERB},
|
||||||
'PN|UTR/NEU|PLU|DEF|OBJ': { POS: PRON }, # dem, varandra, varann
|
"PC|PRF|NEU|SIN|IND|NOM": {POS: VERB}, # taget, sett, särskilt, förbjudet, ökat
|
||||||
'PN|UTR/NEU|PLU|DEF|SUB': { POS: PRON }, # de, bägge
|
"PC|PRF|UTR/NEU|PLU|IND/DEF|GEN": {POS: VERB}, # försäkrades, anställdas
|
||||||
'PN|UTR/NEU|PLU|DEF|SUB/OBJ': { POS: PRON }, # dessa, dom, båda, den, bådadera
|
"PC|PRF|UTR/NEU|PLU|IND/DEF|NOM": {
|
||||||
'PN|UTR/NEU|PLU|IND|SUB/OBJ': { POS: PRON }, # andra, alla, många, sådana, några
|
POS: VERB
|
||||||
'PN|UTR/NEU|SIN/PLU|DEF|OBJ': { POS: PRON }, # sig, sej
|
}, # särskilda, gifta, ökade, handikappade, skilda
|
||||||
'PN|UTR|PLU|DEF|OBJ': { POS: PRON }, # oss, er, eder
|
"PC|PRF|UTR/NEU|SIN|DEF|GEN": {POS: VERB},
|
||||||
'PN|UTR|PLU|DEF|SUB': { POS: PRON }, # vi
|
"PC|PRF|UTR/NEU|SIN|DEF|NOM": {POS: VERB}, # ökade, gifta, nämnda, nedärvda, dolda
|
||||||
'PN|UTR|SIN|DEF|OBJ': { POS: PRON }, # dig, mig, henne, honom, Er
|
"PC|PRF|UTR|SIN|IND|GEN": {POS: VERB},
|
||||||
'PN|UTR|SIN|DEF|SUB': { POS: PRON }, # du, han, hon, jag, ni
|
"PC|PRF|UTR|SIN|IND|NOM": {POS: VERB}, # särskild, ökad, beredd, gift, oförändrad
|
||||||
'PN|UTR|SIN|DEF|SUB/OBJ': { POS: PRON }, # den, denna, densamma
|
"PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|GEN": {
|
||||||
'PN|UTR|SIN|IND|SUB': { POS: PRON }, # man
|
POS: VERB
|
||||||
'PN|UTR|SIN|IND|SUB/OBJ': { POS: PRON }, # en, var, någon, ingen, Varannan
|
}, # studerandes, sammanboendes, dubbelarbetandes
|
||||||
'PP': { POS: ADP }, # i, av, på, för, till
|
"PC|PRS|UTR/NEU|SIN/PLU|IND/DEF|NOM": {
|
||||||
'PP|AN': { POS: ADP }, # f
|
POS: VERB
|
||||||
'PS|AN': { POS: DET },
|
}, # följande, beroende, nuvarande, motsvarande, liknande
|
||||||
'PS|NEU|SIN|DEF': { POS: DET }, # sitt, vårt, ditt, mitt, ert
|
"PL": {POS: PART}, # ut, upp, in, till, med
|
||||||
'PS|UTR/NEU|PLU|DEF': { POS: DET }, # sina, våra, dina, mina
|
"PL|SMS": {POS: PART},
|
||||||
'PS|UTR/NEU|SIN/PLU|DEF': { POS: DET }, # deras, dess, hans, hennes, varandras
|
"PM": {POS: PROPN}, # F, N, Liechtenstein, Danmark, DK
|
||||||
'PS|UTR|SIN|DEF': { POS: DET }, # sin, vår, din, min, er
|
"PM|GEN": {POS: PROPN}, # Sveriges, EEC:s, Guds, Stockholms, Kristi
|
||||||
'RG': { POS: NUM }, # 2, 17, 20, 1, 18
|
"PM|NOM": {POS: PROPN}, # Sverige, EEC, Stockholm, USA, ATP
|
||||||
'RG|GEN': { POS: NUM },
|
"PM|SMS": {POS: PROPN}, # Göteborgs-, Nord-, Väst-
|
||||||
'RG|MAS|SIN|DEF|NOM': { POS: NUM },
|
"PN|MAS|SIN|DEF|SUB/OBJ": {POS: PRON}, # denne
|
||||||
'RG|NEU|SIN|IND|NOM': { POS: NUM }, # ett
|
"PN|NEU|SIN|DEF|SUB/OBJ": {POS: PRON}, # det, detta, detsamma
|
||||||
'RG|NOM': { POS: NUM }, # två, tre, 1, 20, 2
|
"PN|NEU|SIN|IND|SUB/OBJ": {POS: PRON}, # något, allt, mycket, annat, ingenting
|
||||||
'RG|SMS': { POS: NUM }, # ett-, 1950-, två-, tre-, 1700-
|
"PN|UTR/NEU|PLU|DEF|OBJ": {POS: PRON}, # dem, varandra, varann
|
||||||
'RG|UTR/NEU|SIN|DEF|NOM': { POS: NUM },
|
"PN|UTR/NEU|PLU|DEF|SUB": {POS: PRON}, # de, bägge
|
||||||
'RG|UTR|SIN|IND|NOM': { POS: NUM }, # en
|
"PN|UTR/NEU|PLU|DEF|SUB/OBJ": {POS: PRON}, # dessa, dom, båda, den, bådadera
|
||||||
'RO|MAS|SIN|IND/DEF|GEN': { POS: ADJ },
|
"PN|UTR/NEU|PLU|IND|SUB/OBJ": {POS: PRON}, # andra, alla, många, sådana, några
|
||||||
'RO|MAS|SIN|IND/DEF|NOM': { POS: ADJ }, # förste
|
"PN|UTR/NEU|SIN/PLU|DEF|OBJ": {POS: PRON}, # sig, sej
|
||||||
'RO|GEN': { POS: ADJ },
|
"PN|UTR|PLU|DEF|OBJ": {POS: PRON}, # oss, er, eder
|
||||||
'RO|NOM': { POS: ADJ }, # första, andra, tredje, fjärde, femte
|
"PN|UTR|PLU|DEF|SUB": {POS: PRON}, # vi
|
||||||
'SN': { POS: SCONJ }, # att, om, innan, eftersom, medan
|
"PN|UTR|SIN|DEF|OBJ": {POS: PRON}, # dig, mig, henne, honom, Er
|
||||||
'UO': { POS: X }, # companionship, vice, versa, family, capita
|
"PN|UTR|SIN|DEF|SUB": {POS: PRON}, # du, han, hon, jag, ni
|
||||||
'VB|AN': { POS: VERB }, # jfr
|
"PN|UTR|SIN|DEF|SUB/OBJ": {POS: PRON}, # den, denna, densamma
|
||||||
'VB|IMP|AKT': { POS: VERB }, # se, Diskutera, låt, Läs, Gå
|
"PN|UTR|SIN|IND|SUB": {POS: PRON}, # man
|
||||||
'VB|IMP|SFO': { POS: VERB }, # tas
|
"PN|UTR|SIN|IND|SUB/OBJ": {POS: PRON}, # en, var, någon, ingen, Varannan
|
||||||
'VB|INF|AKT': { POS: VERB }, # vara, få, ha, bli, kunna
|
"PP": {POS: ADP}, # i, av, på, för, till
|
||||||
'VB|INF|SFO': { POS: VERB }, # användas, finnas, göras, tas, ses
|
"PP|AN": {POS: ADP}, # f
|
||||||
'VB|KON|PRS|AKT': { POS: VERB }, # vare, Gånge
|
"PS|AN": {POS: DET},
|
||||||
'VB|KON|PRT|AKT': { POS: VERB }, # vore, finge
|
"PS|NEU|SIN|DEF": {POS: DET}, # sitt, vårt, ditt, mitt, ert
|
||||||
'VB|KON|PRT|SFO': { POS: VERB },
|
"PS|UTR/NEU|PLU|DEF": {POS: DET}, # sina, våra, dina, mina
|
||||||
'VB|PRS|AKT': { POS: VERB }, # är, har, kan, får, måste
|
"PS|UTR/NEU|SIN/PLU|DEF": {POS: DET}, # deras, dess, hans, hennes, varandras
|
||||||
'VB|PRS|SFO': { POS: VERB }, # finns, kallas, behövs, beräknas, används
|
"PS|UTR|SIN|DEF": {POS: DET}, # sin, vår, din, min, er
|
||||||
'VB|PRT|AKT': { POS: VERB }, # skulle, var, hade, kunde, fick
|
"RG": {POS: NUM}, # 2, 17, 20, 1, 18
|
||||||
'VB|PRT|SFO': { POS: VERB }, # fanns, gjordes, höjdes, användes, infördes
|
"RG|GEN": {POS: NUM},
|
||||||
'VB|SMS': { POS: VERB }, # läs-
|
"RG|MAS|SIN|DEF|NOM": {POS: NUM},
|
||||||
'VB|SUP|AKT': { POS: VERB }, # varit, fått, blivit, haft, kommit
|
"RG|NEU|SIN|IND|NOM": {POS: NUM}, # ett
|
||||||
'VB|SUP|SFO': { POS: VERB } # nämnts, gjorts, förändrats, sagts, framhållits
|
"RG|NOM": {POS: NUM}, # två, tre, 1, 20, 2
|
||||||
|
"RG|SMS": {POS: NUM}, # ett-, 1950-, två-, tre-, 1700-
|
||||||
|
"RG|UTR/NEU|SIN|DEF|NOM": {POS: NUM},
|
||||||
|
"RG|UTR|SIN|IND|NOM": {POS: NUM}, # en
|
||||||
|
"RO|MAS|SIN|IND/DEF|GEN": {POS: ADJ},
|
||||||
|
"RO|MAS|SIN|IND/DEF|NOM": {POS: ADJ}, # förste
|
||||||
|
"RO|GEN": {POS: ADJ},
|
||||||
|
"RO|NOM": {POS: ADJ}, # första, andra, tredje, fjärde, femte
|
||||||
|
"SN": {POS: SCONJ}, # att, om, innan, eftersom, medan
|
||||||
|
"UO": {POS: X}, # companionship, vice, versa, family, capita
|
||||||
|
"VB|AN": {POS: VERB}, # jfr
|
||||||
|
"VB|IMP|AKT": {POS: VERB}, # se, Diskutera, låt, Läs, Gå
|
||||||
|
"VB|IMP|SFO": {POS: VERB}, # tas
|
||||||
|
"VB|INF|AKT": {POS: VERB}, # vara, få, ha, bli, kunna
|
||||||
|
"VB|INF|SFO": {POS: VERB}, # användas, finnas, göras, tas, ses
|
||||||
|
"VB|KON|PRS|AKT": {POS: VERB}, # vare, Gånge
|
||||||
|
"VB|KON|PRT|AKT": {POS: VERB}, # vore, finge
|
||||||
|
"VB|KON|PRT|SFO": {POS: VERB},
|
||||||
|
"VB|PRS|AKT": {POS: VERB}, # är, har, kan, får, måste
|
||||||
|
"VB|PRS|SFO": {POS: VERB}, # finns, kallas, behövs, beräknas, används
|
||||||
|
"VB|PRT|AKT": {POS: VERB}, # skulle, var, hade, kunde, fick
|
||||||
|
"VB|PRT|SFO": {POS: VERB}, # fanns, gjordes, höjdes, användes, infördes
|
||||||
|
"VB|SMS": {POS: VERB}, # läs-
|
||||||
|
"VB|SUP|AKT": {POS: VERB}, # varit, fått, blivit, haft, kommit
|
||||||
|
"VB|SUP|SFO": {POS: VERB}, # nämnts, gjorts, förändrats, sagts, framhållits
|
||||||
}
|
}
|
||||||
|
|
|
@ -144,7 +144,7 @@ ABBREVIATIONS = [
|
||||||
|
|
||||||
# Add abbreviation for trailing punctuation too. If the abbreviation already has a trailing punctuation - skip it.
|
# Add abbreviation for trailing punctuation too. If the abbreviation already has a trailing punctuation - skip it.
|
||||||
for abbr in ABBREVIATIONS:
|
for abbr in ABBREVIATIONS:
|
||||||
if abbr.endswith(".") == False:
|
if not abbr.endswith("."):
|
||||||
ABBREVIATIONS.append(abbr + ".")
|
ABBREVIATIONS.append(abbr + ".")
|
||||||
|
|
||||||
for orth in ABBREVIATIONS:
|
for orth in ABBREVIATIONS:
|
||||||
|
|
|
@ -4,16 +4,15 @@ from __future__ import unicode_literals
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
|
||||||
class TamilDefaults(Language.Defaults):
|
class TamilDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: "ta"
|
lex_attr_getters[LANG] = lambda text: "ta"
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Tamil(Language):
|
class Tamil(Language):
|
||||||
|
|
|
@ -4,70 +4,33 @@ from __future__ import unicode_literals
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
# uncomment if files are available
|
|
||||||
# from .norm_exceptions import NORM_EXCEPTIONS
|
|
||||||
from .tag_map import TAG_MAP
|
|
||||||
# from .morph_rules import MORPH_RULES
|
|
||||||
|
|
||||||
# uncomment if lookup-based lemmatizer is available
|
|
||||||
from .lemmatizer import LOOKUP
|
from .lemmatizer import LOOKUP
|
||||||
# from ...lemmatizerlookup import Lemmatizer
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
|
||||||
def _return_tl(_):
|
def _return_tl(_):
|
||||||
return 'tl'
|
return "tl"
|
||||||
|
|
||||||
|
|
||||||
# Create a Language subclass
|
|
||||||
# Documentation: https://spacy.io/docs/usage/adding-languages
|
|
||||||
|
|
||||||
# This file should be placed in spacy/lang/xx (ISO code of language).
|
|
||||||
# Before submitting a pull request, make sure the remove all comments from the
|
|
||||||
# language data files, and run at least the basic tokenizer tests. Simply add the
|
|
||||||
# language ID to the list of languages in spacy/tests/conftest.py to include it
|
|
||||||
# in the basic tokenizer sanity tests. You can optionally add a fixture for the
|
|
||||||
# language's tokenizer and add more specific tests. For more info, see the
|
|
||||||
# tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests
|
|
||||||
|
|
||||||
|
|
||||||
class TagalogDefaults(Language.Defaults):
|
class TagalogDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = _return_tl # ISO code
|
lex_attr_getters[LANG] = _return_tl
|
||||||
# add more norm exception dictionaries here
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
|
)
|
||||||
# overwrite functions for lexical attributes
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
|
||||||
# add custom tokenizer exceptions to base exceptions
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
|
||||||
# add stop words
|
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
lemma_lookup = LOOKUP
|
||||||
# if available: add tag map
|
|
||||||
# tag_map = dict(TAG_MAP)
|
|
||||||
|
|
||||||
# if available: add morph rules
|
|
||||||
# morph_rules = dict(MORPH_RULES)
|
|
||||||
|
|
||||||
# if available: add lookup lemmatizer
|
|
||||||
# @classmethod
|
|
||||||
# def create_lemmatizer(cls, nlp=None):
|
|
||||||
# return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class Tagalog(Language):
|
class Tagalog(Language):
|
||||||
lang = 'tl' # ISO code
|
lang = "tl"
|
||||||
Defaults = TagalogDefaults # set Defaults to custom language defaults
|
Defaults = TagalogDefaults
|
||||||
|
|
||||||
|
|
||||||
# set default export – this allows the language class to be lazy-loaded
|
__all__ = ["Tagalog"]
|
||||||
__all__ = ['Tagalog']
|
|
||||||
|
|
|
@ -2,11 +2,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
# Adding a lemmatizer lookup table
|
|
||||||
# Documentation: https://spacy.io/docs/usage/adding-languages#lemmatizer
|
|
||||||
# Entries should be added in the following format:
|
|
||||||
|
|
||||||
|
|
||||||
LOOKUP = {
|
LOOKUP = {
|
||||||
"kaugnayan": "ugnay",
|
"kaugnayan": "ugnay",
|
||||||
"sangkatauhan": "tao",
|
"sangkatauhan": "tao",
|
||||||
|
@ -14,5 +9,5 @@ LOOKUP = {
|
||||||
"pandaigdigan": "daigdig",
|
"pandaigdigan": "daigdig",
|
||||||
"kasaysayan": "saysay",
|
"kasaysayan": "saysay",
|
||||||
"kabayanihan": "bayani",
|
"kabayanihan": "bayani",
|
||||||
"karuwagan": "duwag"
|
"karuwagan": "duwag",
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,33 +1,55 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
# import the symbols for the attrs you want to overwrite
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
# Overwriting functions for lexical attributes
|
_num_words = [
|
||||||
# Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs
|
"sero",
|
||||||
# Most of these functions, like is_lower or like_url should be language-
|
"isa",
|
||||||
# independent. Others, like like_num (which includes both digits and number
|
"dalawa",
|
||||||
# words), requires customisation.
|
"tatlo",
|
||||||
|
"apat",
|
||||||
|
"lima",
|
||||||
# Example: check if token resembles a number
|
"anim",
|
||||||
|
"pito",
|
||||||
_num_words = ['sero', 'isa', 'dalawa', 'tatlo', 'apat', 'lima', 'anim', 'pito',
|
"walo",
|
||||||
'walo', 'siyam', 'sampu', 'labing-isa', 'labindalawa', 'labintatlo', 'labing-apat',
|
"siyam",
|
||||||
'labinlima', 'labing-anim', 'labimpito', 'labing-walo', 'labinsiyam', 'dalawampu',
|
"sampu",
|
||||||
'tatlumpu', 'apatnapu', 'limampu', 'animnapu', 'pitumpu', 'walumpu', 'siyamnapu',
|
"labing-isa",
|
||||||
'daan', 'libo', 'milyon', 'bilyon', 'trilyon', 'quadrilyon',
|
"labindalawa",
|
||||||
'gajilyon', 'bazilyon']
|
"labintatlo",
|
||||||
|
"labing-apat",
|
||||||
|
"labinlima",
|
||||||
|
"labing-anim",
|
||||||
|
"labimpito",
|
||||||
|
"labing-walo",
|
||||||
|
"labinsiyam",
|
||||||
|
"dalawampu",
|
||||||
|
"tatlumpu",
|
||||||
|
"apatnapu",
|
||||||
|
"limampu",
|
||||||
|
"animnapu",
|
||||||
|
"pitumpu",
|
||||||
|
"walumpu",
|
||||||
|
"siyamnapu",
|
||||||
|
"daan",
|
||||||
|
"libo",
|
||||||
|
"milyon",
|
||||||
|
"bilyon",
|
||||||
|
"trilyon",
|
||||||
|
"quadrilyon",
|
||||||
|
"gajilyon",
|
||||||
|
"bazilyon",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(",", "").replace(".", "")
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.count('/') == 1:
|
if text.count("/") == 1:
|
||||||
num, denom = text.split('/')
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text in _num_words:
|
if text in _num_words:
|
||||||
|
@ -35,9 +57,4 @@ def like_num(text):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
# Create dictionary of functions to overwrite. The default lex_attr_getters are
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||||
# updated with this one, so only the functions defined here are overwritten.
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
|
||||||
LIKE_NUM: like_num
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,162 +1,154 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
STOP_WORDS = set(
|
||||||
# Add stop words
|
"""
|
||||||
# Documentation: https://spacy.io/docs/usage/adding-languages#stop-words
|
akin
|
||||||
# To improve readability, words should be ordered alphabetically and separated
|
aking
|
||||||
# by spaces and newlines. When adding stop words from an online source, always
|
ako
|
||||||
# include the link in a comment. Make sure to proofread and double-check the
|
alin
|
||||||
# words – lists available online are often known to contain mistakes.
|
am
|
||||||
|
amin
|
||||||
# data from https://github.com/stopwords-iso/stopwords-tl/blob/master/stopwords-tl.txt
|
aming
|
||||||
|
ang
|
||||||
STOP_WORDS = set("""
|
ano
|
||||||
akin
|
anumang
|
||||||
aking
|
apat
|
||||||
ako
|
at
|
||||||
alin
|
atin
|
||||||
am
|
ating
|
||||||
amin
|
ay
|
||||||
aming
|
bababa
|
||||||
ang
|
bago
|
||||||
ano
|
bakit
|
||||||
anumang
|
bawat
|
||||||
apat
|
bilang
|
||||||
at
|
dahil
|
||||||
atin
|
dalawa
|
||||||
ating
|
dapat
|
||||||
ay
|
din
|
||||||
bababa
|
dito
|
||||||
bago
|
doon
|
||||||
bakit
|
gagawin
|
||||||
bawat
|
gayunman
|
||||||
bilang
|
ginagawa
|
||||||
dahil
|
ginawa
|
||||||
dalawa
|
ginawang
|
||||||
dapat
|
gumawa
|
||||||
din
|
gusto
|
||||||
dito
|
habang
|
||||||
doon
|
hanggang
|
||||||
gagawin
|
hindi
|
||||||
gayunman
|
huwag
|
||||||
ginagawa
|
iba
|
||||||
ginawa
|
ibaba
|
||||||
ginawang
|
ibabaw
|
||||||
gumawa
|
ibig
|
||||||
gusto
|
ikaw
|
||||||
habang
|
ilagay
|
||||||
hanggang
|
ilalim
|
||||||
hindi
|
ilan
|
||||||
huwag
|
inyong
|
||||||
iba
|
isa
|
||||||
ibaba
|
isang
|
||||||
ibabaw
|
itaas
|
||||||
ibig
|
ito
|
||||||
ikaw
|
iyo
|
||||||
ilagay
|
iyon
|
||||||
ilalim
|
iyong
|
||||||
ilan
|
ka
|
||||||
inyong
|
kahit
|
||||||
isa
|
kailangan
|
||||||
isang
|
kailanman
|
||||||
itaas
|
kami
|
||||||
ito
|
kanila
|
||||||
iyo
|
kanilang
|
||||||
iyon
|
kanino
|
||||||
iyong
|
kanya
|
||||||
ka
|
kanyang
|
||||||
kahit
|
kapag
|
||||||
kailangan
|
kapwa
|
||||||
kailanman
|
karamihan
|
||||||
kami
|
katiyakan
|
||||||
kanila
|
katulad
|
||||||
kanilang
|
kaya
|
||||||
kanino
|
kaysa
|
||||||
kanya
|
ko
|
||||||
kanyang
|
kong
|
||||||
kapag
|
kulang
|
||||||
kapwa
|
kumuha
|
||||||
karamihan
|
kung
|
||||||
katiyakan
|
laban
|
||||||
katulad
|
lahat
|
||||||
kaya
|
lamang
|
||||||
kaysa
|
likod
|
||||||
ko
|
lima
|
||||||
kong
|
maaari
|
||||||
kulang
|
maaaring
|
||||||
kumuha
|
maging
|
||||||
kung
|
mahusay
|
||||||
laban
|
makita
|
||||||
lahat
|
marami
|
||||||
lamang
|
marapat
|
||||||
likod
|
masyado
|
||||||
lima
|
may
|
||||||
maaari
|
mayroon
|
||||||
maaaring
|
mga
|
||||||
maging
|
minsan
|
||||||
mahusay
|
mismo
|
||||||
makita
|
mula
|
||||||
marami
|
muli
|
||||||
marapat
|
na
|
||||||
masyado
|
nabanggit
|
||||||
may
|
naging
|
||||||
mayroon
|
nagkaroon
|
||||||
mga
|
nais
|
||||||
minsan
|
nakita
|
||||||
mismo
|
namin
|
||||||
mula
|
napaka
|
||||||
muli
|
narito
|
||||||
na
|
nasaan
|
||||||
nabanggit
|
ng
|
||||||
naging
|
ngayon
|
||||||
nagkaroon
|
ni
|
||||||
nais
|
nila
|
||||||
nakita
|
nilang
|
||||||
namin
|
nito
|
||||||
napaka
|
niya
|
||||||
narito
|
niyang
|
||||||
nasaan
|
noon
|
||||||
ng
|
o
|
||||||
ngayon
|
pa
|
||||||
ni
|
paano
|
||||||
nila
|
pababa
|
||||||
nilang
|
paggawa
|
||||||
nito
|
pagitan
|
||||||
niya
|
pagkakaroon
|
||||||
niyang
|
pagkatapos
|
||||||
noon
|
palabas
|
||||||
o
|
pamamagitan
|
||||||
pa
|
panahon
|
||||||
paano
|
pangalawa
|
||||||
pababa
|
para
|
||||||
paggawa
|
paraan
|
||||||
pagitan
|
pareho
|
||||||
pagkakaroon
|
pataas
|
||||||
pagkatapos
|
pero
|
||||||
palabas
|
pumunta
|
||||||
pamamagitan
|
pumupunta
|
||||||
panahon
|
sa
|
||||||
pangalawa
|
saan
|
||||||
para
|
sabi
|
||||||
paraan
|
sabihin
|
||||||
pareho
|
sarili
|
||||||
pataas
|
sila
|
||||||
pero
|
sino
|
||||||
pumunta
|
siya
|
||||||
pumupunta
|
tatlo
|
||||||
sa
|
tayo
|
||||||
saan
|
tulad
|
||||||
sabi
|
tungkol
|
||||||
sabihin
|
una
|
||||||
sarili
|
walang
|
||||||
sila
|
""".split()
|
||||||
sino
|
)
|
||||||
siya
|
|
||||||
tatlo
|
|
||||||
tayo
|
|
||||||
tulad
|
|
||||||
tungkol
|
|
||||||
una
|
|
||||||
walang
|
|
||||||
""".split())
|
|
||||||
|
|
|
@ -1,36 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
|
||||||
from ...symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
|
|
||||||
|
|
||||||
|
|
||||||
# Add a tag map
|
|
||||||
# Documentation: https://spacy.io/docs/usage/adding-languages#tag-map
|
|
||||||
# Universal Dependencies: http://universaldependencies.org/u/pos/all.html
|
|
||||||
# The keys of the tag map should be strings in your tag set. The dictionary must
|
|
||||||
# have an entry POS whose value is one of the Universal Dependencies tags.
|
|
||||||
# Optionally, you can also include morphological features or other attributes.
|
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
|
||||||
"ADV": {POS: ADV},
|
|
||||||
"NOUN": {POS: NOUN},
|
|
||||||
"ADP": {POS: ADP},
|
|
||||||
"PRON": {POS: PRON},
|
|
||||||
"SCONJ": {POS: SCONJ},
|
|
||||||
"PROPN": {POS: PROPN},
|
|
||||||
"DET": {POS: DET},
|
|
||||||
"SYM": {POS: SYM},
|
|
||||||
"INTJ": {POS: INTJ},
|
|
||||||
"PUNCT": {POS: PUNCT},
|
|
||||||
"NUM": {POS: NUM},
|
|
||||||
"AUX": {POS: AUX},
|
|
||||||
"X": {POS: X},
|
|
||||||
"CONJ": {POS: CONJ},
|
|
||||||
"CCONJ": {POS: CCONJ},
|
|
||||||
"ADJ": {POS: ADJ},
|
|
||||||
"VERB": {POS: VERB},
|
|
||||||
"PART": {POS: PART},
|
|
||||||
"SP": {POS: SPACE}
|
|
||||||
}
|
|
|
@ -1,48 +1,20 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
# import symbols – if you need to use more, add them here
|
from ...symbols import ORTH, LEMMA
|
||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, ADP, DET
|
|
||||||
|
|
||||||
|
|
||||||
# Add tokenizer exceptions
|
|
||||||
# Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions
|
|
||||||
# Feel free to use custom logic to generate repetitive exceptions more efficiently.
|
|
||||||
# If an exception is split into more than one token, the ORTH values combined always
|
|
||||||
# need to match the original string.
|
|
||||||
|
|
||||||
# Exceptions should be added in the following format:
|
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
"tayo'y": [
|
"tayo'y": [{ORTH: "tayo", LEMMA: "tayo"}, {ORTH: "'y", LEMMA: "ay"}],
|
||||||
{ORTH: "tayo", LEMMA: "tayo"},
|
"isa'y": [{ORTH: "isa", LEMMA: "isa"}, {ORTH: "'y", LEMMA: "ay"}],
|
||||||
{ORTH: "'y", LEMMA: "ay"}],
|
"baya'y": [{ORTH: "baya", LEMMA: "bayan"}, {ORTH: "'y", LEMMA: "ay"}],
|
||||||
"isa'y": [
|
"sa'yo": [{ORTH: "sa", LEMMA: "sa"}, {ORTH: "'yo", LEMMA: "iyo"}],
|
||||||
{ORTH: "isa", LEMMA: "isa"},
|
"ano'ng": [{ORTH: "ano", LEMMA: "ano"}, {ORTH: "'ng", LEMMA: "ang"}],
|
||||||
{ORTH: "'y", LEMMA: "ay"}],
|
"siya'y": [{ORTH: "siya", LEMMA: "siya"}, {ORTH: "'y", LEMMA: "ay"}],
|
||||||
"baya'y": [
|
"nawa'y": [{ORTH: "nawa", LEMMA: "nawa"}, {ORTH: "'y", LEMMA: "ay"}],
|
||||||
{ORTH: "baya", LEMMA: "bayan"},
|
"papa'no": [{ORTH: "papa'no", LEMMA: "papaano"}],
|
||||||
{ORTH: "'y", LEMMA: "ay"}],
|
"'di": [{ORTH: "'di", LEMMA: "hindi"}],
|
||||||
"sa'yo": [
|
|
||||||
{ORTH: "sa", LEMMA: "sa"},
|
|
||||||
{ORTH: "'yo", LEMMA: "iyo"}],
|
|
||||||
"ano'ng": [
|
|
||||||
{ORTH: "ano", LEMMA: "ano"},
|
|
||||||
{ORTH: "'ng", LEMMA: "ang"}],
|
|
||||||
"siya'y": [
|
|
||||||
{ORTH: "siya", LEMMA: "siya"},
|
|
||||||
{ORTH: "'y", LEMMA: "ay"}],
|
|
||||||
"nawa'y": [
|
|
||||||
{ORTH: "nawa", LEMMA: "nawa"},
|
|
||||||
{ORTH: "'y", LEMMA: "ay"}],
|
|
||||||
"papa'no": [
|
|
||||||
{ORTH: "papa'no", LEMMA: "papaano"}],
|
|
||||||
"'di": [
|
|
||||||
{ORTH: "'di", LEMMA: "hindi"}]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# To keep things clean and readable, it's recommended to only declare the
|
|
||||||
# TOKENIZER_EXCEPTIONS at the bottom:
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE, PUNCT
|
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
||||||
|
|
||||||
|
|
||||||
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
||||||
|
|
|
@ -5,71 +5,32 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
# uncomment if files are available
|
|
||||||
# from .norm_exceptions import NORM_EXCEPTIONS
|
|
||||||
# from .tag_map import TAG_MAP
|
|
||||||
# from .morph_rules import MORPH_RULES
|
|
||||||
|
|
||||||
# uncomment if lookup-based lemmatizer is available
|
|
||||||
# from .lemmatizer import LOOKUP
|
|
||||||
# from ...lemmatizerlookup import Lemmatizer
|
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, LIKE_NUM, NORM
|
from ...attrs import LANG, NORM
|
||||||
# from .tag_map import TAG_MAP
|
|
||||||
from .lemmatizer import UkrainianLemmatizer
|
from .lemmatizer import UkrainianLemmatizer
|
||||||
|
|
||||||
|
|
||||||
# Create a Language subclass
|
|
||||||
# Documentation: https://spacy.io/docs/usage/adding-languages
|
|
||||||
|
|
||||||
# This file should be placed in spacy/lang/xx (ISO code of language).
|
|
||||||
# Before submitting a pull request, make sure the remove all comments from the
|
|
||||||
# language data files, and run at least the basic tokenizer tests. Simply add the
|
|
||||||
# language ID to the list of languages in spacy/tests/conftest.py to include it
|
|
||||||
# in the basic tokenizer sanity tests. You can optionally add a fixture for the
|
|
||||||
# language's tokenizer and add more specific tests. For more info, see the
|
|
||||||
# tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests
|
|
||||||
|
|
||||||
|
|
||||||
class UkrainianDefaults(Language.Defaults):
|
class UkrainianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'uk' # ISO code
|
lex_attr_getters[LANG] = lambda text: "uk"
|
||||||
# add more norm exception dictionaries here
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
|
)
|
||||||
# overwrite functions for lexical attributes
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
|
||||||
# add custom tokenizer exceptions to base exceptions
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
|
||||||
# add stop words
|
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
# if available: add tag map
|
|
||||||
# tag_map = dict(TAG_MAP)
|
|
||||||
|
|
||||||
# if available: add morph rules
|
|
||||||
# morph_rules = dict(MORPH_RULES)
|
|
||||||
|
|
||||||
# if available: add lookup lemmatizer
|
|
||||||
# @classmethod
|
|
||||||
# def create_lemmatizer(cls, nlp=None):
|
|
||||||
# return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None):
|
def create_lemmatizer(cls, nlp=None):
|
||||||
return UkrainianLemmatizer()
|
return UkrainianLemmatizer()
|
||||||
|
|
||||||
|
|
||||||
class Ukrainian(Language):
|
class Ukrainian(Language):
|
||||||
lang = 'uk' # ISO code
|
lang = "uk"
|
||||||
Defaults = UkrainianDefaults # set Defaults to custom language defaults
|
Defaults = UkrainianDefaults
|
||||||
|
|
||||||
|
|
||||||
# set default export – this allows the language class to be lazy-loaded
|
__all__ = ["Ukrainian"]
|
||||||
__all__ = ['Ukrainian']
|
|
||||||
|
|
|
@ -14,10 +14,10 @@ sentences = [
|
||||||
"Ніч на середу буде морозною.",
|
"Ніч на середу буде морозною.",
|
||||||
"Чим кращі книги ти читав, тим гірше спиш.", # Serhiy Zhadan
|
"Чим кращі книги ти читав, тим гірше спиш.", # Serhiy Zhadan
|
||||||
"Найстаріші ґудзики, відомі людству, археологи знайшли в долині ріки Інд.",
|
"Найстаріші ґудзики, відомі людству, археологи знайшли в долині ріки Інд.",
|
||||||
"Слов'янське слово «Україна» вперше згадується у Київському літописному зводі за Іпатіївським списком під 1187 роком.", # wikipedia
|
"Слов'янське слово «Україна» вперше згадується у Київському літописному зводі за Іпатіївським списком під 1187 роком.", # wikipedia
|
||||||
"Де у Києві найсмачніша кава?",
|
"Де у Києві найсмачніша кава?",
|
||||||
"Від Нижнього озера довгими дерев’яними сходами, над якими синьо й біло горіли маленькі коробочки-ліхтарики, підіймалися до нього двоє стовусів: найкращий друг Вертутій і його дванадцятилітній онук Чублик.", # blyznets_viktor_semenovych/zemlia_svitliachkiv
|
"Від Нижнього озера довгими дерев’яними сходами, над якими синьо й біло горіли маленькі коробочки-ліхтарики, підіймалися до нього двоє стовусів: найкращий друг Вертутій і його дванадцятилітній онук Чублик.", # blyznets_viktor_semenovych/zemlia_svitliachkiv
|
||||||
"Китайський космічний зонд \"Чан'е-4\" вперше в історії здійснив м'яку посадку на зворотному боці Місяця.",
|
"Китайський космічний зонд \"Чан'е-4\" вперше в історії здійснив м'яку посадку на зворотному боці Місяця.",
|
||||||
"Коли до губ твоїх лишається півподиху, коли до губ твоїх лишається півкроку – зіниці твої виткані із подиву, в очах у тебе синьо і широко.", # Hryhorij Czubaj
|
"Коли до губ твоїх лишається півподиху, коли до губ твоїх лишається півкроку – зіниці твої виткані із подиву, в очах у тебе синьо і широко.", # Hryhorij Czubaj
|
||||||
"Дорогу сестру збираю у дорогу, а брати вирішили не брати машину." # homographs
|
"Дорогу сестру збираю у дорогу, а брати вирішили не брати машину.", # homographs
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..ru.lemmatizer import RussianLemmatizer
|
from ..ru.lemmatizer import RussianLemmatizer
|
||||||
|
|
||||||
|
|
||||||
class UkrainianLemmatizer(RussianLemmatizer):
|
class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
|
def __init__(self, pymorphy2_lang="ru"):
|
||||||
def __init__(self, pymorphy2_lang='ru'):
|
|
||||||
try:
|
try:
|
||||||
super(UkrainianLemmatizer, self).__init__(pymorphy2_lang='uk')
|
super(UkrainianLemmatizer, self).__init__(pymorphy2_lang="uk")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
'The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: '
|
"The Ukrainian lemmatizer requires the pymorphy2 library and dictionaries: "
|
||||||
'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"')
|
'try to fix it with "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
|
||||||
|
)
|
||||||
|
|
|
@ -1,32 +1,68 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
# import the symbols for the attrs you want to overwrite
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
# Overwriting functions for lexical attributes
|
"більйон",
|
||||||
# Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs
|
"вісім",
|
||||||
# Most of these functions, like is_lower or like_url should be language-
|
"вісімдесят",
|
||||||
# independent. Others, like like_num (which includes both digits and number
|
"вісімнадцять",
|
||||||
# words), requires customisation.
|
"вісімсот",
|
||||||
|
"восьмий",
|
||||||
|
"два",
|
||||||
# Example: check if token resembles a number
|
"двадцять",
|
||||||
_num_words = ["більйон", "вісім", "вісімдесят", "вісімнадцять", "вісімсот", "восьмий", "два", "двадцять", "дванадцять",
|
"дванадцять",
|
||||||
"двісті", "дев'яносто", "дев'ятнадцять", "дев'ятсот", "дев'ять", "десять", "децильйон", "квадрильйон",
|
"двісті",
|
||||||
"квінтильйон", "мільйон", "мільярд", "нонильйон", "один", "одинадцять", "октильйон", "п'ятий",
|
"дев'яносто",
|
||||||
"п'ятисотий", "п'ятнадцять", "п'ятсот", "п'ять", "секстильйон", "септильйон", "сім", "сімдесят",
|
"дев'ятнадцять",
|
||||||
"сімнадцять", "сімсот", "сорок", "сто", "тисяча", "три", "тридцять", "трильйон", "тринадцять", "триста",
|
"дев'ятсот",
|
||||||
"чотири", "чотириста", "чотирнадцять", "шістдесят", "шістнадцять", "шістсот", "шість"]
|
"дев'ять",
|
||||||
|
"десять",
|
||||||
|
"децильйон",
|
||||||
|
"квадрильйон",
|
||||||
|
"квінтильйон",
|
||||||
|
"мільйон",
|
||||||
|
"мільярд",
|
||||||
|
"нонильйон",
|
||||||
|
"один",
|
||||||
|
"одинадцять",
|
||||||
|
"октильйон",
|
||||||
|
"п'ятий",
|
||||||
|
"п'ятисотий",
|
||||||
|
"п'ятнадцять",
|
||||||
|
"п'ятсот",
|
||||||
|
"п'ять",
|
||||||
|
"секстильйон",
|
||||||
|
"септильйон",
|
||||||
|
"сім",
|
||||||
|
"сімдесят",
|
||||||
|
"сімнадцять",
|
||||||
|
"сімсот",
|
||||||
|
"сорок",
|
||||||
|
"сто",
|
||||||
|
"тисяча",
|
||||||
|
"три",
|
||||||
|
"тридцять",
|
||||||
|
"трильйон",
|
||||||
|
"тринадцять",
|
||||||
|
"триста",
|
||||||
|
"чотири",
|
||||||
|
"чотириста",
|
||||||
|
"чотирнадцять",
|
||||||
|
"шістдесят",
|
||||||
|
"шістнадцять",
|
||||||
|
"шістсот",
|
||||||
|
"шість",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(",", "").replace(".", "")
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.count('/') == 1:
|
if text.count("/") == 1:
|
||||||
num, denom = text.split('/')
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text in _num_words:
|
if text in _num_words:
|
||||||
|
@ -34,9 +70,4 @@ def like_num(text):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
# Create dictionary of functions to overwrite. The default lex_attr_getters are
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||||
# updated with this one, so only the functions defined here are overwritten.
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
|
||||||
LIKE_NUM: like_num
|
|
||||||
}
|
|
||||||
|
|
|
@ -2,15 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
# Add stop words
|
STOP_WORDS = set(
|
||||||
# Documentation: https://spacy.io/docs/usage/adding-languages#stop-words
|
"""а
|
||||||
# To improve readability, words should be ordered alphabetically and separated
|
|
||||||
# by spaces and newlines. When adding stop words from an online source, always
|
|
||||||
# include the link in a comment. Make sure to proofread and double-check the
|
|
||||||
# words – lists available online are often known to contain mistakes.
|
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set("""а
|
|
||||||
або
|
або
|
||||||
адже
|
адже
|
||||||
але
|
але
|
||||||
|
@ -401,4 +394,5 @@ STOP_WORDS = set("""а
|
||||||
якій
|
якій
|
||||||
якого
|
якого
|
||||||
якщо
|
якщо
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -5,32 +5,24 @@ from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
||||||
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
|
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
|
||||||
|
|
||||||
|
|
||||||
# Add a tag map
|
|
||||||
# Documentation: https://spacy.io/docs/usage/adding-languages#tag-map
|
|
||||||
# Universal Dependencies: http://universaldependencies.org/u/pos/all.html
|
|
||||||
# The keys of the tag map should be strings in your tag set. The dictionary must
|
|
||||||
# have an entry POS whose value is one of the Universal Dependencies tags.
|
|
||||||
# Optionally, you can also include morphological features or other attributes.
|
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
"ADV": {POS: ADV},
|
"ADV": {POS: ADV},
|
||||||
"NOUN": {POS: NOUN},
|
"NOUN": {POS: NOUN},
|
||||||
"ADP": {POS: ADP},
|
"ADP": {POS: ADP},
|
||||||
"PRON": {POS: PRON},
|
"PRON": {POS: PRON},
|
||||||
"SCONJ": {POS: SCONJ},
|
"SCONJ": {POS: SCONJ},
|
||||||
"PROPN": {POS: PROPN},
|
"PROPN": {POS: PROPN},
|
||||||
"DET": {POS: DET},
|
"DET": {POS: DET},
|
||||||
"SYM": {POS: SYM},
|
"SYM": {POS: SYM},
|
||||||
"INTJ": {POS: INTJ},
|
"INTJ": {POS: INTJ},
|
||||||
"PUNCT": {POS: PUNCT},
|
"PUNCT": {POS: PUNCT},
|
||||||
"NUM": {POS: NUM},
|
"NUM": {POS: NUM},
|
||||||
"AUX": {POS: AUX},
|
"AUX": {POS: AUX},
|
||||||
"X": {POS: X},
|
"X": {POS: X},
|
||||||
"CONJ": {POS: CONJ},
|
"CONJ": {POS: CONJ},
|
||||||
"CCONJ": {POS: CCONJ},
|
"CCONJ": {POS: CCONJ},
|
||||||
"ADJ": {POS: ADJ},
|
"ADJ": {POS: ADJ},
|
||||||
"VERB": {POS: VERB},
|
"VERB": {POS: VERB},
|
||||||
"PART": {POS: PART},
|
"PART": {POS: PART},
|
||||||
"SP": {POS: SPACE}
|
"SP": {POS: SPACE},
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,18 +1,9 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
# import symbols – if you need to use more, add them here
|
|
||||||
from ...symbols import ORTH, LEMMA, POS, NORM, NOUN
|
from ...symbols import ORTH, LEMMA, POS, NORM, NOUN
|
||||||
|
|
||||||
|
|
||||||
# Add tokenizer exceptions
|
|
||||||
# Documentation: https://spacy.io/docs/usage/adding-languages#tokenizer-exceptions
|
|
||||||
# Feel free to use custom logic to generate repetitive exceptions more efficiently.
|
|
||||||
# If an exception is split into more than one token, the ORTH values combined always
|
|
||||||
# need to match the original string.
|
|
||||||
|
|
||||||
# Exceptions should be added in the following format:
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
|
@ -28,11 +19,9 @@ for exc_data in [
|
||||||
{ORTH: "проф.", LEMMA: "професор", NORM: "професор", POS: NOUN},
|
{ORTH: "проф.", LEMMA: "професор", NORM: "професор", POS: NOUN},
|
||||||
{ORTH: "акад.", LEMMA: "академік", NORM: "академік", POS: NOUN},
|
{ORTH: "акад.", LEMMA: "академік", NORM: "академік", POS: NOUN},
|
||||||
{ORTH: "доц.", LEMMA: "доцент", NORM: "доцент", POS: NOUN},
|
{ORTH: "доц.", LEMMA: "доцент", NORM: "доцент", POS: NOUN},
|
||||||
{ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN}]:
|
{ORTH: "оз.", LEMMA: "озеро", NORM: "озеро", POS: NOUN},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
# To keep things clean and readable, it's recommended to only declare the
|
|
||||||
# TOKENIZER_EXCEPTIONS at the bottom:
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .matcher import Matcher
|
from .matcher import Matcher # noqa: F401
|
||||||
from .phrasematcher import PhraseMatcher
|
from .phrasematcher import PhraseMatcher # noqa: F401
|
||||||
from .dependencymatcher import DependencyTreeMatcher
|
from .dependencymatcher import DependencyTreeMatcher # noqa: F401
|
||||||
|
|
|
@ -119,8 +119,8 @@ def tr_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def uk_tokenizer():
|
def uk_tokenizer():
|
||||||
pymorphy = pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy2")
|
||||||
return util.get_lang_class("uk").Defaults.create_tokenizer()
|
return get_lang_class("uk").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
|
@ -130,7 +130,7 @@ def ca_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def pl_tokenizer():
|
def pl_tokenizer():
|
||||||
return util.get_lang_class("pl").Defaults.create_tokenizer()
|
return get_lang_class("pl").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
|
|
0
spacy/tests/lang/pl/__init__.py
Normal file
0
spacy/tests/lang/pl/__init__.py
Normal file
|
@ -3,57 +3,57 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
DOT_TESTS = [
|
DOT_TESTS = [
|
||||||
('tel.', ['tel.']),
|
("tel.", ["tel."]),
|
||||||
('np.', ['np.']),
|
("np.", ["np."]),
|
||||||
('godz. 21:37', ['godz.', '21:37']),
|
("godz. 21:37", ["godz.", "21:37"]),
|
||||||
('inż.', ['inż.']),
|
("inż.", ["inż."]),
|
||||||
('gosp.-polit.', ['gosp.-polit.']),
|
("gosp.-polit.", ["gosp.-polit."]),
|
||||||
('ppoż', ['ppoż']),
|
("ppoż", ["ppoż"]),
|
||||||
('płn', ['płn']),
|
("płn", ["płn"]),
|
||||||
('ul.', ['ul.']),
|
("ul.", ["ul."]),
|
||||||
('jw.', ['jw.']),
|
("jw.", ["jw."]),
|
||||||
('itd.', ['itd.']),
|
("itd.", ["itd."]),
|
||||||
('cdn.', ['cdn.']),
|
("cdn.", ["cdn."]),
|
||||||
('itp.', ['itp.']),
|
("itp.", ["itp."]),
|
||||||
('10,- zł', ['10,-', 'zł']),
|
("10,- zł", ["10,-", "zł"]),
|
||||||
('0 zł 99 gr', ['0', 'zł', '99', 'gr']),
|
("0 zł 99 gr", ["0", "zł", "99", "gr"]),
|
||||||
('0,99 rub.', ['0,99', 'rub.']),
|
("0,99 rub.", ["0,99", "rub."]),
|
||||||
('dol.', ['dol.']),
|
("dol.", ["dol."]),
|
||||||
('1000 m n.p.m.', ['1000', 'm', 'n.p.m.']),
|
("1000 m n.p.m.", ["1000", "m", "n.p.m."]),
|
||||||
('m.in.', ['m.in.']),
|
("m.in.", ["m.in."]),
|
||||||
('p.n.e.', ['p.n.e.']),
|
("p.n.e.", ["p.n.e."]),
|
||||||
('Sz.P.', ['Sz.P.']),
|
("Sz.P.", ["Sz.P."]),
|
||||||
('p.o.', ['p.o.']),
|
("p.o.", ["p.o."]),
|
||||||
('k.o.', ['k.o.']),
|
("k.o.", ["k.o."]),
|
||||||
('m.st.', ['m.st.']),
|
("m.st.", ["m.st."]),
|
||||||
('dra.', ['dra', '.']),
|
("dra.", ["dra", "."]),
|
||||||
('pp.', ['pp.']),
|
("pp.", ["pp."]),
|
||||||
('oo.', ['oo.'])
|
("oo.", ["oo."]),
|
||||||
]
|
]
|
||||||
|
|
||||||
HYPHEN_TESTS = [
|
HYPHEN_TESTS = [
|
||||||
('5-fluoropentylo-3-pirydynyloindol', ['5-fluoropentylo-3-pirydynyloindol']),
|
("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]),
|
||||||
('NESS-040C5', ['NESS-040C5']),
|
("NESS-040C5", ["NESS-040C5"]),
|
||||||
('JTE-7-31', ['JTE-7-31']),
|
("JTE-7-31", ["JTE-7-31"]),
|
||||||
('BAY-59-3074', ['BAY-59-3074']),
|
("BAY-59-3074", ["BAY-59-3074"]),
|
||||||
('BAY-38-7271', ['BAY-38-7271']),
|
("BAY-38-7271", ["BAY-38-7271"]),
|
||||||
('STS-135', ['STS-135']),
|
("STS-135", ["STS-135"]),
|
||||||
('5F-PB-22', ['5F-PB-22']),
|
("5F-PB-22", ["5F-PB-22"]),
|
||||||
('cztero-', ['cztero-']),
|
("cztero-", ["cztero-"]),
|
||||||
('jedno-', ['jedno-']),
|
("jedno-", ["jedno-"]),
|
||||||
('dwu-', ['dwu-']),
|
("dwu-", ["dwu-"]),
|
||||||
('trzy-', ['trzy-']),
|
("trzy-", ["trzy-"]),
|
||||||
('b-adoratorzy', ['b-adoratorzy']),
|
("b-adoratorzy", ["b-adoratorzy"]),
|
||||||
('2-3-4 drzewa', ['2-3-4', 'drzewa']),
|
("2-3-4 drzewa", ["2-3-4", "drzewa"]),
|
||||||
('b-drzewa', ['b-drzewa'])
|
("b-drzewa", ["b-drzewa"]),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
TESTCASES = DOT_TESTS + HYPHEN_TESTS
|
TESTCASES = DOT_TESTS + HYPHEN_TESTS
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
|
||||||
def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens):
|
def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens):
|
||||||
tokens = pl_tokenizer(text)
|
tokens = pl_tokenizer(text)
|
||||||
token_list = [token.text for token in tokens if not token.is_space]
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
|
|
@ -5,34 +5,42 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
SV_TOKEN_EXCEPTION_TESTS = [
|
SV_TOKEN_EXCEPTION_TESTS = [
|
||||||
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
|
(
|
||||||
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']),
|
"Smörsåsen används bl.a. till fisk",
|
||||||
('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."])
|
["Smörsåsen", "används", "bl.a.", "till", "fisk"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Jag kommer först kl. 13 p.g.a. diverse förseningar",
|
||||||
|
["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Anders I. tycker om ord med i i.",
|
||||||
|
["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."],
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', SV_TOKEN_EXCEPTION_TESTS)
|
@pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS)
|
||||||
def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens):
|
def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
token_list = [token.text for token in tokens if not token.is_space]
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
assert expected_tokens == token_list
|
assert expected_tokens == token_list
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["driveru", "hajaru", "Serru", "Fixaru"])
|
@pytest.mark.parametrize("text", ["driveru", "hajaru", "Serru", "Fixaru"])
|
||||||
def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text):
|
def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[1].text == "u"
|
assert tokens[1].text == "u"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text',
|
@pytest.mark.parametrize("text", ["bl.a", "m.a.o.", "Jan.", "Dec.", "kr.", "osv."])
|
||||||
["bl.a", "m.a.o.", "Jan.", "Dec.", "kr.", "osv."])
|
|
||||||
def test_sv_tokenizer_handles_abbr(sv_tokenizer, text):
|
def test_sv_tokenizer_handles_abbr(sv_tokenizer, text):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Jul.", "jul.", "sön.", "Sön."])
|
@pytest.mark.parametrize("text", ["Jul.", "jul.", "sön.", "Sön."])
|
||||||
def test_sv_tokenizer_handles_ambiguous_abbr(sv_tokenizer, text):
|
def test_sv_tokenizer_handles_ambiguous_abbr(sv_tokenizer, text):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
|
@ -4,12 +4,17 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('string,lemma', [('DNA-profilernas', 'DNA-profil'),
|
@pytest.mark.parametrize(
|
||||||
('Elfenbenskustens', 'Elfenbenskusten'),
|
"string,lemma",
|
||||||
('abortmotståndarens', 'abortmotståndare'),
|
[
|
||||||
('kolesterols', 'kolesterol'),
|
("DNA-profilernas", "DNA-profil"),
|
||||||
('portionssnusernas', 'portionssnus'),
|
("Elfenbenskustens", "Elfenbenskusten"),
|
||||||
('åsyns', 'åsyn')])
|
("abortmotståndarens", "abortmotståndare"),
|
||||||
|
("kolesterols", "kolesterol"),
|
||||||
|
("portionssnusernas", "portionssnus"),
|
||||||
|
("åsyns", "åsyn"),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_lemmatizer_lookup_assigns(sv_tokenizer, string, lemma):
|
def test_lemmatizer_lookup_assigns(sv_tokenizer, string, lemma):
|
||||||
tokens = sv_tokenizer(string)
|
tokens = sv_tokenizer(string)
|
||||||
assert tokens[0].lemma_ == lemma
|
assert tokens[0].lemma_ == lemma
|
||||||
|
|
|
@ -1,28 +1,28 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(under)"])
|
|
||||||
|
@pytest.mark.parametrize("text", ["(under)"])
|
||||||
def test_tokenizer_splits_no_special(sv_tokenizer, text):
|
def test_tokenizer_splits_no_special(sv_tokenizer, text):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["gitta'r", "Björn's", "Lars'"])
|
@pytest.mark.parametrize("text", ["gitta'r", "Björn's", "Lars'"])
|
||||||
def test_tokenizer_handles_no_punct(sv_tokenizer, text):
|
def test_tokenizer_handles_no_punct(sv_tokenizer, text):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["svart.Gul", "Hej.Världen"])
|
@pytest.mark.parametrize("text", ["svart.Gul", "Hej.Världen"])
|
||||||
def test_tokenizer_splits_period_infix(sv_tokenizer, text):
|
def test_tokenizer_splits_period_infix(sv_tokenizer, text):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Hej,Världen", "en,två"])
|
@pytest.mark.parametrize("text", ["Hej,Världen", "en,två"])
|
||||||
def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
|
def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -31,7 +31,7 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
|
||||||
assert tokens[2].text == text.split(",")[1]
|
assert tokens[2].text == text.split(",")[1]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["svart...Gul", "svart...gul"])
|
@pytest.mark.parametrize("text", ["svart...Gul", "svart...gul"])
|
||||||
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
|
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
|
@ -1,9 +1,6 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
"""Test that longer and mixed texts are tokenized correctly."""
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
def test_sv_tokenizer_handles_long_text(sv_tokenizer):
|
def test_sv_tokenizer_handles_long_text(sv_tokenizer):
|
||||||
text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön,
|
text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön,
|
||||||
|
|
|
@ -1,25 +1,24 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
"""Test that open, closed and paired punctuation is split off correctly."""
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
PUNCT_OPEN = ['(', '[', '{', '*']
|
PUNCT_OPEN = ["(", "[", "{", "*"]
|
||||||
PUNCT_CLOSE = [')', ']', '}', '*']
|
PUNCT_CLOSE = [")", "]", "}", "*"]
|
||||||
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
@pytest.mark.parametrize("text", ["(", "((", "<"])
|
||||||
def test_uk_tokenizer_handles_only_punct(uk_tokenizer, text):
|
def test_uk_tokenizer_handles_only_punct(uk_tokenizer, text):
|
||||||
tokens = uk_tokenizer(text)
|
tokens = uk_tokenizer(text)
|
||||||
assert len(tokens) == len(text)
|
assert len(tokens) == len(text)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
@pytest.mark.parametrize(
|
||||||
|
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||||
|
)
|
||||||
def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text):
|
def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text):
|
||||||
tokens = uk_tokenizer(punct + text)
|
tokens = uk_tokenizer(punct + text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -27,8 +26,10 @@ def test_uk_tokenizer_splits_open_punct(uk_tokenizer, punct, text):
|
||||||
assert tokens[1].text == text
|
assert tokens[1].text == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
@pytest.mark.parametrize(
|
||||||
|
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||||
|
)
|
||||||
def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text):
|
def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text):
|
||||||
tokens = uk_tokenizer(text + punct)
|
tokens = uk_tokenizer(text + punct)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -36,9 +37,11 @@ def test_uk_tokenizer_splits_close_punct(uk_tokenizer, punct, text):
|
||||||
assert tokens[1].text == punct
|
assert tokens[1].text == punct
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||||
@pytest.mark.parametrize('punct_add', ["`"])
|
@pytest.mark.parametrize("punct_add", ["`"])
|
||||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
@pytest.mark.parametrize(
|
||||||
|
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||||
|
)
|
||||||
def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add, text):
|
def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add, text):
|
||||||
tokens = uk_tokenizer(punct + punct_add + text)
|
tokens = uk_tokenizer(punct + punct_add + text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -47,9 +50,11 @@ def test_uk_tokenizer_splits_two_diff_open_punct(uk_tokenizer, punct, punct_add,
|
||||||
assert tokens[2].text == text
|
assert tokens[2].text == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||||
@pytest.mark.parametrize('punct_add', ["'"])
|
@pytest.mark.parametrize("punct_add", ["'"])
|
||||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
@pytest.mark.parametrize(
|
||||||
|
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||||
|
)
|
||||||
def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add, text):
|
def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add, text):
|
||||||
tokens = uk_tokenizer(text + punct + punct_add)
|
tokens = uk_tokenizer(text + punct + punct_add)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -58,8 +63,10 @@ def test_uk_tokenizer_splits_two_diff_close_punct(uk_tokenizer, punct, punct_add
|
||||||
assert tokens[2].text == punct_add
|
assert tokens[2].text == punct_add
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
@pytest.mark.parametrize(
|
||||||
|
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||||
|
)
|
||||||
def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text):
|
def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text):
|
||||||
tokens = uk_tokenizer(punct + punct + punct + text)
|
tokens = uk_tokenizer(punct + punct + punct + text)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
@ -67,8 +74,10 @@ def test_uk_tokenizer_splits_same_open_punct(uk_tokenizer, punct, text):
|
||||||
assert tokens[3].text == text
|
assert tokens[3].text == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
@pytest.mark.parametrize(
|
||||||
|
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||||
|
)
|
||||||
def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text):
|
def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text):
|
||||||
tokens = uk_tokenizer(text + punct + punct + punct)
|
tokens = uk_tokenizer(text + punct + punct + punct)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
@ -76,14 +85,14 @@ def test_uk_tokenizer_splits_same_close_punct(uk_tokenizer, punct, text):
|
||||||
assert tokens[1].text == punct
|
assert tokens[1].text == punct
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["'Тест"])
|
@pytest.mark.parametrize("text", ["'Тест"])
|
||||||
def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text):
|
def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text):
|
||||||
tokens = uk_tokenizer(text)
|
tokens = uk_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[0].text == "'"
|
assert tokens[0].text == "'"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Тест''"])
|
@pytest.mark.parametrize("text", ["Тест''"])
|
||||||
def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
|
def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
|
||||||
tokens = uk_tokenizer(text)
|
tokens = uk_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -91,10 +100,13 @@ def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
|
||||||
assert len(tokens_punct) == 1
|
assert len(tokens_punct) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
@pytest.mark.parametrize(
|
||||||
def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open,
|
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||||
punct_close, text):
|
)
|
||||||
|
def test_uk_tokenizer_splits_open_close_punct(
|
||||||
|
uk_tokenizer, punct_open, punct_close, text
|
||||||
|
):
|
||||||
tokens = uk_tokenizer(punct_open + text + punct_close)
|
tokens = uk_tokenizer(punct_open + text + punct_close)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[0].text == punct_open
|
assert tokens[0].text == punct_open
|
||||||
|
@ -102,11 +114,14 @@ def test_uk_tokenizer_splits_open_close_punct(uk_tokenizer, punct_open,
|
||||||
assert tokens[2].text == punct_close
|
assert tokens[2].text == punct_close
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||||
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
|
||||||
@pytest.mark.parametrize('text', ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"])
|
@pytest.mark.parametrize(
|
||||||
def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close,
|
"text", ["Привет", "Привіт", "Ґелґотати", "З'єднання", "Єдність", "їхні"]
|
||||||
punct_open2, punct_close2, text):
|
)
|
||||||
|
def test_uk_tokenizer_two_diff_punct(
|
||||||
|
uk_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
|
||||||
|
):
|
||||||
tokens = uk_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
tokens = uk_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[0].text == punct_open2
|
assert tokens[0].text == punct_open2
|
||||||
|
@ -116,7 +131,9 @@ def test_uk_tokenizer_two_diff_punct(uk_tokenizer, punct_open, punct_close,
|
||||||
assert tokens[4].text == punct_close2
|
assert tokens[4].text == punct_close2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."])
|
@pytest.mark.parametrize(
|
||||||
|
"text", ["Привет.", "Привіт.", "Ґелґотати.", "З'єднання.", "Єдність.", "їхні."]
|
||||||
|
)
|
||||||
def test_uk_tokenizer_splits_trailing_dot(uk_tokenizer, text):
|
def test_uk_tokenizer_splits_trailing_dot(uk_tokenizer, text):
|
||||||
tokens = uk_tokenizer(text)
|
tokens = uk_tokenizer(text)
|
||||||
assert tokens[1].text == "."
|
assert tokens[1].text == "."
|
||||||
|
|
|
@ -1,18 +1,14 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
"""Test that tokenizer exceptions are parsed correctly."""
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,norms,lemmas', [("ім.", ["імені"], ["ім'я"]),
|
@pytest.mark.parametrize(
|
||||||
("проф.", ["професор"], ["професор"])])
|
"text,norms,lemmas",
|
||||||
|
[("ім.", ["імені"], ["ім'я"]), ("проф.", ["професор"], ["професор"])],
|
||||||
|
)
|
||||||
def test_uk_tokenizer_abbrev_exceptions(uk_tokenizer, text, norms, lemmas):
|
def test_uk_tokenizer_abbrev_exceptions(uk_tokenizer, text, norms, lemmas):
|
||||||
tokens = uk_tokenizer(text)
|
tokens = uk_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
assert [token.norm_ for token in tokens] == norms
|
assert [token.norm_ for token in tokens] == norms
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,16 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from tempfile import NamedTemporaryFile
|
from tempfile import NamedTemporaryFile
|
||||||
import pytest
|
|
||||||
|
|
||||||
from ...cli.train import train
|
from ...cli.train import train
|
||||||
|
|
||||||
|
|
||||||
def test_cli_trained_model_can_be_saved(tmpdir):
|
def test_cli_trained_model_can_be_saved(tmpdir):
|
||||||
lang = 'nl'
|
lang = "nl"
|
||||||
output_dir = str(tmpdir)
|
output_dir = str(tmpdir)
|
||||||
train_file = NamedTemporaryFile('wb', dir=output_dir, delete=False)
|
train_file = NamedTemporaryFile("wb", dir=output_dir, delete=False)
|
||||||
train_corpus = [
|
train_corpus = [
|
||||||
{
|
{
|
||||||
"id": "identifier_0",
|
"id": "identifier_0",
|
||||||
|
@ -26,7 +26,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
|
||||||
"head": 1,
|
"head": 1,
|
||||||
"tag": "NOUN",
|
"tag": "NOUN",
|
||||||
"orth": "Jan",
|
"orth": "Jan",
|
||||||
"ner": "B-PER"
|
"ner": "B-PER",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
|
@ -34,7 +34,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
|
||||||
"head": 0,
|
"head": 0,
|
||||||
"tag": "VERB",
|
"tag": "VERB",
|
||||||
"orth": "houdt",
|
"orth": "houdt",
|
||||||
"ner": "O"
|
"ner": "O",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
|
@ -42,7 +42,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
|
||||||
"head": 1,
|
"head": 1,
|
||||||
"tag": "ADP",
|
"tag": "ADP",
|
||||||
"orth": "van",
|
"orth": "van",
|
||||||
"ner": "O"
|
"ner": "O",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 3,
|
"id": 3,
|
||||||
|
@ -50,7 +50,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
|
||||||
"head": -2,
|
"head": -2,
|
||||||
"tag": "NOUN",
|
"tag": "NOUN",
|
||||||
"orth": "Marie",
|
"orth": "Marie",
|
||||||
"ner": "B-PER"
|
"ner": "B-PER",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 4,
|
||||||
|
@ -58,7 +58,7 @@ def test_cli_trained_model_can_be_saved(tmpdir):
|
||||||
"head": -3,
|
"head": -3,
|
||||||
"tag": "PUNCT",
|
"tag": "PUNCT",
|
||||||
"orth": ".",
|
"orth": ".",
|
||||||
"ner": "O"
|
"ner": "O",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5,
|
"id": 5,
|
||||||
|
@ -66,18 +66,18 @@ def test_cli_trained_model_can_be_saved(tmpdir):
|
||||||
"head": -1,
|
"head": -1,
|
||||||
"tag": "SPACE",
|
"tag": "SPACE",
|
||||||
"orth": "\n",
|
"orth": "\n",
|
||||||
"ner": "O"
|
"ner": "O",
|
||||||
}
|
},
|
||||||
],
|
],
|
||||||
"brackets": []
|
"brackets": [],
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
train_file.write(json.dumps(train_corpus).encode('utf-8'))
|
train_file.write(json.dumps(train_corpus).encode("utf-8"))
|
||||||
train_file.close()
|
train_file.close()
|
||||||
train_data = train_file.name
|
train_data = train_file.name
|
||||||
dev_data = train_data
|
dev_data = train_data
|
||||||
|
|
|
@ -155,6 +155,14 @@ def test_issue1758(en_tokenizer):
|
||||||
assert tokens[1].lemma_ == "have"
|
assert tokens[1].lemma_ == "have"
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue1773(en_tokenizer):
|
||||||
|
"""Test that spaces don't receive a POS but no TAG. This is the root cause
|
||||||
|
of the serialization issue reported in #1773."""
|
||||||
|
doc = en_tokenizer("\n")
|
||||||
|
if doc[0].pos_ == "SPACE":
|
||||||
|
assert doc[0].tag_ != ""
|
||||||
|
|
||||||
|
|
||||||
def test_issue1799():
|
def test_issue1799():
|
||||||
"""Test sentence boundaries are deserialized correctly, even for
|
"""Test sentence boundaries are deserialized correctly, even for
|
||||||
non-projective sentences."""
|
non-projective sentences."""
|
||||||
|
@ -249,8 +257,8 @@ def test_issue1945():
|
||||||
|
|
||||||
def test_issue1963(en_tokenizer):
|
def test_issue1963(en_tokenizer):
|
||||||
"""Test that doc.merge() resizes doc.tensor"""
|
"""Test that doc.merge() resizes doc.tensor"""
|
||||||
doc = en_tokenizer('a b c d')
|
doc = en_tokenizer("a b c d")
|
||||||
doc.tensor = numpy.ones((len(doc), 128), dtype='f')
|
doc.tensor = numpy.ones((len(doc), 128), dtype="f")
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc[0:2])
|
retokenizer.merge(doc[0:2])
|
||||||
assert len(doc) == 3
|
assert len(doc) == 3
|
||||||
|
|
|
@ -1,9 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue1773(en_tokenizer):
|
|
||||||
"""Test that spaces don't receive a POS but no TAG. This is the root cause
|
|
||||||
of the serialization issue reported in #1773."""
|
|
||||||
doc = en_tokenizer('\n')
|
|
||||||
if doc[0].pos_ == 'SPACE':
|
|
||||||
assert doc[0].tag_ != ""
|
|
|
@ -6,8 +6,9 @@ from spacy.tokens import Doc
|
||||||
from spacy.displacy import render
|
from spacy.displacy import render
|
||||||
from spacy.gold import iob_to_biluo
|
from spacy.gold import iob_to_biluo
|
||||||
from spacy.lang.it import Italian
|
from spacy.lang.it import Italian
|
||||||
|
import numpy
|
||||||
|
|
||||||
from ..util import add_vecs_to_vocab
|
from ..util import add_vecs_to_vocab, get_doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
|
@ -69,6 +70,26 @@ def test_issue2385_biluo(tags):
|
||||||
assert iob_to_biluo(tags) == list(tags)
|
assert iob_to_biluo(tags) == list(tags)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue2396(en_vocab):
|
||||||
|
words = ["She", "created", "a", "test", "for", "spacy"]
|
||||||
|
heads = [1, 0, 1, -2, -1, -1]
|
||||||
|
matrix = numpy.array(
|
||||||
|
[
|
||||||
|
[0, 1, 1, 1, 1, 1],
|
||||||
|
[1, 1, 1, 1, 1, 1],
|
||||||
|
[1, 1, 2, 3, 3, 3],
|
||||||
|
[1, 1, 3, 3, 3, 3],
|
||||||
|
[1, 1, 3, 3, 4, 4],
|
||||||
|
[1, 1, 3, 3, 4, 5],
|
||||||
|
],
|
||||||
|
dtype=numpy.int32,
|
||||||
|
)
|
||||||
|
doc = get_doc(en_vocab, words=words, heads=heads)
|
||||||
|
span = doc[:]
|
||||||
|
assert (doc.get_lca_matrix() == matrix).all()
|
||||||
|
assert (span.get_lca_matrix() == matrix).all()
|
||||||
|
|
||||||
|
|
||||||
def test_issue2482():
|
def test_issue2482():
|
||||||
"""Test we can serialize and deserialize a blank NER or parser model."""
|
"""Test we can serialize and deserialize a blank NER or parser model."""
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
|
|
|
@ -1,35 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import numpy
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"sentence,heads,matrix",
|
|
||||||
[
|
|
||||||
(
|
|
||||||
"She created a test for spacy",
|
|
||||||
[1, 0, 1, -2, -1, -1],
|
|
||||||
numpy.array(
|
|
||||||
[
|
|
||||||
[0, 1, 1, 1, 1, 1],
|
|
||||||
[1, 1, 1, 1, 1, 1],
|
|
||||||
[1, 1, 2, 3, 3, 3],
|
|
||||||
[1, 1, 3, 3, 3, 3],
|
|
||||||
[1, 1, 3, 3, 4, 4],
|
|
||||||
[1, 1, 3, 3, 4, 5],
|
|
||||||
],
|
|
||||||
dtype=numpy.int32,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_issue2396(en_tokenizer, sentence, heads, matrix):
|
|
||||||
tokens = en_tokenizer(sentence)
|
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
|
||||||
span = doc[:]
|
|
||||||
assert (doc.get_lca_matrix() == matrix).all()
|
|
||||||
assert (span.get_lca_matrix() == matrix).all()
|
|
|
@ -1,14 +1,10 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
def test_issue2754():
|
def test_issue2754(en_tokenizer):
|
||||||
"""Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
|
"""Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
|
||||||
nlp = English()
|
a = en_tokenizer("a")
|
||||||
a = nlp('a')
|
assert a[0].norm_ == "a"
|
||||||
assert a[0].norm_ == 'a'
|
am = en_tokenizer("am")
|
||||||
am = nlp('am')
|
assert am[0].norm_ == "am"
|
||||||
assert am[0].norm_ == 'am'
|
|
||||||
|
|
||||||
|
|
|
@ -9,4 +9,3 @@ def test_issue2835(en_tokenizer):
|
||||||
"""
|
"""
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert doc
|
assert doc
|
||||||
|
|
||||||
|
|
|
@ -2,26 +2,24 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.vectors import Vectors
|
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy._ml import link_vectors_to_models
|
from spacy._ml import link_vectors_to_models
|
||||||
|
|
||||||
|
|
||||||
def test_issue2871():
|
def test_issue2871():
|
||||||
"""Test that vectors recover the correct key for spaCy reserved words."""
|
"""Test that vectors recover the correct key for spaCy reserved words."""
|
||||||
words = ['dog', 'cat', 'SUFFIX']
|
words = ["dog", "cat", "SUFFIX"]
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
vocab.vectors.resize(shape=(3, 10))
|
vocab.vectors.resize(shape=(3, 10))
|
||||||
vector_data = numpy.zeros((3, 10), dtype='f')
|
vector_data = numpy.zeros((3, 10), dtype="f")
|
||||||
for word in words:
|
for word in words:
|
||||||
_ = vocab[word]
|
_ = vocab[word] # noqa: F841
|
||||||
vocab.set_vector(word, vector_data[0])
|
vocab.set_vector(word, vector_data[0])
|
||||||
vocab.vectors.name = 'dummy_vectors'
|
vocab.vectors.name = "dummy_vectors"
|
||||||
link_vectors_to_models(vocab)
|
link_vectors_to_models(vocab)
|
||||||
assert vocab['dog'].rank == 0
|
assert vocab["dog"].rank == 0
|
||||||
assert vocab['cat'].rank == 1
|
assert vocab["cat"].rank == 1
|
||||||
assert vocab['SUFFIX'].rank == 2
|
assert vocab["SUFFIX"].rank == 2
|
||||||
assert vocab.vectors.find(key='dog') == 0
|
assert vocab.vectors.find(key="dog") == 0
|
||||||
assert vocab.vectors.find(key='cat') == 1
|
assert vocab.vectors.find(key="cat") == 1
|
||||||
assert vocab.vectors.find(key='SUFFIX') == 2
|
assert vocab.vectors.find(key="SUFFIX") == 2
|
||||||
|
|
|
@ -58,9 +58,10 @@ def test_issue3009(doc, matcher, pattern):
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert matches
|
assert matches
|
||||||
|
|
||||||
|
|
||||||
def test_issue2464(matcher):
|
def test_issue2464(matcher):
|
||||||
"""Test problem with successive ?. This is the same bug, so putting it here."""
|
"""Test problem with successive ?. This is the same bug, so putting it here."""
|
||||||
doc = Doc(matcher.vocab, words=['a', 'b'])
|
doc = Doc(matcher.vocab, words=["a", "b"])
|
||||||
matcher.add('4', None, [{'OP': '?'}, {'OP': '?'}])
|
matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 3
|
assert len(matches) == 3
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from ...attrs import ENT_IOB, ENT_TYPE
|
from ...attrs import ENT_IOB, ENT_TYPE
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
@ -30,4 +28,4 @@ def test_issue3012(en_vocab):
|
||||||
# serializing then deserializing
|
# serializing then deserializing
|
||||||
doc_bytes = doc.to_bytes()
|
doc_bytes = doc.to_bytes()
|
||||||
doc2 = Doc(en_vocab).from_bytes(doc_bytes)
|
doc2 = Doc(en_vocab).from_bytes(doc_bytes)
|
||||||
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
|
assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
|
||||||
|
|
|
@ -1,10 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
import pytest
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models('fr')
|
|
||||||
def test_issue1959(FR):
|
|
||||||
texts = ['Je suis la mauvaise herbe', "Me, myself and moi"]
|
|
||||||
for text in texts:
|
|
||||||
FR(text)
|
|
Loading…
Reference in New Issue
Block a user