Merge branch 'master' into pr/3948

This commit is contained in:
Ines Montani 2019-07-11 12:18:31 +02:00
commit 197cfd7ebc
8 changed files with 118 additions and 81 deletions

View File

@ -666,16 +666,28 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
entity_width = cfg.get("entity_width")
with Model.define_operators({">>": chain, "**": clone}):
model = Affine(entity_width, entity_width+context_width+1+ner_types)\
>> Affine(1, entity_width, drop_factor=0.0)\
model = (
Affine(entity_width, entity_width + context_width + 1 + ner_types)
>> Affine(1, entity_width, drop_factor=0.0)
>> logistic
)
# context encoder
tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors,
cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth,
bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\
>> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
tok2vec = (
Tok2Vec(
width=hidden_width,
embed_size=embed_width,
pretrained_vectors=pretrained_vectors,
cnn_maxout_pieces=cnn_maxout_pieces,
subword_features=True,
conv_depth=conv_depth,
bilstm_depth=0,
)
>> flatten_add_lengths
>> Pooling(mean_pool)
>> Residual(zero_init(Maxout(hidden_width, hidden_width)))
>> zero_init(Affine(context_width, hidden_width))
)
model.tok2vec = tok2vec
@ -684,6 +696,7 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
model.nO = 1
return model
@layerize
def flatten(seqs, drop=0.0):
ops = Model.ops

View File

@ -6,7 +6,7 @@ import sys
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP, POS
from .tag_map import TAG_MAP
from ...attrs import LANG
from ...language import Language
from ...tokens import Doc
@ -22,6 +22,7 @@ if is_python_pre_3_5:
Morpheme = namedtuple("Morpheme", "surface lemma tag")
elif is_python_post_3_7:
from dataclasses import dataclass
@dataclass(frozen=True)
class Morpheme:
surface: str
@ -29,6 +30,7 @@ elif is_python_post_3_7:
tag: str
else:
from typing import NamedTuple
class Morpheme(NamedTuple):
surface: str
lemma: str

View File

@ -1,10 +1,8 @@
# encoding: utf8
from __future__ import unicode_literals
from collections import defaultdict
from ...symbols import (POS, PUNCT, INTJ, X, SYM,
ADJ, AUX, ADP, CONJ, NOUN, PRON, VERB, ADV, PROPN,
NUM, DET)
from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON
from ...symbols import VERB, ADV, PROPN, NUM, DET
# 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴
# https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265
@ -23,7 +21,6 @@ TAG_MAP = {
"MAJ": {POS: CONJ}, # 접속 부사
"MAG": {POS: ADV}, # 일반 부사
"MM": {POS: DET}, # 관형사
"XPN": {POS: X}, # 접두사
# XS. 접미사
"XSN": {POS: X},
@ -36,15 +33,12 @@ TAG_MAP = {
"EC": {POS: X},
"ETN": {POS: X},
"ETM": {POS: X},
"IC": {POS: INTJ}, # 감탄사
"VV": {POS: VERB}, # 동사
"VA": {POS: ADJ}, # 형용사
"VX": {POS: AUX}, # 보조 용언
"VCP": {POS: ADP}, # 긍정 지정사(이다)
"VCN": {POS: ADJ}, # 부정 지정사(아니다)
"NNG": {POS: NOUN}, # 일반 명사(general noun)
"NNB": {POS: NOUN}, # 의존 명사
"NNBC": {POS: NOUN}, # 의존 명사(단위: unit)
@ -52,7 +46,6 @@ TAG_MAP = {
"NP": {POS: PRON}, # 대명사
"NR": {POS: NUM}, # 수사(numerals)
"SN": {POS: NUM}, # 숫자
# S.{1,2} 부호
# 문장 부호
"SF": {POS: PUNCT}, # period or other EOS marker

View File

@ -5,8 +5,7 @@ import pytest
@pytest.mark.parametrize(
"word,lemma",
[("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", ""), ("뭡니까", ""), ("됐다", "")],
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", ""), ("뭡니까", ""), ("됐다", "")]
)
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
test_lemma = ko_tokenizer(word)[0].lemma_

View File

@ -5,16 +5,24 @@ import pytest
def test_lt_tokenizer_handles_long_text(lt_tokenizer):
text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią
vidutinį daugiametį vandens lygį. Nustatyta, kad 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis
yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui."""
tokens = lt_tokenizer(text.replace("\n", ""))
text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui."""
tokens = lt_tokenizer(text)
assert len(tokens) == 42
@pytest.mark.parametrize('text,length', [
("177R Parodų rūmaiOzo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.", 15),
("ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.", 16)])
@pytest.mark.parametrize(
"text,length",
[
(
"177R Parodų rūmaiOzo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.",
15,
),
(
"ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.",
16,
),
],
)
def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length):
tokens = lt_tokenizer(text)
assert len(tokens) == length
@ -26,7 +34,9 @@ def test_lt_tokenizer_abbrev_exceptions(lt_tokenizer, text):
assert len(tokens) == 1
@pytest.mark.parametrize("text,match", [
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("10,000", True),
@ -37,7 +47,9 @@ def test_lt_tokenizer_abbrev_exceptions(lt_tokenizer, text):
("milijardas", True),
("šuo", False),
(",", False),
("1/2", True)])
("1/2", True),
],
)
def test_lt_lex_attrs_like_number(lt_tokenizer, text, match):
tokens = lt_tokenizer(text)
assert len(tokens) == 1

View File

@ -5,7 +5,6 @@ import pytest
import re
from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token
from ..util import get_doc
@pytest.fixture
@ -288,24 +287,43 @@ def deps():
def dependency_matcher(en_vocab):
def is_brown_yellow(text):
return bool(re.compile(r"brown|yellow|over").match(text))
IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)
pattern1 = [
{"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}},
{"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},"PATTERN": {"ORTH": "quick", "DEP": "amod"}},
{"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, "PATTERN": {IS_BROWN_YELLOW: True}},
{
"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
"PATTERN": {"ORTH": "quick", "DEP": "amod"},
},
{
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
"PATTERN": {IS_BROWN_YELLOW: True},
},
]
pattern2 = [
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
{"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
{"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}
{
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
{
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
]
pattern3 = [
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
{"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
{"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"}, "PATTERN": {"ORTH": "brown"}}
{
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
{
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"},
"PATTERN": {"ORTH": "brown"},
},
]
matcher = DependencyMatcher(en_vocab)
@ -320,9 +338,9 @@ def test_dependency_matcher_compile(dependency_matcher):
assert len(dependency_matcher) == 3
def test_dependency_matcher(dependency_matcher, text, heads, deps):
doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
matches = dependency_matcher(doc)
# def test_dependency_matcher(dependency_matcher, text, heads, deps):
# doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
# matches = dependency_matcher(doc)
# assert matches[0][1] == [[3, 1, 2]]
# assert matches[1][1] == [[4, 3, 3]]
# assert matches[2][1] == [[4, 3, 2]]

View File

@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English
import pytest
def test_issue3880():