Merge branch 'master' into pr/3948

This commit is contained in:
Ines Montani 2019-07-11 12:18:31 +02:00
commit 197cfd7ebc
8 changed files with 118 additions and 81 deletions

View File

@ -661,21 +661,33 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
conv_depth = cfg.get("conv_depth", 2) conv_depth = cfg.get("conv_depth", 2)
cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name
context_width = cfg.get("context_width") context_width = cfg.get("context_width")
entity_width = cfg.get("entity_width") entity_width = cfg.get("entity_width")
with Model.define_operators({">>": chain, "**": clone}): with Model.define_operators({">>": chain, "**": clone}):
model = Affine(entity_width, entity_width+context_width+1+ner_types)\ model = (
>> Affine(1, entity_width, drop_factor=0.0)\ Affine(entity_width, entity_width + context_width + 1 + ner_types)
>> logistic >> Affine(1, entity_width, drop_factor=0.0)
>> logistic
)
# context encoder # context encoder
tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors, tok2vec = (
cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth, Tok2Vec(
bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\ width=hidden_width,
>> Residual(zero_init(Maxout(hidden_width, hidden_width))) \ embed_size=embed_width,
>> zero_init(Affine(context_width, hidden_width)) pretrained_vectors=pretrained_vectors,
cnn_maxout_pieces=cnn_maxout_pieces,
subword_features=True,
conv_depth=conv_depth,
bilstm_depth=0,
)
>> flatten_add_lengths
>> Pooling(mean_pool)
>> Residual(zero_init(Maxout(hidden_width, hidden_width)))
>> zero_init(Affine(context_width, hidden_width))
)
model.tok2vec = tok2vec model.tok2vec = tok2vec
@ -684,6 +696,7 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
model.nO = 1 model.nO = 1
return model return model
@layerize @layerize
def flatten(seqs, drop=0.0): def flatten(seqs, drop=0.0):
ops = Model.ops ops = Model.ops

View File

@ -6,7 +6,7 @@ import sys
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP, POS from .tag_map import TAG_MAP
from ...attrs import LANG from ...attrs import LANG
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
@ -22,6 +22,7 @@ if is_python_pre_3_5:
Morpheme = namedtuple("Morpheme", "surface lemma tag") Morpheme = namedtuple("Morpheme", "surface lemma tag")
elif is_python_post_3_7: elif is_python_post_3_7:
from dataclasses import dataclass from dataclasses import dataclass
@dataclass(frozen=True) @dataclass(frozen=True)
class Morpheme: class Morpheme:
surface: str surface: str
@ -29,6 +30,7 @@ elif is_python_post_3_7:
tag: str tag: str
else: else:
from typing import NamedTuple from typing import NamedTuple
class Morpheme(NamedTuple): class Morpheme(NamedTuple):
surface: str surface: str
lemma: str lemma: str

View File

@ -1,66 +1,59 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from collections import defaultdict
from ...symbols import (POS, PUNCT, INTJ, X, SYM, from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON
ADJ, AUX, ADP, CONJ, NOUN, PRON, VERB, ADV, PROPN, from ...symbols import VERB, ADV, PROPN, NUM, DET
NUM, DET)
# 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴 # 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴
# https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265 # https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265
# https://universaldependencies.org/u/pos/ # https://universaldependencies.org/u/pos/
TAG_MAP = { TAG_MAP = {
# J.{1,2} 조사 # J.{1,2} 조사
"JKS": {POS: ADP}, "JKS": {POS: ADP},
"JKC": {POS: ADP}, "JKC": {POS: ADP},
"JKG": {POS: ADP}, "JKG": {POS: ADP},
"JKO": {POS: ADP}, "JKO": {POS: ADP},
"JKB": {POS: ADP}, "JKB": {POS: ADP},
"JKV": {POS: ADP}, "JKV": {POS: ADP},
"JKQ": {POS: ADP}, "JKQ": {POS: ADP},
"JX": {POS: ADP}, # 보조사 "JX": {POS: ADP}, # 보조사
"JC": {POS: CONJ}, # 접속 조사 "JC": {POS: CONJ}, # 접속 조사
"MAJ": {POS: CONJ}, # 접속 부사 "MAJ": {POS: CONJ}, # 접속 부사
"MAG": {POS: ADV}, # 일반 부사 "MAG": {POS: ADV}, # 일반 부사
"MM": {POS: DET}, # 관형사 "MM": {POS: DET}, # 관형사
"XPN": {POS: X}, # 접두사 "XPN": {POS: X}, # 접두사
# XS. 접미사 # XS. 접미사
"XSN": {POS: X}, "XSN": {POS: X},
"XSV": {POS: X}, "XSV": {POS: X},
"XSA": {POS: X}, "XSA": {POS: X},
"XR": {POS: X}, # 어근 "XR": {POS: X}, # 어근
# E.{1,2} 어미 # E.{1,2} 어미
"EP": {POS: X}, "EP": {POS: X},
"EF": {POS: X}, "EF": {POS: X},
"EC": {POS: X}, "EC": {POS: X},
"ETN": {POS: X}, "ETN": {POS: X},
"ETM": {POS: X}, "ETM": {POS: X},
"IC": {POS: INTJ}, # 감탄사 "IC": {POS: INTJ}, # 감탄사
"VV": {POS: VERB}, # 동사 "VV": {POS: VERB}, # 동사
"VA": {POS: ADJ}, # 형용사 "VA": {POS: ADJ}, # 형용사
"VX": {POS: AUX}, # 보조 용언 "VX": {POS: AUX}, # 보조 용언
"VCP": {POS: ADP}, # 긍정 지정사(이다) "VCP": {POS: ADP}, # 긍정 지정사(이다)
"VCN": {POS: ADJ}, # 부정 지정사(아니다) "VCN": {POS: ADJ}, # 부정 지정사(아니다)
"NNG": {POS: NOUN}, # 일반 명사(general noun)
"NNG": {POS: NOUN}, # 일반 명사(general noun) "NNB": {POS: NOUN}, # 의존 명사
"NNB": {POS: NOUN}, # 의존 명사 "NNBC": {POS: NOUN}, # 의존 명사(단위: unit)
"NNBC": {POS: NOUN}, # 의존 명사(단위: unit) "NNP": {POS: PROPN}, # 고유 명사(proper noun)
"NNP": {POS: PROPN}, # 고유 명사(proper noun)
"NP": {POS: PRON}, # 대명사 "NP": {POS: PRON}, # 대명사
"NR": {POS: NUM}, # 수사(numerals) "NR": {POS: NUM}, # 수사(numerals)
"SN": {POS: NUM}, # 숫자 "SN": {POS: NUM}, # 숫자
# S.{1,2} 부호 # S.{1,2} 부호
# 문장 부호 # 문장 부호
"SF": {POS: PUNCT}, # period or other EOS marker "SF": {POS: PUNCT}, # period or other EOS marker
"SE": {POS: PUNCT}, "SE": {POS: PUNCT},
"SC": {POS: PUNCT}, # comma, etc. "SC": {POS: PUNCT}, # comma, etc.
"SSO": {POS: PUNCT}, # open bracket "SSO": {POS: PUNCT}, # open bracket
"SSC": {POS: PUNCT}, # close bracket "SSC": {POS: PUNCT}, # close bracket
"SY": {POS: SYM}, # 기타 기호 "SY": {POS: SYM}, # 기타 기호
"SL": {POS: X}, # 외국어 "SL": {POS: X}, # 외국어
"SH": {POS: X}, # 한자 "SH": {POS: X}, # 한자
} }

View File

@ -5,8 +5,7 @@ import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize(
"word,lemma", "word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", ""), ("뭡니까", ""), ("됐다", "")]
[("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", ""), ("뭡니까", ""), ("됐다", "")],
) )
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma): def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
test_lemma = ko_tokenizer(word)[0].lemma_ test_lemma = ko_tokenizer(word)[0].lemma_

View File

@ -7,15 +7,15 @@ import pytest
TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."), TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."),
("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 .")] ("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 .")]
TAG_TESTS = [("서울 타워 근처에 살고 있습니다.", TAG_TESTS = [("서울 타워 근처에 살고 있습니다.",
"NNP NNG NNG JKB VV EC VX EF SF"), "NNP NNG NNG JKB VV EC VX EF SF"),
("영등포구에 있는 맛집 좀 알려주세요.", ("영등포구에 있는 맛집 좀 알려주세요.",
"NNP JKB VV ETM NNG MAG VV VX EP SF")] "NNP JKB VV ETM NNG MAG VV VX EP SF")]
FULL_TAG_TESTS = [("영등포구에 있는 맛집 좀 알려주세요.", FULL_TAG_TESTS = [("영등포구에 있는 맛집 좀 알려주세요.",
"NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")] "NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")]
POS_TESTS = [("서울 타워 근처에 살고 있습니다.", POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
"PROPN NOUN NOUN ADP VERB X AUX X PUNCT"), "PROPN NOUN NOUN ADP VERB X AUX X PUNCT"),
("영등포구에 있는 맛집 좀 알려주세요.", ("영등포구에 있는 맛집 좀 알려주세요.",
"PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")] "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]

View File

@ -5,16 +5,24 @@ import pytest
def test_lt_tokenizer_handles_long_text(lt_tokenizer): def test_lt_tokenizer_handles_long_text(lt_tokenizer):
text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui."""
vidutinį daugiametį vandens lygį. Nustatyta, kad 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis tokens = lt_tokenizer(text)
yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui."""
tokens = lt_tokenizer(text.replace("\n", ""))
assert len(tokens) == 42 assert len(tokens) == 42
@pytest.mark.parametrize('text,length', [ @pytest.mark.parametrize(
("177R Parodų rūmaiOzo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.", 15), "text,length",
("ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.", 16)]) [
(
"177R Parodų rūmaiOzo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.",
15,
),
(
"ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.",
16,
),
],
)
def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length): def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length):
tokens = lt_tokenizer(text) tokens = lt_tokenizer(text)
assert len(tokens) == length assert len(tokens) == length
@ -26,18 +34,22 @@ def test_lt_tokenizer_abbrev_exceptions(lt_tokenizer, text):
assert len(tokens) == 1 assert len(tokens) == 1
@pytest.mark.parametrize("text,match", [ @pytest.mark.parametrize(
("10", True), "text,match",
("1", True), [
("10,000", True), ("10", True),
("10,00", True), ("1", True),
("999.0", True), ("10,000", True),
("vienas", True), ("10,00", True),
("du", True), ("999.0", True),
("milijardas", True), ("vienas", True),
("šuo", False), ("du", True),
(",", False), ("milijardas", True),
("1/2", True)]) ("šuo", False),
(",", False),
("1/2", True),
],
)
def test_lt_lex_attrs_like_number(lt_tokenizer, text, match): def test_lt_lex_attrs_like_number(lt_tokenizer, text, match):
tokens = lt_tokenizer(text) tokens = lt_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1

View File

@ -5,7 +5,6 @@ import pytest
import re import re
from spacy.matcher import Matcher, DependencyMatcher from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token from spacy.tokens import Doc, Token
from ..util import get_doc
@pytest.fixture @pytest.fixture
@ -288,24 +287,43 @@ def deps():
def dependency_matcher(en_vocab): def dependency_matcher(en_vocab):
def is_brown_yellow(text): def is_brown_yellow(text):
return bool(re.compile(r"brown|yellow|over").match(text)) return bool(re.compile(r"brown|yellow|over").match(text))
IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow) IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)
pattern1 = [ pattern1 = [
{"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}}, {"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}},
{"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},"PATTERN": {"ORTH": "quick", "DEP": "amod"}}, {
{"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, "PATTERN": {IS_BROWN_YELLOW: True}}, "SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
"PATTERN": {"ORTH": "quick", "DEP": "amod"},
},
{
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
"PATTERN": {IS_BROWN_YELLOW: True},
},
] ]
pattern2 = [ pattern2 = [
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
{"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, {
{"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}} "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
{
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
] ]
pattern3 = [ pattern3 = [
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
{"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, {
{"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"}, "PATTERN": {"ORTH": "brown"}} "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
{
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"},
"PATTERN": {"ORTH": "brown"},
},
] ]
matcher = DependencyMatcher(en_vocab) matcher = DependencyMatcher(en_vocab)
@ -320,9 +338,9 @@ def test_dependency_matcher_compile(dependency_matcher):
assert len(dependency_matcher) == 3 assert len(dependency_matcher) == 3
def test_dependency_matcher(dependency_matcher, text, heads, deps): # def test_dependency_matcher(dependency_matcher, text, heads, deps):
doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps) # doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
matches = dependency_matcher(doc) # matches = dependency_matcher(doc)
# assert matches[0][1] == [[3, 1, 2]] # assert matches[0][1] == [[3, 1, 2]]
# assert matches[1][1] == [[4, 3, 3]] # assert matches[1][1] == [[4, 3, 3]]
# assert matches[2][1] == [[4, 3, 2]] # assert matches[2][1] == [[4, 3, 2]]

View File

@ -1,7 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.lang.en import English from spacy.lang.en import English
import pytest
def test_issue3880(): def test_issue3880():