Tidy up and auto-format

This commit is contained in:
Ines Montani 2019-07-11 12:02:25 +02:00
parent 0491a8e7c8
commit 0b8406a05c
8 changed files with 118 additions and 79 deletions

View File

@ -661,21 +661,33 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
conv_depth = cfg.get("conv_depth", 2)
cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name
pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name
context_width = cfg.get("context_width")
entity_width = cfg.get("entity_width")
with Model.define_operators({">>": chain, "**": clone}):
model = Affine(entity_width, entity_width+context_width+1+ner_types)\
>> Affine(1, entity_width, drop_factor=0.0)\
>> logistic
model = (
Affine(entity_width, entity_width + context_width + 1 + ner_types)
>> Affine(1, entity_width, drop_factor=0.0)
>> logistic
)
# context encoder
tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors,
cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth,
bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\
>> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
>> zero_init(Affine(context_width, hidden_width))
tok2vec = (
Tok2Vec(
width=hidden_width,
embed_size=embed_width,
pretrained_vectors=pretrained_vectors,
cnn_maxout_pieces=cnn_maxout_pieces,
subword_features=True,
conv_depth=conv_depth,
bilstm_depth=0,
)
>> flatten_add_lengths
>> Pooling(mean_pool)
>> Residual(zero_init(Maxout(hidden_width, hidden_width)))
>> zero_init(Affine(context_width, hidden_width))
)
model.tok2vec = tok2vec
@ -684,6 +696,7 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
model.nO = 1
return model
@layerize
def flatten(seqs, drop=0.0):
ops = Model.ops

View File

@ -6,7 +6,7 @@ import sys
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP, POS
from .tag_map import TAG_MAP
from ...attrs import LANG
from ...language import Language
from ...tokens import Doc
@ -22,6 +22,7 @@ if is_python_pre_3_5:
Morpheme = namedtuple("Morpheme", "surface lemma tag")
elif is_python_post_3_7:
from dataclasses import dataclass
@dataclass(frozen=True)
class Morpheme:
surface: str
@ -29,6 +30,7 @@ elif is_python_post_3_7:
tag: str
else:
from typing import NamedTuple
class Morpheme(NamedTuple):
surface: str
lemma: str

View File

@ -1,66 +1,59 @@
# encoding: utf8
from __future__ import unicode_literals
from collections import defaultdict
from ...symbols import (POS, PUNCT, INTJ, X, SYM,
ADJ, AUX, ADP, CONJ, NOUN, PRON, VERB, ADV, PROPN,
NUM, DET)
from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON
from ...symbols import VERB, ADV, PROPN, NUM, DET
# 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴
# https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265
# https://universaldependencies.org/u/pos/
TAG_MAP = {
# J.{1,2} 조사
"JKS": {POS: ADP},
# J.{1,2} 조사
"JKS": {POS: ADP},
"JKC": {POS: ADP},
"JKG": {POS: ADP},
"JKO": {POS: ADP},
"JKB": {POS: ADP},
"JKV": {POS: ADP},
"JKQ": {POS: ADP},
"JX": {POS: ADP}, # 보조사
"JX": {POS: ADP}, # 보조사
"JC": {POS: CONJ}, # 접속 조사
"MAJ": {POS: CONJ}, # 접속 부사
"MAJ": {POS: CONJ}, # 접속 부사
"MAG": {POS: ADV}, # 일반 부사
"MM": {POS: DET}, # 관형사
"MM": {POS: DET}, # 관형사
"XPN": {POS: X}, # 접두사
# XS. 접미사
# XS. 접미사
"XSN": {POS: X},
"XSV": {POS: X},
"XSA": {POS: X},
"XR": {POS: X}, # 어근
"XR": {POS: X}, # 어근
# E.{1,2} 어미
"EP": {POS: X},
"EF": {POS: X},
"EC": {POS: X},
"ETN": {POS: X},
"ETM": {POS: X},
"IC": {POS: INTJ}, # 감탄사
"VV": {POS: VERB}, # 동사
"VA": {POS: ADJ}, # 형용사
"VX": {POS: AUX}, # 보조 용언
"VA": {POS: ADJ}, # 형용사
"VX": {POS: AUX}, # 보조 용언
"VCP": {POS: ADP}, # 긍정 지정사(이다)
"VCN": {POS: ADJ}, # 부정 지정사(아니다)
"NNG": {POS: NOUN}, # 일반 명사(general noun)
"NNB": {POS: NOUN}, # 의존 명사
"NNBC": {POS: NOUN}, # 의존 명사(단위: unit)
"NNP": {POS: PROPN}, # 고유 명사(proper noun)
"NNG": {POS: NOUN}, # 일반 명사(general noun)
"NNB": {POS: NOUN}, # 의존 명사
"NNBC": {POS: NOUN}, # 의존 명사(단위: unit)
"NNP": {POS: PROPN}, # 고유 명사(proper noun)
"NP": {POS: PRON}, # 대명사
"NR": {POS: NUM}, # 수사(numerals)
"SN": {POS: NUM}, # 숫자
"NR": {POS: NUM}, # 수사(numerals)
"SN": {POS: NUM}, # 숫자
# S.{1,2} 부호
# 문장 부호
"SF": {POS: PUNCT}, # period or other EOS marker
# 문장 부호
"SF": {POS: PUNCT}, # period or other EOS marker
"SE": {POS: PUNCT},
"SC": {POS: PUNCT}, # comma, etc.
"SSO": {POS: PUNCT}, # open bracket
"SSC": {POS: PUNCT}, # close bracket
"SY": {POS: SYM}, # 기타 기호
"SL": {POS: X}, # 외국어
"SH": {POS: X}, # 한자
"SC": {POS: PUNCT}, # comma, etc.
"SSO": {POS: PUNCT}, # open bracket
"SSC": {POS: PUNCT}, # close bracket
"SY": {POS: SYM}, # 기타 기호
"SL": {POS: X}, # 외국어
"SH": {POS: X}, # 한자
}

View File

@ -5,8 +5,7 @@ import pytest
@pytest.mark.parametrize(
"word,lemma",
[("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", ""), ("뭡니까", ""), ("됐다", "")],
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", ""), ("뭡니까", ""), ("됐다", "")]
)
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
test_lemma = ko_tokenizer(word)[0].lemma_

View File

@ -7,15 +7,15 @@ import pytest
TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."),
("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 .")]
TAG_TESTS = [("서울 타워 근처에 살고 있습니다.",
TAG_TESTS = [("서울 타워 근처에 살고 있습니다.",
"NNP NNG NNG JKB VV EC VX EF SF"),
("영등포구에 있는 맛집 좀 알려주세요.",
("영등포구에 있는 맛집 좀 알려주세요.",
"NNP JKB VV ETM NNG MAG VV VX EP SF")]
FULL_TAG_TESTS = [("영등포구에 있는 맛집 좀 알려주세요.",
"NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")]
POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
"PROPN NOUN NOUN ADP VERB X AUX X PUNCT"),
("영등포구에 있는 맛집 좀 알려주세요.",
"PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]

View File

@ -5,16 +5,26 @@ import pytest
def test_lt_tokenizer_handles_long_text(lt_tokenizer):
text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią
vidutinį daugiametį vandens lygį. Nustatyta, kad 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis
text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią
vidutinį daugiametį vandens lygį. Nustatyta, kad 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis
yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui."""
tokens = lt_tokenizer(text.replace("\n", ""))
assert len(tokens) == 42
@pytest.mark.parametrize('text,length', [
("177R Parodų rūmaiOzo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.", 15),
("ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.", 16)])
@pytest.mark.parametrize(
"text,length",
[
(
"177R Parodų rūmaiOzo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.",
15,
),
(
"ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.",
16,
),
],
)
def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length):
tokens = lt_tokenizer(text)
assert len(tokens) == length
@ -26,18 +36,22 @@ def test_lt_tokenizer_abbrev_exceptions(lt_tokenizer, text):
assert len(tokens) == 1
@pytest.mark.parametrize("text,match", [
("10", True),
("1", True),
("10,000", True),
("10,00", True),
("999.0", True),
("vienas", True),
("du", True),
("milijardas", True),
("šuo", False),
(",", False),
("1/2", True)])
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("10,000", True),
("10,00", True),
("999.0", True),
("vienas", True),
("du", True),
("milijardas", True),
("šuo", False),
(",", False),
("1/2", True),
],
)
def test_lt_lex_attrs_like_number(lt_tokenizer, text, match):
tokens = lt_tokenizer(text)
assert len(tokens) == 1

View File

@ -5,7 +5,6 @@ import pytest
import re
from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token
from ..util import get_doc
@pytest.fixture
@ -288,24 +287,43 @@ def deps():
def dependency_matcher(en_vocab):
def is_brown_yellow(text):
return bool(re.compile(r"brown|yellow|over").match(text))
IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)
pattern1 = [
{"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}},
{"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},"PATTERN": {"ORTH": "quick", "DEP": "amod"}},
{"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, "PATTERN": {IS_BROWN_YELLOW: True}},
{
"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
"PATTERN": {"ORTH": "quick", "DEP": "amod"},
},
{
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
"PATTERN": {IS_BROWN_YELLOW: True},
},
]
pattern2 = [
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
{"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
{"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}
{
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
{
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
]
pattern3 = [
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
{"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
{"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"}, "PATTERN": {"ORTH": "brown"}}
{
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
{
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"},
"PATTERN": {"ORTH": "brown"},
},
]
matcher = DependencyMatcher(en_vocab)
@ -320,9 +338,9 @@ def test_dependency_matcher_compile(dependency_matcher):
assert len(dependency_matcher) == 3
def test_dependency_matcher(dependency_matcher, text, heads, deps):
doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
matches = dependency_matcher(doc)
# assert matches[0][1] == [[3, 1, 2]]
# assert matches[1][1] == [[4, 3, 3]]
# assert matches[2][1] == [[4, 3, 2]]
# def test_dependency_matcher(dependency_matcher, text, heads, deps):
# doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
# matches = dependency_matcher(doc)
# assert matches[0][1] == [[3, 1, 2]]
# assert matches[1][1] == [[4, 3, 3]]
# assert matches[2][1] == [[4, 3, 2]]

View File

@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English
import pytest
def test_issue3880():