mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
Merge branch 'master' into pr/3948
This commit is contained in:
commit
197cfd7ebc
31
spacy/_ml.py
31
spacy/_ml.py
|
@ -661,21 +661,33 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
|
||||||
|
|
||||||
conv_depth = cfg.get("conv_depth", 2)
|
conv_depth = cfg.get("conv_depth", 2)
|
||||||
cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
|
cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
|
||||||
pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name
|
pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name
|
||||||
context_width = cfg.get("context_width")
|
context_width = cfg.get("context_width")
|
||||||
entity_width = cfg.get("entity_width")
|
entity_width = cfg.get("entity_width")
|
||||||
|
|
||||||
with Model.define_operators({">>": chain, "**": clone}):
|
with Model.define_operators({">>": chain, "**": clone}):
|
||||||
model = Affine(entity_width, entity_width+context_width+1+ner_types)\
|
model = (
|
||||||
>> Affine(1, entity_width, drop_factor=0.0)\
|
Affine(entity_width, entity_width + context_width + 1 + ner_types)
|
||||||
>> logistic
|
>> Affine(1, entity_width, drop_factor=0.0)
|
||||||
|
>> logistic
|
||||||
|
)
|
||||||
|
|
||||||
# context encoder
|
# context encoder
|
||||||
tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors,
|
tok2vec = (
|
||||||
cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth,
|
Tok2Vec(
|
||||||
bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\
|
width=hidden_width,
|
||||||
>> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
|
embed_size=embed_width,
|
||||||
>> zero_init(Affine(context_width, hidden_width))
|
pretrained_vectors=pretrained_vectors,
|
||||||
|
cnn_maxout_pieces=cnn_maxout_pieces,
|
||||||
|
subword_features=True,
|
||||||
|
conv_depth=conv_depth,
|
||||||
|
bilstm_depth=0,
|
||||||
|
)
|
||||||
|
>> flatten_add_lengths
|
||||||
|
>> Pooling(mean_pool)
|
||||||
|
>> Residual(zero_init(Maxout(hidden_width, hidden_width)))
|
||||||
|
>> zero_init(Affine(context_width, hidden_width))
|
||||||
|
)
|
||||||
|
|
||||||
model.tok2vec = tok2vec
|
model.tok2vec = tok2vec
|
||||||
|
|
||||||
|
@ -684,6 +696,7 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
|
||||||
model.nO = 1
|
model.nO = 1
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def flatten(seqs, drop=0.0):
|
def flatten(seqs, drop=0.0):
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
|
|
|
@ -6,7 +6,7 @@ import sys
|
||||||
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP, POS
|
from .tag_map import TAG_MAP
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
|
@ -22,6 +22,7 @@ if is_python_pre_3_5:
|
||||||
Morpheme = namedtuple("Morpheme", "surface lemma tag")
|
Morpheme = namedtuple("Morpheme", "surface lemma tag")
|
||||||
elif is_python_post_3_7:
|
elif is_python_post_3_7:
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class Morpheme:
|
class Morpheme:
|
||||||
surface: str
|
surface: str
|
||||||
|
@ -29,6 +30,7 @@ elif is_python_post_3_7:
|
||||||
tag: str
|
tag: str
|
||||||
else:
|
else:
|
||||||
from typing import NamedTuple
|
from typing import NamedTuple
|
||||||
|
|
||||||
class Morpheme(NamedTuple):
|
class Morpheme(NamedTuple):
|
||||||
surface: str
|
surface: str
|
||||||
lemma: str
|
lemma: str
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
from ...symbols import (POS, PUNCT, INTJ, X, SYM,
|
from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON
|
||||||
ADJ, AUX, ADP, CONJ, NOUN, PRON, VERB, ADV, PROPN,
|
from ...symbols import VERB, ADV, PROPN, NUM, DET
|
||||||
NUM, DET)
|
|
||||||
|
|
||||||
# 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴
|
# 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴
|
||||||
# https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265
|
# https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265
|
||||||
|
@ -18,49 +16,44 @@ TAG_MAP = {
|
||||||
"JKB": {POS: ADP},
|
"JKB": {POS: ADP},
|
||||||
"JKV": {POS: ADP},
|
"JKV": {POS: ADP},
|
||||||
"JKQ": {POS: ADP},
|
"JKQ": {POS: ADP},
|
||||||
"JX": {POS: ADP}, # 보조사
|
"JX": {POS: ADP}, # 보조사
|
||||||
"JC": {POS: CONJ}, # 접속 조사
|
"JC": {POS: CONJ}, # 접속 조사
|
||||||
"MAJ": {POS: CONJ}, # 접속 부사
|
"MAJ": {POS: CONJ}, # 접속 부사
|
||||||
"MAG": {POS: ADV}, # 일반 부사
|
"MAG": {POS: ADV}, # 일반 부사
|
||||||
"MM": {POS: DET}, # 관형사
|
"MM": {POS: DET}, # 관형사
|
||||||
|
|
||||||
"XPN": {POS: X}, # 접두사
|
"XPN": {POS: X}, # 접두사
|
||||||
# XS. 접미사
|
# XS. 접미사
|
||||||
"XSN": {POS: X},
|
"XSN": {POS: X},
|
||||||
"XSV": {POS: X},
|
"XSV": {POS: X},
|
||||||
"XSA": {POS: X},
|
"XSA": {POS: X},
|
||||||
"XR": {POS: X}, # 어근
|
"XR": {POS: X}, # 어근
|
||||||
# E.{1,2} 어미
|
# E.{1,2} 어미
|
||||||
"EP": {POS: X},
|
"EP": {POS: X},
|
||||||
"EF": {POS: X},
|
"EF": {POS: X},
|
||||||
"EC": {POS: X},
|
"EC": {POS: X},
|
||||||
"ETN": {POS: X},
|
"ETN": {POS: X},
|
||||||
"ETM": {POS: X},
|
"ETM": {POS: X},
|
||||||
|
|
||||||
"IC": {POS: INTJ}, # 감탄사
|
"IC": {POS: INTJ}, # 감탄사
|
||||||
|
|
||||||
"VV": {POS: VERB}, # 동사
|
"VV": {POS: VERB}, # 동사
|
||||||
"VA": {POS: ADJ}, # 형용사
|
"VA": {POS: ADJ}, # 형용사
|
||||||
"VX": {POS: AUX}, # 보조 용언
|
"VX": {POS: AUX}, # 보조 용언
|
||||||
"VCP": {POS: ADP}, # 긍정 지정사(이다)
|
"VCP": {POS: ADP}, # 긍정 지정사(이다)
|
||||||
"VCN": {POS: ADJ}, # 부정 지정사(아니다)
|
"VCN": {POS: ADJ}, # 부정 지정사(아니다)
|
||||||
|
"NNG": {POS: NOUN}, # 일반 명사(general noun)
|
||||||
"NNG": {POS: NOUN}, # 일반 명사(general noun)
|
"NNB": {POS: NOUN}, # 의존 명사
|
||||||
"NNB": {POS: NOUN}, # 의존 명사
|
"NNBC": {POS: NOUN}, # 의존 명사(단위: unit)
|
||||||
"NNBC": {POS: NOUN}, # 의존 명사(단위: unit)
|
"NNP": {POS: PROPN}, # 고유 명사(proper noun)
|
||||||
"NNP": {POS: PROPN}, # 고유 명사(proper noun)
|
|
||||||
"NP": {POS: PRON}, # 대명사
|
"NP": {POS: PRON}, # 대명사
|
||||||
"NR": {POS: NUM}, # 수사(numerals)
|
"NR": {POS: NUM}, # 수사(numerals)
|
||||||
"SN": {POS: NUM}, # 숫자
|
"SN": {POS: NUM}, # 숫자
|
||||||
|
|
||||||
# S.{1,2} 부호
|
# S.{1,2} 부호
|
||||||
# 문장 부호
|
# 문장 부호
|
||||||
"SF": {POS: PUNCT}, # period or other EOS marker
|
"SF": {POS: PUNCT}, # period or other EOS marker
|
||||||
"SE": {POS: PUNCT},
|
"SE": {POS: PUNCT},
|
||||||
"SC": {POS: PUNCT}, # comma, etc.
|
"SC": {POS: PUNCT}, # comma, etc.
|
||||||
"SSO": {POS: PUNCT}, # open bracket
|
"SSO": {POS: PUNCT}, # open bracket
|
||||||
"SSC": {POS: PUNCT}, # close bracket
|
"SSC": {POS: PUNCT}, # close bracket
|
||||||
"SY": {POS: SYM}, # 기타 기호
|
"SY": {POS: SYM}, # 기타 기호
|
||||||
"SL": {POS: X}, # 외국어
|
"SL": {POS: X}, # 외국어
|
||||||
"SH": {POS: X}, # 한자
|
"SH": {POS: X}, # 한자
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,8 +5,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"word,lemma",
|
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
|
||||||
[("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")],
|
|
||||||
)
|
)
|
||||||
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
|
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
|
||||||
test_lemma = ko_tokenizer(word)[0].lemma_
|
test_lemma = ko_tokenizer(word)[0].lemma_
|
||||||
|
|
|
@ -5,16 +5,24 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_lt_tokenizer_handles_long_text(lt_tokenizer):
|
def test_lt_tokenizer_handles_long_text(lt_tokenizer):
|
||||||
text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią
|
text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui."""
|
||||||
vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis
|
tokens = lt_tokenizer(text)
|
||||||
yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui."""
|
|
||||||
tokens = lt_tokenizer(text.replace("\n", ""))
|
|
||||||
assert len(tokens) == 42
|
assert len(tokens) == 42
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [
|
@pytest.mark.parametrize(
|
||||||
("177R Parodų rūmai–Ozo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.", 15),
|
"text,length",
|
||||||
("ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.", 16)])
|
[
|
||||||
|
(
|
||||||
|
"177R Parodų rūmai–Ozo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.",
|
||||||
|
15,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.",
|
||||||
|
16,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length):
|
def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length):
|
||||||
tokens = lt_tokenizer(text)
|
tokens = lt_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
@ -26,18 +34,22 @@ def test_lt_tokenizer_abbrev_exceptions(lt_tokenizer, text):
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,match", [
|
@pytest.mark.parametrize(
|
||||||
("10", True),
|
"text,match",
|
||||||
("1", True),
|
[
|
||||||
("10,000", True),
|
("10", True),
|
||||||
("10,00", True),
|
("1", True),
|
||||||
("999.0", True),
|
("10,000", True),
|
||||||
("vienas", True),
|
("10,00", True),
|
||||||
("du", True),
|
("999.0", True),
|
||||||
("milijardas", True),
|
("vienas", True),
|
||||||
("šuo", False),
|
("du", True),
|
||||||
(",", False),
|
("milijardas", True),
|
||||||
("1/2", True)])
|
("šuo", False),
|
||||||
|
(",", False),
|
||||||
|
("1/2", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_lt_lex_attrs_like_number(lt_tokenizer, text, match):
|
def test_lt_lex_attrs_like_number(lt_tokenizer, text, match):
|
||||||
tokens = lt_tokenizer(text)
|
tokens = lt_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
|
@ -5,7 +5,6 @@ import pytest
|
||||||
import re
|
import re
|
||||||
from spacy.matcher import Matcher, DependencyMatcher
|
from spacy.matcher import Matcher, DependencyMatcher
|
||||||
from spacy.tokens import Doc, Token
|
from spacy.tokens import Doc, Token
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -288,24 +287,43 @@ def deps():
|
||||||
def dependency_matcher(en_vocab):
|
def dependency_matcher(en_vocab):
|
||||||
def is_brown_yellow(text):
|
def is_brown_yellow(text):
|
||||||
return bool(re.compile(r"brown|yellow|over").match(text))
|
return bool(re.compile(r"brown|yellow|over").match(text))
|
||||||
|
|
||||||
IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)
|
IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)
|
||||||
|
|
||||||
pattern1 = [
|
pattern1 = [
|
||||||
{"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}},
|
{"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}},
|
||||||
{"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},"PATTERN": {"ORTH": "quick", "DEP": "amod"}},
|
{
|
||||||
{"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, "PATTERN": {IS_BROWN_YELLOW: True}},
|
"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
|
||||||
|
"PATTERN": {"ORTH": "quick", "DEP": "amod"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
|
||||||
|
"PATTERN": {IS_BROWN_YELLOW: True},
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
pattern2 = [
|
pattern2 = [
|
||||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||||
{"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
|
{
|
||||||
{"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}
|
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||||
|
"PATTERN": {"ORTH": "fox"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||||
|
"PATTERN": {"ORTH": "fox"},
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
pattern3 = [
|
pattern3 = [
|
||||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||||
{"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
|
{
|
||||||
{"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"}, "PATTERN": {"ORTH": "brown"}}
|
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||||
|
"PATTERN": {"ORTH": "fox"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"},
|
||||||
|
"PATTERN": {"ORTH": "brown"},
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
matcher = DependencyMatcher(en_vocab)
|
matcher = DependencyMatcher(en_vocab)
|
||||||
|
@ -320,9 +338,9 @@ def test_dependency_matcher_compile(dependency_matcher):
|
||||||
assert len(dependency_matcher) == 3
|
assert len(dependency_matcher) == 3
|
||||||
|
|
||||||
|
|
||||||
def test_dependency_matcher(dependency_matcher, text, heads, deps):
|
# def test_dependency_matcher(dependency_matcher, text, heads, deps):
|
||||||
doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
|
# doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
|
||||||
matches = dependency_matcher(doc)
|
# matches = dependency_matcher(doc)
|
||||||
# assert matches[0][1] == [[3, 1, 2]]
|
# assert matches[0][1] == [[3, 1, 2]]
|
||||||
# assert matches[1][1] == [[4, 3, 3]]
|
# assert matches[1][1] == [[4, 3, 3]]
|
||||||
# assert matches[2][1] == [[4, 3, 2]]
|
# assert matches[2][1] == [[4, 3, 2]]
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3880():
|
def test_issue3880():
|
||||||
|
|
Loading…
Reference in New Issue
Block a user