mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Auto-format
This commit is contained in:
parent
235fe6fe3b
commit
6e303de717
|
@ -206,7 +206,15 @@ _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian
|
||||||
_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
|
_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
|
||||||
|
|
||||||
_uncased = (
|
_uncased = (
|
||||||
_bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu + _hangul
|
_bengali
|
||||||
|
+ _hebrew
|
||||||
|
+ _persian
|
||||||
|
+ _sinhala
|
||||||
|
+ _hindi
|
||||||
|
+ _kannada
|
||||||
|
+ _tamil
|
||||||
|
+ _telugu
|
||||||
|
+ _hangul
|
||||||
)
|
)
|
||||||
|
|
||||||
ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
|
ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
|
||||||
|
|
|
@ -5,47 +5,47 @@ from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
"영",
|
"영",
|
||||||
"공",
|
"공",
|
||||||
# Native Korean number system
|
# Native Korean number system
|
||||||
"하나",
|
"하나",
|
||||||
"둘",
|
"둘",
|
||||||
"셋",
|
"셋",
|
||||||
"넷",
|
"넷",
|
||||||
"다섯",
|
"다섯",
|
||||||
"여섯",
|
"여섯",
|
||||||
"일곱",
|
"일곱",
|
||||||
"여덟",
|
"여덟",
|
||||||
"아홉",
|
"아홉",
|
||||||
"열",
|
"열",
|
||||||
"스물",
|
"스물",
|
||||||
"서른",
|
"서른",
|
||||||
"마흔",
|
"마흔",
|
||||||
"쉰",
|
"쉰",
|
||||||
"예순",
|
"예순",
|
||||||
"일흔",
|
"일흔",
|
||||||
"여든",
|
"여든",
|
||||||
"아흔",
|
"아흔",
|
||||||
# Sino-Korean number system
|
# Sino-Korean number system
|
||||||
"일",
|
"일",
|
||||||
"이",
|
"이",
|
||||||
"삼",
|
"삼",
|
||||||
"사",
|
"사",
|
||||||
"오",
|
"오",
|
||||||
"육",
|
"육",
|
||||||
"칠",
|
"칠",
|
||||||
"팔",
|
"팔",
|
||||||
"구",
|
"구",
|
||||||
"십",
|
"십",
|
||||||
"백",
|
"백",
|
||||||
"천",
|
"천",
|
||||||
"만",
|
"만",
|
||||||
"십만",
|
"십만",
|
||||||
"백만",
|
"백만",
|
||||||
"천만",
|
"천만",
|
||||||
"일억",
|
"일억",
|
||||||
"십억",
|
"십억",
|
||||||
"백억"
|
"백억",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,9 +6,7 @@ from ...symbols import ORTH, LEMMA, NORM
|
||||||
# TODO
|
# TODO
|
||||||
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
|
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
|
||||||
|
|
||||||
_exc = {
|
_exc = {}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
# translate / delete what is not necessary
|
# translate / delete what is not necessary
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
|
|
|
@ -14,6 +14,7 @@ from .tag_map import TAG_MAP
|
||||||
def try_jieba_import(use_jieba):
|
def try_jieba_import(use_jieba):
|
||||||
try:
|
try:
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
return jieba
|
return jieba
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if use_jieba:
|
if use_jieba:
|
||||||
|
@ -34,7 +35,9 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
# use jieba
|
# use jieba
|
||||||
if self.use_jieba:
|
if self.use_jieba:
|
||||||
jieba_words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
|
jieba_words = list(
|
||||||
|
[x for x in self.jieba_seg.cut(text, cut_all=False) if x]
|
||||||
|
)
|
||||||
words = [jieba_words[0]]
|
words = [jieba_words[0]]
|
||||||
spaces = [False]
|
spaces = [False]
|
||||||
for i in range(1, len(jieba_words)):
|
for i in range(1, len(jieba_words)):
|
||||||
|
|
|
@ -271,7 +271,9 @@ class Scorer(object):
|
||||||
self.labelled_per_dep[token.dep_.lower()] = PRFScore()
|
self.labelled_per_dep[token.dep_.lower()] = PRFScore()
|
||||||
if token.dep_.lower() not in cand_deps_per_dep:
|
if token.dep_.lower() not in cand_deps_per_dep:
|
||||||
cand_deps_per_dep[token.dep_.lower()] = set()
|
cand_deps_per_dep[token.dep_.lower()] = set()
|
||||||
cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower()))
|
cand_deps_per_dep[token.dep_.lower()].add(
|
||||||
|
(gold_i, gold_head, token.dep_.lower())
|
||||||
|
)
|
||||||
if "-" not in [token[-1] for token in gold.orig_annot]:
|
if "-" not in [token[-1] for token in gold.orig_annot]:
|
||||||
# Find all NER labels in gold and doc
|
# Find all NER labels in gold and doc
|
||||||
ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
|
ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
|
||||||
|
@ -304,7 +306,9 @@ class Scorer(object):
|
||||||
self.tags.score_set(cand_tags, gold_tags)
|
self.tags.score_set(cand_tags, gold_tags)
|
||||||
self.labelled.score_set(cand_deps, gold_deps)
|
self.labelled.score_set(cand_deps, gold_deps)
|
||||||
for dep in self.labelled_per_dep:
|
for dep in self.labelled_per_dep:
|
||||||
self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set()))
|
self.labelled_per_dep[dep].score_set(
|
||||||
|
cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
|
||||||
|
)
|
||||||
self.unlabelled.score_set(
|
self.unlabelled.score_set(
|
||||||
set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
|
set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
|
||||||
)
|
)
|
||||||
|
|
|
@ -11,8 +11,14 @@ def test_issue4590(en_vocab):
|
||||||
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
||||||
pattern = [
|
pattern = [
|
||||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||||
{"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
|
{
|
||||||
{"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
|
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||||
|
"PATTERN": {"ORTH": "fox"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||||
|
"PATTERN": {"ORTH": "fox"},
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
on_match = Mock()
|
on_match = Mock()
|
||||||
|
@ -23,12 +29,11 @@ def test_issue4590(en_vocab):
|
||||||
text = "The quick brown fox jumped over the lazy fox"
|
text = "The quick brown fox jumped over the lazy fox"
|
||||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||||
deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
|
deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
|
||||||
|
|
||||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||||
|
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
|
|
||||||
on_match_args = on_match.call_args
|
on_match_args = on_match.call_args
|
||||||
|
|
||||||
assert on_match_args[0][3] == matches
|
assert on_match_args[0][3] == matches
|
||||||
|
|
||||||
|
|
|
@ -12,8 +12,22 @@ from .util import get_doc
|
||||||
test_las_apple = [
|
test_las_apple = [
|
||||||
[
|
[
|
||||||
"Apple is looking at buying U.K. startup for $ 1 billion",
|
"Apple is looking at buying U.K. startup for $ 1 billion",
|
||||||
{"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
|
{
|
||||||
"deps": ['nsubj', 'aux', 'ROOT', 'prep', 'pcomp', 'compound', 'dobj', 'prep', 'quantmod', 'compound', 'pobj']},
|
"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
|
||||||
|
"deps": [
|
||||||
|
"nsubj",
|
||||||
|
"aux",
|
||||||
|
"ROOT",
|
||||||
|
"prep",
|
||||||
|
"pcomp",
|
||||||
|
"compound",
|
||||||
|
"dobj",
|
||||||
|
"prep",
|
||||||
|
"quantmod",
|
||||||
|
"compound",
|
||||||
|
"pobj",
|
||||||
|
],
|
||||||
|
},
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -59,7 +73,7 @@ def test_las_per_type(en_vocab):
|
||||||
en_vocab,
|
en_vocab,
|
||||||
words=input_.split(" "),
|
words=input_.split(" "),
|
||||||
heads=([h - i for i, h in enumerate(annot["heads"])]),
|
heads=([h - i for i, h in enumerate(annot["heads"])]),
|
||||||
deps=annot["deps"]
|
deps=annot["deps"],
|
||||||
)
|
)
|
||||||
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
|
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
|
||||||
doc[0].dep_ = "compound"
|
doc[0].dep_ = "compound"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user