Auto-format

This commit is contained in:
Ines Montani 2019-11-20 13:15:24 +01:00
parent 235fe6fe3b
commit 6e303de717
7 changed files with 89 additions and 57 deletions

View File

@ -206,7 +206,15 @@ _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian
_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower _lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
_uncased = ( _uncased = (
_bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu + _hangul _bengali
+ _hebrew
+ _persian
+ _sinhala
+ _hindi
+ _kannada
+ _tamil
+ _telugu
+ _hangul
) )
ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased) ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)

View File

@ -5,47 +5,47 @@ from ...attrs import LIKE_NUM
_num_words = [ _num_words = [
"", "",
"", "",
# Native Korean number system # Native Korean number system
"하나", "하나",
"", "",
"", "",
"", "",
"다섯", "다섯",
"여섯", "여섯",
"일곱", "일곱",
"여덟", "여덟",
"아홉", "아홉",
"", "",
"스물", "스물",
"서른", "서른",
"마흔", "마흔",
"", "",
"예순", "예순",
"일흔", "일흔",
"여든", "여든",
"아흔", "아흔",
# Sino-Korean number system # Sino-Korean number system
"", "",
"", "",
"", "",
"", "",
"", "",
"", "",
"", "",
"", "",
"", "",
"", "",
"", "",
"", "",
"", "",
"십만", "십만",
"백만", "백만",
"천만", "천만",
"일억", "일억",
"십억", "십억",
"백억" "백억",
] ]

View File

@ -6,9 +6,7 @@ from ...symbols import ORTH, LEMMA, NORM
# TODO # TODO
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions) # treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
_exc = { _exc = {}
}
# translate / delete what is not necessary # translate / delete what is not necessary
for exc_data in [ for exc_data in [

View File

@ -14,6 +14,7 @@ from .tag_map import TAG_MAP
def try_jieba_import(use_jieba): def try_jieba_import(use_jieba):
try: try:
import jieba import jieba
return jieba return jieba
except ImportError: except ImportError:
if use_jieba: if use_jieba:
@ -34,7 +35,9 @@ class ChineseTokenizer(DummyTokenizer):
def __call__(self, text): def __call__(self, text):
# use jieba # use jieba
if self.use_jieba: if self.use_jieba:
jieba_words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) jieba_words = list(
[x for x in self.jieba_seg.cut(text, cut_all=False) if x]
)
words = [jieba_words[0]] words = [jieba_words[0]]
spaces = [False] spaces = [False]
for i in range(1, len(jieba_words)): for i in range(1, len(jieba_words)):

View File

@ -271,7 +271,9 @@ class Scorer(object):
self.labelled_per_dep[token.dep_.lower()] = PRFScore() self.labelled_per_dep[token.dep_.lower()] = PRFScore()
if token.dep_.lower() not in cand_deps_per_dep: if token.dep_.lower() not in cand_deps_per_dep:
cand_deps_per_dep[token.dep_.lower()] = set() cand_deps_per_dep[token.dep_.lower()] = set()
cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower())) cand_deps_per_dep[token.dep_.lower()].add(
(gold_i, gold_head, token.dep_.lower())
)
if "-" not in [token[-1] for token in gold.orig_annot]: if "-" not in [token[-1] for token in gold.orig_annot]:
# Find all NER labels in gold and doc # Find all NER labels in gold and doc
ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents]) ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
@ -304,7 +306,9 @@ class Scorer(object):
self.tags.score_set(cand_tags, gold_tags) self.tags.score_set(cand_tags, gold_tags)
self.labelled.score_set(cand_deps, gold_deps) self.labelled.score_set(cand_deps, gold_deps)
for dep in self.labelled_per_dep: for dep in self.labelled_per_dep:
self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())) self.labelled_per_dep[dep].score_set(
cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
)
self.unlabelled.score_set( self.unlabelled.score_set(
set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps) set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
) )

View File

@ -11,8 +11,14 @@ def test_issue4590(en_vocab):
"""Test that matches param in on_match method are the same as matches run with no on_match method""" """Test that matches param in on_match method are the same as matches run with no on_match method"""
pattern = [ pattern = [
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
{"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, {
{"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
{
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
] ]
on_match = Mock() on_match = Mock()
@ -23,12 +29,11 @@ def test_issue4590(en_vocab):
text = "The quick brown fox jumped over the lazy fox" text = "The quick brown fox jumped over the lazy fox"
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"] deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
matches = matcher(doc) matches = matcher(doc)
on_match_args = on_match.call_args on_match_args = on_match.call_args
assert on_match_args[0][3] == matches assert on_match_args[0][3] == matches

View File

@ -12,8 +12,22 @@ from .util import get_doc
test_las_apple = [ test_las_apple = [
[ [
"Apple is looking at buying U.K. startup for $ 1 billion", "Apple is looking at buying U.K. startup for $ 1 billion",
{"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7], {
"deps": ['nsubj', 'aux', 'ROOT', 'prep', 'pcomp', 'compound', 'dobj', 'prep', 'quantmod', 'compound', 'pobj']}, "heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
"deps": [
"nsubj",
"aux",
"ROOT",
"prep",
"pcomp",
"compound",
"dobj",
"prep",
"quantmod",
"compound",
"pobj",
],
},
] ]
] ]
@ -59,7 +73,7 @@ def test_las_per_type(en_vocab):
en_vocab, en_vocab,
words=input_.split(" "), words=input_.split(" "),
heads=([h - i for i, h in enumerate(annot["heads"])]), heads=([h - i for i, h in enumerate(annot["heads"])]),
deps=annot["deps"] deps=annot["deps"],
) )
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"]) gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
doc[0].dep_ = "compound" doc[0].dep_ = "compound"