Tidy up and auto-format [ci skip]

This commit is contained in:
Ines Montani 2019-09-14 12:58:06 +02:00
parent bcbb9f5119
commit 3126dd0904
5 changed files with 12 additions and 9 deletions

View File

@ -39,8 +39,8 @@ def resolve_pos(token):
"""
# this is only used for consecutive ascii spaces
if token.pos == '空白':
return '空白'
if token.pos == "空白":
return "空白"
# TODO: This is a first take. The rules here are crude approximations.
# For many of these, full dependencies are needed to properly resolve
@ -78,7 +78,7 @@ def detailed_tokens(tokenizer, text):
scount = node.next.rlength - node.next.length
spaces.append(bool(scount))
while scount > 1:
words.append(ShortUnitWord(' ', ' ', '空白'))
words.append(ShortUnitWord(" ", " ", "空白"))
spaces.append(False)
scount -= 1

View File

@ -48,8 +48,9 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
pos = [token.pos_ for token in ja_tokenizer(text)]
assert pos == expected_pos
def test_extra_spaces(ja_tokenizer):
# note: three spaces after "I"
tokens = ja_tokenizer("I like cheese.")
assert tokens[1].orth_ == ' '
assert tokens[2].orth_ == ' '
assert tokens[1].orth_ == " "
assert tokens[2].orth_ == " "

View File

@ -17,4 +17,6 @@ TEST_CASES = [
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
assert lemmas == [lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens]
assert lemmas == [
lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens
]

View File

@ -78,7 +78,6 @@ def test_lookups_to_from_disk():
assert table2.get_string("b") == 2
# This fails on Python 3.5
@pytest.mark.xfail
def test_lookups_to_from_bytes_via_vocab():
@ -97,6 +96,7 @@ def test_lookups_to_from_bytes_via_vocab():
assert table.get_string("hello") == "world"
assert new_vocab.to_bytes() == vocab_bytes
# This fails on Python 3.5
@pytest.mark.xfail
def test_lookups_to_from_disk_via_vocab():

View File

@ -80,8 +80,8 @@ training corpus and can be defined in the respective language data's
<Accordion title="Universal Part-of-speech Tags" id="pos-universal">
spaCy also maps all language-specific part-of-speech tags to a small, fixed set
of word type tags following the
spaCy maps all language-specific part-of-speech tags to a small, fixed set of
word type tags following the
[Universal Dependencies scheme](http://universaldependencies.org/u/pos/). The
universal tags don't code for any morphological features and only cover the word
type. They're available as the [`Token.pos`](/api/token#attributes) and