Tidy up and auto-format [ci skip]

This commit is contained in:
Ines Montani 2019-09-14 12:58:06 +02:00
parent bcbb9f5119
commit 3126dd0904
5 changed files with 12 additions and 9 deletions

View File

@ -39,8 +39,8 @@ def resolve_pos(token):
""" """
# this is only used for consecutive ascii spaces # this is only used for consecutive ascii spaces
if token.pos == '空白': if token.pos == "空白":
return '空白' return "空白"
# TODO: This is a first take. The rules here are crude approximations. # TODO: This is a first take. The rules here are crude approximations.
# For many of these, full dependencies are needed to properly resolve # For many of these, full dependencies are needed to properly resolve
@ -78,7 +78,7 @@ def detailed_tokens(tokenizer, text):
scount = node.next.rlength - node.next.length scount = node.next.rlength - node.next.length
spaces.append(bool(scount)) spaces.append(bool(scount))
while scount > 1: while scount > 1:
words.append(ShortUnitWord(' ', ' ', '空白')) words.append(ShortUnitWord(" ", " ", "空白"))
spaces.append(False) spaces.append(False)
scount -= 1 scount -= 1

View File

@ -48,8 +48,9 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
pos = [token.pos_ for token in ja_tokenizer(text)] pos = [token.pos_ for token in ja_tokenizer(text)]
assert pos == expected_pos assert pos == expected_pos
def test_extra_spaces(ja_tokenizer): def test_extra_spaces(ja_tokenizer):
# note: three spaces after "I" # note: three spaces after "I"
tokens = ja_tokenizer("I like cheese.") tokens = ja_tokenizer("I like cheese.")
assert tokens[1].orth_ == ' ' assert tokens[1].orth_ == " "
assert tokens[2].orth_ == ' ' assert tokens[2].orth_ == " "

View File

@ -17,4 +17,6 @@ TEST_CASES = [
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES) @pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas): def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
assert lemmas == [lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens] assert lemmas == [
lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens
]

View File

@ -78,7 +78,6 @@ def test_lookups_to_from_disk():
assert table2.get_string("b") == 2 assert table2.get_string("b") == 2
# This fails on Python 3.5 # This fails on Python 3.5
@pytest.mark.xfail @pytest.mark.xfail
def test_lookups_to_from_bytes_via_vocab(): def test_lookups_to_from_bytes_via_vocab():
@ -97,6 +96,7 @@ def test_lookups_to_from_bytes_via_vocab():
assert table.get_string("hello") == "world" assert table.get_string("hello") == "world"
assert new_vocab.to_bytes() == vocab_bytes assert new_vocab.to_bytes() == vocab_bytes
# This fails on Python 3.5 # This fails on Python 3.5
@pytest.mark.xfail @pytest.mark.xfail
def test_lookups_to_from_disk_via_vocab(): def test_lookups_to_from_disk_via_vocab():

View File

@ -80,8 +80,8 @@ training corpus and can be defined in the respective language data's
<Accordion title="Universal Part-of-speech Tags" id="pos-universal"> <Accordion title="Universal Part-of-speech Tags" id="pos-universal">
spaCy also maps all language-specific part-of-speech tags to a small, fixed set spaCy maps all language-specific part-of-speech tags to a small, fixed set of
of word type tags following the word type tags following the
[Universal Dependencies scheme](http://universaldependencies.org/u/pos/). The [Universal Dependencies scheme](http://universaldependencies.org/u/pos/). The
universal tags don't code for any morphological features and only cover the word universal tags don't code for any morphological features and only cover the word
type. They're available as the [`Token.pos`](/api/token#attributes) and type. They're available as the [`Token.pos`](/api/token#attributes) and