mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Tidy up and auto-format [ci skip]
This commit is contained in:
parent
bcbb9f5119
commit
3126dd0904
|
@ -39,8 +39,8 @@ def resolve_pos(token):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# this is only used for consecutive ascii spaces
|
# this is only used for consecutive ascii spaces
|
||||||
if token.pos == '空白':
|
if token.pos == "空白":
|
||||||
return '空白'
|
return "空白"
|
||||||
|
|
||||||
# TODO: This is a first take. The rules here are crude approximations.
|
# TODO: This is a first take. The rules here are crude approximations.
|
||||||
# For many of these, full dependencies are needed to properly resolve
|
# For many of these, full dependencies are needed to properly resolve
|
||||||
|
@ -78,7 +78,7 @@ def detailed_tokens(tokenizer, text):
|
||||||
scount = node.next.rlength - node.next.length
|
scount = node.next.rlength - node.next.length
|
||||||
spaces.append(bool(scount))
|
spaces.append(bool(scount))
|
||||||
while scount > 1:
|
while scount > 1:
|
||||||
words.append(ShortUnitWord(' ', ' ', '空白'))
|
words.append(ShortUnitWord(" ", " ", "空白"))
|
||||||
spaces.append(False)
|
spaces.append(False)
|
||||||
scount -= 1
|
scount -= 1
|
||||||
|
|
||||||
|
|
|
@ -48,8 +48,9 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
||||||
pos = [token.pos_ for token in ja_tokenizer(text)]
|
pos = [token.pos_ for token in ja_tokenizer(text)]
|
||||||
assert pos == expected_pos
|
assert pos == expected_pos
|
||||||
|
|
||||||
|
|
||||||
def test_extra_spaces(ja_tokenizer):
|
def test_extra_spaces(ja_tokenizer):
|
||||||
# note: three spaces after "I"
|
# note: three spaces after "I"
|
||||||
tokens = ja_tokenizer("I like cheese.")
|
tokens = ja_tokenizer("I like cheese.")
|
||||||
assert tokens[1].orth_ == ' '
|
assert tokens[1].orth_ == " "
|
||||||
assert tokens[2].orth_ == ' '
|
assert tokens[2].orth_ == " "
|
||||||
|
|
|
@ -17,4 +17,6 @@ TEST_CASES = [
|
||||||
|
|
||||||
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
|
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
|
||||||
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
|
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
|
||||||
assert lemmas == [lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens]
|
assert lemmas == [
|
||||||
|
lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens
|
||||||
|
]
|
||||||
|
|
|
@ -78,7 +78,6 @@ def test_lookups_to_from_disk():
|
||||||
assert table2.get_string("b") == 2
|
assert table2.get_string("b") == 2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# This fails on Python 3.5
|
# This fails on Python 3.5
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_lookups_to_from_bytes_via_vocab():
|
def test_lookups_to_from_bytes_via_vocab():
|
||||||
|
@ -97,6 +96,7 @@ def test_lookups_to_from_bytes_via_vocab():
|
||||||
assert table.get_string("hello") == "world"
|
assert table.get_string("hello") == "world"
|
||||||
assert new_vocab.to_bytes() == vocab_bytes
|
assert new_vocab.to_bytes() == vocab_bytes
|
||||||
|
|
||||||
|
|
||||||
# This fails on Python 3.5
|
# This fails on Python 3.5
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_lookups_to_from_disk_via_vocab():
|
def test_lookups_to_from_disk_via_vocab():
|
||||||
|
|
|
@ -80,8 +80,8 @@ training corpus and can be defined in the respective language data's
|
||||||
|
|
||||||
<Accordion title="Universal Part-of-speech Tags" id="pos-universal">
|
<Accordion title="Universal Part-of-speech Tags" id="pos-universal">
|
||||||
|
|
||||||
spaCy also maps all language-specific part-of-speech tags to a small, fixed set
|
spaCy maps all language-specific part-of-speech tags to a small, fixed set of
|
||||||
of word type tags following the
|
word type tags following the
|
||||||
[Universal Dependencies scheme](http://universaldependencies.org/u/pos/). The
|
[Universal Dependencies scheme](http://universaldependencies.org/u/pos/). The
|
||||||
universal tags don't code for any morphological features and only cover the word
|
universal tags don't code for any morphological features and only cover the word
|
||||||
type. They're available as the [`Token.pos`](/api/token#attributes) and
|
type. They're available as the [`Token.pos`](/api/token#attributes) and
|
||||||
|
|
Loading…
Reference in New Issue
Block a user