mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Tidy up and auto-format [ci skip]
This commit is contained in:
parent
bcbb9f5119
commit
3126dd0904
|
@ -39,8 +39,8 @@ def resolve_pos(token):
|
|||
"""
|
||||
|
||||
# this is only used for consecutive ascii spaces
|
||||
if token.pos == '空白':
|
||||
return '空白'
|
||||
if token.pos == "空白":
|
||||
return "空白"
|
||||
|
||||
# TODO: This is a first take. The rules here are crude approximations.
|
||||
# For many of these, full dependencies are needed to properly resolve
|
||||
|
@ -78,7 +78,7 @@ def detailed_tokens(tokenizer, text):
|
|||
scount = node.next.rlength - node.next.length
|
||||
spaces.append(bool(scount))
|
||||
while scount > 1:
|
||||
words.append(ShortUnitWord(' ', ' ', '空白'))
|
||||
words.append(ShortUnitWord(" ", " ", "空白"))
|
||||
spaces.append(False)
|
||||
scount -= 1
|
||||
|
||||
|
|
|
@ -48,8 +48,9 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
|||
pos = [token.pos_ for token in ja_tokenizer(text)]
|
||||
assert pos == expected_pos
|
||||
|
||||
|
||||
def test_extra_spaces(ja_tokenizer):
|
||||
# note: three spaces after "I"
|
||||
tokens = ja_tokenizer("I like cheese.")
|
||||
assert tokens[1].orth_ == ' '
|
||||
assert tokens[2].orth_ == ' '
|
||||
assert tokens[1].orth_ == " "
|
||||
assert tokens[2].orth_ == " "
|
||||
|
|
|
@ -17,4 +17,6 @@ TEST_CASES = [
|
|||
|
||||
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
|
||||
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
|
||||
assert lemmas == [lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens]
|
||||
assert lemmas == [
|
||||
lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens
|
||||
]
|
||||
|
|
|
@ -78,7 +78,6 @@ def test_lookups_to_from_disk():
|
|||
assert table2.get_string("b") == 2
|
||||
|
||||
|
||||
|
||||
# This fails on Python 3.5
|
||||
@pytest.mark.xfail
|
||||
def test_lookups_to_from_bytes_via_vocab():
|
||||
|
@ -97,6 +96,7 @@ def test_lookups_to_from_bytes_via_vocab():
|
|||
assert table.get_string("hello") == "world"
|
||||
assert new_vocab.to_bytes() == vocab_bytes
|
||||
|
||||
|
||||
# This fails on Python 3.5
|
||||
@pytest.mark.xfail
|
||||
def test_lookups_to_from_disk_via_vocab():
|
||||
|
|
|
@ -80,8 +80,8 @@ training corpus and can be defined in the respective language data's
|
|||
|
||||
<Accordion title="Universal Part-of-speech Tags" id="pos-universal">
|
||||
|
||||
spaCy also maps all language-specific part-of-speech tags to a small, fixed set
|
||||
of word type tags following the
|
||||
spaCy maps all language-specific part-of-speech tags to a small, fixed set of
|
||||
word type tags following the
|
||||
[Universal Dependencies scheme](http://universaldependencies.org/u/pos/). The
|
||||
universal tags don't code for any morphological features and only cover the word
|
||||
type. They're available as the [`Token.pos`](/api/token#attributes) and
|
||||
|
|
Loading…
Reference in New Issue
Block a user