diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 791b1ec33..056a6893b 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -39,8 +39,8 @@ def resolve_pos(token): """ # this is only used for consecutive ascii spaces - if token.pos == '空白': - return '空白' + if token.pos == "空白": + return "空白" # TODO: This is a first take. The rules here are crude approximations. # For many of these, full dependencies are needed to properly resolve @@ -78,7 +78,7 @@ def detailed_tokens(tokenizer, text): scount = node.next.rlength - node.next.length spaces.append(bool(scount)) while scount > 1: - words.append(ShortUnitWord(' ', ' ', '空白')) + words.append(ShortUnitWord(" ", " ", "空白")) spaces.append(False) scount -= 1 diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 38ca37bc9..ad8bfaa00 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -48,8 +48,9 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] assert pos == expected_pos + def test_extra_spaces(ja_tokenizer): # note: three spaces after "I" tokens = ja_tokenizer("I like cheese.") - assert tokens[1].orth_ == ' ' - assert tokens[2].orth_ == ' ' + assert tokens[1].orth_ == " " + assert tokens[2].orth_ == " " diff --git a/spacy/tests/lang/lt/test_lemmatizer.py b/spacy/tests/lang/lt/test_lemmatizer.py index 5c3ed34f8..b98d63935 100644 --- a/spacy/tests/lang/lt/test_lemmatizer.py +++ b/spacy/tests/lang/lt/test_lemmatizer.py @@ -17,4 +17,6 @@ TEST_CASES = [ @pytest.mark.parametrize("tokens,lemmas", TEST_CASES) def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas): - assert lemmas == [lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens] + assert lemmas == [ + lt_lemmatizer.lookup_table.get_string(token, token) for token in tokens + ] diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index 7cdf8ff68..daab5e585 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -78,7 +78,6 @@ def test_lookups_to_from_disk(): assert table2.get_string("b") == 2 - # This fails on Python 3.5 @pytest.mark.xfail def test_lookups_to_from_bytes_via_vocab(): @@ -97,6 +96,7 @@ def test_lookups_to_from_bytes_via_vocab(): assert table.get_string("hello") == "world" assert new_vocab.to_bytes() == vocab_bytes + # This fails on Python 3.5 @pytest.mark.xfail def test_lookups_to_from_disk_via_vocab(): diff --git a/website/docs/api/annotation.md b/website/docs/api/annotation.md index 2c52d197a..f44019752 100644 --- a/website/docs/api/annotation.md +++ b/website/docs/api/annotation.md @@ -80,8 +80,8 @@ training corpus and can be defined in the respective language data's -spaCy also maps all language-specific part-of-speech tags to a small, fixed set -of word type tags following the +spaCy maps all language-specific part-of-speech tags to a small, fixed set of +word type tags following the [Universal Dependencies scheme](http://universaldependencies.org/u/pos/). The universal tags don't code for any morphological features and only cover the word type. They're available as the [`Token.pos`](/api/token#attributes) and