mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-09 16:58:17 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
61eaaac98d
|
@ -58,7 +58,8 @@ def check_spaces(text, tokens):
|
||||||
yield prev_end != idx
|
yield prev_end != idx
|
||||||
prev_end = idx + len(token)
|
prev_end = idx + len(token)
|
||||||
start = prev_end
|
start = prev_end
|
||||||
yield False
|
if start > 0:
|
||||||
|
yield False
|
||||||
|
|
||||||
|
|
||||||
class KoreanTokenizer(DummyTokenizer):
|
class KoreanTokenizer(DummyTokenizer):
|
||||||
|
|
|
@ -45,3 +45,8 @@ def test_ko_tokenizer_full_tags(ko_tokenizer, text, expected_tags):
|
||||||
def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
|
def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
|
||||||
pos = [token.pos_ for token in ko_tokenizer(text)]
|
pos = [token.pos_ for token in ko_tokenizer(text)]
|
||||||
assert pos == expected_pos.split()
|
assert pos == expected_pos.split()
|
||||||
|
|
||||||
|
|
||||||
|
def test_ko_empty_doc(ko_tokenizer):
|
||||||
|
tokens = ko_tokenizer("")
|
||||||
|
assert len(tokens) == 0
|
||||||
|
|
|
@ -26,6 +26,14 @@ import PosDeps101 from 'usage/101/\_pos-deps.md'
|
||||||
|
|
||||||
<PosDeps101 />
|
<PosDeps101 />
|
||||||
|
|
||||||
|
<Infobox title="📖 Part-of-speech tag scheme">
|
||||||
|
|
||||||
|
For a list of the fine-grained and coarse-grained part-of-speech tags assigned
|
||||||
|
by spaCy's models across different languages, see the
|
||||||
|
[POS tag scheme documentation](/api/annotation#pos-tagging).
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### Rule-based morphology {#rule-based-morphology}
|
### Rule-based morphology {#rule-based-morphology}
|
||||||
|
|
||||||
Inflectional morphology is the process by which a root form of a word is
|
Inflectional morphology is the process by which a root form of a word is
|
||||||
|
@ -61,14 +69,7 @@ of the two. The system works as follows:
|
||||||
morphological information, without consulting the context of the token. The
|
morphological information, without consulting the context of the token. The
|
||||||
lemmatizer also accepts list-based exception files, acquired from
|
lemmatizer also accepts list-based exception files, acquired from
|
||||||
[WordNet](https://wordnet.princeton.edu/).
|
[WordNet](https://wordnet.princeton.edu/).
|
||||||
|
|
||||||
<Infobox title="📖 Part-of-speech tag scheme">
|
|
||||||
|
|
||||||
For a list of the fine-grained and coarse-grained part-of-speech tags assigned
|
|
||||||
by spaCy's models across different languages, see the
|
|
||||||
[POS tag scheme documentation](/api/annotation#pos-tagging).
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
## Dependency Parsing {#dependency-parse model="parser"}
|
## Dependency Parsing {#dependency-parse model="parser"}
|
||||||
|
|
||||||
|
@ -289,7 +290,7 @@ for token in doc:
|
||||||
|
|
||||||
For a list of the syntactic dependency labels assigned by spaCy's models across
|
For a list of the syntactic dependency labels assigned by spaCy's models across
|
||||||
different languages, see the
|
different languages, see the
|
||||||
[dependency label scheme documentation](/api/annotation#pos-tagging).
|
[dependency label scheme documentation](/api/annotation#dependency-parsing).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user