mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 20:28:20 +03:00
39 lines
1.5 KiB
Plaintext
39 lines
1.5 KiB
Plaintext
//- 💫 DOCS > USAGE > LINGUISTIC FEATURES
|
|
|
|
include ../_includes/_mixins
|
|
|
|
p
|
|
| Processing raw text intelligently is difficult: most words are rare, and
|
|
| it's common for words that look completely different to mean almost the
|
|
| same thing. The same words in a different order can mean something
|
|
| completely different. Even splitting text into useful word-like units can
|
|
| be difficult in many languages. While it's possible to solve some
|
|
| problems starting from only the raw characters, it's usually better to
|
|
| use linguistic knowledge to add useful information. That's exactly what
|
|
| spaCy is designed to do: you put in raw text, and get back a
|
|
| #[+api("doc") #[code Doc]] object, that comes with a variety of
|
|
| annotations.
|
|
|
|
+section("pos-tagging")
|
|
+h(2, "pos-tagging") Part-of-speech tagging
|
|
+tag-model("tagger", "dependency parse")
|
|
include _linguistic-features/_pos-tagging
|
|
|
|
+section("dependency-parse")
|
|
+h(2, "dependency-parse") Dependency parsing
|
|
+tag-model("dependency parse")
|
|
include _linguistic-features/_dependency-parse
|
|
|
|
+section("named-entities")
|
|
+h(2, "named-entities") Named Entities
|
|
+tag-model("named entities")
|
|
include _linguistic-features/_named-entities
|
|
|
|
+section("tokenization")
|
|
+h(2, "tokenization") Tokenization
|
|
include _linguistic-features/_tokenization
|
|
|
|
+section("rule-based-matching")
|
|
+h(2, "rule-based-matching") Rule-based matching
|
|
include _linguistic-features/_rule-based-matching
|