Add details on syntax iterators

2025-12-18 23:54:35 +03:00 · 2017-06-04 23:16:33 +02:00 · 2017-06-04 23:16:33 +02:00 · e9816daa6a
commit e9816daa6a
parent 8a29308d0b
1 changed files with 35 additions and 0 deletions
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@ -42,6 +42,7 @@ p
        +item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
        +item #[+a("#norm-exceptions") Norm exceptions]
        +item #[+a("#lex-attrs") Lexical attributes]
+        +item #[+a("#syntax-iterators") Syntax iterators]
        +item #[+a("#lemmatizer") Lemmatizer]
        +item #[+a("#tag-map") Tag map]
        +item #[+a("#morph-rules") Morph rules]
@ -104,6 +105,13 @@ p
        +cell dict
        +cell Attribute ID mapped to function.

+    +row
+        +cell #[code SYNTAX_ITERATORS]
+        +cell dict
+        +cell
+            |  Iterator ID mapped to function. Currently only supports
+            |  #[code 'noun_chunks'].
+
    +row
        +cell #[code LOOKUP]
        +cell dict
@ -449,6 +457,33 @@ p
    |  #[code lex_attr_getters.update(LEX_ATTRS)], only the new custom functions
    |  are overwritten.

+h(3, "syntax-iterators") Syntax iterators
+
+p
+    |  Syntax iterators are functions that compute views of a #[code Doc]
+    |  object based on its syntax. At the moment, this data is only used for
+    |  extracting
+    |  #[+a("/docs/usage/dependency-parse#noun-chunks") noun chunks], which
+    |  are available as the #[+api("doc#noun_chunks") #[code Doc.noun_chunks]]
+    |  property. Because base noun phrases work differently across languages,
+    |  the rules to compute them are part of the individual language's data. If
+    |  a language does not include a noun chunks iterator, the property won't
+    |  be available. For examples, see the existing syntax iterators:
+
+aside-code("Noun chunks example").
+    doc = nlp(u'A phrase with another phrase occurs.')
+    chunks = list(doc.noun_chunks)
+    assert chunks[0].text == "A phrase"
+    assert chunks[1].text == "another phrase"
+
+table(["Language", "Source"])
+    for lang, lang_id in {en: "English", de: "German", es: "Spanish"}
+        +row
+            +cell=lang
+            +cell
+                +src(gh("spaCy", "spacy/lang/" + lang_id + "/syntax_iterators.py"))
+                    |  lang/#{lang_id}/syntax_iterators.py
+
 +h(3, "lemmatizer") Lemmatizer

 p