Document lemmatizer

2026-02-08 08:19:45 +03:00 · 2017-10-24 15:51:05 +02:00 · 2017-10-24 15:51:05 +02:00 · 3944c1d6e7
commit 3944c1d6e7
parent c9dc88ddfc
2 changed files with 159 additions and 2 deletions
--- a/website/api/_data.json
+++ b/website/api/_data.json
@ -160,7 +160,9 @@

    "lemmatizer": {
        "title": "Lemmatizer",
-        "tag": "class"
+        "teaser": "Assign the base forms of words.",
+        "tag": "class",
+        "source": "spacy/lemmatizer.py"
    },

    "tagger": {
--- a/website/api/lemmatizer.jade
+++ b/website/api/lemmatizer.jade
@ -2,4 +2,159 @@

 include ../_includes/_mixins

-+under-construction
+p
+    |  The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix
+    |  rules and lookup tables.
+
+h(2, "init") Lemmatizer.__init__
+    +tag method
+
+p Create a #[code Lemmatizer].
+
+aside-code("Example").
+    from spacy.lemmatizer import Lemmatizer
+    lemmatizer = Lemmatizer()
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code index]
+        +cell dict / #[code None]
+        +cell Inventory of lemmas in the language.
+
+    +row
+        +cell #[code exceptions]
+        +cell dict / #[code None]
+        +cell Mapping of string forms to lemmas that bypass the #[code rules].
+
+    +row
+        +cell #[code rules]
+        +cell dict / #[code None]
+        +cell List of suffix rewrite rules.
+
+    +row
+        +cell #[code lookup]
+        +cell dict / #[code None]
+        +cell Lookup table mapping string to their lemmas.
+
+    +row("foot")
+        +cell returns
+        +cell #[code Lemmatizer]
+        +cell The newly created object.
+
+h(2, "call") Lemmatizer.__call__
+    +tag method
+
+p Lemmatize a string.
+
+aside-code("Example").
+    from spacy.lemmatizer import Lemmatizer
+    from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
+    lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
+    lemmas = lemmatizer(u'ducks', u'NOUN')
+    assert lemmas == [u'duck']
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code string]
+        +cell unicode
+        +cell The string to lemmatize, e.g. the token text.
+
+    +row
+        +cell #[code univ_pos]
+        +cell unicode / int
+        +cell The token's universal part-of-speech tag.
+
+    +row
+        +cell #[code morphology]
+        +cell dict / #[code None]
+        +cell
+            |  Morphological features following the
+            |  #[+a("http://universaldependencies.org/") Universal Dependencies]
+            |  scheme.
+
+    +row("foot")
+        +cell returns
+        +cell list
+        +cell The available lemmas for the string.
+
+h(2, "lookup") Lemmatizer.lookup
+    +tag method
+    +tag-new(2)
+
+p
+    |  Look up a lemma in the lookup table, if available. If no lemma is found,
+    |  the original string is returned. Languages can provide a
+    |  #[+a("/usage/adding-languages#lemmatizer") lookup table] via the
+    |  #[code lemma_lookup] variable, set on the individual #[code Language]
+    |  class.
+
+aside-code("Example").
+    lookup = {u'going': u'go'}
+    lemmatizer = Lemmatizer(lookup=lookup)
+    assert lemmatizer.lookup(u'going') == u'go'
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code string]
+        +cell unicode
+        +cell The string to look up.
+
+    +row("foot")
+        +cell returns
+        +cell unicode
+        +cell The lemma if the string was found, otherwise the original string.
+
+h(2, "is_base_form") Lemmatizer.is_base_form
+    +tag method
+
+p
+    |  Check whether we're dealing with an uninflected paradigm, so we can
+    |  avoid lemmatization entirely.
+
+aside-code("Example").
+    pos = 'verb'
+    morph = {'VerbForm': 'inf'}
+    is_base_form = lemmatizer.is_base_form(pos, morph)
+    assert is_base_form == True
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code univ_pos]
+        +cell unicode / int
+        +cell The token's universal part-of-speech tag.
+
+    +row
+        +cell #[code morphology]
+        +cell dict
+        +cell The token's morphological features.
+
+    +row("foot")
+        +cell returns
+        +cell bool
+        +cell
+            |  Whether the token's part-of-speech tag and morphological features
+            |  describe a base form.
+
+h(2, "attributes") Attributes
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code index]
+        +cell dict / #[code None]
+        +cell Inventory of lemmas in the language.
+
+    +row
+        +cell #[code exc]
+        +cell dict / #[code None]
+        +cell Mapping of string forms to lemmas that bypass the #[code rules].
+
+    +row
+        +cell #[code rules]
+        +cell dict / #[code None]
+        +cell List of suffix rewrite rules.
+
+    +row
+        +cell #[code lookup_table]
+            +tag-new(2)
+        +cell dict / #[code None]
+        +cell The lemma lookup table, if available.