Document lemmatizer

2025-11-02 00:47:52 +03:00 · 2017-10-24 15:51:05 +02:00 · 2017-10-24 15:51:05 +02:00 · 3944c1d6e7
commit 3944c1d6e7
parent c9dc88ddfc
2 changed files with 159 additions and 2 deletions
--- a/website/api/_data.json
+++ b/website/api/_data.json
@ -160,7 +160,9 @@
    "lemmatizer": {
        "title": "Lemmatizer",
-        "tag": "class"
+        "teaser": "Assign the base forms of words.",
        "tag": "class",
        "source": "spacy/lemmatizer.py"
    },
    "tagger": {
--- a/website/api/lemmatizer.jade
+++ b/website/api/lemmatizer.jade
@ -2,4 +2,159 @@
 include ../_includes/_mixins
-+under-construction
+p
    |  The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix
    |  rules and lookup tables.
 +h(2, "init") Lemmatizer.__init__
    +tag method
 p Create a #[code Lemmatizer].
 +aside-code("Example").
    from spacy.lemmatizer import Lemmatizer
    lemmatizer = Lemmatizer()
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code index]
        +cell dict / #[code None]
        +cell Inventory of lemmas in the language.
    +row
        +cell #[code exceptions]
        +cell dict / #[code None]
        +cell Mapping of string forms to lemmas that bypass the #[code rules].
    +row
        +cell #[code rules]
        +cell dict / #[code None]
        +cell List of suffix rewrite rules.
    +row
        +cell #[code lookup]
        +cell dict / #[code None]
        +cell Lookup table mapping string to their lemmas.
    +row("foot")
        +cell returns
        +cell #[code Lemmatizer]
        +cell The newly created object.
 +h(2, "call") Lemmatizer.__call__
    +tag method
 p Lemmatize a string.
 +aside-code("Example").
    from spacy.lemmatizer import Lemmatizer
    from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
    lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
    lemmas = lemmatizer(u'ducks', u'NOUN')
    assert lemmas == [u'duck']
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to lemmatize, e.g. the token text.
    +row
        +cell #[code univ_pos]
        +cell unicode / int
        +cell The token's universal part-of-speech tag.
    +row
        +cell #[code morphology]
        +cell dict / #[code None]
        +cell
            |  Morphological features following the
            |  #[+a("http://universaldependencies.org/") Universal Dependencies]
            |  scheme.
    +row("foot")
        +cell returns
        +cell list
        +cell The available lemmas for the string.
 +h(2, "lookup") Lemmatizer.lookup
    +tag method
    +tag-new(2)
 p
    |  Look up a lemma in the lookup table, if available. If no lemma is found,
    |  the original string is returned. Languages can provide a
    |  #[+a("/usage/adding-languages#lemmatizer") lookup table] via the
    |  #[code lemma_lookup] variable, set on the individual #[code Language]
    |  class.
 +aside-code("Example").
    lookup = {u'going': u'go'}
    lemmatizer = Lemmatizer(lookup=lookup)
    assert lemmatizer.lookup(u'going') == u'go'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to look up.
    +row("foot")
        +cell returns
        +cell unicode
        +cell The lemma if the string was found, otherwise the original string.
 +h(2, "is_base_form") Lemmatizer.is_base_form
    +tag method
 p
    |  Check whether we're dealing with an uninflected paradigm, so we can
    |  avoid lemmatization entirely.
 +aside-code("Example").
    pos = 'verb'
    morph = {'VerbForm': 'inf'}
    is_base_form = lemmatizer.is_base_form(pos, morph)
    assert is_base_form == True
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code univ_pos]
        +cell unicode / int
        +cell The token's universal part-of-speech tag.
    +row
        +cell #[code morphology]
        +cell dict
        +cell The token's morphological features.
    +row("foot")
        +cell returns
        +cell bool
        +cell
            |  Whether the token's part-of-speech tag and morphological features
            |  describe a base form.
 +h(2, "attributes") Attributes
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code index]
        +cell dict / #[code None]
        +cell Inventory of lemmas in the language.
    +row
        +cell #[code exc]
        +cell dict / #[code None]
        +cell Mapping of string forms to lemmas that bypass the #[code rules].
    +row
        +cell #[code rules]
        +cell dict / #[code None]
        +cell List of suffix rewrite rules.
    +row
        +cell #[code lookup_table]
            +tag-new(2)
        +cell dict / #[code None]
        +cell The lemma lookup table, if available.