spaCy/website/api/lemmatizer.jade

//- 💫 DOCS > API > LEMMATIZER

include ../_includes/_mixins

p
    |  The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix
    |  rules and lookup tables.

+h(2, "init") Lemmatizer.__init__
    +tag method

p Create a #[code Lemmatizer].

+aside-code("Example").
    from spacy.lemmatizer import Lemmatizer
    lemmatizer = Lemmatizer()

+table(["Name", "Type", "Description"])
    +row
        +cell #[code index]
        +cell dict / #[code None]
        +cell Inventory of lemmas in the language.

    +row
        +cell #[code exceptions]
        +cell dict / #[code None]
        +cell Mapping of string forms to lemmas that bypass the #[code rules].

    +row
        +cell #[code rules]
        +cell dict / #[code None]
        +cell List of suffix rewrite rules.

    +row
        +cell #[code lookup]
        +cell dict / #[code None]
        +cell Lookup table mapping string to their lemmas.

    +row("foot")
        +cell returns
        +cell #[code Lemmatizer]
        +cell The newly created object.

+h(2, "call") Lemmatizer.__call__
    +tag method

p Lemmatize a string.

+aside-code("Example").
    from spacy.lemmatizer import Lemmatizer
    from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
    lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
    lemmas = lemmatizer(u'ducks', u'NOUN')
    assert lemmas == [u'duck']

+table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to lemmatize, e.g. the token text.

    +row
        +cell #[code univ_pos]
        +cell unicode / int
        +cell The token's universal part-of-speech tag.

    +row
        +cell #[code morphology]
        +cell dict / #[code None]
        +cell
            |  Morphological features following the
            |  #[+a("http://universaldependencies.org/") Universal Dependencies]
            |  scheme.

    +row("foot")
        +cell returns
        +cell list
        +cell The available lemmas for the string.

+h(2, "lookup") Lemmatizer.lookup
    +tag method
    +tag-new(2)

p
    |  Look up a lemma in the lookup table, if available. If no lemma is found,
    |  the original string is returned. Languages can provide a
    |  #[+a("/usage/adding-languages#lemmatizer") lookup table] via the
    |  #[code lemma_lookup] variable, set on the individual #[code Language]
    |  class.

+aside-code("Example").
    lookup = {u'going': u'go'}
    lemmatizer = Lemmatizer(lookup=lookup)
    assert lemmatizer.lookup(u'going') == u'go'

+table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to look up.

    +row("foot")
        +cell returns
        +cell unicode
        +cell The lemma if the string was found, otherwise the original string.

+h(2, "is_base_form") Lemmatizer.is_base_form
    +tag method

p
    |  Check whether we're dealing with an uninflected paradigm, so we can
    |  avoid lemmatization entirely.

+aside-code("Example").
    pos = 'verb'
    morph = {'VerbForm': 'inf'}
    is_base_form = lemmatizer.is_base_form(pos, morph)
    assert is_base_form == True

+table(["Name", "Type", "Description"])
    +row
        +cell #[code univ_pos]
        +cell unicode / int
        +cell The token's universal part-of-speech tag.

    +row
        +cell #[code morphology]
        +cell dict
        +cell The token's morphological features.

    +row("foot")
        +cell returns
        +cell bool
        +cell
            |  Whether the token's part-of-speech tag and morphological features
            |  describe a base form.

+h(2, "attributes") Attributes

+table(["Name", "Type", "Description"])
    +row
        +cell #[code index]
        +cell dict / #[code None]
        +cell Inventory of lemmas in the language.

    +row
        +cell #[code exc]
        +cell dict / #[code None]
        +cell Mapping of string forms to lemmas that bypass the #[code rules].

    +row
        +cell #[code rules]
        +cell dict / #[code None]
        +cell List of suffix rewrite rules.

    +row
        +cell #[code lookup_table]
            +tag-new(2)
        +cell dict / #[code None]
        +cell The lemma lookup table, if available.
Update API documentation 2017-10-03 15:27:22 +03:00			`//- 💫 DOCS > API > LEMMATIZER`

			`include ../_includes/_mixins`

Document lemmatizer 2017-10-24 16:51:05 +03:00			`p`
			`\| The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix`
			`\| rules and lookup tables.`

			`+h(2, "init") Lemmatizer.__init__`
			`+tag method`

			`p Create a #[code Lemmatizer].`

			`+aside-code("Example").`
			`from spacy.lemmatizer import Lemmatizer`
			`lemmatizer = Lemmatizer()`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code index]`
			`+cell dict / #[code None]`
			`+cell Inventory of lemmas in the language.`

			`+row`
			`+cell #[code exceptions]`
			`+cell dict / #[code None]`
			`+cell Mapping of string forms to lemmas that bypass the #[code rules].`

			`+row`
			`+cell #[code rules]`
			`+cell dict / #[code None]`
			`+cell List of suffix rewrite rules.`

			`+row`
			`+cell #[code lookup]`
			`+cell dict / #[code None]`
			`+cell Lookup table mapping string to their lemmas.`

			`+row("foot")`
			`+cell returns`
			`+cell #[code Lemmatizer]`
			`+cell The newly created object.`

			`+h(2, "call") Lemmatizer.__call__`
			`+tag method`

			`p Lemmatize a string.`

			`+aside-code("Example").`
			`from spacy.lemmatizer import Lemmatizer`
			`from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES`
			`lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)`
			`lemmas = lemmatizer(u'ducks', u'NOUN')`
			`assert lemmas == [u'duck']`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code string]`
			`+cell unicode`
			`+cell The string to lemmatize, e.g. the token text.`

			`+row`
			`+cell #[code univ_pos]`
			`+cell unicode / int`
			`+cell The token's universal part-of-speech tag.`

			`+row`
			`+cell #[code morphology]`
			`+cell dict / #[code None]`
			`+cell`
			`\| Morphological features following the`
			`\| #[+a("http://universaldependencies.org/") Universal Dependencies]`
			`\| scheme.`

			`+row("foot")`
			`+cell returns`
			`+cell list`
			`+cell The available lemmas for the string.`

			`+h(2, "lookup") Lemmatizer.lookup`
			`+tag method`
			`+tag-new(2)`

			`p`
			`\| Look up a lemma in the lookup table, if available. If no lemma is found,`
			`\| the original string is returned. Languages can provide a`
			`\| #[+a("/usage/adding-languages#lemmatizer") lookup table] via the`
			`\| #[code lemma_lookup] variable, set on the individual #[code Language]`
			`\| class.`

			`+aside-code("Example").`
			`lookup = {u'going': u'go'}`
			`lemmatizer = Lemmatizer(lookup=lookup)`
			`assert lemmatizer.lookup(u'going') == u'go'`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code string]`
			`+cell unicode`
			`+cell The string to look up.`

			`+row("foot")`
			`+cell returns`
			`+cell unicode`
			`+cell The lemma if the string was found, otherwise the original string.`

			`+h(2, "is_base_form") Lemmatizer.is_base_form`
			`+tag method`

			`p`
			`\| Check whether we're dealing with an uninflected paradigm, so we can`
			`\| avoid lemmatization entirely.`

			`+aside-code("Example").`
			`pos = 'verb'`
			`morph = {'VerbForm': 'inf'}`
			`is_base_form = lemmatizer.is_base_form(pos, morph)`
			`assert is_base_form == True`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code univ_pos]`
			`+cell unicode / int`
			`+cell The token's universal part-of-speech tag.`

			`+row`
			`+cell #[code morphology]`
			`+cell dict`
			`+cell The token's morphological features.`

			`+row("foot")`
			`+cell returns`
			`+cell bool`
			`+cell`
			`\| Whether the token's part-of-speech tag and morphological features`
			`\| describe a base form.`

			`+h(2, "attributes") Attributes`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code index]`
			`+cell dict / #[code None]`
			`+cell Inventory of lemmas in the language.`

			`+row`
			`+cell #[code exc]`
			`+cell dict / #[code None]`
			`+cell Mapping of string forms to lemmas that bypass the #[code rules].`

			`+row`
			`+cell #[code rules]`
			`+cell dict / #[code None]`
			`+cell List of suffix rewrite rules.`

			`+row`
			`+cell #[code lookup_table]`
			`+tag-new(2)`
			`+cell dict / #[code None]`
			`+cell The lemma lookup table, if available.`