spaCy/spacy/lexeme.pyx

# cython: embedsignature=True
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
from libc.string cimport memset
cimport numpy as np
np.import_array()

import numpy
from thinc.api import get_array_module
import warnings

from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
from .attrs cimport IS_CURRENCY

from .attrs import intify_attrs
from .errors import Errors, Warnings


OOV_RANK = 0xffffffffffffffff # UINT64_MAX
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.id = OOV_RANK


cdef class Lexeme:
    """An entry in the vocabulary. A `Lexeme` has no string context – it's a
    word-type, as opposed to a word token.  It therefore has no part-of-speech
    tag, dependency parse, or lemma (lemmatization depends on the
    part-of-speech tag).

    DOCS: https://spacy.io/api/lexeme
    """
    def __init__(self, Vocab vocab, attr_t orth):
        """Create a Lexeme object.

        vocab (Vocab): The parent vocabulary
        orth (uint64): The orth id of the lexeme.
        Returns (Lexeme): The newly constructd object.
        """
        self.vocab = vocab
        self.orth = orth
        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
        if self.c.orth != orth:
            raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))

    def __richcmp__(self, other, int op):
        if other is None:
            if op == 0 or op == 1 or op == 2:
                return False
            else:
                return True
        if isinstance(other, Lexeme):
            a = self.orth
            b = other.orth
        elif isinstance(other, long):
            a = self.orth
            b = other
        elif isinstance(other, str):
            a = self.orth_
            b = other
        else:
            a = 0
            b = 1
        if op == 2:  # ==
            return a == b
        elif op == 3:  # !=
            return a != b
        elif op == 0:  # <
            return a < b
        elif op == 1:  # <=
            return a <= b
        elif op == 4:  # >
            return a > b
        elif op == 5:  # >=
            return a >= b
        else:
            raise NotImplementedError(op)

    def __hash__(self):
        return self.c.orth

    def set_attrs(self, **attrs):
        cdef attr_id_t attr
        attrs = intify_attrs(attrs)
        for attr, value in attrs.items():
            # skip PROB, e.g. from lexemes.jsonl
            if isinstance(value, float):
                continue
            elif isinstance(value, (int, long)):
                 Lexeme.set_struct_attr(self.c, attr, value)
            else:
                Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))

    def set_flag(self, attr_id_t flag_id, bint value):
        """Change the value of a boolean flag.

        flag_id (int): The attribute ID of the flag to set.
        value (bool): The new value of the flag.
        """
        Lexeme.c_set_flag(self.c, flag_id, value)

    def check_flag(self, attr_id_t flag_id):
        """Check the value of a boolean flag.

        flag_id (int): The attribute ID of the flag to query.
        RETURNS (bool): The value of the flag.
        """
        return True if Lexeme.c_check_flag(self.c, flag_id) else False

    def similarity(self, other):
        """Compute a semantic similarity estimate. Defaults to cosine over
        vectors.

        other (object): The object to compare with. By default, accepts `Doc`,
            `Span`, `Token` and `Lexeme` objects.
        RETURNS (float): A scalar similarity score. Higher is more similar.
        """
        # Return 1.0 similarity for matches
        if hasattr(other, "orth"):
            if self.c.orth == other.orth:
                return 1.0
        elif hasattr(other, "__len__") and len(other) == 1 \
        and hasattr(other[0], "orth"):
            if self.c.orth == other[0].orth:
                return 1.0
        if self.vector_norm == 0 or other.vector_norm == 0:
            warnings.warn(Warnings.W008.format(obj="Lexeme"))
            return 0.0
        vector = self.vector
        xp = get_array_module(vector)
        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
        return result.item()
    
    @property
    def has_vector(self):
        """RETURNS (bool): Whether a word vector is associated with the object.
        """
        return self.vocab.has_vector(self.c.orth)

    @property
    def vector_norm(self):
        """RETURNS (float): The L2 norm of the vector representation."""
        vector = self.vector
        return numpy.sqrt((vector**2).sum())

    property vector:
        """A real-valued meaning representation.

        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the lexeme's semantics.
        """
        def __get__(self):
            cdef int length = self.vocab.vectors_length
            if length == 0:
                raise ValueError(Errors.E010)
            return self.vocab.get_vector(self.c.orth)

        def __set__(self, vector):
            if len(vector) != self.vocab.vectors_length:
                raise ValueError(Errors.E073.format(new_length=len(vector),
                                                    length=self.vocab.vectors_length))
            self.vocab.set_vector(self.c.orth, vector)

    property rank:
        """RETURNS (str): Sequential ID of the lexeme's lexical type, used
            to index into tables, e.g. for word vectors."""
        def __get__(self):
            return self.c.id

        def __set__(self, value):
            self.c.id = value

    @property
    def orth_(self):
        """RETURNS (str): The original verbatim text of the lexeme
            (identical to `Lexeme.text`). Exists mostly for consistency with
            the other attributes."""
        return self.vocab.strings[self.c.orth]

    @property
    def text(self):
        """RETURNS (str): The original verbatim text of the lexeme."""
        return self.orth_

    property lower:
        """RETURNS (str): Lowercase form of the lexeme."""
        def __get__(self):
            return self.c.lower

        def __set__(self, attr_t x):
            self.c.lower = x

    property norm:
        """RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
            lexeme text.
        """
        def __get__(self):
            return self.c.norm

        def __set__(self, attr_t x):
            if "lexeme_norm" not in self.vocab.lookups:
                self.vocab.lookups.add_table("lexeme_norm")
            norm_table = self.vocab.lookups.get_table("lexeme_norm")
            norm_table[self.c.orth] = self.vocab.strings[x]
            self.c.norm = x

    property shape:
        """RETURNS (uint64): Transform of the word's string, to show
            orthographic features.
        """
        def __get__(self):
            return self.c.shape

        def __set__(self, attr_t x):
            self.c.shape = x

    property prefix:
        """RETURNS (uint64): Length-N substring from the start of the word.
            Defaults to `N=1`.
        """
        def __get__(self):
            return self.c.prefix

        def __set__(self, attr_t x):
            self.c.prefix = x

    property suffix:
        """RETURNS (uint64): Length-N substring from the end of the word.
            Defaults to `N=3`.
        """
        def __get__(self):
            return self.c.suffix

        def __set__(self, attr_t x):
            self.c.suffix = x

    property cluster:
        """RETURNS (int): Brown cluster ID."""
        def __get__(self):
            cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
            return cluster_table.get(self.c.orth, 0)

        def __set__(self, int x):
            cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
            cluster_table[self.c.orth] = x

    property lang:
        """RETURNS (uint64): Language of the parent vocabulary."""
        def __get__(self):
            return self.c.lang

        def __set__(self, attr_t x):
            self.c.lang = x

    property prob:
        """RETURNS (float): Smoothed log probability estimate of the lexeme's
            type."""
        def __get__(self):
            prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
            settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
            default_oov_prob = settings_table.get("oov_prob", -20.0)
            return prob_table.get(self.c.orth, default_oov_prob)

        def __set__(self, float x):
            prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
            prob_table[self.c.orth] = x

    property lower_:
        """RETURNS (str): Lowercase form of the word."""
        def __get__(self):
            return self.vocab.strings[self.c.lower]

        def __set__(self, str x):
            self.c.lower = self.vocab.strings.add(x)

    property norm_:
        """RETURNS (str): The lexeme's norm, i.e. a normalised form of the
            lexeme text.
        """
        def __get__(self):
            return self.vocab.strings[self.c.norm]

        def __set__(self, str x):
            self.norm = self.vocab.strings.add(x)

    property shape_:
        """RETURNS (str): Transform of the word's string, to show
            orthographic features.
        """
        def __get__(self):
            return self.vocab.strings[self.c.shape]

        def __set__(self, str x):
            self.c.shape = self.vocab.strings.add(x)

    property prefix_:
        """RETURNS (str): Length-N substring from the start of the word.
            Defaults to `N=1`.
        """
        def __get__(self):
            return self.vocab.strings[self.c.prefix]

        def __set__(self, str x):
            self.c.prefix = self.vocab.strings.add(x)

    property suffix_:
        """RETURNS (str): Length-N substring from the end of the word.
            Defaults to `N=3`.
        """
        def __get__(self):
            return self.vocab.strings[self.c.suffix]

        def __set__(self, str x):
            self.c.suffix = self.vocab.strings.add(x)

    property lang_:
        """RETURNS (str): Language of the parent vocabulary."""
        def __get__(self):
            return self.vocab.strings[self.c.lang]

        def __set__(self, str x):
            self.c.lang = self.vocab.strings.add(x)

    property flags:
        """RETURNS (uint64): Container of the lexeme's binary flags."""
        def __get__(self):
            return self.c.flags

        def __set__(self, flags_t x):
            self.c.flags = x

    @property
    def is_oov(self):
        """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
        return self.orth not in self.vocab.vectors

    property is_stop:
        """RETURNS (bool): Whether the lexeme is a stop word."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_STOP)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_STOP, x)

    property is_alpha:
        """RETURNS (bool): Whether the lexeme consists of alphabetic
            characters. Equivalent to `lexeme.text.isalpha()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_ALPHA)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_ALPHA, x)

    property is_ascii:
        """RETURNS (bool): Whether the lexeme consists of ASCII characters.
            Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_ASCII)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_ASCII, x)

    property is_digit:
        """RETURNS (bool): Whether the lexeme consists of digits. Equivalent
            to `lexeme.text.isdigit()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_DIGIT)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_DIGIT, x)

    property is_lower:
        """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
            `lexeme.text.islower()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_LOWER)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_LOWER, x)

    property is_upper:
        """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
            `lexeme.text.isupper()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_UPPER)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_UPPER, x)

    property is_title:
        """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
            `lexeme.text.istitle()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_TITLE)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_TITLE, x)

    property is_punct:
        """RETURNS (bool): Whether the lexeme is punctuation."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_PUNCT)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_PUNCT, x)

    property is_space:
        """RETURNS (bool): Whether the lexeme consist of whitespace characters.
            Equivalent to `lexeme.text.isspace()`.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_SPACE)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_SPACE, x)

    property is_bracket:
        """RETURNS (bool): Whether the lexeme is a bracket."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_BRACKET)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_BRACKET, x)

    property is_quote:
        """RETURNS (bool): Whether the lexeme is a quotation mark."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_QUOTE)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_QUOTE, x)

    property is_left_punct:
        """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)

    property is_right_punct:
        """RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)

    property is_currency:
        """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, IS_CURRENCY)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, IS_CURRENCY, x)

    property like_url:
        """RETURNS (bool): Whether the lexeme resembles a URL."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, LIKE_URL)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, LIKE_URL, x)

    property like_num:
        """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
            "10", "ten", etc.
        """
        def __get__(self):
            return Lexeme.c_check_flag(self.c, LIKE_NUM)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, LIKE_NUM, x)

    property like_email:
        """RETURNS (bool): Whether the lexeme resembles an email address."""
        def __get__(self):
            return Lexeme.c_check_flag(self.c, LIKE_EMAIL)

        def __set__(self, bint x):
            Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
-												* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme

											
										
										
											2015-01-14 16:33:16 +03:00
+								# cython: embedsignature=True
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								# Compiler crashes on memory view coercion without this. Should report bug.
 								from cython.view cimport array as cvarray
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								from libc.string cimport memset
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								cimport numpy as np
 								np.import_array()
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 13:05:47 +03:00
+								import numpy
-												Tidy up and auto-format

											
										
										
											2020-02-18 17:38:18 +03:00
+								from thinc.api import get_array_module
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								import warnings
-												* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding.

											
										
										
											2014-10-22 18:57:59 +04:00
-												* Add supersense data to Lexeme objects. Add simple has_sense method to check the flag.

											
										
										
											2015-07-01 19:50:37 +03:00
+								from .typedefs cimport attr_t, flags_t
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 								from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								from .attrs cimport IS_CURRENCY
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Improve Lexeme.set_attrs method

											
										
										
											2017-10-30 13:49:11 +03:00
+								from .attrs import intify_attrs
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								from .errors import Errors, Warnings
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												* Revising data model of lexeme. Compiles.

											
										
										
											2014-10-09 12:53:30 +04:00
-												Avoid libc.stdint for UINT64_MAX (#5545)


											
										
										
											2020-06-04 21:02:05 +03:00
+								OOV_RANK = 0xffffffffffffffff # UINT64_MAX
-												* Tmp commit. Refactoring to create a Python Lexeme class.

											
										
										
											2015-01-12 02:26:22 +03:00
+								memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
-												Use max(uint64) for OOV lexeme rank (#5303)

* Use max(uint64) for OOV lexeme rank

* Add test for default OOV rank

* Revert back to thinc==7.4.0

Requiring the updated version of thinc was unnecessary.

* Define OOV_RANK in one place

Define OOV_RANK in one place in `util`.

* Fix formatting [ci skip]

* Switch to external definitions of max(uint64)

Switch to external defintions of max(uint64) and confirm that they are
equal.
											
										
										
											2020-04-15 14:49:47 +03:00
+								EMPTY_LEXEME.id = OOV_RANK
-												* Revising data model of lexeme. Compiles.

											
										
										
											2014-10-09 12:53:30 +04:00
-												* Tmp. Refactoring, introducing a Lexeme PyObject.

											
										
										
											2015-01-12 03:23:44 +03:00
+								cdef class Lexeme:
-												Update docstrings and API docs for Lexeme

											
										
										
											2017-05-20 16:13:42 +03:00
+								    """An entry in the vocabulary. A `Lexeme` has no string context – it's a
-												* Add docstring to Lexeme

											
										
										
											2015-01-24 12:48:34 +03:00
+								    word-type, as opposed to a word token.  It therefore has no part-of-speech
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								    tag, dependency parse, or lemma (lemmatization depends on the
 								    part-of-speech tag).
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
-												Replace links to nightly docs [ci skip]

											
										
										
											2021-01-30 12:09:38 +03:00
+								    DOCS: https://spacy.io/api/lexeme
-												* Add docstring to Lexeme

											
										
										
											2015-01-24 12:48:34 +03:00
+								    """
-												Adjust lexeme sizing for attr_t being 64 bit

											
										
										
											2017-05-28 13:51:09 +03:00
+								    def __init__(self, Vocab vocab, attr_t orth):
-												Update docstrings and API docs for Lexeme

											
										
										
											2017-05-20 16:13:42 +03:00
+								        """Create a Lexeme object.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Lexeme

											
										
										
											2017-05-20 16:13:42 +03:00
+								        vocab (Vocab): The parent vocabulary
-												Adjust lexeme sizing for attr_t being 64 bit

											
										
										
											2017-05-28 13:51:09 +03:00
+								        orth (uint64): The orth id of the lexeme.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								        Returns (Lexeme): The newly constructd object.
 								        """
-												* Tmp

											
										
										
											2015-08-22 23:04:34 +03:00
+								        self.vocab = vocab
 								        self.orth = orth
-												* Begin merge of Gazetteer and DE branches

											
										
										
											2015-09-06 20:45:15 +03:00
+								        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								        if self.c.orth != orth:
 								            raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
-												* Tmp

											
										
										
											2015-08-22 23:04:34 +03:00
-												* Fix Issue #361: Lexemes didn't have rich comparison.

											
										
										
											2016-05-05 02:32:26 +03:00
+								    def __richcmp__(self, other, int op):
-												Add test for #1757: Comparison against None

											
										
										
											2018-01-15 17:54:25 +03:00
+								        if other is None:
 								            if op == 0 or op == 1 or op == 2:
 								                return False
 								            else:
 								                return True
-												* Fix Issue #361: Lexemes didn't have rich comparison.

											
										
										
											2016-05-05 02:32:26 +03:00
+								        if isinstance(other, Lexeme):
 								            a = self.orth
 								            b = other.orth
-												Adjust lexeme sizing for attr_t being 64 bit

											
										
										
											2017-05-28 13:51:09 +03:00
+								        elif isinstance(other, long):
-												* Fix Issue #361: Lexemes didn't have rich comparison.

											
										
										
											2016-05-05 02:32:26 +03:00
+								            a = self.orth
 								            b = other
 								        elif isinstance(other, str):
-												* Fix issue #372: mistake in Lexeme rich comparison

											
										
										
											2016-05-12 13:58:57 +03:00
+								            a = self.orth_
-												* Fix Issue #361: Lexemes didn't have rich comparison.

											
										
										
											2016-05-05 02:32:26 +03:00
+								            b = other
 								        else:
 								            a = 0
 								            b = 1
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        if op == 2:  # ==
-												* Fix Issue #361: Lexemes didn't have rich comparison.

											
										
										
											2016-05-05 02:32:26 +03:00
+								            return a == b
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        elif op == 3:  # !=
-												* Fix Issue #361: Lexemes didn't have rich comparison.

											
										
										
											2016-05-05 02:32:26 +03:00
+								            return a != b
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        elif op == 0:  # <
-												* Fix Issue #361: Lexemes didn't have rich comparison.

											
										
										
											2016-05-05 02:32:26 +03:00
+								            return a < b
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        elif op == 1:  # <=
-												* Fix Issue #361: Lexemes didn't have rich comparison.

											
										
										
											2016-05-05 02:32:26 +03:00
+								            return a <= b
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        elif op == 4:  # >
-												* Fix Issue #361: Lexemes didn't have rich comparison.

											
										
										
											2016-05-05 02:32:26 +03:00
+								            return a > b
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        elif op == 5:  # >=
-												* Fix Issue #361: Lexemes didn't have rich comparison.

											
										
										
											2016-05-05 02:32:26 +03:00
+								            return a >= b
 								        else:
 								            raise NotImplementedError(op)
-												Fix Issue #371: Lexeme objects were unhashable.

											
										
										
											2016-09-27 14:22:30 +03:00
+								    def __hash__(self):
 								        return self.c.orth
-												Improve Lexeme.set_attrs method

											
										
										
											2017-10-30 13:49:11 +03:00
+								    def set_attrs(self, **attrs):
 								        cdef attr_id_t attr
 								        attrs = intify_attrs(attrs)
 								        for attr, value in attrs.items():
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								            # skip PROB, e.g. from lexemes.jsonl
 								            if isinstance(value, float):
 								                continue
 								            elif isinstance(value, (int, long)):
 								                 Lexeme.set_struct_attr(self.c, attr, value)
-												Improve Lexeme.set_attrs method

											
										
										
											2017-10-30 13:49:11 +03:00
+								            else:
 								                Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								    def set_flag(self, attr_id_t flag_id, bint value):
-												Update docstrings and API docs for Lexeme

											
										
										
											2017-05-20 16:13:42 +03:00
+								        """Change the value of a boolean flag.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Lexeme

											
										
										
											2017-05-20 16:13:42 +03:00
+								        flag_id (int): The attribute ID of the flag to set.
 								        value (bool): The new value of the flag.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								        """
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        Lexeme.c_set_flag(self.c, flag_id, value)
-												Remove whitespace

											
										
										
											2017-04-01 11:19:01 +03:00
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								    def check_flag(self, attr_id_t flag_id):
-												Update docstrings and API docs for Lexeme

											
										
										
											2017-05-20 16:13:42 +03:00
+								        """Check the value of a boolean flag.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Lexeme

											
										
										
											2017-05-20 16:13:42 +03:00
+								        flag_id (int): The attribute ID of the flag to query.
 								        RETURNS (bool): The value of the flag.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								        """
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        return True if Lexeme.c_check_flag(self.c, flag_id) else False
-												* Temporarily add py_set_flag attribute in Lexeme

											
										
										
											2015-09-06 18:52:51 +03:00
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								    def similarity(self, other):
-												Update docstrings and API docs for Lexeme

											
										
										
											2017-05-20 16:13:42 +03:00
+								        """Compute a semantic similarity estimate. Defaults to cosine over
 								        vectors.
 								        other (object): The object to compare with. By default, accepts `Doc`,
 								            `Span`, `Token` and `Lexeme` objects.
 								        RETURNS (float): A scalar similarity score. Higher is more similar.
-												Use consistent formatting for docstrings

											
										
										
											2017-04-15 12:59:21 +03:00
+								        """
-												Make .similarity() return 1.0 if all orth attrs match

											
										
										
											2018-01-15 18:29:48 +03:00
+								        # Return 1.0 similarity for matches
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        if hasattr(other, "orth"):
-												Make .similarity() return 1.0 if all orth attrs match

											
										
										
											2018-01-15 18:29:48 +03:00
+								            if self.c.orth == other.orth:
 								                return 1.0
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        elif hasattr(other, "__len__") and len(other) == 1 \
 								        and hasattr(other[0], "orth"):
-												Make .similarity() return 1.0 if all orth attrs match

											
										
										
											2018-01-15 18:29:48 +03:00
+								            if self.c.orth == other[0].orth:
 								                return 1.0
-												* Fix vectors bugs for OOV words

											
										
										
											2015-09-22 03:10:01 +03:00
+								        if self.vector_norm == 0 or other.vector_norm == 0:
-												Simplify warnings

											
										
										
											2020-04-28 14:37:37 +03:00
+								            warnings.warn(Warnings.W008.format(obj="Lexeme"))
-												* Fix vectors bugs for OOV words

											
										
										
											2015-09-22 03:10:01 +03:00
+								            return 0.0
-												Don't use numpy directly for similarity (#3362)

* Don't use numpy directly for similarity

* Contributor agreement

											
										
										
											2019-03-07 01:58:38 +03:00
+								        vector = self.vector
 								        xp = get_array_module(vector)
-												Bugfix for similarity return types (#10051)


											
										
										
											2022-01-20 13:40:46 +03:00
+								        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
 								        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
 								        return result.item()
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def has_vector(self):
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether a word vector is associated with the object.
-												Update docstrings and API docs for Lexeme

											
										
										
											2017-05-20 16:13:42 +03:00
+								        """
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.vocab.has_vector(self.c.orth)
-												* Add has_vector attribute to Token and Lexeme

											
										
										
											2015-09-21 12:52:43 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def vector_norm(self):
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (float): The L2 norm of the vector representation."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        vector = self.vector
 								        return numpy.sqrt((vector**2).sum())
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
 								    property vector:
-												Update docstrings and API docs for Lexeme

											
										
										
											2017-05-20 16:13:42 +03:00
+								        """A real-valued meaning representation.
 								        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
 								            representing the lexeme's semantics.
 								        """
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								        def __get__(self):
-												* Fix vectors bug in lexeme

											
										
										
											2015-09-15 12:05:11 +03:00
+								            cdef int length = self.vocab.vectors_length
-												* Raise exceptions if attempt to access parse, but data is not installed. This partly but not fully addresses Issue #97. Still need exceptions on the various Token attributes that access the parse tree, e.g. token.head, token.lefts, token.rights, etc. Exceptions should be centralized, too.

											
										
										
											2015-09-21 11:35:40 +03:00
+								            if length == 0:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                raise ValueError(Errors.E010)
-												Remove vectors from lexeme

											
										
										
											2017-05-28 12:45:48 +03:00
+								            return self.vocab.get_vector(self.c.orth)
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
-												* Support setting of word vectors on Lexeme object.

											
										
										
											2015-09-15 07:42:27 +03:00
+								        def __set__(self, vector):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            if len(vector) != self.vocab.vectors_length:
 								                raise ValueError(Errors.E073.format(new_length=len(vector),
 								                                                    length=self.vocab.vectors_length))
-												Remove vectors from lexeme

											
										
										
											2017-05-28 12:45:48 +03:00
+								            self.vocab.set_vector(self.c.orth, vector)
-												* Support setting of word vectors on Lexeme object.

											
										
										
											2015-09-15 07:42:27 +03:00
-												* Add .rank property to Token and Lexeme, for frequency rank

											
										
										
											2015-11-08 18:18:25 +03:00
+								    property rank:
-												fix 's typo's across code base (#8384)


											
										
										
											2021-06-15 11:57:08 +03:00
+								        """RETURNS (str): Sequential ID of the lexeme's lexical type, used
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            to index into tables, e.g. for word vectors."""
-												* Add .rank property to Token and Lexeme, for frequency rank

											
										
										
											2015-11-08 18:18:25 +03:00
+								        def __get__(self):
 								            return self.c.id
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
-												Allow Lexeme.rank to be set

											
										
										
											2017-08-24 22:43:00 +03:00
+								        def __set__(self, value):
 								            self.c.id = value
-												* Add .rank property to Token and Lexeme, for frequency rank

											
										
										
											2015-11-08 18:18:25 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def orth_(self):
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): The original verbatim text of the lexeme
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            (identical to `Lexeme.text`). Exists mostly for consistency with
 								            the other attributes."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.vocab.strings[self.c.orth]
-												* Tmp

											
										
										
											2015-08-22 23:04:34 +03:00
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								    @property
 								    def text(self):
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): The original verbatim text of the lexeme."""
-												Tidy up property code style (#3391)

Use decorator if properties only have a getter and existing syntax if there's getter and setter
											
										
										
											2019-03-11 17:59:09 +03:00
+								        return self.orth_
-												Update docstrings and API docs for Lexeme

											
										
										
											2017-05-20 16:13:42 +03:00
-												* Tmp

											
										
										
											2015-08-22 23:04:34 +03:00
+								    property lower:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): Lowercase form of the lexeme."""
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        def __get__(self):
 								            return self.c.lower
 								        def __set__(self, attr_t x):
 								            self.c.lower = x
-												Remove whitespace

											
										
										
											2017-04-01 11:19:01 +03:00
-												* Tmp

											
										
										
											2015-08-22 23:04:34 +03:00
+								    property norm:
-												fix 's typo's across code base (#8384)


											
										
										
											2021-06-15 11:57:08 +03:00
+								        """RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            lexeme text.
 								        """
 								        def __get__(self):
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								            return self.c.norm
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
 								        def __set__(self, attr_t x):
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								            if "lexeme_norm" not in self.vocab.lookups:
 								                self.vocab.lookups.add_table("lexeme_norm")
 								            norm_table = self.vocab.lookups.get_table("lexeme_norm")
 								            norm_table[self.c.orth] = self.vocab.strings[x]
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            self.c.norm = x
-												* Tmp

											
										
										
											2015-08-22 23:04:34 +03:00
 								    property shape:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (uint64): Transform of the word's string, to show
 								            orthographic features.
 								        """
 								        def __get__(self):
 								            return self.c.shape
 								        def __set__(self, attr_t x):
 								            self.c.shape = x
-												* Tmp

											
										
										
											2015-08-22 23:04:34 +03:00
 								    property prefix:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (uint64): Length-N substring from the start of the word.
 								            Defaults to `N=1`.
 								        """
 								        def __get__(self):
 								            return self.c.prefix
 								        def __set__(self, attr_t x):
 								            self.c.prefix = x
-												* Tmp

											
										
										
											2015-08-22 23:04:34 +03:00
 								    property suffix:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (uint64): Length-N substring from the end of the word.
 								            Defaults to `N=3`.
 								        """
 								        def __get__(self):
 								            return self.c.suffix
 								        def __set__(self, attr_t x):
 								            self.c.suffix = x
-												Remove whitespace

											
										
										
											2017-04-01 11:19:01 +03:00
-												* Add missing properties in Lexeme class

											
										
										
											2015-08-26 20:16:28 +03:00
+								    property cluster:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (int): Brown cluster ID."""
 								        def __get__(self):
-												WIP: move more language data to config

											
										
										
											2020-07-22 16:59:37 +03:00
+								            cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								            return cluster_table.get(self.c.orth, 0)
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								        def __set__(self, int x):
-												WIP: move more language data to config

											
										
										
											2020-07-22 16:59:37 +03:00
+								            cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								            cluster_table[self.c.orth] = x
-												Remove whitespace

											
										
										
											2017-04-01 11:19:01 +03:00
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
+								    property lang:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (uint64): Language of the parent vocabulary."""
 								        def __get__(self):
 								            return self.c.lang
 								        def __set__(self, attr_t x):
 								            self.c.lang = x
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
-												* Add missing properties in Lexeme class

											
										
										
											2015-08-26 20:16:28 +03:00
+								    property prob:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (float): Smoothed log probability estimate of the lexeme's
 								            type."""
 								        def __get__(self):
-												WIP: move more language data to config

											
										
										
											2020-07-22 16:59:37 +03:00
+								            prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
 								            settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								            default_oov_prob = settings_table.get("oov_prob", -20.0)
 								            return prob_table.get(self.c.orth, default_oov_prob)
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
 								        def __set__(self, float x):
-												WIP: move more language data to config

											
										
										
											2020-07-22 16:59:37 +03:00
+								            prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								            prob_table[self.c.orth] = x
-												* Tmp

											
										
										
											2015-08-22 23:04:34 +03:00
 								    property lower_:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): Lowercase form of the word."""
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        def __get__(self):
 								            return self.vocab.strings[self.c.lower]
-												Update Cython string types (#9143)

* Replace all basestring references with unicode

`basestring` was a compatability type introduced by Cython to make
dealing with utf-8 strings in Python2 easier. In Python3 it is
equivalent to the unicode (or str) type.

I replaced all references to basestring with unicode, since that was
used elsewhere, but we could also just replace them with str, which
shoudl also be equivalent.

All tests pass locally.

* Replace all references to unicode type with str

Since we only support python3 this is simpler.

* Remove all references to unicode type

This removes all references to the unicode type across the codebase and
replaces them with `str`, which makes it more drastic than the prior
commits. In order to make this work importing `unicode_literals` had to
be removed, and one explicit unicode literal also had to be removed (it
is unclear why this is necessary in Cython with language level 3, but
without doing it there were errors about implicit conversion).

When `unicode` is used as a type in comments it was also edited to be
`str`.

Additionally `coding: utf8` headers were removed from a few files.
											
										
										
											2021-09-13 18:02:17 +03:00
+								        def __set__(self, str x):
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            self.c.lower = self.vocab.strings.add(x)
-												Remove whitespace

											
										
										
											2017-04-01 11:19:01 +03:00
-												* Tmp

											
										
										
											2015-08-22 23:04:34 +03:00
+								    property norm_:
-												fix 's typo's across code base (#8384)


											
										
										
											2021-06-15 11:57:08 +03:00
+								        """RETURNS (str): The lexeme's norm, i.e. a normalised form of the
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            lexeme text.
 								        """
 								        def __get__(self):
 								            return self.vocab.strings[self.c.norm]
-												Update Cython string types (#9143)

* Replace all basestring references with unicode

`basestring` was a compatability type introduced by Cython to make
dealing with utf-8 strings in Python2 easier. In Python3 it is
equivalent to the unicode (or str) type.

I replaced all references to basestring with unicode, since that was
used elsewhere, but we could also just replace them with str, which
shoudl also be equivalent.

All tests pass locally.

* Replace all references to unicode type with str

Since we only support python3 this is simpler.

* Remove all references to unicode type

This removes all references to the unicode type across the codebase and
replaces them with `str`, which makes it more drastic than the prior
commits. In order to make this work importing `unicode_literals` had to
be removed, and one explicit unicode literal also had to be removed (it
is unclear why this is necessary in Cython with language level 3, but
without doing it there were errors about implicit conversion).

When `unicode` is used as a type in comments it was also edited to be
`str`.

Additionally `coding: utf8` headers were removed from a few files.
											
										
										
											2021-09-13 18:02:17 +03:00
+								        def __set__(self, str x):
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								            self.norm = self.vocab.strings.add(x)
-												Remove whitespace

											
										
										
											2017-04-01 11:19:01 +03:00
-												* Tmp

											
										
										
											2015-08-22 23:04:34 +03:00
+								    property shape_:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): Transform of the word's string, to show
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            orthographic features.
 								        """
 								        def __get__(self):
 								            return self.vocab.strings[self.c.shape]
-												Update Cython string types (#9143)

* Replace all basestring references with unicode

`basestring` was a compatability type introduced by Cython to make
dealing with utf-8 strings in Python2 easier. In Python3 it is
equivalent to the unicode (or str) type.

I replaced all references to basestring with unicode, since that was
used elsewhere, but we could also just replace them with str, which
shoudl also be equivalent.

All tests pass locally.

* Replace all references to unicode type with str

Since we only support python3 this is simpler.

* Remove all references to unicode type

This removes all references to the unicode type across the codebase and
replaces them with `str`, which makes it more drastic than the prior
commits. In order to make this work importing `unicode_literals` had to
be removed, and one explicit unicode literal also had to be removed (it
is unclear why this is necessary in Cython with language level 3, but
without doing it there were errors about implicit conversion).

When `unicode` is used as a type in comments it was also edited to be
`str`.

Additionally `coding: utf8` headers were removed from a few files.
											
										
										
											2021-09-13 18:02:17 +03:00
+								        def __set__(self, str x):
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            self.c.shape = self.vocab.strings.add(x)
-												* Add a has_repvec property to Lexeme, and a check function to check flags

											
										
										
											2015-02-07 16:42:44 +03:00
-												* Tmp

											
										
										
											2015-08-22 23:04:34 +03:00
+								    property prefix_:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): Length-N substring from the start of the word.
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            Defaults to `N=1`.
 								        """
 								        def __get__(self):
 								            return self.vocab.strings[self.c.prefix]
-												Update Cython string types (#9143)

* Replace all basestring references with unicode

`basestring` was a compatability type introduced by Cython to make
dealing with utf-8 strings in Python2 easier. In Python3 it is
equivalent to the unicode (or str) type.

I replaced all references to basestring with unicode, since that was
used elsewhere, but we could also just replace them with str, which
shoudl also be equivalent.

All tests pass locally.

* Replace all references to unicode type with str

Since we only support python3 this is simpler.

* Remove all references to unicode type

This removes all references to the unicode type across the codebase and
replaces them with `str`, which makes it more drastic than the prior
commits. In order to make this work importing `unicode_literals` had to
be removed, and one explicit unicode literal also had to be removed (it
is unclear why this is necessary in Cython with language level 3, but
without doing it there were errors about implicit conversion).

When `unicode` is used as a type in comments it was also edited to be
`str`.

Additionally `coding: utf8` headers were removed from a few files.
											
										
										
											2021-09-13 18:02:17 +03:00
+								        def __set__(self, str x):
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            self.c.prefix = self.vocab.strings.add(x)
-												* Add a has_repvec property to Lexeme, and a check function to check flags

											
										
										
											2015-02-07 16:42:44 +03:00
-												* Tmp

											
										
										
											2015-08-22 23:04:34 +03:00
+								    property suffix_:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): Length-N substring from the end of the word.
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            Defaults to `N=3`.
 								        """
 								        def __get__(self):
 								            return self.vocab.strings[self.c.suffix]
-												Update Cython string types (#9143)

* Replace all basestring references with unicode

`basestring` was a compatability type introduced by Cython to make
dealing with utf-8 strings in Python2 easier. In Python3 it is
equivalent to the unicode (or str) type.

I replaced all references to basestring with unicode, since that was
used elsewhere, but we could also just replace them with str, which
shoudl also be equivalent.

All tests pass locally.

* Replace all references to unicode type with str

Since we only support python3 this is simpler.

* Remove all references to unicode type

This removes all references to the unicode type across the codebase and
replaces them with `str`, which makes it more drastic than the prior
commits. In order to make this work importing `unicode_literals` had to
be removed, and one explicit unicode literal also had to be removed (it
is unclear why this is necessary in Cython with language level 3, but
without doing it there were errors about implicit conversion).

When `unicode` is used as a type in comments it was also edited to be
`str`.

Additionally `coding: utf8` headers were removed from a few files.
											
										
										
											2021-09-13 18:02:17 +03:00
+								        def __set__(self, str x):
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            self.c.suffix = self.vocab.strings.add(x)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
+								    property lang_:
-												unicode -> str consistency

											
										
										
											2020-05-24 18:20:58 +03:00
+								        """RETURNS (str): Language of the parent vocabulary."""
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        def __get__(self):
 								            return self.vocab.strings[self.c.lang]
-												Update Cython string types (#9143)

* Replace all basestring references with unicode

`basestring` was a compatability type introduced by Cython to make
dealing with utf-8 strings in Python2 easier. In Python3 it is
equivalent to the unicode (or str) type.

I replaced all references to basestring with unicode, since that was
used elsewhere, but we could also just replace them with str, which
shoudl also be equivalent.

All tests pass locally.

* Replace all references to unicode type with str

Since we only support python3 this is simpler.

* Remove all references to unicode type

This removes all references to the unicode type across the codebase and
replaces them with `str`, which makes it more drastic than the prior
commits. In order to make this work importing `unicode_literals` had to
be removed, and one explicit unicode literal also had to be removed (it
is unclear why this is necessary in Cython with language level 3, but
without doing it there were errors about implicit conversion).

When `unicode` is used as a type in comments it was also edited to be
`str`.

Additionally `coding: utf8` headers were removed from a few files.
											
										
										
											2021-09-13 18:02:17 +03:00
+								        def __set__(self, str x):
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            self.c.lang = self.vocab.strings.add(x)
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
-												* Add missing properties in Lexeme class

											
										
										
											2015-08-26 20:16:28 +03:00
+								    property flags:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (uint64): Container of the lexeme's binary flags."""
 								        def __get__(self):
 								            return self.c.flags
 								        def __set__(self, flags_t x):
 								            self.c.flags = x
-												* Add missing properties in Lexeme class

											
										
										
											2015-08-26 20:16:28 +03:00
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 16:59:14 +03:00
+								    @property
 								    def is_oov(self):
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
-												Fix polarity of Token.is_oov and Lexeme.is_oov (#5634)

Fix `Token.is_oov` and `Lexeme.is_oov` so they return `True` when the
lexeme does **not** have a vector.
											
										
										
											2020-06-23 14:29:51 +03:00
+								        return self.orth not in self.vocab.vectors
-												* Add is_oov property, and fix up handling of attributes

											
										
										
											2015-07-27 02:50:06 +03:00
-												* Add is_stop to Python API

											
										
										
											2015-09-14 11:25:40 +03:00
+								    property is_stop:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme is a stop word."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_STOP)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_STOP, x)
-												* Add is_stop to Python API

											
										
										
											2015-09-14 11:25:40 +03:00
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								    property is_alpha:
-												Alphanumeric -> alphabetic [ci skip]

see ines/spacy-course#38

											
										
										
											2019-10-06 14:30:01 +03:00
+								        """RETURNS (bool): Whether the lexeme consists of alphabetic
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								            characters. Equivalent to `lexeme.text.isalpha()`.
 								        """
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_ALPHA)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_ALPHA, x)
-												Remove whitespace

											
										
										
											2017-04-01 11:19:01 +03:00
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								    property is_ascii:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme consists of ASCII characters.
 								            Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
 								        """
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_ASCII)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_ASCII, x)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property is_digit:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme consists of digits. Equivalent
 								            to `lexeme.text.isdigit()`.
 								        """
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_DIGIT)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_DIGIT, x)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property is_lower:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
 								            `lexeme.text.islower()`.
 								        """
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_LOWER)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_LOWER, x)
 								    property is_upper:
 								        """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
 								            `lexeme.text.isupper()`.
 								        """
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_UPPER)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_UPPER, x)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property is_title:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
 								            `lexeme.text.istitle()`.
 								        """
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_TITLE)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_TITLE, x)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property is_punct:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme is punctuation."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_PUNCT)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_PUNCT, x)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												Remove whitespace

											
										
										
											2017-04-01 11:19:01 +03:00
+								    property is_space:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme consist of whitespace characters.
 								            Equivalent to `lexeme.text.isspace()`.
 								        """
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_SPACE)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_SPACE, x)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												Remove whitespace

											
										
										
											2017-04-01 11:19:01 +03:00
+								    property is_bracket:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme is a bracket."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_BRACKET)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_BRACKET, x)
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
-												Remove whitespace

											
										
										
											2017-04-01 11:19:01 +03:00
+								    property is_quote:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme is a quotation mark."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_QUOTE)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_QUOTE, x)
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
-												Remove whitespace

											
										
										
											2017-04-01 11:19:01 +03:00
+								    property is_left_punct:
-												reduce memory load when reading all vectors from file (#6945)

* reduce memory load when reading all vectors from file

* one more small typo fix
											
										
										
											2021-02-07 03:05:43 +03:00
+								        """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
-												Remove whitespace

											
										
										
											2017-04-01 11:19:01 +03:00
+								    property is_right_punct:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
-												added new lexical feat to lexeme

											
										
										
											2018-02-11 20:51:48 +03:00
+								    property is_currency:
 								        """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, IS_CURRENCY)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								    property like_url:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme resembles a URL."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, LIKE_URL)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, LIKE_URL, x)
-												Remove whitespace

											
										
										
											2017-04-01 11:19:01 +03:00
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								    property like_num:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
 								            "10", "ten", etc.
 								        """
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, LIKE_NUM)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, LIKE_NUM, x)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property like_email:
-												Tidy up Lexeme and update docs

											
										
										
											2017-10-27 22:07:50 +03:00
+								        """RETURNS (bool): Whether the lexeme resembles an email address."""
 								        def __get__(self):
 								            return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
 								        def __set__(self, bint x):
 								            Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)