spaCy/spacy/lexeme.pyx

# cython: embedsignature=True
from libc.math cimport sqrt
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64

# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
cimport numpy as np
np.import_array()


from libc.string cimport memset

from .orth cimport word_shape
from .typedefs cimport attr_t, flags_t
import numpy

from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET
from .attrs cimport IS_QUOTE
from .attrs cimport IS_LEFT_PUNCT
from .attrs cimport IS_RIGHT_PUNCT
from .attrs cimport IS_OOV


memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))


cdef class Lexeme:
    """An entry in the vocabulary.  A Lexeme has no string context --- it's a
    word-type, as opposed to a word token.  It therefore has no part-of-speech
    tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
    tag).
    """
    def __init__(self, Vocab vocab, int orth):
        """Create a Lexeme object.

        Arguments:
            vocab (Vocab): The parent vocabulary
            orth (int): The orth id of the lexeme.
        Returns (Lexeme): The newly constructd object.
        """
        self.vocab = vocab
        self.orth = orth
        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
        assert self.c.orth == orth

    def __richcmp__(self, other, int op):
        if isinstance(other, Lexeme):
            a = self.orth
            b = other.orth
        elif isinstance(other, int):
            a = self.orth
            b = other
        elif isinstance(other, str):
            a = self.orth_
            b = other
        else:
            a = 0
            b = 1
        if op == 2: # ==
            return a == b
        elif op == 3: # !=
            return a != b
        elif op == 0: # <
            return a < b
        elif op == 1: # <=
            return a <= b
        elif op == 4: # >
            return a > b
        elif op == 5: # >=
            return a >= b
        else:
            raise NotImplementedError(op)

    def __hash__(self):
        return self.c.orth

    def set_flag(self, attr_id_t flag_id, bint value):
        """Change the value of a boolean flag.

        Arguments:
            flag_id (int): The attribute ID of the flag to set.
            value (bool): The new value of the flag.
        """
        Lexeme.c_set_flag(self.c, flag_id, value)

    def check_flag(self, attr_id_t flag_id):
        """Check the value of a boolean flag.

        Arguments:
            flag_id (int): The attribute ID of the flag to query.
        Returns (bool): The value of the flag.
        """
        return True if Lexeme.c_check_flag(self.c, flag_id) else False

    def similarity(self, other):
        '''Compute a semantic similarity estimate. Defaults to cosine over vectors.

        Arguments:
            other:
                The object to compare with. By default, accepts Doc, Span,
                Token and Lexeme objects.
        Returns:
            score (float): A scalar similarity score. Higher is more similar.
        '''
        if self.vector_norm == 0 or other.vector_norm == 0:
            return 0.0
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

    property has_vector:
        def __get__(self):
            cdef int i
            for i in range(self.vocab.vectors_length):
                if self.c.vector[i] != 0:
                    return True
            else:
                return False

    property vector_norm:
        def __get__(self):
            return self.c.l2_norm

        def __set__(self, float value):
            self.c.l2_norm = value

    property vector:
        def __get__(self):
            cdef int length = self.vocab.vectors_length
            if length == 0:
                raise ValueError(
                    "Word vectors set to length 0. This may be because the "
                    "data is not installed. If you haven't already, run"
                    "\npython -m spacy download %s\n"
                    "to install the data." % self.vocab.lang
                )

            vector_view = <float[:length,]>self.c.vector
            return numpy.asarray(vector_view)

        def __set__(self, vector):
            assert len(vector) == self.vocab.vectors_length
            cdef float value
            cdef double norm = 0.0
            for i, value in enumerate(vector):
                self.c.vector[i] = value
                norm += value * value
            self.c.l2_norm = sqrt(norm)

    property rank:
        def __get__(self):
            return self.c.id

    property repvec:
        def __get__(self):
            raise AttributeError("lex.repvec has been renamed to lex.vector")

    property sentiment:
        def __get__(self):
            return self.c.sentiment
        def __set__(self, float sentiment):
            self.c.sentiment = sentiment

    property orth_:
        def __get__(self):
            return self.vocab.strings[self.c.orth]

    property lower:
        def __get__(self): return self.c.lower
        def __set__(self, int x): self.c.lower = x

    property norm:
        def __get__(self): return self.c.norm
        def __set__(self, int x): self.c.norm = x

    property shape:
        def __get__(self): return self.c.shape
        def __set__(self, int x): self.c.shape = x

    property prefix:
        def __get__(self): return self.c.prefix
        def __set__(self, int x): self.c.prefix = x

    property suffix:
        def __get__(self): return self.c.suffix
        def __set__(self, int x): self.c.suffix = x

    property cluster:
        def __get__(self): return self.c.cluster
        def __set__(self, int x): self.c.cluster = x

    property lang:
        def __get__(self): return self.c.lang
        def __set__(self, int x): self.c.lang = x

    property prob:
        def __get__(self): return self.c.prob
        def __set__(self, float x): self.c.prob = x

    property lower_:
        def __get__(self): return self.vocab.strings[self.c.lower]
        def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]

    property norm_:
        def __get__(self): return self.vocab.strings[self.c.norm]
        def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]

    property shape_:
        def __get__(self): return self.vocab.strings[self.c.shape]
        def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]

    property prefix_:
        def __get__(self): return self.vocab.strings[self.c.prefix]
        def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]

    property suffix_:
        def __get__(self): return self.vocab.strings[self.c.suffix]
        def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]

    property lang_:
        def __get__(self): return self.vocab.strings[self.c.lang]
        def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]

    property flags:
        def __get__(self): return self.c.flags
        def __set__(self, flags_t x): self.c.flags = x

    property is_oov:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_OOV, x)

    property is_stop:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_STOP, x)

    property is_alpha:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x)

    property is_ascii:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x)

    property is_digit:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_DIGIT)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_DIGIT, x)

    property is_lower:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_LOWER)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LOWER, x)

    property is_title:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_TITLE)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_TITLE, x)

    property is_punct:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x)

    property is_space:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)

    property is_bracket:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x)

    property is_quote:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x)

    property is_left_punct:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)

    property is_right_punct:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)


    property like_url:
        def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)

    property like_num:
        def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x)

    property like_email:
        def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme 2015-01-14 16:33:16 +03:00			`# cython: embedsignature=True`
Fix vector_norm when vector is assigned to Lexeme. 2016-10-23 15:23:56 +03:00			`from libc.math cimport sqrt`
* Upd Tokens to use vector, with bounds checking. 2014-09-15 05:22:40 +04:00			`from cpython.ref cimport Py_INCREF`
* Switch from own memory class to cymem, in pip 2014-09-18 01:09:24 +04:00			`from cymem.cymem cimport Pool`
* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`from murmurhash.mrmr cimport hash64`
* Upd Tokens to use vector, with bounds checking. 2014-09-15 05:22:40 +04:00
* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility. 2015-09-14 10:49:58 +03:00			`# Compiler crashes on memory view coercion without this. Should report bug.`
			`from cython.view cimport array as cvarray`
			`cimport numpy as np`
			`np.import_array()`



* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 18:57:59 +04:00			`from libc.string cimport memset`

* Fix orth import 2015-01-05 10:49:19 +03:00			`from .orth cimport word_shape`
* Add supersense data to Lexeme objects. Add simple has_sense method to check the flag. 2015-07-01 19:50:37 +03:00			`from .typedefs cimport attr_t, flags_t`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`import numpy`
* Restoring Lexeme-as-struct 2014-09-10 22:41:37 +04:00
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00			`from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE`
			`from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP`
introduce lang field for LexemeC to hold language id put noun_chunk logic into iterators.py for each language separately 2016-03-10 15:01:34 +03:00			`from .attrs cimport IS_BRACKET`
			`from .attrs cimport IS_QUOTE`
			`from .attrs cimport IS_LEFT_PUNCT`
			`from .attrs cimport IS_RIGHT_PUNCT`
* Add is_oov property, and fix up handling of attributes 2015-07-27 02:50:06 +03:00			`from .attrs cimport IS_OOV`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))`
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00

* Tmp. Refactoring, introducing a Lexeme PyObject. 2015-01-12 03:23:44 +03:00			`cdef class Lexeme:`
* Add docstring to Lexeme 2015-01-24 12:48:34 +03:00			`"""An entry in the vocabulary. A Lexeme has no string context --- it's a`
			`word-type, as opposed to a word token. It therefore has no part-of-speech`
			`tag, dependency parse, or lemma (lemmatization depends on the part-of-speech`
			`tag).`
			`"""`
* Tmp 2015-08-22 23:04:34 +03:00			`def __init__(self, Vocab vocab, int orth):`
Fix doc strings 2016-11-01 14:25:36 +03:00			`"""Create a Lexeme object.`

			`Arguments:`
			`vocab (Vocab): The parent vocabulary`
			`orth (int): The orth id of the lexeme.`
			`Returns (Lexeme): The newly constructd object.`
			`"""`
* Tmp 2015-08-22 23:04:34 +03:00			`self.vocab = vocab`
			`self.orth = orth`
* Begin merge of Gazetteer and DE branches 2015-09-06 20:45:15 +03:00			`self.c = <LexemeC><void>vocab.get_by_orth(vocab.mem, orth)`
* Work on language-independent refactoring 2015-08-23 21:49:18 +03:00			`assert self.c.orth == orth`
* Tmp 2015-08-22 23:04:34 +03:00
* Fix Issue #361: Lexemes didn't have rich comparison. 2016-05-05 02:32:26 +03:00			`def __richcmp__(self, other, int op):`
			`if isinstance(other, Lexeme):`
			`a = self.orth`
			`b = other.orth`
			`elif isinstance(other, int):`
			`a = self.orth`
			`b = other`
			`elif isinstance(other, str):`
* Fix issue #372: mistake in Lexeme rich comparison 2016-05-12 13:58:57 +03:00			`a = self.orth_`
* Fix Issue #361: Lexemes didn't have rich comparison. 2016-05-05 02:32:26 +03:00			`b = other`
			`else:`
			`a = 0`
			`b = 1`
			`if op == 2: # ==`
			`return a == b`
			`elif op == 3: # !=`
			`return a != b`
			`elif op == 0: # <`
			`return a < b`
			`elif op == 1: # <=`
			`return a <= b`
			`elif op == 4: # >`
			`return a > b`
			`elif op == 5: # >=`
			`return a >= b`
			`else:`
			`raise NotImplementedError(op)`

Fix Issue #371: Lexeme objects were unhashable. 2016-09-27 14:22:30 +03:00			`def __hash__(self):`
			`return self.c.orth`

* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def set_flag(self, attr_id_t flag_id, bint value):`
Fix doc strings 2016-11-01 14:25:36 +03:00			`"""Change the value of a boolean flag.`

			`Arguments:`
			`flag_id (int): The attribute ID of the flag to set.`
			`value (bool): The new value of the flag.`
			`"""`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`Lexeme.c_set_flag(self.c, flag_id, value)`
Remove whitespace 2017-04-01 11:19:01 +03:00
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def check_flag(self, attr_id_t flag_id):`
Fix doc strings 2016-11-01 14:25:36 +03:00			`"""Check the value of a boolean flag.`

			`Arguments:`
			`flag_id (int): The attribute ID of the flag to query.`
			`Returns (bool): The value of the flag.`
			`"""`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`return True if Lexeme.c_check_flag(self.c, flag_id) else False`
* Temporarily add py_set_flag attribute in Lexeme 2015-09-06 18:52:51 +03:00
* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility. 2015-09-14 10:49:58 +03:00			`def similarity(self, other):`
Fix doc strings 2016-11-01 14:25:36 +03:00			`'''Compute a semantic similarity estimate. Defaults to cosine over vectors.`

			`Arguments:`
			`other:`
			`The object to compare with. By default, accepts Doc, Span,`
			`Token and Lexeme objects.`
			`Returns:`
			`score (float): A scalar similarity score. Higher is more similar.`
			`'''`
* Fix vectors bugs for OOV words 2015-09-22 03:10:01 +03:00			`if self.vector_norm == 0 or other.vector_norm == 0:`
			`return 0.0`
* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility. 2015-09-14 10:49:58 +03:00			`return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)`

* Add has_vector attribute to Token and Lexeme 2015-09-21 12:52:43 +03:00			`property has_vector:`
			`def __get__(self):`
* Fix Token.has_vector and Lexeme.has_vector 2015-09-22 02:43:16 +03:00			`cdef int i`
			`for i in range(self.vocab.vectors_length):`
* Rename .repvec to .vector in C API 2015-11-03 15:47:59 +03:00			`if self.c.vector[i] != 0:`
* Fix Token.has_vector and Lexeme.has_vector 2015-09-22 02:43:16 +03:00			`return True`
			`else:`
			`return False`
* Add has_vector attribute to Token and Lexeme 2015-09-21 12:52:43 +03:00
* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility. 2015-09-14 10:49:58 +03:00			`property vector_norm:`
			`def __get__(self):`
			`return self.c.l2_norm`

			`def __set__(self, float value):`
			`self.c.l2_norm = value`

			`property vector:`
			`def __get__(self):`
* Fix vectors bug in lexeme 2015-09-15 12:05:11 +03:00			`cdef int length = self.vocab.vectors_length`
* Raise exceptions if attempt to access parse, but data is not installed. This partly but not fully addresses Issue #97. Still need exceptions on the various Token attributes that access the parse tree, e.g. token.head, token.lefts, token.rights, etc. Exceptions should be centralized, too. 2015-09-21 11:35:40 +03:00			`if length == 0:`
			`raise ValueError(`
			`"Word vectors set to length 0. This may be because the "`
			`"data is not installed. If you haven't already, run"`
Fix download commands in error messages (see #946) 2017-04-01 11:19:32 +03:00			`"\npython -m spacy download %s\n"`
make error messages language independent 2016-03-24 13:47:09 +03:00			`"to install the data." % self.vocab.lang`
* Raise exceptions if attempt to access parse, but data is not installed. This partly but not fully addresses Issue #97. Still need exceptions on the various Token attributes that access the parse tree, e.g. token.head, token.lefts, token.rights, etc. Exceptions should be centralized, too. 2015-09-21 11:35:40 +03:00			`)`
Remove whitespace 2017-04-01 11:19:01 +03:00
* Rename .repvec to .vector in C API 2015-11-03 15:47:59 +03:00			`vector_view = <float[:length,]>self.c.vector`
			`return numpy.asarray(vector_view)`
* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility. 2015-09-14 10:49:58 +03:00
* Support setting of word vectors on Lexeme object. 2015-09-15 07:42:27 +03:00			`def __set__(self, vector):`
* Fix vectors bug in lexeme 2015-09-15 12:05:11 +03:00			`assert len(vector) == self.vocab.vectors_length`
* Support setting of word vectors on Lexeme object. 2015-09-15 07:42:27 +03:00			`cdef float value`
Fix vector_norm when vector is assigned to Lexeme. 2016-10-23 15:23:56 +03:00			`cdef double norm = 0.0`
* Support setting of word vectors on Lexeme object. 2015-09-15 07:42:27 +03:00			`for i, value in enumerate(vector):`
* Rename .repvec to .vector in C API 2015-11-03 15:47:59 +03:00			`self.c.vector[i] = value`
Fix vector_norm when vector is assigned to Lexeme. 2016-10-23 15:23:56 +03:00			`norm += value * value`
			`self.c.l2_norm = sqrt(norm)`
* Support setting of word vectors on Lexeme object. 2015-09-15 07:42:27 +03:00
* Add .rank property to Token and Lexeme, for frequency rank 2015-11-08 18:18:25 +03:00			`property rank:`
			`def __get__(self):`
			`return self.c.id`

* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility. 2015-09-14 10:49:58 +03:00			`property repvec:`
			`def __get__(self):`
Add sentiment property on lexeme object 2016-10-19 21:52:52 +03:00			`raise AttributeError("lex.repvec has been renamed to lex.vector")`

			`property sentiment:`
			`def __get__(self):`
			`return self.c.sentiment`
			`def __set__(self, float sentiment):`
			`self.c.sentiment = sentiment`
Remove whitespace 2017-04-01 11:19:01 +03:00
* Add missing properties in Lexeme class 2015-08-26 20:16:28 +03:00			`property orth_:`
			`def __get__(self):`
			`return self.vocab.strings[self.c.orth]`
* Tmp 2015-08-22 23:04:34 +03:00
			`property lower:`
			`def __get__(self): return self.c.lower`
			`def __set__(self, int x): self.c.lower = x`
Remove whitespace 2017-04-01 11:19:01 +03:00
* Tmp 2015-08-22 23:04:34 +03:00			`property norm:`
			`def __get__(self): return self.c.norm`
			`def __set__(self, int x): self.c.norm = x`

			`property shape:`
			`def __get__(self): return self.c.shape`
			`def __set__(self, int x): self.c.shape = x`

			`property prefix:`
			`def __get__(self): return self.c.prefix`
			`def __set__(self, int x): self.c.prefix = x`

			`property suffix:`
			`def __get__(self): return self.c.suffix`
			`def __set__(self, int x): self.c.suffix = x`
Remove whitespace 2017-04-01 11:19:01 +03:00
* Add missing properties in Lexeme class 2015-08-26 20:16:28 +03:00			`property cluster:`
* Fix attribute getters and setters in Lexeme 2015-09-09 15:29:22 +03:00			`def __get__(self): return self.c.cluster`
			`def __set__(self, int x): self.c.cluster = x`
Remove whitespace 2017-04-01 11:19:01 +03:00
introduce lang field for LexemeC to hold language id put noun_chunk logic into iterators.py for each language separately 2016-03-10 15:01:34 +03:00			`property lang:`
			`def __get__(self): return self.c.lang`
			`def __set__(self, int x): self.c.lang = x`

* Add missing properties in Lexeme class 2015-08-26 20:16:28 +03:00			`property prob:`
* Fix attribute getters and setters in Lexeme 2015-09-09 15:29:22 +03:00			`def __get__(self): return self.c.prob`
			`def __set__(self, float x): self.c.prob = x`
* Tmp 2015-08-22 23:04:34 +03:00
			`property lower_:`
			`def __get__(self): return self.vocab.strings[self.c.lower]`
			`def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]`
Remove whitespace 2017-04-01 11:19:01 +03:00
* Tmp 2015-08-22 23:04:34 +03:00			`property norm_:`
* Fix attribute getters and setters in Lexeme 2015-09-09 15:29:22 +03:00			`def __get__(self): return self.vocab.strings[self.c.norm]`
* Tmp 2015-08-22 23:04:34 +03:00			`def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]`
Remove whitespace 2017-04-01 11:19:01 +03:00
* Tmp 2015-08-22 23:04:34 +03:00			`property shape_:`
			`def __get__(self): return self.vocab.strings[self.c.shape]`
			`def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]`
* Add a has_repvec property to Lexeme, and a check function to check flags 2015-02-07 16:42:44 +03:00
* Tmp 2015-08-22 23:04:34 +03:00			`property prefix_:`
* Fix attribute getters and setters in Lexeme 2015-09-09 15:29:22 +03:00			`def __get__(self): return self.vocab.strings[self.c.prefix]`
* Tmp 2015-08-22 23:04:34 +03:00			`def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]`
* Add a has_repvec property to Lexeme, and a check function to check flags 2015-02-07 16:42:44 +03:00
* Tmp 2015-08-22 23:04:34 +03:00			`property suffix_:`
* Fix attribute getters and setters in Lexeme 2015-09-09 15:29:22 +03:00			`def __get__(self): return self.vocab.strings[self.c.suffix]`
* Tmp 2015-08-22 23:04:34 +03:00			`def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00
introduce lang field for LexemeC to hold language id put noun_chunk logic into iterators.py for each language separately 2016-03-10 15:01:34 +03:00			`property lang_:`
			`def __get__(self): return self.vocab.strings[self.c.lang]`
			`def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]`

* Add missing properties in Lexeme class 2015-08-26 20:16:28 +03:00			`property flags:`
			`def __get__(self): return self.c.flags`
			`def __set__(self, flags_t x): self.c.flags = x`

* Add is_oov property, and fix up handling of attributes 2015-07-27 02:50:06 +03:00			`property is_oov:`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_OOV, x)`
* Add is_oov property, and fix up handling of attributes 2015-07-27 02:50:06 +03:00
* Add is_stop to Python API 2015-09-14 11:25:40 +03:00			`property is_stop:`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_STOP, x)`
* Add is_stop to Python API 2015-09-14 11:25:40 +03:00
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00			`property is_alpha:`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x)`
Remove whitespace 2017-04-01 11:19:01 +03:00
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00			`property is_ascii:`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00
			`property is_digit:`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, IS_DIGIT)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_DIGIT, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00
			`property is_lower:`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, IS_LOWER)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LOWER, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00
			`property is_title:`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, IS_TITLE)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_TITLE, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00
			`property is_punct:`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00
Remove whitespace 2017-04-01 11:19:01 +03:00			`property is_space:`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00
Remove whitespace 2017-04-01 11:19:01 +03:00			`property is_bracket:`
* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct 2016-02-04 15:04:16 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x)`

Remove whitespace 2017-04-01 11:19:01 +03:00			`property is_quote:`
* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct 2016-02-04 15:04:16 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x)`

Remove whitespace 2017-04-01 11:19:01 +03:00			`property is_left_punct:`
* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct 2016-02-04 15:04:16 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)`

Remove whitespace 2017-04-01 11:19:01 +03:00			`property is_right_punct:`
* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct 2016-02-04 15:04:16 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)`


* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00			`property like_url:`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)`
Remove whitespace 2017-04-01 11:19:01 +03:00
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00			`property like_num:`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x)`
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects 2015-07-26 17:37:16 +03:00
			`property like_email:`
* Fix ugly py_check_flag and py_set_flag functions in Lexeme 2015-09-15 06:06:18 +03:00			`def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_EMAIL)`
			`def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)`