spaCy/spacy/tokens/token.pyx

# cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals

from libc.string cimport memcpy
from cpython.mem cimport PyMem_Malloc, PyMem_Free
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
cimport numpy as np
np.import_array()
import numpy

from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme
from .. import parts_of_speech
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport LEMMA, POS, TAG, DEP
from ..compat import is_config
from .. import about


cdef class Token:
    """An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
    def __cinit__(self, Vocab vocab, Doc doc, int offset):
        """Construct a `Token` object.

        vocab (Vocab): A storage container for lexical types.
        doc (Doc): The parent document.
        offset (int): The index of the token within the document.
        """
        self.vocab = vocab
        self.doc = doc
        self.c = &self.doc.c[offset]
        self.i = offset

    def __hash__(self):
        return hash((self.doc, self.i))

    def __len__(self):
        """The number of unicode characters in the token, i.e. `token.text`.

        RETURNS (int): The number of unicode characters in the token.
        """
        return self.c.lex.length

    def __unicode__(self):
        return self.text

    def __bytes__(self):
        return self.text.encode('utf8')

    def __str__(self):
        if is_config(python3=True):
            return self.__unicode__()
        return self.__bytes__()

    def __repr__(self):
        return self.__str__()

    def __richcmp__(self, Token other, int op):
        # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
        my = self.idx
        their = other.idx if other is not None else None
        if op == 0:
            return my < their
        elif op == 2:
            return my == their
        elif op == 4:
            return my > their
        elif op == 1:
            return my <= their
        elif op == 3:
            return my != their
        elif op == 5:
            return my >= their
        else:
            raise ValueError(op)

    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
        """Check the value of a boolean flag.

        flag_id (int): The ID of the flag attribute.
        RETURNS (bool): Whether the flag is set.

        EXAMPLE:
            >>> from spacy.attrs import IS_TITLE
            >>> doc = nlp(u'Give it back! He pleaded.')
            >>> token = doc[0]
            >>> token.check_flag(IS_TITLE)
            True
        """
        return Lexeme.c_check_flag(self.c.lex, flag_id)

    def nbor(self, int i=1):
        """Get a neighboring token.

        i (int): The relative position of the token to get. Defaults to 1.
        RETURNS (Token): The token at position `self.doc[self.i+i]`.
        """
        return self.doc[self.i+i]

    def similarity(self, other):
        """Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.

        other (object): The object to compare with. By default, accepts `Doc`,
            `Span`, `Token` and `Lexeme` objects.
        RETURNS (float): A scalar similarity score. Higher is more similar.
        """
        if 'similarity' in self.doc.user_token_hooks:
            return self.doc.user_token_hooks['similarity'](self)
        if self.vector_norm == 0 or other.vector_norm == 0:
            return 0.0
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

    property lex_id:
        """ID of the token's lexical type.

        RETURNS (int): ID of the token's lexical type."""
        def __get__(self):
            return self.c.lex.id

    property rank:
        # TODO: add docstring
        def __get__(self):
            return self.c.lex.id

    property string:
        def __get__(self):
            return self.text_with_ws

    property text:
        """A unicode representation of the token text.

        RETURNS (unicode): The original verbatim text of the token.
        """
        def __get__(self):
            return self.orth_

    property text_with_ws:
        """The text content of the token with a trailing whitespace character if
        it has one.

        RETURNS (unicode): The text content of the span (with trailing whitespace).
        """
        def __get__(self):
            cdef unicode orth = self.vocab.strings[self.c.lex.orth]
            if self.c.spacy:
                return orth + u' '
            else:
                return orth

    property prob:
        def __get__(self):
            return self.c.lex.prob

    property sentiment:
        def __get__(self):
            if 'sentiment' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['sentiment'](self)
            return self.c.lex.sentiment

    property lang:
        def __get__(self):
            return self.c.lex.lang

    property idx:
        def __get__(self):
            return self.c.idx

    property cluster:
        def __get__(self):
            return self.c.lex.cluster

    property orth:
        def __get__(self):
            return self.c.lex.orth

    property lower:
        def __get__(self):
            return self.c.lex.lower

    property norm:
        def __get__(self):
            return self.c.lex.norm

    property shape:
        def __get__(self):
            return self.c.lex.shape

    property prefix:
        def __get__(self):
            return self.c.lex.prefix

    property suffix:
        def __get__(self):
            return self.c.lex.suffix

    property lemma:
        """Base form of the word, with no inflectional suffixes.

        RETURNS (uint64): Token lemma.
        """
        def __get__(self):
            return self.c.lemma
        def __set__(self, attr_t lemma):
            self.c.lemma = lemma

    property pos:
        def __get__(self):
            return self.c.pos

    property tag:
        def __get__(self):
            return self.c.tag
        def __set__(self, attr_t tag):
            self.vocab.morphology.assign_tag(self.c, tag)

    property dep:
        def __get__(self):
            return self.c.dep
        def __set__(self, attr_t label):
            self.c.dep = label

    property has_vector:
        """A boolean value indicating whether a word vector is associated with
        the object.

        RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):
            if 'has_vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['has_vector'](self)
            return self.vocab.has_vector(self.c.lex.orth)

    property vector:
        """A real-valued meaning representation.

        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the token's semantics.
        """
        def __get__(self):
            if 'vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector'](self)
            if self.has_vector:
                return self.vocab.get_vector(self.c.lex.orth)
            else:
                return self.doc.tensor[self.i]

    property vector_norm:
        """The L2 norm of the token's vector representation.

        RETURNS (float): The L2 norm of the vector representation.
        """
        def __get__(self):
            if 'vector_norm' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector_norm'](self)
            vector = self.vector 
            return numpy.sqrt((vector ** 2).sum())

    property n_lefts:
        def __get__(self):
            return self.c.l_kids

    property n_rights:
        def __get__(self):
            return self.c.r_kids

    property sent_start:
        def __get__(self):
            return self.c.sent_start

        def __set__(self, bint value):
            if self.doc.is_parsed:
                raise ValueError(
                    'Refusing to write to token.sent_start if its document is parsed, '
                    'because this may cause inconsistent state. '
                    'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
            self.c.sent_start = value

    property lefts:
        def __get__(self):
            """
            The leftward immediate children of the word, in the syntactic
            dependency parse.
            """
            cdef int nr_iter = 0
            cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
            while ptr < self.c:
                if ptr + ptr.head == self.c:
                    yield self.doc[ptr - (self.c - self.i)]
                ptr += 1
                nr_iter += 1
                # This is ugly, but it's a way to guard out infinite loops
                if nr_iter >= 10000000:
                    raise RuntimeError(
                        "Possibly infinite loop encountered while looking for token.lefts")

    property rights:
        def __get__(self):
            """
            The rightward immediate children of the word, in the syntactic
            dependency parse.
            """
            cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
            tokens = []
            cdef int nr_iter = 0
            while ptr > self.c:
                if ptr + ptr.head == self.c:
                    tokens.append(self.doc[ptr - (self.c - self.i)])
                ptr -= 1
                nr_iter += 1
                if nr_iter >= 10000000:
                    raise RuntimeError(
                        "Possibly infinite loop encountered while looking for token.rights")
            tokens.reverse()
            for t in tokens:
                yield t

    property children:
        """
        A sequence of the token's immediate syntactic children.

        Yields: Token A child token such that child.head==self
        """
        def __get__(self):
            yield from self.lefts
            yield from self.rights

    property subtree:
        """
        A sequence of all the token's syntactic descendents.

        Yields: Token A descendent token such that self.is_ancestor(descendent)
        """
        def __get__(self):
            for word in self.lefts:
                yield from word.subtree
            yield self
            for word in self.rights:
                yield from word.subtree

    property left_edge:
        """The leftmost token of this token's syntactic descendents.

        RETURNS (Token): The first token such that `self.is_ancestor(token)`.
        """
        def __get__(self):
            return self.doc[self.c.l_edge]

    property right_edge:
        """The rightmost token of this token's syntactic descendents.

        RETURNS (Token): The last token such that `self.is_ancestor(token)`.
        """
        def __get__(self):
            return self.doc[self.c.r_edge]

    property ancestors:
        """A sequence of this token's syntactic ancestors.

        YIELDS (Token): A sequence of ancestor tokens such that
            `ancestor.is_ancestor(self)`.
        """
        def __get__(self):
            cdef const TokenC* head_ptr = self.c
            # guard against infinite loop, no token can have
            # more ancestors than tokens in the tree
            cdef int i = 0
            while head_ptr.head != 0 and i < self.doc.length:
                head_ptr += head_ptr.head
                yield self.doc[head_ptr - (self.c - self.i)]
                i += 1

    def is_ancestor(self, descendant):
        """Check whether this token is a parent, grandparent, etc. of another
        in the dependency tree.

        descendant (Token): Another token.
        RETURNS (bool): Whether this token is the ancestor of the descendant.
        """
        if self.doc is not descendant.doc:
            return False
        return any( ancestor.i == self.i for ancestor in descendant.ancestors )

    property head:
        """The syntactic parent, or "governor", of this token.

        RETURNS (Token): The token head.
        """
        def __get__(self):
            """The token predicted by the parser to be the head of the current
            token.
            """
            return self.doc[self.i + self.c.head]
        def __set__(self, Token new_head):
            # this function sets the head of self to new_head
            # and updates the counters for left/right dependents
            # and left/right corner for the new and the old head

            # do nothing if old head is new head
            if self.i + self.c.head == new_head.i:
                return

            cdef Token old_head = self.head
            cdef int rel_newhead_i = new_head.i - self.i

            # is the new head a descendant of the old head
            cdef bint is_desc = old_head.is_ancestor(new_head)

            cdef int new_edge
            cdef Token anc, child

            # update number of deps of old head
            if self.c.head > 0: # left dependent
                old_head.c.l_kids -= 1
                if self.c.l_edge == old_head.c.l_edge:
                    # the token dominates the left edge so the left edge of the head
                    # may change when the token is reattached
                    # it may not change if the new head is a descendant of the current head

                    new_edge = self.c.l_edge
                    # the new l_edge is the left-most l_edge on any of the other dependents
                    # where the l_edge is left of the head, otherwise it is the head
                    if not is_desc:
                        new_edge = old_head.i
                        for child in old_head.children:
                            if child == self:
                                continue
                            if child.c.l_edge < new_edge:
                                new_edge = child.c.l_edge
                        old_head.c.l_edge = new_edge

                    # walk up the tree from old_head and assign new l_edge to ancestors
                    # until an ancestor already has an l_edge that's further left
                    for anc in old_head.ancestors:
                        if anc.c.l_edge <= new_edge:
                            break
                        anc.c.l_edge = new_edge

            elif self.c.head < 0: # right dependent
                old_head.c.r_kids -= 1
                # do the same thing as for l_edge
                if self.c.r_edge == old_head.c.r_edge:
                    new_edge = self.c.r_edge

                    if not is_desc:
                        new_edge = old_head.i
                        for child in old_head.children:
                            if child == self:
                                continue
                            if child.c.r_edge > new_edge:
                                new_edge = child.c.r_edge
                        old_head.c.r_edge = new_edge

                    for anc in old_head.ancestors:
                        if anc.c.r_edge >= new_edge:
                            break
                        anc.c.r_edge = new_edge

            # update number of deps of new head
            if rel_newhead_i > 0: # left dependent
                new_head.c.l_kids += 1
                # walk up the tree from new head and set l_edge to self.l_edge
                # until you hit a token with an l_edge further to the left
                if self.c.l_edge < new_head.c.l_edge:
                    new_head.c.l_edge = self.c.l_edge
                    for anc in new_head.ancestors:
                        if anc.c.l_edge <= self.c.l_edge:
                            break
                        anc.c.l_edge = self.c.l_edge

            elif rel_newhead_i < 0: # right dependent
                new_head.c.r_kids += 1
                # do the same as for l_edge
                if self.c.r_edge > new_head.c.r_edge:
                    new_head.c.r_edge = self.c.r_edge
                    for anc in new_head.ancestors:
                        if anc.c.r_edge >= self.c.r_edge:
                            break
                        anc.c.r_edge = self.c.r_edge

            # set new head
            self.c.head = rel_newhead_i

    property conjuncts:
        """A sequence of coordinated tokens, including the token itself.

        YIELDS (Token): A coordinated token.
        """
        def __get__(self):
            """Get a list of conjoined words."""
            cdef Token word
            if 'conjuncts' in self.doc.user_token_hooks:
                yield from self.doc.user_token_hooks['conjuncts'](self)
            else:
                if self.dep_ != 'conj':
                    for word in self.rights:
                        if word.dep_ == 'conj':
                            yield word
                            yield from word.conjuncts

    property ent_type:
        """Named entity type.

        RETURNS (uint64): Named entity type.
        """
        def __get__(self):
            return self.c.ent_type
        def __set__(self, ent_type):
            self.c.ent_type = ent_type

    property ent_iob:
        """IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
        is assigned.

        RETURNS (uint64): IOB code of named entity tag.
        """
        def __get__(self):
            return self.c.ent_iob

    property ent_type_:
        """Named entity type.

        RETURNS (unicode): Named entity type.
        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_type]
        def __set__(self, ent_type):
            self.c.ent_type = self.vocab.strings.add(ent_type)

    property ent_iob_:
        """IOB code of named entity tag. "B" means the token begins an entity,
        "I" means it is inside an entity, "O" means it is outside an entity, and
        "" means no entity tag is set.

        RETURNS (unicode): IOB code of named entity tag.
        """
        def __get__(self):
            iob_strings = ('', 'I', 'O', 'B')
            return iob_strings[self.c.ent_iob]

    property ent_id:
        """ID of the entity the token is an instance of, if any. Usually
        assigned by patterns in the Matcher.

        RETURNS (uint64): ID of the entity.
        """
        def __get__(self):
            return self.c.ent_id

        def __set__(self, hash_t key):
            self.c.ent_id = key

    property ent_id_:
        """ID of the entity the token is an instance of, if any. Usually
        assigned by patterns in the Matcher.

        RETURNS (unicode): ID of the entity.
        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_id]

        def __set__(self, name):
            self.c.ent_id = self.vocab.strings.add(name)

    property whitespace_:
        def __get__(self):
            return ' ' if self.c.spacy else ''

    property orth_:
        def __get__(self):
            return self.vocab.strings[self.c.lex.orth]

    property lower_:
        def __get__(self):
            return self.vocab.strings[self.c.lex.lower]

    property norm_:
        def __get__(self):
            return self.vocab.strings[self.c.lex.norm]

    property shape_:
        def __get__(self):
            return self.vocab.strings[self.c.lex.shape]

    property prefix_:
        def __get__(self):
            return self.vocab.strings[self.c.lex.prefix]

    property suffix_:
        def __get__(self):
            return self.vocab.strings[self.c.lex.suffix]

    property lang_:
        def __get__(self):
            return self.vocab.strings[self.c.lex.lang]

    property lemma_:
        """Base form of the word, with no inflectional suffixes.

        RETURNS (unicode): Token lemma.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lemma]
        def __set__(self, unicode lemma_):
            self.c.lemma = self.vocab.strings.add(lemma_)

    property pos_:
        def __get__(self):
            return parts_of_speech.NAMES[self.c.pos]

    property tag_:
        def __get__(self):
            return self.vocab.strings[self.c.tag]
        def __set__(self, tag):
            self.tag = self.vocab.strings.add(tag)

    property dep_:
        def __get__(self):
            return self.vocab.strings[self.c.dep]
        def __set__(self, unicode label):
            self.c.dep = self.vocab.strings.add(label)

    property is_oov:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)

    property is_stop:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)

    property is_alpha:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)

    property is_ascii:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)

    property is_digit:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)

    property is_lower:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)

    property is_title:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)

    property is_punct:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)

    property is_space:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)

    property is_bracket:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)

    property is_quote:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)

    property is_left_punct:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)

    property is_right_punct:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)

    property like_url:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)

    property like_num:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)

    property like_email:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
-												Fix issue #672: ent_iob_ was a string, not unicode, due to missing unicode_literals statement.

											
										
										
											2016-12-19 00:33:53 +03:00
+								# cython: infer_types=True
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								# coding: utf8
-												Fix issue #672: ent_iob_ was a string, not unicode, due to missing unicode_literals statement.

											
										
										
											2016-12-19 00:33:53 +03:00
+								from __future__ import unicode_literals
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								from libc.string cimport memcpy
 								from cpython.mem cimport PyMem_Malloc, PyMem_Free
 								# Compiler crashes on memory view coercion without this. Should report bug.
 								from cython.view cimport array as cvarray
-												* Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx

											
										
										
											2015-07-13 21:20:58 +03:00
+								cimport numpy as np
 								np.import_array()
 								import numpy
-												Import hash_t typedef in token.pyx

											
										
										
											2016-09-23 15:22:06 +03:00
+								from ..typedefs cimport hash_t
-												* Begin merge of Gazetteer and DE branches

											
										
										
											2015-09-06 20:45:15 +03:00
+								from ..lexeme cimport Lexeme
-												* Rename ATTR_IDS to attrs.IDS. Rename ATTR_NAMES to attrs.NAMES. Rename UNIV_POS_IDS to parts_of_speech.IDS

											
										
										
											2015-10-10 09:55:55 +03:00
+								from .. import parts_of_speech
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-												Tidy up imports

											
										
										
											2017-05-13 14:04:40 +03:00
+								from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
-												Tidy up imports

											
										
										
											2017-05-13 14:04:40 +03:00
+								from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 								from ..attrs cimport LEMMA, POS, TAG, DEP
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								from ..compat import is_config
-												Fix models error message and use about.__docs_models__ (see #1051)

											
										
										
											2017-05-13 14:05:47 +03:00
+								from .. import about
-												* Work on language-independent refactoring

											
										
										
											2015-08-23 21:49:18 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								cdef class Token:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								    """An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								    def __cinit__(self, Vocab vocab, Doc doc, int offset):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Construct a `Token` object.
 								        vocab (Vocab): A storage container for lexical types.
 								        doc (Doc): The parent document.
 								        offset (int): The index of the token within the document.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        self.vocab = vocab
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								        self.doc = doc
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 16:15:14 +03:00
+								        self.c = &self.doc.c[offset]
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								        self.i = offset
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												Make Token hashable. Fixes #743

											
										
										
											2017-01-16 15:27:57 +03:00
+								    def __hash__(self):
 								        return hash((self.doc, self.i))
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    def __len__(self):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """The number of unicode characters in the token, i.e. `token.text`.
 								        RETURNS (int): The number of unicode characters in the token.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        return self.c.lex.length
 								    def __unicode__(self):
-												Remove deprecation shim around str/bytes in Token.

											
										
										
											2016-10-17 15:02:47 +03:00
+								        return self.text
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
+								    def __bytes__(self):
-												Remove deprecation shim around str/bytes in Token.

											
										
										
											2016-10-17 15:02:47 +03:00
+								        return self.text.encode('utf8')
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
-												* Fix string coercion for Python 3

											
										
										
											2015-07-24 04:49:30 +03:00
+								    def __str__(self):
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        if is_config(python3=True):
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
+								            return self.__unicode__()
 								        return self.__bytes__()
-												* Fix string coercion for Python 3

											
										
										
											2015-07-24 04:49:30 +03:00
-												added __repr__ that prints text in ipython for doc, token, and span objects

											
										
										
											2015-10-21 14:11:46 +03:00
+								    def __repr__(self):
-												fixed error when printing unicode

											
										
										
											2015-11-02 21:22:18 +03:00
+								        return self.__str__()
-												added __repr__ that prints text in ipython for doc, token, and span objects

											
										
										
											2015-10-21 14:11:46 +03:00
-												Amend 8ae8b443f: Handle comparison with None tokens.

											
										
										
											2017-01-11 15:03:32 +03:00
+								    def __richcmp__(self, Token other, int op):
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
+								        # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
 								        my = self.idx
-												Amend 8ae8b443f: Handle comparison with None tokens.

											
										
										
											2017-01-11 15:03:32 +03:00
+								        their = other.idx if other is not None else None
-												Add richcmp method to Token. Closes #631

											
										
										
											2017-01-09 21:30:31 +03:00
+								        if op == 0:
 								            return my < their
 								        elif op == 2:
 								            return my == their
 								        elif op == 4:
 								            return my > their
 								        elif op == 1:
 								            return my <= their
 								        elif op == 3:
 								            return my != their
 								        elif op == 5:
 								            return my >= their
 								        else:
 								            raise ValueError(op)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Check the value of a boolean flag.
 								        flag_id (int): The ID of the flag attribute.
 								        RETURNS (bool): Whether the flag is set.
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        EXAMPLE:
 								            >>> from spacy.attrs import IS_TITLE
 								            >>> doc = nlp(u'Give it back! He pleaded.')
 								            >>> token = doc[0]
 								            >>> token.check_flag(IS_TITLE)
 								            True
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        return Lexeme.c_check_flag(self.c.lex, flag_id)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    def nbor(self, int i=1):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Get a neighboring token.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        i (int): The relative position of the token to get. Defaults to 1.
 								        RETURNS (Token): The token at position `self.doc[self.i+i]`.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								        return self.doc[self.i+i]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								    def similarity(self, other):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Make a semantic similarity estimate. The default estimate is cosine
 								        similarity using an average of word vectors.
 								        other (object): The object to compare with. By default, accepts `Doc`,
 								            `Span`, `Token` and `Lexeme` objects.
 								        RETURNS (float): A scalar similarity score. Higher is more similar.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								        if 'similarity' in self.doc.user_token_hooks:
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								            return self.doc.user_token_hooks['similarity'](self)
-												* Fix vectors bugs for OOV words

											
										
										
											2015-09-22 03:10:01 +03:00
+								        if self.vector_norm == 0 or other.vector_norm == 0:
 								            return 0.0
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property lex_id:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """ID of the token's lexical type.
 								        RETURNS (int): ID of the token's lexical type."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.lex.id
-												* Add .rank property to Token and Lexeme, for frequency rank

											
										
										
											2015-11-08 18:18:25 +03:00
+								    property rank:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        # TODO: add docstring
-												* Add .rank property to Token and Lexeme, for frequency rank

											
										
										
											2015-11-08 18:18:25 +03:00
+								        def __get__(self):
 								            return self.c.lex.id
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property string:
 								        def __get__(self):
-												* Route token.string via token.txt_with_ws, to deprecate token.string in future

											
										
										
											2016-01-16 19:14:34 +03:00
+								            return self.text_with_ws
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
+								    property text:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """A unicode representation of the token text.
 								        RETURNS (unicode): The original verbatim text of the token.
 								        """
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
+								        def __get__(self):
 								            return self.orth_
 								    property text_with_ws:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """The text content of the token with a trailing whitespace character if
 								        it has one.
 								        RETURNS (unicode): The text content of the span (with trailing whitespace).
 								        """
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            cdef unicode orth = self.vocab.strings[self.c.lex.orth]
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
+								            if self.c.spacy:
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								                return orth + u' '
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
+								            else:
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								                return orth
-												* Add test and test_with_ws attributes.

											
										
										
											2015-09-13 03:27:42 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property prob:
 								        def __get__(self):
 								            return self.c.lex.prob
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								    property sentiment:
 								        def __get__(self):
 								            if 'sentiment' in self.doc.user_token_hooks:
 								                return self.doc.user_token_hooks['sentiment'](self)
 								            return self.c.lex.sentiment
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
+								    property lang:
 								        def __get__(self):
 								            return self.c.lex.lang
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property idx:
 								        def __get__(self):
 								            return self.c.idx
 								    property cluster:
 								        def __get__(self):
 								            return self.c.lex.cluster
 								    property orth:
 								        def __get__(self):
 								            return self.c.lex.orth
 								    property lower:
 								        def __get__(self):
 								            return self.c.lex.lower
 								    property norm:
 								        def __get__(self):
 								            return self.c.lex.norm
 								    property shape:
 								        def __get__(self):
 								            return self.c.lex.shape
 								    property prefix:
 								        def __get__(self):
 								            return self.c.lex.prefix
 								    property suffix:
 								        def __get__(self):
 								            return self.c.lex.suffix
 								    property lemma:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Base form of the word, with no inflectional suffixes.
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        RETURNS (uint64): Token lemma.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.lemma
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, attr_t lemma):
-												Allow lemma to be set from Python. Re #973

											
										
										
											2017-04-16 19:07:53 +03:00
+								            self.c.lemma = lemma
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property pos:
 								        def __get__(self):
 								            return self.c.pos
 								    property tag:
 								        def __get__(self):
 								            return self.c.tag
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, attr_t tag):
-												Fix #595: Lemmatization was incorrect for base forms, because morphological analyser wasn't adding morphology properly.

											
										
										
											2016-11-04 02:29:07 +03:00
+								            self.vocab.morphology.assign_tag(self.c, tag)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property dep:
 								        def __get__(self):
 								            return self.c.dep
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, attr_t label):
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								            self.c.dep = label
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												* Add has_vector attribute to Token and Lexeme

											
										
										
											2015-09-21 12:52:43 +03:00
+								    property has_vector:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """A boolean value indicating whether a word vector is associated with
 								        the object.
 								        RETURNS (bool): Whether a word vector is associated with the object.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Add has_vector attribute to Token and Lexeme

											
										
										
											2015-09-21 12:52:43 +03:00
+								        def __get__(self):
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								            if 'has_vector' in self.doc.user_token_hooks:
 								                return self.doc.user_token_hooks['has_vector'](self)
-												Fix vector linkage for token

											
										
										
											2017-06-04 22:19:58 +03:00
+								            return self.vocab.has_vector(self.c.lex.orth)
-												* Add has_vector attribute to Token and Lexeme

											
										
										
											2015-09-21 12:52:43 +03:00
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								    property vector:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """A real-valued meaning representation.
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
 								            representing the token's semantics.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								            if 'vector' in self.doc.user_token_hooks:
 								                return self.doc.user_token_hooks['vector'](self)
-												Try using tensor for vector/similarity methdos

											
										
										
											2017-05-31 00:35:17 +03:00
+								            if self.has_vector:
 								                return self.vocab.get_vector(self.c.lex.orth)
 								            else:
 								                return self.doc.tensor[self.i]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								    property vector_norm:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-20 16:13:33 +03:00
+								        """The L2 norm of the token's vector representation.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
 								        RETURNS (float): The L2 norm of the vector representation.
 								        """
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								        def __get__(self):
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								            if 'vector_norm' in self.doc.user_token_hooks:
 								                return self.doc.user_token_hooks['vector_norm'](self)
-												Re-delegate vectors to vocab

											
										
										
											2017-05-28 12:46:10 +03:00
+								            vector = self.vector
 								            return numpy.sqrt((vector ** 2).sum())
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property n_lefts:
 								        def __get__(self):
-												revert init_model.py back to pre-german state (because it makes more sense)
simplify token.n_rights and token.n_lefts

											
										
										
											2016-03-21 18:10:25 +03:00
+								            return self.c.l_kids
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property n_rights:
 								        def __get__(self):
-												revert init_model.py back to pre-german state (because it makes more sense)
simplify token.n_rights and token.n_lefts

											
										
										
											2016-03-21 18:10:25 +03:00
+								            return self.c.r_kids
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												* Add Token.sent_start property, re Issue #235

											
										
										
											2016-05-05 12:53:20 +03:00
+								    property sent_start:
 								        def __get__(self):
 								            return self.c.sent_start
 								        def __set__(self, bint value):
 								            if self.doc.is_parsed:
 								                raise ValueError(
 								                    'Refusing to write to token.sent_start if its document is parsed, '
 								                    'because this may cause inconsistent state. '
 								                    'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
 								            self.c.sent_start = value
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property lefts:
 								        def __get__(self):
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								            """
 								            The leftward immediate children of the word, in the syntactic
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								            dependency parse.
 								            """
-												* Add loop guard to Token.lefts and Token.rights properties

											
										
										
											2016-01-16 18:18:17 +03:00
+								            cdef int nr_iter = 0
-												* Fix bug in token subtree, introduced by duplication of L/R code in Stateclass. Need to consolidate the two methods.

											
										
										
											2015-09-06 11:48:36 +03:00
+								            cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								            while ptr < self.c:
-												integrated pseudo-projective parsing into parser

- nonproj.pyx holds a class PseudoProjectivity which currently holds
  all functionality to implement Nivre & Nilsson 2005's pseudo-projective
  parsing using the HEAD decoration scheme
- changed lefts/rights in Token to account for possible non-projective
  structures

											
										
										
											2016-03-01 12:09:08 +03:00
+								                if ptr + ptr.head == self.c:
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								                    yield self.doc[ptr - (self.c - self.i)]
-												integrated pseudo-projective parsing into parser

- nonproj.pyx holds a class PseudoProjectivity which currently holds
  all functionality to implement Nivre & Nilsson 2005's pseudo-projective
  parsing using the HEAD decoration scheme
- changed lefts/rights in Token to account for possible non-projective
  structures

											
										
										
											2016-03-01 12:09:08 +03:00
+								                ptr += 1
-												* Add loop guard to Token.lefts and Token.rights properties

											
										
										
											2016-01-16 18:18:17 +03:00
+								                nr_iter += 1
 								                # This is ugly, but it's a way to guard out infinite loops
 								                if nr_iter >= 10000000:
 								                    raise RuntimeError(
 								                        "Possibly infinite loop encountered while looking for token.lefts")
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property rights:
 								        def __get__(self):
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								            """
 								            The rightward immediate children of the word, in the syntactic
 								            dependency parse.
 								            """
-												* Fix bug in token subtree, introduced by duplication of L/R code in Stateclass. Need to consolidate the two methods.

											
										
										
											2015-09-06 11:48:36 +03:00
+								            cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								            tokens = []
-												* Add loop guard to Token.lefts and Token.rights properties

											
										
										
											2016-01-16 18:18:17 +03:00
+								            cdef int nr_iter = 0
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								            while ptr > self.c:
-												integrated pseudo-projective parsing into parser

- nonproj.pyx holds a class PseudoProjectivity which currently holds
  all functionality to implement Nivre & Nilsson 2005's pseudo-projective
  parsing using the HEAD decoration scheme
- changed lefts/rights in Token to account for possible non-projective
  structures

											
										
										
											2016-03-01 12:09:08 +03:00
+								                if ptr + ptr.head == self.c:
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								                    tokens.append(self.doc[ptr - (self.c - self.i)])
-												integrated pseudo-projective parsing into parser

- nonproj.pyx holds a class PseudoProjectivity which currently holds
  all functionality to implement Nivre & Nilsson 2005's pseudo-projective
  parsing using the HEAD decoration scheme
- changed lefts/rights in Token to account for possible non-projective
  structures

											
										
										
											2016-03-01 12:09:08 +03:00
+								                ptr -= 1
 								                nr_iter += 1
-												* Add loop guard to Token.lefts and Token.rights properties

											
										
										
											2016-01-16 18:18:17 +03:00
+								                if nr_iter >= 10000000:
 								                    raise RuntimeError(
 								                        "Possibly infinite loop encountered while looking for token.rights")
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								            tokens.reverse()
 								            for t in tokens:
 								                yield t
 								    property children:
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
 								        A sequence of the token's immediate syntactic children.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
 								        Yields: Token A child token such that child.head==self
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            yield from self.lefts
 								            yield from self.rights
 								    property subtree:
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
 								        A sequence of all the token's syntactic descendents.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
 								        Yields: Token A descendent token such that self.is_ancestor(descendent)
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            for word in self.lefts:
 								                yield from word.subtree
 								            yield self
 								            for word in self.rights:
 								                yield from word.subtree
 								    property left_edge:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """The leftmost token of this token's syntactic descendents.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        RETURNS (Token): The first token such that `self.is_ancestor(token)`.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								            return self.doc[self.c.l_edge]
-												* Whitespace

											
										
										
											2015-08-09 00:37:44 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property right_edge:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """The rightmost token of this token's syntactic descendents.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        RETURNS (Token): The last token such that `self.is_ancestor(token)`.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								            return self.doc[self.c.r_edge]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								    property ancestors:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """A sequence of this token's syntactic ancestors.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        YIELDS (Token): A sequence of ancestor tokens such that
 								            `ancestor.is_ancestor(self)`.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								        def __get__(self):
 								            cdef const TokenC* head_ptr = self.c
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
+								            # guard against infinite loop, no token can have
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								            # more ancestors than tokens in the tree
 								            cdef int i = 0
 								            while head_ptr.head != 0 and i < self.doc.length:
 								                head_ptr += head_ptr.head
 								                yield self.doc[head_ptr - (self.c - self.i)]
 								                i += 1
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								    def is_ancestor(self, descendant):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Check whether this token is a parent, grandparent, etc. of another
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								        in the dependency tree.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        descendant (Token): Another token.
 								        RETURNS (bool): Whether this token is the ancestor of the descendant.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Fix variable error in token

											
										
										
											2016-11-01 15:28:00 +03:00
+								        if self.doc is not descendant.doc:
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
+								            return False
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								        return any( ancestor.i == self.i for ancestor in descendant.ancestors )
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property head:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """The syntactic parent, or "governor", of this token.
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        RETURNS (Token): The token head.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								            """The token predicted by the parser to be the head of the current
 								            token.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								            """
-												* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API

											
										
										
											2015-07-14 01:10:11 +03:00
+								            return self.doc[self.i + self.c.head]
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								        def __set__(self, Token new_head):
 								            # this function sets the head of self to new_head
 								            # and updates the counters for left/right dependents
 								            # and left/right corner for the new and the old head
 								            # do nothing if old head is new head
 								            if self.i + self.c.head == new_head.i:
 								                return
 								            cdef Token old_head = self.head
 								            cdef int rel_newhead_i = new_head.i - self.i
 								            # is the new head a descendant of the old head
-												Use is_ancestor instead of deprecated is_ancestor_of

											
										
										
											2017-05-19 21:23:40 +03:00
+								            cdef bint is_desc = old_head.is_ancestor(new_head)
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								            cdef int new_edge
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								            cdef Token anc, child
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
 								            # update number of deps of old head
 								            if self.c.head > 0: # left dependent
 								                old_head.c.l_kids -= 1
 								                if self.c.l_edge == old_head.c.l_edge:
 								                    # the token dominates the left edge so the left edge of the head
 								                    # may change when the token is reattached
 								                    # it may not change if the new head is a descendant of the current head
 								                    new_edge = self.c.l_edge
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                    # the new l_edge is the left-most l_edge on any of the other dependents
 								                    # where the l_edge is left of the head, otherwise it is the head
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                    if not is_desc:
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                        new_edge = old_head.i
 								                        for child in old_head.children:
 								                            if child == self:
 								                                continue
 								                            if child.c.l_edge < new_edge:
 								                                new_edge = child.c.l_edge
 								                        old_head.c.l_edge = new_edge
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                    # walk up the tree from old_head and assign new l_edge to ancestors
 								                    # until an ancestor already has an l_edge that's further left
 								                    for anc in old_head.ancestors:
 								                        if anc.c.l_edge <= new_edge:
 								                            break
 								                        anc.c.l_edge = new_edge
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								            elif self.c.head < 0: # right dependent
 								                old_head.c.r_kids -= 1
 								                # do the same thing as for l_edge
 								                if self.c.r_edge == old_head.c.r_edge:
 								                    new_edge = self.c.r_edge
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                    if not is_desc:
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                        new_edge = old_head.i
 								                        for child in old_head.children:
 								                            if child == self:
 								                                continue
 								                            if child.c.r_edge > new_edge:
 								                                new_edge = child.c.r_edge
 								                        old_head.c.r_edge = new_edge
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                    for anc in old_head.ancestors:
 								                        if anc.c.r_edge >= new_edge:
 								                            break
 								                        anc.c.r_edge = new_edge
 								            # update number of deps of new head
 								            if rel_newhead_i > 0: # left dependent
 								                new_head.c.l_kids += 1
 								                # walk up the tree from new head and set l_edge to self.l_edge
 								                # until you hit a token with an l_edge further to the left
 								                if self.c.l_edge < new_head.c.l_edge:
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                    new_head.c.l_edge = self.c.l_edge
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                    for anc in new_head.ancestors:
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                        if anc.c.l_edge <= self.c.l_edge:
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                            break
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                        anc.c.l_edge = self.c.l_edge
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
 								            elif rel_newhead_i < 0: # right dependent
 								                new_head.c.r_kids += 1
 								                # do the same as for l_edge
 								                if self.c.r_edge > new_head.c.r_edge:
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                    new_head.c.r_edge = self.c.r_edge
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                    for anc in new_head.ancestors:
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                        if anc.c.r_edge >= self.c.r_edge:
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								                            break
-												changed head.__set__ to make it simpler

											
										
										
											2016-03-14 15:43:48 +03:00
+								                        anc.c.r_edge = self.c.r_edge
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
 								            # set new head
 								            self.c.head = rel_newhead_i
-												* Whitespace

											
										
										
											2015-08-09 00:37:44 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property conjuncts:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """A sequence of coordinated tokens, including the token itself.
-												Fix doc strings

											
										
										
											2016-11-01 14:25:36 +03:00
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        YIELDS (Token): A coordinated token.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												* Fix token.conjuncts method

											
										
										
											2015-10-14 19:34:57 +03:00
+								            """Get a list of conjoined words."""
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								            cdef Token word
-												Add sentiment field to doc, rename getters_for_tokens and getters_for_spans, add user_hooks field to Doc.

											
										
										
											2016-10-19 21:54:03 +03:00
+								            if 'conjuncts' in self.doc.user_token_hooks:
 								                yield from self.doc.user_token_hooks['conjuncts'](self)
-												Defer some attributes to Doc, via getters_for_tokens attribute.

											
										
										
											2016-10-17 03:44:49 +03:00
+								            else:
 								                if self.dep_ != 'conj':
 								                    for word in self.rights:
 								                        if word.dep_ == 'conj':
 								                            yield word
 								                            yield from word.conjuncts
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property ent_type:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Named entity type.
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        RETURNS (uint64): Named entity type.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.ent_type
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, ent_type):
 								            self.c.ent_type = ent_type
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property ent_iob:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
 								        is assigned.
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        RETURNS (uint64): IOB code of named entity tag.
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            return self.c.ent_iob
 								    property ent_type_:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Named entity type.
 								        RETURNS (unicode): Named entity type.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.ent_type]
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        def __set__(self, ent_type):
 								            self.c.ent_type = self.vocab.strings.add(ent_type)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property ent_iob_:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """IOB code of named entity tag. "B" means the token begins an entity,
 								        "I" means it is inside an entity, "O" means it is outside an entity, and
 								        "" means no entity tag is set.
 								        RETURNS (unicode): IOB code of named entity tag.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
 								            iob_strings = ('', 'I', 'O', 'B')
 								            return iob_strings[self.c.ent_iob]
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
+								    property ent_id:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """ID of the entity the token is an instance of, if any. Usually
 								        assigned by patterns in the Matcher.
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								        RETURNS (uint64): ID of the entity.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
+								        def __get__(self):
-												Fix token.pyx

											
										
										
											2016-09-23 16:07:07 +03:00
+								            return self.c.ent_id
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
 								        def __set__(self, hash_t key):
-												Allow ent_id to be set in Token

											
										
										
											2017-03-31 15:00:14 +03:00
+								            self.c.ent_id = key
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
 								    property ent_id_:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """ID of the entity the token is an instance of, if any. Usually
 								        assigned by patterns in the Matcher.
 								        RETURNS (unicode): ID of the entity.
-												Tidy up and fix formatting and imports

											
										
										
											2017-04-15 14:05:15 +03:00
+								        """
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.ent_id]
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
-												Allow ent_id to be set in Token

											
										
										
											2017-03-31 15:00:14 +03:00
+								        def __set__(self, name):
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								            self.c.ent_id = self.vocab.strings.add(name)
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 15:54:55 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property whitespace_:
 								        def __get__(self):
-												* Fix whitespace_ calculation in Token

											
										
										
											2015-10-18 09:21:11 +03:00
+								            return ' ' if self.c.spacy else ''
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property orth_:
 								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lex.orth]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property lower_:
 								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lex.lower]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property norm_:
 								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lex.norm]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property shape_:
 								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lex.shape]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property prefix_:
 								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lex.prefix]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property suffix_:
 								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lex.suffix]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
+								    property lang_:
 								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lex.lang]
-												introduce lang field for LexemeC to hold language id
put noun_chunk logic into iterators.py for each language separately

											
										
										
											2016-03-10 15:01:34 +03:00
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								    property lemma_:
-												Update docstrings and API docs for Token

											
										
										
											2017-05-19 19:47:56 +03:00
+								        """Base form of the word, with no inflectional suffixes.
 								        RETURNS (unicode): Token lemma.
 								        """
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
+								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.lemma]
-												Allow lemma to be set from Python. Re #973

											
										
										
											2017-04-16 19:07:53 +03:00
+								        def __set__(self, unicode lemma_):
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								            self.c.lemma = self.vocab.strings.add(lemma_)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property pos_:
 								        def __get__(self):
-												* Rename ATTR_IDS to attrs.IDS. Rename ATTR_NAMES to attrs.NAMES. Rename UNIV_POS_IDS to parts_of_speech.IDS

											
										
										
											2015-10-10 09:55:55 +03:00
+								            return parts_of_speech.NAMES[self.c.pos]
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property tag_:
 								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.tag]
-												Fix Issue #600: Missing setters for Token attribute.

											
										
										
											2016-11-03 01:28:59 +03:00
+								        def __set__(self, tag):
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								            self.tag = self.vocab.strings.add(tag)
-												* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference

											
										
										
											2015-07-13 20:20:48 +03:00
 								    property dep_:
 								        def __get__(self):
-												Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.

											
										
										
											2016-09-30 21:20:22 +03:00
+								            return self.vocab.strings[self.c.dep]
-												add function for setting head and label to token
change PseudoProjectivity.deprojectivize to use these functions

											
										
										
											2016-03-11 19:31:06 +03:00
+								        def __set__(self, unicode label):
-												Finish stringstore change. Also xfail vectors tests

											
										
										
											2017-05-28 16:10:22 +03:00
+								            self.c.dep = self.vocab.strings.add(label)
-												* Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx

											
										
										
											2015-07-13 21:20:58 +03:00
-												* Add is_oov property, and fix up handling of attributes

											
										
										
											2015-07-27 02:50:06 +03:00
+								    property is_oov:
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
-												* Add is_oov property, and fix up handling of attributes

											
										
										
											2015-07-27 02:50:06 +03:00
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
+								    property is_stop:
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)
-												* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

											
										
										
											2015-09-14 10:49:58 +03:00
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								    property is_alpha:
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
-												* Whitespace

											
										
										
											2015-08-09 00:37:44 +03:00
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								    property is_ascii:
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property is_digit:
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property is_lower:
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property is_title:
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property is_punct:
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
+								    property is_space:
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
 								    property is_bracket:
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
+								    property is_quote:
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
+								    property is_left_punct:
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
-												Add support for Universal Dependencies v2.0

											
										
										
											2017-02-27 00:27:11 +03:00
+								    property is_right_punct:
-												* Add Python hooks for is_bracket/is_quote/is_left_punct/is_right_punct

											
										
										
											2016-02-04 15:04:16 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property like_url:
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
-												* Whitespace

											
										
										
											2015-08-09 00:37:44 +03:00
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
+								    property like_num:
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
-												* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

											
										
										
											2015-07-26 17:37:16 +03:00
 								    property like_email:
-												* Fix ugly py_check_flag and py_set_flag functions in Lexeme

											
										
										
											2015-09-15 06:06:18 +03:00
+								        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)