Tidy up and fix formatting and imports

This commit is contained in:
ines 2017-04-15 13:05:15 +02:00
parent fefe6684cd
commit 0739ae7b76
15 changed files with 251 additions and 230 deletions

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import six import six
import sys import sys
import json import ujson
try: try:
import cPickle as pickle import cPickle as pickle
@ -28,14 +28,14 @@ if is_python2:
unicode_ = unicode unicode_ = unicode
basestring_ = basestring basestring_ = basestring
input_ = raw_input input_ = raw_input
json_dumps = lambda data: json.dumps(data, indent=2).decode('utf8') json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
elif is_python3: elif is_python3:
bytes_ = bytes bytes_ = bytes
unicode_ = str unicode_ = str
basestring_ = str basestring_ = str
input_ = input input_ = input
json_dumps = lambda data: json.dumps(data, indent=2) json_dumps = lambda data: ujson.dumps(data, indent=2)
def symlink_to(orig, dest): def symlink_to(orig, dest):

View File

@ -1,3 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path from pathlib import Path
from . import about from . import about

View File

@ -7,17 +7,17 @@ out of "context") is in features/extractor.pyx
The atomic feature names are listed in a big enum, so that the feature tuples The atomic feature names are listed in a big enum, so that the feature tuples
can refer to them. can refer to them.
""" """
from libc.string cimport memset # coding: utf-8
from __future__ import unicode_literals
from libc.string cimport memset
from itertools import combinations from itertools import combinations
from cymem.cymem cimport Pool
from ..structs cimport TokenC from ..structs cimport TokenC
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from cymem.cymem cimport Pool
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
if token is NULL: if token is NULL:

View File

@ -1,29 +1,26 @@
# cython: profile=True # cython: profile=True
# cython: cdivision=True # cython: cdivision=True
# cython: infer_types=True # cython: infer_types=True
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
import ctypes import ctypes
import os from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from ..structs cimport TokenC from cymem.cymem cimport Pool
from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token
from .nonproj import PseudoProjectivity
from .nonproj import is_nonproj_tree
from .transition_system cimport do_func_t, get_cost_func_t from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..gold cimport GoldParseC from ..gold cimport GoldParseC
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..structs cimport TokenC
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from cymem.cymem cimport Pool
from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token
from .nonproj import PseudoProjectivity
from .nonproj import is_nonproj_tree
DEF NON_MONOTONIC = True DEF NON_MONOTONIC = True

View File

@ -1,50 +1,34 @@
"""
MALT-style dependency parser
"""
# cython: profile=True # cython: profile=True
# cython: experimental_cpp_class_def=True # cython: experimental_cpp_class_def=True
# cython: cdivision=True # cython: cdivision=True
# cython: infer_types=True # cython: infer_types=True
""" # coding: utf-8
MALT-style dependency parser
""" from __future__ import unicode_literals, print_function
from __future__ import unicode_literals
cimport cython cimport cython
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from libc.stdint cimport uint32_t, uint64_t from libc.stdint cimport uint32_t, uint64_t
from libc.string cimport memset, memcpy from libc.string cimport memset, memcpy
from libc.stdlib cimport rand from libc.stdlib cimport rand
from libc.math cimport log, exp, isnan, isinf from libc.math cimport log, exp, isnan, isinf
import random
import os.path
from os import path
import shutil
import json
import math
from cymem.cymem cimport Pool, Address from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport real_hash64 as hash64 from murmurhash.mrmr cimport real_hash64 as hash64
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
from util import Config
from thinc.linear.features cimport ConjunctionExtracter from thinc.linear.features cimport ConjunctionExtracter
from thinc.structs cimport FeatureC, ExampleC from thinc.structs cimport FeatureC, ExampleC
from thinc.extra.search cimport Beam, MaxViolation
from thinc.extra.search cimport Beam
from thinc.extra.search cimport MaxViolation
from thinc.extra.eg cimport Example from thinc.extra.eg cimport Example
from thinc.extra.mb cimport Minibatch from thinc.extra.mb cimport Minibatch
from ..structs cimport TokenC from ..structs cimport TokenC
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..strings cimport StringStore from ..strings cimport StringStore
from .transition_system cimport TransitionSystem, Transition from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParse from ..gold cimport GoldParse
from . import _parse_features from . import _parse_features
from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context from ._parse_features cimport fill_context
@ -266,4 +250,3 @@ def is_gold(StateClass state, GoldParse gold, StringStore strings):
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
truth.add((id_, head, dep)) truth.add((id_, head, dep))
return truth == predicted return truth == predicted

View File

@ -1,9 +1,14 @@
from spacy.parts_of_speech cimport NOUN, PROPN, PRON # coding: utf-8
from __future__ import unicode_literals
from ..parts_of_speech cimport NOUN, PROPN, PRON
def english_noun_chunks(obj): def english_noun_chunks(obj):
'''Detect base noun phrases from a dependency parse. """
Works on both Doc and Span.''' Detect base noun phrases from a dependency parse.
Works on both Doc and Span.
"""
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'ROOT', 'root'] 'attr', 'ROOT', 'root']
doc = obj.doc # Ensure works on both Doc and Span. doc = obj.doc # Ensure works on both Doc and Span.

View File

@ -1,17 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from .transition_system cimport Transition
from .transition_system cimport do_func_t
from ..structs cimport TokenC, Entity
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from ..gold cimport GoldParseC
from ..gold cimport GoldParse
from ..attrs cimport ENT_TYPE, ENT_IOB
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from .transition_system cimport Transition
from .transition_system cimport do_func_t
from ..structs cimport TokenC, Entity
from ..gold cimport GoldParseC
from ..gold cimport GoldParse
from ..attrs cimport ENT_TYPE, ENT_IOB
cdef enum: cdef enum:

View File

@ -1,8 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from copy import copy from copy import copy
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from spacy.attrs import DEP, HEAD from ..attrs import DEP, HEAD
def ancestors(tokenid, heads): def ancestors(tokenid, heads):
@ -201,5 +202,3 @@ class PseudoProjectivity:
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts)) filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
filtered.append((raw_text, filtered_sents)) filtered.append((raw_text, filtered_sents))
return filtered return filtered

View File

@ -1,56 +1,44 @@
# cython: infer_types=True
""" """
MALT-style dependency parser MALT-style dependency parser
""" """
# coding: utf-8
# cython: infer_types=True
from __future__ import unicode_literals from __future__ import unicode_literals
from collections import Counter
import ujson
cimport cython cimport cython
cimport cython.parallel cimport cython.parallel
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals from cpython.exc cimport PyErr_CheckSignals
from libc.stdint cimport uint32_t, uint64_t from libc.stdint cimport uint32_t, uint64_t
from libc.string cimport memset, memcpy from libc.string cimport memset, memcpy
from libc.stdlib cimport malloc, calloc, free from libc.stdlib cimport malloc, calloc, free
import os.path
from collections import Counter
from os import path
import shutil
import json
import sys
from .nonproj import PseudoProjectivity
from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec from thinc.linalg cimport VecVec
from thinc.structs cimport SparseArrayC from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
from thinc.extra.eg cimport Example
from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64
from preshed.maps cimport MapStruct from preshed.maps cimport MapStruct
from preshed.maps cimport map_get from preshed.maps cimport map_get
from thinc.structs cimport FeatureC
from thinc.structs cimport ExampleC
from thinc.extra.eg cimport Example
from util import Config
from ..structs cimport TokenC
from ..tokens.doc cimport Doc
from ..strings cimport StringStore
from .transition_system import OracleError
from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParse
from . import _parse_features from . import _parse_features
from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context from ._parse_features cimport fill_context
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from .nonproj import PseudoProjectivity
from .transition_system import OracleError
from .transition_system cimport TransitionSystem, Transition
from ..structs cimport TokenC
from ..tokens.doc cimport Doc
from ..strings cimport StringStore
from ..gold cimport GoldParse
USE_FTRL = False USE_FTRL = False
DEBUG = False DEBUG = False
@ -80,7 +68,9 @@ cdef class ParserModel(AveragedPerceptron):
return nr_feat return nr_feat
def update(self, Example eg, itn=0): def update(self, Example eg, itn=0):
'''Does regression on negative cost. Sort of cute?''' """
Does regression on negative cost. Sort of cute?
"""
self.time += 1 self.time += 1
cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class) cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
cdef int guess = eg.guess cdef int guess = eg.guess
@ -132,10 +122,13 @@ cdef class ParserModel(AveragedPerceptron):
cdef class Parser: cdef class Parser:
"""Base class of the DependencyParser and EntityRecognizer.""" """
Base class of the DependencyParser and EntityRecognizer.
"""
@classmethod @classmethod
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg): def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
"""Load the statistical model from the supplied path. """
Load the statistical model from the supplied path.
Arguments: Arguments:
path (Path): path (Path):
@ -148,7 +141,7 @@ cdef class Parser:
The newly constructed object. The newly constructed object.
""" """
with (path / 'config.json').open() as file_: with (path / 'config.json').open() as file_:
cfg = json.load(file_) cfg = ujson.load(file_)
# TODO: remove this shim when we don't have to support older data # TODO: remove this shim when we don't have to support older data
if 'labels' in cfg and 'actions' not in cfg: if 'labels' in cfg and 'actions' not in cfg:
cfg['actions'] = cfg.pop('labels') cfg['actions'] = cfg.pop('labels')
@ -168,7 +161,8 @@ cdef class Parser:
return self return self
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg): def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
"""Create a Parser. """
Create a Parser.
Arguments: Arguments:
vocab (Vocab): vocab (Vocab):
@ -198,7 +192,8 @@ cdef class Parser:
return (Parser, (self.vocab, self.moves, self.model), None, None) return (Parser, (self.vocab, self.moves, self.model), None, None)
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
"""Apply the entity recognizer, setting the annotations onto the Doc object. """
Apply the entity recognizer, setting the annotations onto the Doc object.
Arguments: Arguments:
doc (Doc): The document to be processed. doc (Doc): The document to be processed.
@ -215,7 +210,8 @@ cdef class Parser:
self.moves.finalize_doc(tokens) self.moves.finalize_doc(tokens)
def pipe(self, stream, int batch_size=1000, int n_threads=2): def pipe(self, stream, int batch_size=1000, int n_threads=2):
"""Process a stream of documents. """
Process a stream of documents.
Arguments: Arguments:
stream: The sequence of documents to process. stream: The sequence of documents to process.
@ -303,7 +299,8 @@ cdef class Parser:
return 0 return 0
def update(self, Doc tokens, GoldParse gold, itn=0): def update(self, Doc tokens, GoldParse gold, itn=0):
"""Update the statistical model. """
Update the statistical model.
Arguments: Arguments:
doc (Doc): doc (Doc):
@ -342,7 +339,8 @@ cdef class Parser:
return loss return loss
def step_through(self, Doc doc, GoldParse gold=None): def step_through(self, Doc doc, GoldParse gold=None):
"""Set up a stepwise state, to introspect and control the transition sequence. """
Set up a stepwise state, to introspect and control the transition sequence.
Arguments: Arguments:
doc (Doc): The document to step through. doc (Doc): The document to step through.
@ -426,7 +424,9 @@ cdef class StepwiseState:
@property @property
def costs(self): def costs(self):
'''Find the action-costs for the current state''' """
Find the action-costs for the current state.
"""
self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs, self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs,
self.stcls, self.gold) self.stcls, self.gold)
costs = {} costs = {}

View File

@ -1,5 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from ..vocab cimport EMPTY_LEXEME from ..vocab cimport EMPTY_LEXEME
from ..structs cimport Entity from ..structs cimport Entity
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme

View File

@ -1,4 +1,8 @@
# cython: infer_types=True # cython: infer_types=True
# coding: utf-8
from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from collections import defaultdict from collections import defaultdict
@ -6,7 +10,6 @@ from collections import defaultdict
from ..structs cimport TokenC from ..structs cimport TokenC
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
cdef weight_t MIN_SCORE = -90000 cdef weight_t MIN_SCORE = -90000

View File

@ -1,18 +0,0 @@
from os import path
import json
class Config(object):
def __init__(self, **kwargs):
for key, value in kwargs.items():
setattr(self, key, value)
def get(self, attr, default=None):
return self.__dict__.get(attr, default)
@classmethod
def write(cls, model_dir, name, **kwargs):
open(path.join(model_dir, '%s.json' % name), 'w').write(json.dumps(kwargs))
@classmethod
def read(cls, model_dir, name):
return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))

View File

@ -1,15 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
cimport cython cimport cython
cimport numpy as np
import numpy
import numpy.linalg
import struct
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.math cimport sqrt from libc.math cimport sqrt
import numpy from .span cimport Span
import numpy.linalg from .token cimport Token
import struct
cimport numpy as np
import six
import warnings
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..lexeme cimport EMPTY_LEXEME from ..lexeme cimport EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
@ -19,11 +22,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from .span cimport Span
from .token cimport Token
from ..serialize.bits cimport BitArray from ..serialize.bits cimport BitArray
from ..util import normalize_slice from ..util import normalize_slice
from ..syntax.iterators import CHUNKERS from ..syntax.iterators import CHUNKERS
from ..compat import is_config
DEF PADDING = 5 DEF PADDING = 5
@ -76,7 +78,7 @@ cdef class Doc:
""" """
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
''' """
Create a Doc object. Create a Doc object.
Aside: Implementation Aside: Implementation
@ -97,7 +99,7 @@ cdef class Doc:
A list of boolean values, of the same length as words. True A list of boolean values, of the same length as words. True
means that the word is followed by a space, False means it is not. means that the word is followed by a space, False means it is not.
If None, defaults to [True]*len(words) If None, defaults to [True]*len(words)
''' """
self.vocab = vocab self.vocab = vocab
size = 20 size = 20
self.mem = Pool() self.mem = Pool()
@ -158,7 +160,7 @@ cdef class Doc:
self.is_parsed = True self.is_parsed = True
def __getitem__(self, object i): def __getitem__(self, object i):
''' """
doc[i] doc[i]
Get the Token object at position i, where i is an integer. Get the Token object at position i, where i is an integer.
Negative indexing is supported, and follows the usual Python Negative indexing is supported, and follows the usual Python
@ -172,7 +174,7 @@ cdef class Doc:
are not supported, as `Span` objects must be contiguous (cannot have gaps). are not supported, as `Span` objects must be contiguous (cannot have gaps).
You can use negative indices and open-ended ranges, which have their You can use negative indices and open-ended ranges, which have their
normal Python semantics. normal Python semantics.
''' """
if isinstance(i, slice): if isinstance(i, slice):
start, stop = normalize_slice(len(self), i.start, i.stop, i.step) start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
return Span(self, start, stop, label=0) return Span(self, start, stop, label=0)
@ -186,7 +188,7 @@ cdef class Doc:
return Token.cinit(self.vocab, &self.c[i], i, self) return Token.cinit(self.vocab, &self.c[i], i, self)
def __iter__(self): def __iter__(self):
''' """
for token in doc for token in doc
Iterate over `Token` objects, from which the annotations can Iterate over `Token` objects, from which the annotations can
be easily accessed. This is the main way of accessing Token be easily accessed. This is the main way of accessing Token
@ -194,7 +196,7 @@ cdef class Doc:
Python. If faster-than-Python speeds are required, you can Python. If faster-than-Python speeds are required, you can
instead access the annotations as a numpy array, or access the instead access the annotations as a numpy array, or access the
underlying C data directly from Cython. underlying C data directly from Cython.
''' """
cdef int i cdef int i
for i in range(self.length): for i in range(self.length):
if self._py_tokens[i] is not None: if self._py_tokens[i] is not None:
@ -203,10 +205,10 @@ cdef class Doc:
yield Token.cinit(self.vocab, &self.c[i], i, self) yield Token.cinit(self.vocab, &self.c[i], i, self)
def __len__(self): def __len__(self):
''' """
len(doc) len(doc)
The number of tokens in the document. The number of tokens in the document.
''' """
return self.length return self.length
def __unicode__(self): def __unicode__(self):
@ -216,7 +218,7 @@ cdef class Doc:
return u''.join([t.text_with_ws for t in self]).encode('utf-8') return u''.join([t.text_with_ws for t in self]).encode('utf-8')
def __str__(self): def __str__(self):
if six.PY3: if is_config(python3=True):
return self.__unicode__() return self.__unicode__()
return self.__bytes__() return self.__bytes__()
@ -228,7 +230,8 @@ cdef class Doc:
return self return self
def similarity(self, other): def similarity(self, other):
'''Make a semantic similarity estimate. The default estimate is cosine """
Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors. similarity using an average of word vectors.
Arguments: Arguments:
@ -237,7 +240,7 @@ cdef class Doc:
Return: Return:
score (float): A scalar similarity score. Higher is more similar. score (float): A scalar similarity score. Higher is more similar.
''' """
if 'similarity' in self.user_hooks: if 'similarity' in self.user_hooks:
return self.user_hooks['similarity'](self, other) return self.user_hooks['similarity'](self, other)
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
@ -245,9 +248,9 @@ cdef class Doc:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property has_vector: property has_vector:
''' """
A boolean value indicating whether a word vector is associated with the object. A boolean value indicating whether a word vector is associated with the object.
''' """
def __get__(self): def __get__(self):
if 'has_vector' in self.user_hooks: if 'has_vector' in self.user_hooks:
return self.user_hooks['has_vector'](self) return self.user_hooks['has_vector'](self)
@ -255,11 +258,11 @@ cdef class Doc:
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
property vector: property vector:
''' """
A real-valued meaning representation. Defaults to an average of the token vectors. A real-valued meaning representation. Defaults to an average of the token vectors.
Type: numpy.ndarray[ndim=1, dtype='float32'] Type: numpy.ndarray[ndim=1, dtype='float32']
''' """
def __get__(self): def __get__(self):
if 'vector' in self.user_hooks: if 'vector' in self.user_hooks:
return self.user_hooks['vector'](self) return self.user_hooks['vector'](self)
@ -294,17 +297,21 @@ cdef class Doc:
return self.text return self.text
property text: property text:
'''A unicode representation of the document text.''' """
A unicode representation of the document text.
"""
def __get__(self): def __get__(self):
return u''.join(t.text_with_ws for t in self) return u''.join(t.text_with_ws for t in self)
property text_with_ws: property text_with_ws:
'''An alias of Doc.text, provided for duck-type compatibility with Span and Token.''' """
An alias of Doc.text, provided for duck-type compatibility with Span and Token.
"""
def __get__(self): def __get__(self):
return self.text return self.text
property ents: property ents:
''' """
Yields named-entity `Span` objects, if the entity recognizer Yields named-entity `Span` objects, if the entity recognizer
has been applied to the document. Iterate over the span to get has been applied to the document. Iterate over the span to get
individual Token objects, or access the label: individual Token objects, or access the label:
@ -318,7 +325,7 @@ cdef class Doc:
assert ents[0].label_ == 'PERSON' assert ents[0].label_ == 'PERSON'
assert ents[0].orth_ == 'Best' assert ents[0].orth_ == 'Best'
assert ents[0].text == 'Mr. Best' assert ents[0].text == 'Mr. Best'
''' """
def __get__(self): def __get__(self):
cdef int i cdef int i
cdef const TokenC* token cdef const TokenC* token
@ -382,13 +389,13 @@ cdef class Doc:
self.c[start].ent_iob = 3 self.c[start].ent_iob = 3
property noun_chunks: property noun_chunks:
''' """
Yields base noun-phrase #[code Span] objects, if the document Yields base noun-phrase #[code Span] objects, if the document
has been syntactically parsed. A base noun phrase, or has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to 'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses. For example: phrases, and no relative clauses.
''' """
def __get__(self): def __get__(self):
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(
@ -496,7 +503,8 @@ cdef class Doc:
return output return output
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed """
Produce a dict of {attribute (int): count (ints)} frequencies, keyed
by the values of the given attribute ID. by the values of the given attribute ID.
Example: Example:
@ -563,8 +571,9 @@ cdef class Doc:
self.c[i] = parsed[i] self.c[i] = parsed[i]
def from_array(self, attrs, array): def from_array(self, attrs, array):
'''Write to a `Doc` object, from an `(M, N)` array of attributes. """
''' Write to a `Doc` object, from an `(M, N)` array of attributes.
"""
cdef int i, col cdef int i, col
cdef attr_id_t attr_id cdef attr_id_t attr_id
cdef TokenC* tokens = self.c cdef TokenC* tokens = self.c
@ -603,19 +612,23 @@ cdef class Doc:
return self return self
def to_bytes(self): def to_bytes(self):
'''Serialize, producing a byte string.''' """
Serialize, producing a byte string.
"""
byte_string = self.vocab.serializer.pack(self) byte_string = self.vocab.serializer.pack(self)
cdef uint32_t length = len(byte_string) cdef uint32_t length = len(byte_string)
return struct.pack('I', length) + byte_string return struct.pack('I', length) + byte_string
def from_bytes(self, data): def from_bytes(self, data):
'''Deserialize, loading from bytes.''' """
Deserialize, loading from bytes.
"""
self.vocab.serializer.unpack_into(data[4:], self) self.vocab.serializer.unpack_into(data[4:], self)
return self return self
@staticmethod @staticmethod
def read_bytes(file_): def read_bytes(file_):
''' """
A static method, used to read serialized #[code Doc] objects from A static method, used to read serialized #[code Doc] objects from
a file. For example: a file. For example:
@ -630,7 +643,7 @@ cdef class Doc:
for byte_string in Doc.read_bytes(file_): for byte_string in Doc.read_bytes(file_):
docs.append(Doc(nlp.vocab).from_bytes(byte_string)) docs.append(Doc(nlp.vocab).from_bytes(byte_string))
assert len(docs) == 2 assert len(docs) == 2
''' """
keep_reading = True keep_reading = True
while keep_reading: while keep_reading:
try: try:
@ -644,7 +657,8 @@ cdef class Doc:
yield n_bytes_str + data yield n_bytes_str + data
def merge(self, int start_idx, int end_idx, *args, **attributes): def merge(self, int start_idx, int end_idx, *args, **attributes):
"""Retokenize the document, such that the span at doc.text[start_idx : end_idx] """
Retokenize the document, such that the span at doc.text[start_idx : end_idx]
is merged into a single token. If start_idx and end_idx do not mark start is merged into a single token. If start_idx and end_idx do not mark start
and end token boundaries, the document remains unchanged. and end token boundaries, the document remains unchanged.
@ -658,7 +672,6 @@ cdef class Doc:
token (Token): token (Token):
The newly merged token, or None if the start and end indices did The newly merged token, or None if the start and end indices did
not fall at token boundaries. not fall at token boundaries.
""" """
cdef unicode tag, lemma, ent_type cdef unicode tag, lemma, ent_type
if len(args) == 3: if len(args) == 3:

View File

@ -1,26 +1,31 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from collections import defaultdict from collections import defaultdict
cimport numpy as np
import numpy import numpy
import numpy.linalg import numpy.linalg
cimport numpy as np
from libc.math cimport sqrt from libc.math cimport sqrt
import six
from .doc cimport token_by_start, token_by_end
from ..structs cimport TokenC, LexemeC from ..structs cimport TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t, hash_t from ..typedefs cimport flags_t, attr_t, hash_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..util import normalize_slice from ..util import normalize_slice
from .doc cimport token_by_start, token_by_end
from ..attrs cimport IS_PUNCT, IS_SPACE from ..attrs cimport IS_PUNCT, IS_SPACE
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..compat import is_config
cdef class Span: cdef class Span:
"""A slice from a Doc object.""" """
A slice from a Doc object.
"""
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None, def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
vector_norm=None): vector_norm=None):
'''Create a Span object from the slice doc[start : end] """
Create a Span object from the slice doc[start : end]
Arguments: Arguments:
doc (Doc): The parent document. doc (Doc): The parent document.
@ -30,7 +35,7 @@ cdef class Span:
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
Returns: Returns:
Span The newly constructed object. Span The newly constructed object.
''' """
if not (0 <= start <= end <= len(doc)): if not (0 <= start <= end <= len(doc)):
raise IndexError raise IndexError
@ -68,7 +73,7 @@ cdef class Span:
return self.end - self.start return self.end - self.start
def __repr__(self): def __repr__(self):
if six.PY3: if is_config(python3=True):
return self.text return self.text
return self.text.encode('utf-8') return self.text.encode('utf-8')
@ -89,7 +94,8 @@ cdef class Span:
yield self.doc[i] yield self.doc[i]
def merge(self, *args, **attributes): def merge(self, *args, **attributes):
"""Retokenize the document, such that the span is merged into a single token. """
Retokenize the document, such that the span is merged into a single token.
Arguments: Arguments:
**attributes: **attributes:
@ -102,7 +108,8 @@ cdef class Span:
return self.doc.merge(self.start_char, self.end_char, *args, **attributes) return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
def similarity(self, other): def similarity(self, other):
'''Make a semantic similarity estimate. The default estimate is cosine """
Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors. similarity using an average of word vectors.
Arguments: Arguments:
@ -111,7 +118,7 @@ cdef class Span:
Return: Return:
score (float): A scalar similarity score. Higher is more similar. score (float): A scalar similarity score. Higher is more similar.
''' """
if 'similarity' in self.doc.user_span_hooks: if 'similarity' in self.doc.user_span_hooks:
self.doc.user_span_hooks['similarity'](self, other) self.doc.user_span_hooks['similarity'](self, other)
if self.vector_norm == 0.0 or other.vector_norm == 0.0: if self.vector_norm == 0.0 or other.vector_norm == 0.0:
@ -133,11 +140,12 @@ cdef class Span:
self.end = end + 1 self.end = end + 1
property sent: property sent:
'''The sentence span that this span is a part of. """
The sentence span that this span is a part of.
Returns: Returns:
Span The sentence this is part of. Span The sentence this is part of.
''' """
def __get__(self): def __get__(self):
if 'sent' in self.doc.user_span_hooks: if 'sent' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sent'](self) return self.doc.user_span_hooks['sent'](self)
@ -198,13 +206,13 @@ cdef class Span:
return u''.join([t.text_with_ws for t in self]) return u''.join([t.text_with_ws for t in self])
property noun_chunks: property noun_chunks:
''' """
Yields base noun-phrase #[code Span] objects, if the document Yields base noun-phrase #[code Span] objects, if the document
has been syntactically parsed. A base noun phrase, or has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to 'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses. For example: phrases, and no relative clauses. For example:
''' """
def __get__(self): def __get__(self):
if not self.doc.is_parsed: if not self.doc.is_parsed:
raise ValueError( raise ValueError(
@ -223,17 +231,16 @@ cdef class Span:
yield span yield span
property root: property root:
"""The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered. """
The token within the span that's highest in the parse tree. If there's a
tie, the earlist is prefered.
Returns: Returns:
Token: The root token. Token: The root token.
i.e. has the i.e. has the shortest path to the root of the sentence (or is the root
shortest path to the root of the sentence (or is the root itself). itself). If multiple words are equally high in the tree, the first word
is taken. For example:
If multiple words are equally high in the tree, the first word is taken.
For example:
>>> toks = nlp(u'I like New York in Autumn.') >>> toks = nlp(u'I like New York in Autumn.')
@ -303,7 +310,8 @@ cdef class Span:
return self.doc[root] return self.doc[root]
property lefts: property lefts:
"""Tokens that are to the left of the span, whose head is within the Span. """
Tokens that are to the left of the span, whose head is within the Span.
Yields: Token A left-child of a token of the span. Yields: Token A left-child of a token of the span.
""" """
@ -314,7 +322,8 @@ cdef class Span:
yield left yield left
property rights: property rights:
"""Tokens that are to the right of the Span, whose head is within the Span. """
Tokens that are to the right of the Span, whose head is within the Span.
Yields: Token A right-child of a token of the span. Yields: Token A right-child of a token of the span.
""" """
@ -325,7 +334,8 @@ cdef class Span:
yield right yield right
property subtree: property subtree:
"""Tokens that descend from tokens in the span, but fall outside it. """
Tokens that descend from tokens in the span, but fall outside it.
Yields: Token A descendant of a token within the span. Yields: Token A descendant of a token within the span.
""" """
@ -337,7 +347,9 @@ cdef class Span:
yield from word.subtree yield from word.subtree
property ent_id: property ent_id:
'''An (integer) entity ID. Usually assigned by patterns in the Matcher.''' """
An (integer) entity ID. Usually assigned by patterns in the Matcher.
"""
def __get__(self): def __get__(self):
return self.root.ent_id return self.root.ent_id
@ -345,9 +357,11 @@ cdef class Span:
# TODO # TODO
raise NotImplementedError( raise NotImplementedError(
"Can't yet set ent_id from Span. Vote for this feature on the issue " "Can't yet set ent_id from Span. Vote for this feature on the issue "
"tracker: http://github.com/spacy-io/spaCy") "tracker: http://github.com/explosion/spaCy/issues")
property ent_id_: property ent_id_:
'''A (string) entity ID. Usually assigned by patterns in the Matcher.''' """
A (string) entity ID. Usually assigned by patterns in the Matcher.
"""
def __get__(self): def __get__(self):
return self.root.ent_id_ return self.root.ent_id_
@ -355,7 +369,7 @@ cdef class Span:
# TODO # TODO
raise NotImplementedError( raise NotImplementedError(
"Can't yet set ent_id_ from Span. Vote for this feature on the issue " "Can't yet set ent_id_ from Span. Vote for this feature on the issue "
"tracker: http://github.com/spacy-io/spaCy") "tracker: http://github.com/explosion/spaCy/issues")
property orth_: property orth_:
def __get__(self): def __get__(self):
@ -397,5 +411,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
raise RuntimeError( raise RuntimeError(
"Array bounds exceeded while searching for root word. This likely " "Array bounds exceeded while searching for root word. This likely "
"means the parse tree is in an invalid state. Please report this " "means the parse tree is in an invalid state. Please report this "
"issue here: http://github.com/honnibal/spaCy/") "issue here: http://github.com/explosion/spaCy/issues")
return n return n

View File

@ -1,5 +1,5 @@
# coding: utf8
# cython: infer_types=True # cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.string cimport memcpy from libc.string cimport memcpy
@ -8,20 +8,15 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
from cython.view cimport array as cvarray from cython.view cimport array as cvarray
cimport numpy as np cimport numpy as np
np.import_array() np.import_array()
import numpy import numpy
import six
from ..typedefs cimport hash_t from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from .. import parts_of_speech from .. import parts_of_speech
from ..attrs cimport LEMMA from ..attrs cimport LEMMA
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CCONJ, PUNCT from ..parts_of_speech cimport CCONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET from ..attrs cimport IS_BRACKET
from ..attrs cimport IS_QUOTE from ..attrs cimport IS_QUOTE
@ -29,12 +24,13 @@ from ..attrs cimport IS_LEFT_PUNCT
from ..attrs cimport IS_RIGHT_PUNCT from ..attrs cimport IS_RIGHT_PUNCT
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport IS_OOV from ..attrs cimport IS_OOV
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..compat import is_config
cdef class Token: cdef class Token:
"""An individual token --- i.e. a word, punctuation symbol, whitespace, etc. """
An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
""" """
def __cinit__(self, Vocab vocab, Doc doc, int offset): def __cinit__(self, Vocab vocab, Doc doc, int offset):
self.vocab = vocab self.vocab = vocab
@ -46,7 +42,9 @@ cdef class Token:
return hash((self.doc, self.i)) return hash((self.doc, self.i))
def __len__(self): def __len__(self):
'''Number of unicode characters in token.text''' """
Number of unicode characters in token.text.
"""
return self.c.lex.length return self.c.lex.length
def __unicode__(self): def __unicode__(self):
@ -56,7 +54,7 @@ cdef class Token:
return self.text.encode('utf8') return self.text.encode('utf8')
def __str__(self): def __str__(self):
if six.PY3: if is_config(python3=True):
return self.__unicode__() return self.__unicode__()
return self.__bytes__() return self.__bytes__()
@ -83,27 +81,30 @@ cdef class Token:
raise ValueError(op) raise ValueError(op)
cpdef bint check_flag(self, attr_id_t flag_id) except -1: cpdef bint check_flag(self, attr_id_t flag_id) except -1:
'''Check the value of a boolean flag. """
Check the value of a boolean flag.
Arguments: Arguments:
flag_id (int): The ID of the flag attribute. flag_id (int): The ID of the flag attribute.
Returns: Returns:
is_set (bool): Whether the flag is set. is_set (bool): Whether the flag is set.
''' """
return Lexeme.c_check_flag(self.c.lex, flag_id) return Lexeme.c_check_flag(self.c.lex, flag_id)
def nbor(self, int i=1): def nbor(self, int i=1):
'''Get a neighboring token. """
Get a neighboring token.
Arguments: Arguments:
i (int): The relative position of the token to get. Defaults to 1. i (int): The relative position of the token to get. Defaults to 1.
Returns: Returns:
neighbor (Token): The token at position self.doc[self.i+i] neighbor (Token): The token at position self.doc[self.i+i]
''' """
return self.doc[self.i+i] return self.doc[self.i+i]
def similarity(self, other): def similarity(self, other):
'''Compute a semantic similarity estimate. Defaults to cosine over vectors. """
Compute a semantic similarity estimate. Defaults to cosine over vectors.
Arguments: Arguments:
other: other:
@ -111,7 +112,7 @@ cdef class Token:
Token and Lexeme objects. Token and Lexeme objects.
Returns: Returns:
score (float): A scalar similarity score. Higher is more similar. score (float): A scalar similarity score. Higher is more similar.
''' """
if 'similarity' in self.doc.user_token_hooks: if 'similarity' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['similarity'](self) return self.doc.user_token_hooks['similarity'](self)
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
@ -209,9 +210,9 @@ cdef class Token:
self.c.dep = label self.c.dep = label
property has_vector: property has_vector:
''' """
A boolean value indicating whether a word vector is associated with the object. A boolean value indicating whether a word vector is associated with the object.
''' """
def __get__(self): def __get__(self):
if 'has_vector' in self.doc.user_token_hooks: if 'has_vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['has_vector'](self) return self.doc.user_token_hooks['has_vector'](self)
@ -223,11 +224,11 @@ cdef class Token:
return False return False
property vector: property vector:
''' """
A real-valued meaning representation. A real-valued meaning representation.
Type: numpy.ndarray[ndim=1, dtype='float32'] Type: numpy.ndarray[ndim=1, dtype='float32']
''' """
def __get__(self): def __get__(self):
if 'vector' in self.doc.user_token_hooks: if 'vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['vector'](self) return self.doc.user_token_hooks['vector'](self)
@ -245,6 +246,7 @@ cdef class Token:
property repvec: property repvec:
def __get__(self): def __get__(self):
raise AttributeError("repvec was renamed to vector in v0.100") raise AttributeError("repvec was renamed to vector in v0.100")
property has_repvec: property has_repvec:
def __get__(self): def __get__(self):
raise AttributeError("has_repvec was renamed to has_vector in v0.100") raise AttributeError("has_repvec was renamed to has_vector in v0.100")
@ -265,7 +267,8 @@ cdef class Token:
property lefts: property lefts:
def __get__(self): def __get__(self):
"""The leftward immediate children of the word, in the syntactic """
The leftward immediate children of the word, in the syntactic
dependency parse. dependency parse.
""" """
cdef int nr_iter = 0 cdef int nr_iter = 0
@ -282,8 +285,10 @@ cdef class Token:
property rights: property rights:
def __get__(self): def __get__(self):
"""The rightward immediate children of the word, in the syntactic """
dependency parse.""" The rightward immediate children of the word, in the syntactic
dependency parse.
"""
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
tokens = [] tokens = []
cdef int nr_iter = 0 cdef int nr_iter = 0
@ -300,19 +305,21 @@ cdef class Token:
yield t yield t
property children: property children:
'''A sequence of the token's immediate syntactic children. """
A sequence of the token's immediate syntactic children.
Yields: Token A child token such that child.head==self Yields: Token A child token such that child.head==self
''' """
def __get__(self): def __get__(self):
yield from self.lefts yield from self.lefts
yield from self.rights yield from self.rights
property subtree: property subtree:
'''A sequence of all the token's syntactic descendents. """
A sequence of all the token's syntactic descendents.
Yields: Token A descendent token such that self.is_ancestor(descendent) Yields: Token A descendent token such that self.is_ancestor(descendent)
''' """
def __get__(self): def __get__(self):
for word in self.lefts: for word in self.lefts:
yield from word.subtree yield from word.subtree
@ -321,26 +328,29 @@ cdef class Token:
yield from word.subtree yield from word.subtree
property left_edge: property left_edge:
'''The leftmost token of this token's syntactic descendents. """
The leftmost token of this token's syntactic descendents.
Returns: Token The first token such that self.is_ancestor(token) Returns: Token The first token such that self.is_ancestor(token)
''' """
def __get__(self): def __get__(self):
return self.doc[self.c.l_edge] return self.doc[self.c.l_edge]
property right_edge: property right_edge:
'''The rightmost token of this token's syntactic descendents. """
The rightmost token of this token's syntactic descendents.
Returns: Token The last token such that self.is_ancestor(token) Returns: Token The last token such that self.is_ancestor(token)
''' """
def __get__(self): def __get__(self):
return self.doc[self.c.r_edge] return self.doc[self.c.r_edge]
property ancestors: property ancestors:
'''A sequence of this token's syntactic ancestors. """
A sequence of this token's syntactic ancestors.
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self) Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
''' """
def __get__(self): def __get__(self):
cdef const TokenC* head_ptr = self.c cdef const TokenC* head_ptr = self.c
# guard against infinite loop, no token can have # guard against infinite loop, no token can have
@ -356,25 +366,29 @@ cdef class Token:
return self.is_ancestor(descendant) return self.is_ancestor(descendant)
def is_ancestor(self, descendant): def is_ancestor(self, descendant):
'''Check whether this token is a parent, grandparent, etc. of another """
Check whether this token is a parent, grandparent, etc. of another
in the dependency tree. in the dependency tree.
Arguments: Arguments:
descendant (Token): Another token. descendant (Token): Another token.
Returns: Returns:
is_ancestor (bool): Whether this token is the ancestor of the descendant. is_ancestor (bool): Whether this token is the ancestor of the descendant.
''' """
if self.doc is not descendant.doc: if self.doc is not descendant.doc:
return False return False
return any( ancestor.i == self.i for ancestor in descendant.ancestors ) return any( ancestor.i == self.i for ancestor in descendant.ancestors )
property head: property head:
'''The syntactic parent, or "governor", of this token. """
The syntactic parent, or "governor", of this token.
Returns: Token Returns: Token
''' """
def __get__(self): def __get__(self):
"""The token predicted by the parser to be the head of the current token.""" """
The token predicted by the parser to be the head of the current token.
"""
return self.doc[self.i + self.c.head] return self.doc[self.i + self.c.head]
def __set__(self, Token new_head): def __set__(self, Token new_head):
# this function sets the head of self to new_head # this function sets the head of self to new_head
@ -467,10 +481,11 @@ cdef class Token:
self.c.head = rel_newhead_i self.c.head = rel_newhead_i
property conjuncts: property conjuncts:
'''A sequence of coordinated tokens, including the token itself. """
A sequence of coordinated tokens, including the token itself.
Yields: Token A coordinated token Yields: Token A coordinated token
''' """
def __get__(self): def __get__(self):
"""Get a list of conjoined words.""" """Get a list of conjoined words."""
cdef Token word cdef Token word
@ -501,7 +516,9 @@ cdef class Token:
return iob_strings[self.c.ent_iob] return iob_strings[self.c.ent_iob]
property ent_id: property ent_id:
'''An (integer) entity ID. Usually assigned by patterns in the Matcher.''' """
An (integer) entity ID. Usually assigned by patterns in the Matcher.
"""
def __get__(self): def __get__(self):
return self.c.ent_id return self.c.ent_id
@ -509,7 +526,9 @@ cdef class Token:
self.c.ent_id = key self.c.ent_id = key
property ent_id_: property ent_id_:
'''A (string) entity ID. Usually assigned by patterns in the Matcher.''' """
A (string) entity ID. Usually assigned by patterns in the Matcher.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.ent_id] return self.vocab.strings[self.c.ent_id]