Pass extensions into Underscore class

This commit is contained in:
Matthew Honnibal 2017-10-07 18:56:01 +02:00
parent 1289129fd9
commit 668a0ea640
4 changed files with 69 additions and 9 deletions

View File

@ -30,7 +30,7 @@ from ..util import normalize_slice
from ..compat import is_config
from .. import about
from .. import util
from .underscore import Underscore
DEF PADDING = 5
@ -64,6 +64,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
else:
return Lexeme.get_struct_attr(token.lex, feat_name)
def _get_chunker(lang):
try:
cls = util.get_lang_class(lang)
@ -73,6 +74,7 @@ def _get_chunker(lang):
return None
return cls.Defaults.syntax_iterators.get(u'noun_chunks')
cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary strings.
@ -87,6 +89,21 @@ cdef class Doc:
>>> from spacy.tokens import Doc
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
"""
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
nr_defined = sum(t is not None for t in (default, getter, setter, method))
assert nr_defined == 1
Underscore.doc_extensions[name] = (default, method, getter, setter)
@classmethod
def get_extension(cls, name):
return Underscore.doc_extensions.get(name)
@classmethod
def has_extension(cls, name):
return name in Underscore.doc_extensions
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
"""Create a Doc object.
@ -159,6 +176,10 @@ cdef class Doc:
self.is_tagged = True
self.is_parsed = True
@property
def _(self):
return Underscore(Underscore.doc_extensions, self)
def __getitem__(self, object i):
"""Get a `Token` or `Span` object.

View File

@ -17,10 +17,24 @@ from ..attrs cimport IS_PUNCT, IS_SPACE
from ..lexeme cimport Lexeme
from ..compat import is_config
from .. import about
from .underscore import Underscore
cdef class Span:
"""A slice from a Doc object."""
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
Underscore.span_extensions[name] = (default, method, getter, setter)
@classmethod
def get_extension(cls, name):
return Underscore.span_extensions.get(name)
@classmethod
def has_extension(cls, name):
return name in Underscore.span_extensions
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
vector_norm=None):
"""Create a `Span` object from the slice `doc[start : end]`.
@ -111,6 +125,11 @@ cdef class Span:
for i in range(self.start, self.end):
yield self.doc[i]
@property
def _(self):
return Underscore(Underscore.span_extensions, self,
start=self.start_char, end=self.end_char)
def merge(self, *args, **attributes):
"""Retokenize the document, such that the span is merged into a single
token.

View File

@ -20,10 +20,24 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
from ..attrs cimport LEMMA, POS, TAG, DEP
from ..compat import is_config
from .. import about
from .underscore import Underscore
cdef class Token:
"""An individual token i.e. a word, punctuation symbol, whitespace, etc."""
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
Underscore.span_extensions[name] = (default, method, getter, setter)
@classmethod
def get_extension(cls, name):
return Underscore.span_extensions.get(name)
@classmethod
def has_extension(cls, name):
return name in Underscore.span_extensions
def __cinit__(self, Vocab vocab, Doc doc, int offset):
"""Construct a `Token` object.
@ -87,6 +101,11 @@ cdef class Token:
else:
raise ValueError(op)
@property
def _(self):
return Underscore(Underscore.token_extensions, self,
start=self.idx, end=None)
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
"""Check the value of a boolean flag.

View File

@ -1,9 +1,10 @@
class Undercore(object):
class Underscore(object):
doc_extensions = {}
span_extensions = {}
token_extensions = {}
def __init__(self, obj, start=None, end=None):
def __init__(self, extensions, obj, start=None, end=None):
object.__setattr__(self, '_extensions', extensions)
object.__setattr__(self, '_obj', obj)
# Assumption is that for doc values, _start and _end will both be None
# Span will set non-None values for _start and _end
@ -12,23 +13,23 @@ class Undercore(object):
# (see _get_key), and lets us use a single Underscore class.
object.__setattr__(self, '_doc', obj.doc)
object.__setattr__(self, '_start', start)
object.__setattr__(self, '_end', start)
object.__setattr__(self, '_end', end)
def __getattr__(self, name):
if name not in self.__class__.extensions:
if name not in self._extensions:
raise AttributeError(name)
default, method, getter, setter = self.__class__.extensions[name]
default, method, getter, setter = self._extensions[name]
if getter is not None:
return getter(self._obj)
elif method is not None:
return method)
return method
else:
return self._doc.user_data.get(self._get_key(name), default)
def __setattr__(self, name, value):
if name not in self.__class__.extensions:
if name not in self._extensions:
raise AttributeError(name)
default, method, getter, setter = self.__class__.extensions[name]
default, method, getter, setter = self._extensions[name]
if setter is not None:
return setter(self._obj, value)
else: