Pass extensions into Underscore class

This commit is contained in:
Matthew Honnibal 2017-10-07 18:56:01 +02:00
parent 1289129fd9
commit 668a0ea640
4 changed files with 69 additions and 9 deletions

View File

@ -30,7 +30,7 @@ from ..util import normalize_slice
from ..compat import is_config from ..compat import is_config
from .. import about from .. import about
from .. import util from .. import util
from .underscore import Underscore
DEF PADDING = 5 DEF PADDING = 5
@ -64,6 +64,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
else: else:
return Lexeme.get_struct_attr(token.lex, feat_name) return Lexeme.get_struct_attr(token.lex, feat_name)
def _get_chunker(lang): def _get_chunker(lang):
try: try:
cls = util.get_lang_class(lang) cls = util.get_lang_class(lang)
@ -73,6 +74,7 @@ def _get_chunker(lang):
return None return None
return cls.Defaults.syntax_iterators.get(u'noun_chunks') return cls.Defaults.syntax_iterators.get(u'noun_chunks')
cdef class Doc: cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export """A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary strings. annotations to numpy arrays, losslessly serialize to compressed binary strings.
@ -87,6 +89,21 @@ cdef class Doc:
>>> from spacy.tokens import Doc >>> from spacy.tokens import Doc
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
""" """
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
nr_defined = sum(t is not None for t in (default, getter, setter, method))
assert nr_defined == 1
Underscore.doc_extensions[name] = (default, method, getter, setter)
@classmethod
def get_extension(cls, name):
return Underscore.doc_extensions.get(name)
@classmethod
def has_extension(cls, name):
return name in Underscore.doc_extensions
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
"""Create a Doc object. """Create a Doc object.
@ -159,6 +176,10 @@ cdef class Doc:
self.is_tagged = True self.is_tagged = True
self.is_parsed = True self.is_parsed = True
@property
def _(self):
return Underscore(Underscore.doc_extensions, self)
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a `Token` or `Span` object. """Get a `Token` or `Span` object.

View File

@ -17,10 +17,24 @@ from ..attrs cimport IS_PUNCT, IS_SPACE
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..compat import is_config from ..compat import is_config
from .. import about from .. import about
from .underscore import Underscore
cdef class Span: cdef class Span:
"""A slice from a Doc object.""" """A slice from a Doc object."""
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
Underscore.span_extensions[name] = (default, method, getter, setter)
@classmethod
def get_extension(cls, name):
return Underscore.span_extensions.get(name)
@classmethod
def has_extension(cls, name):
return name in Underscore.span_extensions
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None, def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
vector_norm=None): vector_norm=None):
"""Create a `Span` object from the slice `doc[start : end]`. """Create a `Span` object from the slice `doc[start : end]`.
@ -111,6 +125,11 @@ cdef class Span:
for i in range(self.start, self.end): for i in range(self.start, self.end):
yield self.doc[i] yield self.doc[i]
@property
def _(self):
return Underscore(Underscore.span_extensions, self,
start=self.start_char, end=self.end_char)
def merge(self, *args, **attributes): def merge(self, *args, **attributes):
"""Retokenize the document, such that the span is merged into a single """Retokenize the document, such that the span is merged into a single
token. token.

View File

@ -20,10 +20,24 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
from ..attrs cimport LEMMA, POS, TAG, DEP from ..attrs cimport LEMMA, POS, TAG, DEP
from ..compat import is_config from ..compat import is_config
from .. import about from .. import about
from .underscore import Underscore
cdef class Token: cdef class Token:
"""An individual token i.e. a word, punctuation symbol, whitespace, etc.""" """An individual token i.e. a word, punctuation symbol, whitespace, etc."""
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
Underscore.span_extensions[name] = (default, method, getter, setter)
@classmethod
def get_extension(cls, name):
return Underscore.span_extensions.get(name)
@classmethod
def has_extension(cls, name):
return name in Underscore.span_extensions
def __cinit__(self, Vocab vocab, Doc doc, int offset): def __cinit__(self, Vocab vocab, Doc doc, int offset):
"""Construct a `Token` object. """Construct a `Token` object.
@ -87,6 +101,11 @@ cdef class Token:
else: else:
raise ValueError(op) raise ValueError(op)
@property
def _(self):
return Underscore(Underscore.token_extensions, self,
start=self.idx, end=None)
cpdef bint check_flag(self, attr_id_t flag_id) except -1: cpdef bint check_flag(self, attr_id_t flag_id) except -1:
"""Check the value of a boolean flag. """Check the value of a boolean flag.

View File

@ -1,9 +1,10 @@
class Undercore(object): class Underscore(object):
doc_extensions = {} doc_extensions = {}
span_extensions = {} span_extensions = {}
token_extensions = {} token_extensions = {}
def __init__(self, obj, start=None, end=None): def __init__(self, extensions, obj, start=None, end=None):
object.__setattr__(self, '_extensions', extensions)
object.__setattr__(self, '_obj', obj) object.__setattr__(self, '_obj', obj)
# Assumption is that for doc values, _start and _end will both be None # Assumption is that for doc values, _start and _end will both be None
# Span will set non-None values for _start and _end # Span will set non-None values for _start and _end
@ -12,23 +13,23 @@ class Undercore(object):
# (see _get_key), and lets us use a single Underscore class. # (see _get_key), and lets us use a single Underscore class.
object.__setattr__(self, '_doc', obj.doc) object.__setattr__(self, '_doc', obj.doc)
object.__setattr__(self, '_start', start) object.__setattr__(self, '_start', start)
object.__setattr__(self, '_end', start) object.__setattr__(self, '_end', end)
def __getattr__(self, name): def __getattr__(self, name):
if name not in self.__class__.extensions: if name not in self._extensions:
raise AttributeError(name) raise AttributeError(name)
default, method, getter, setter = self.__class__.extensions[name] default, method, getter, setter = self._extensions[name]
if getter is not None: if getter is not None:
return getter(self._obj) return getter(self._obj)
elif method is not None: elif method is not None:
return method) return method
else: else:
return self._doc.user_data.get(self._get_key(name), default) return self._doc.user_data.get(self._get_key(name), default)
def __setattr__(self, name, value): def __setattr__(self, name, value):
if name not in self.__class__.extensions: if name not in self._extensions:
raise AttributeError(name) raise AttributeError(name)
default, method, getter, setter = self.__class__.extensions[name] default, method, getter, setter = self._extensions[name]
if setter is not None: if setter is not None:
return setter(self._obj, value) return setter(self._obj, value)
else: else: