diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index fcb5a16fa..329b1a0dd 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -30,7 +30,7 @@ from ..util import normalize_slice from ..compat import is_config from .. import about from .. import util - +from .underscore import Underscore DEF PADDING = 5 @@ -64,6 +64,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: else: return Lexeme.get_struct_attr(token.lex, feat_name) + def _get_chunker(lang): try: cls = util.get_lang_class(lang) @@ -73,6 +74,7 @@ def _get_chunker(lang): return None return cls.Defaults.syntax_iterators.get(u'noun_chunks') + cdef class Doc: """A sequence of Token objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to compressed binary strings. @@ -87,6 +89,21 @@ cdef class Doc: >>> from spacy.tokens import Doc >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) """ + @classmethod + def set_extension(cls, name, default=None, method=None, + getter=None, setter=None): + nr_defined = sum(t is not None for t in (default, getter, setter, method)) + assert nr_defined == 1 + Underscore.doc_extensions[name] = (default, method, getter, setter) + + @classmethod + def get_extension(cls, name): + return Underscore.doc_extensions.get(name) + + @classmethod + def has_extension(cls, name): + return name in Underscore.doc_extensions + def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): """Create a Doc object. @@ -159,6 +176,10 @@ cdef class Doc: self.is_tagged = True self.is_parsed = True + @property + def _(self): + return Underscore(Underscore.doc_extensions, self) + def __getitem__(self, object i): """Get a `Token` or `Span` object. diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 7e29cccf4..389922518 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -17,10 +17,24 @@ from ..attrs cimport IS_PUNCT, IS_SPACE from ..lexeme cimport Lexeme from ..compat import is_config from .. import about +from .underscore import Underscore cdef class Span: """A slice from a Doc object.""" + @classmethod + def set_extension(cls, name, default=None, method=None, + getter=None, setter=None): + Underscore.span_extensions[name] = (default, method, getter, setter) + + @classmethod + def get_extension(cls, name): + return Underscore.span_extensions.get(name) + + @classmethod + def has_extension(cls, name): + return name in Underscore.span_extensions + def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None, vector_norm=None): """Create a `Span` object from the slice `doc[start : end]`. @@ -111,6 +125,11 @@ cdef class Span: for i in range(self.start, self.end): yield self.doc[i] + @property + def _(self): + return Underscore(Underscore.span_extensions, self, + start=self.start_char, end=self.end_char) + def merge(self, *args, **attributes): """Retokenize the document, such that the span is merged into a single token. diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 7b11d6efa..c617b382e 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -20,10 +20,24 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST from ..attrs cimport LEMMA, POS, TAG, DEP from ..compat import is_config from .. import about +from .underscore import Underscore cdef class Token: """An individual token – i.e. a word, punctuation symbol, whitespace, etc.""" + @classmethod + def set_extension(cls, name, default=None, method=None, + getter=None, setter=None): + Underscore.span_extensions[name] = (default, method, getter, setter) + + @classmethod + def get_extension(cls, name): + return Underscore.span_extensions.get(name) + + @classmethod + def has_extension(cls, name): + return name in Underscore.span_extensions + def __cinit__(self, Vocab vocab, Doc doc, int offset): """Construct a `Token` object. @@ -87,6 +101,11 @@ cdef class Token: else: raise ValueError(op) + @property + def _(self): + return Underscore(Underscore.token_extensions, self, + start=self.idx, end=None) + cpdef bint check_flag(self, attr_id_t flag_id) except -1: """Check the value of a boolean flag. diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index 8374f4bda..66c54d6d6 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -1,9 +1,10 @@ -class Undercore(object): +class Underscore(object): doc_extensions = {} span_extensions = {} token_extensions = {} - def __init__(self, obj, start=None, end=None): + def __init__(self, extensions, obj, start=None, end=None): + object.__setattr__(self, '_extensions', extensions) object.__setattr__(self, '_obj', obj) # Assumption is that for doc values, _start and _end will both be None # Span will set non-None values for _start and _end @@ -12,23 +13,23 @@ class Undercore(object): # (see _get_key), and lets us use a single Underscore class. object.__setattr__(self, '_doc', obj.doc) object.__setattr__(self, '_start', start) - object.__setattr__(self, '_end', start) + object.__setattr__(self, '_end', end) def __getattr__(self, name): - if name not in self.__class__.extensions: + if name not in self._extensions: raise AttributeError(name) - default, method, getter, setter = self.__class__.extensions[name] + default, method, getter, setter = self._extensions[name] if getter is not None: return getter(self._obj) elif method is not None: - return method) + return method else: return self._doc.user_data.get(self._get_key(name), default) def __setattr__(self, name, value): - if name not in self.__class__.extensions: + if name not in self._extensions: raise AttributeError(name) - default, method, getter, setter = self.__class__.extensions[name] + default, method, getter, setter = self._extensions[name] if setter is not None: return setter(self._obj, value) else: