2018-12-29 20:02:26 +03:00
|
|
|
|
|
2017-04-15 14:05:15 +03:00
|
|
|
|
# coding: utf8
|
2017-05-09 19:45:18 +03:00
|
|
|
|
# cython: infer_types=True
|
|
|
|
|
# cython: bounds_check=False
|
2017-11-17 20:55:56 +03:00
|
|
|
|
# cython: profile=True
|
2017-04-15 14:05:15 +03:00
|
|
|
|
from __future__ import unicode_literals
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2017-04-15 14:05:15 +03:00
|
|
|
|
cimport cython
|
|
|
|
|
cimport numpy as np
|
2015-07-13 20:58:26 +03:00
|
|
|
|
import numpy
|
2015-09-14 10:49:58 +03:00
|
|
|
|
import numpy.linalg
|
2015-07-19 16:18:17 +03:00
|
|
|
|
import struct
|
2017-05-09 19:11:34 +03:00
|
|
|
|
import dill
|
2017-10-17 20:29:20 +03:00
|
|
|
|
import msgpack
|
2017-11-03 13:20:31 +03:00
|
|
|
|
from thinc.neural.util import get_array_module, copy_array
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2017-04-15 14:05:15 +03:00
|
|
|
|
from libc.string cimport memcpy, memset
|
|
|
|
|
from libc.math cimport sqrt
|
|
|
|
|
|
|
|
|
|
from .span cimport Span
|
|
|
|
|
from .token cimport Token
|
2017-05-13 14:04:40 +03:00
|
|
|
|
from .span cimport Span
|
|
|
|
|
from .token cimport Token
|
|
|
|
|
from .printers import parse_tree
|
|
|
|
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
2015-07-16 12:21:44 +03:00
|
|
|
|
from ..typedefs cimport attr_t, flags_t
|
2017-10-19 17:07:14 +03:00
|
|
|
|
from ..attrs import intify_attrs, IDS
|
2015-07-16 12:21:44 +03:00
|
|
|
|
from ..attrs cimport attr_id_t
|
2017-10-27 16:41:45 +03:00
|
|
|
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
|
|
|
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
|
|
|
|
from ..attrs cimport ENT_TYPE, SENT_START
|
2017-05-13 14:04:40 +03:00
|
|
|
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
|
|
|
|
from ..util import normalize_slice
|
2017-11-01 15:25:44 +03:00
|
|
|
|
from ..compat import is_config, copy_reg, pickle, basestring_
|
2018-04-03 16:50:31 +03:00
|
|
|
|
from ..errors import Errors, Warnings, deprecation_warning
|
2017-05-31 00:35:17 +03:00
|
|
|
|
from .. import util
|
2018-04-03 19:30:17 +03:00
|
|
|
|
from .underscore import Underscore, get_ext_args
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
from ._retokenize import Retokenizer
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
|
|
|
|
DEF PADDING = 5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cdef int bounds_check(int i, int length, int padding) except -1:
|
|
|
|
|
if (i + padding) < 0:
|
2018-04-03 16:50:31 +03:00
|
|
|
|
raise IndexError(Errors.E026.format(i=i, length=length))
|
2015-07-13 20:58:26 +03:00
|
|
|
|
if (i - padding) >= length:
|
2018-04-03 16:50:31 +03:00
|
|
|
|
raise IndexError(Errors.E026.format(i=i, length=length))
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
|
|
|
|
if feat_name == LEMMA:
|
|
|
|
|
return token.lemma
|
|
|
|
|
elif feat_name == POS:
|
|
|
|
|
return token.pos
|
|
|
|
|
elif feat_name == TAG:
|
|
|
|
|
return token.tag
|
|
|
|
|
elif feat_name == DEP:
|
|
|
|
|
return token.dep
|
2015-07-16 02:15:34 +03:00
|
|
|
|
elif feat_name == HEAD:
|
|
|
|
|
return token.head
|
2016-05-05 13:11:57 +03:00
|
|
|
|
elif feat_name == SENT_START:
|
|
|
|
|
return token.sent_start
|
2015-07-16 02:15:34 +03:00
|
|
|
|
elif feat_name == SPACY:
|
|
|
|
|
return token.spacy
|
|
|
|
|
elif feat_name == ENT_IOB:
|
|
|
|
|
return token.ent_iob
|
|
|
|
|
elif feat_name == ENT_TYPE:
|
|
|
|
|
return token.ent_type
|
2015-07-13 20:58:26 +03:00
|
|
|
|
else:
|
2015-09-06 20:45:15 +03:00
|
|
|
|
return Lexeme.get_struct_attr(token.lex, feat_name)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2017-10-07 19:56:01 +03:00
|
|
|
|
|
2017-06-04 22:53:39 +03:00
|
|
|
|
def _get_chunker(lang):
|
2017-06-04 23:53:05 +03:00
|
|
|
|
try:
|
|
|
|
|
cls = util.get_lang_class(lang)
|
|
|
|
|
except ImportError:
|
|
|
|
|
return None
|
|
|
|
|
except KeyError:
|
|
|
|
|
return None
|
|
|
|
|
return cls.Defaults.syntax_iterators.get(u'noun_chunks')
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2017-10-07 19:56:01 +03:00
|
|
|
|
|
2015-07-13 20:58:26 +03:00
|
|
|
|
cdef class Doc:
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""A sequence of Token objects. Access sentences and named entities, export
|
2017-10-27 16:41:45 +03:00
|
|
|
|
annotations to numpy arrays, losslessly serialize to compressed binary
|
|
|
|
|
strings. The `Doc` object holds an array of `TokenC` structs. The
|
|
|
|
|
Python-level `Token` and `Span` objects are views of this array, i.e.
|
|
|
|
|
they don't own the data themselves.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
|
|
|
|
|
EXAMPLE: Construction 1
|
|
|
|
|
>>> doc = nlp(u'Some text')
|
|
|
|
|
|
|
|
|
|
Construction 2
|
|
|
|
|
>>> from spacy.tokens import Doc
|
2017-10-27 16:41:45 +03:00
|
|
|
|
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
|
|
|
|
|
spaces=[True, False, False])
|
2015-07-13 20:58:26 +03:00
|
|
|
|
"""
|
2017-10-07 19:56:01 +03:00
|
|
|
|
@classmethod
|
2018-04-03 19:30:17 +03:00
|
|
|
|
def set_extension(cls, name, **kwargs):
|
|
|
|
|
if cls.has_extension(name) and not kwargs.get('force', False):
|
|
|
|
|
raise ValueError(Errors.E090.format(name=name, obj='Doc'))
|
|
|
|
|
Underscore.doc_extensions[name] = get_ext_args(**kwargs)
|
2017-10-07 19:56:01 +03:00
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def get_extension(cls, name):
|
|
|
|
|
return Underscore.doc_extensions.get(name)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def has_extension(cls, name):
|
|
|
|
|
return name in Underscore.doc_extensions
|
|
|
|
|
|
2018-04-29 00:33:09 +03:00
|
|
|
|
@classmethod
|
|
|
|
|
def remove_extension(cls, name):
|
|
|
|
|
if not cls.has_extension(name):
|
|
|
|
|
raise ValueError(Errors.E046.format(name=name))
|
|
|
|
|
return Underscore.doc_extensions.pop(name)
|
|
|
|
|
|
2017-10-17 17:11:13 +03:00
|
|
|
|
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
|
|
|
|
|
orths_and_spaces=None):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Create a Doc object.
|
2016-09-28 12:15:13 +03:00
|
|
|
|
|
2017-10-27 16:41:45 +03:00
|
|
|
|
vocab (Vocab): A vocabulary object, which must match any models you
|
|
|
|
|
want to use (e.g. tokenizer, parser, entity recognizer).
|
2017-05-18 23:17:09 +03:00
|
|
|
|
words (list or None): A list of unicode strings to add to the document
|
|
|
|
|
as words. If `None`, defaults to empty list.
|
|
|
|
|
spaces (list or None): A list of boolean values, of the same length as
|
|
|
|
|
words. True means that the word is followed by a space, False means
|
|
|
|
|
it is not. If `None`, defaults to `[True]*len(words)`
|
2017-10-17 17:11:13 +03:00
|
|
|
|
user_data (dict or None): Optional extra data to attach to the Doc.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
RETURNS (Doc): The newly constructed object.
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2015-07-13 20:58:26 +03:00
|
|
|
|
self.vocab = vocab
|
|
|
|
|
size = 20
|
|
|
|
|
self.mem = Pool()
|
|
|
|
|
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
|
|
|
|
# However, we need to remember the true starting places, so that we can
|
|
|
|
|
# realloc.
|
|
|
|
|
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
|
|
|
|
|
cdef int i
|
|
|
|
|
for i in range(size + (PADDING*2)):
|
|
|
|
|
data_start[i].lex = &EMPTY_LEXEME
|
2015-09-09 04:39:46 +03:00
|
|
|
|
data_start[i].l_edge = i
|
|
|
|
|
data_start[i].r_edge = i
|
2015-11-03 16:15:14 +03:00
|
|
|
|
self.c = data_start + PADDING
|
2015-07-13 20:58:26 +03:00
|
|
|
|
self.max_length = size
|
|
|
|
|
self.length = 0
|
|
|
|
|
self.is_tagged = False
|
|
|
|
|
self.is_parsed = False
|
2016-10-19 21:54:03 +03:00
|
|
|
|
self.sentiment = 0.0
|
2017-07-22 01:34:15 +03:00
|
|
|
|
self.cats = {}
|
2016-10-19 22:15:16 +03:00
|
|
|
|
self.user_hooks = {}
|
|
|
|
|
self.user_token_hooks = {}
|
|
|
|
|
self.user_span_hooks = {}
|
2016-10-17 16:23:47 +03:00
|
|
|
|
self.tensor = numpy.zeros((0,), dtype='float32')
|
2017-10-17 17:11:13 +03:00
|
|
|
|
self.user_data = {} if user_data is None else user_data
|
2015-09-17 04:50:11 +03:00
|
|
|
|
self._vector = None
|
2017-06-04 22:53:39 +03:00
|
|
|
|
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
2016-09-21 15:52:05 +03:00
|
|
|
|
cdef unicode orth
|
|
|
|
|
cdef bint has_space
|
2016-10-16 19:13:03 +03:00
|
|
|
|
if orths_and_spaces is None and words is not None:
|
|
|
|
|
if spaces is None:
|
|
|
|
|
spaces = [True] * len(words)
|
2016-10-16 19:16:42 +03:00
|
|
|
|
elif len(spaces) != len(words):
|
2018-04-03 16:50:31 +03:00
|
|
|
|
raise ValueError(Errors.E027)
|
2016-10-16 19:13:03 +03:00
|
|
|
|
orths_and_spaces = zip(words, spaces)
|
2016-09-21 15:52:05 +03:00
|
|
|
|
if orths_and_spaces is not None:
|
|
|
|
|
for orth_space in orths_and_spaces:
|
|
|
|
|
if isinstance(orth_space, unicode):
|
|
|
|
|
orth = orth_space
|
|
|
|
|
has_space = True
|
|
|
|
|
elif isinstance(orth_space, bytes):
|
2018-04-03 16:50:31 +03:00
|
|
|
|
raise ValueError(Errors.E028.format(value=orth_space))
|
2016-09-21 15:52:05 +03:00
|
|
|
|
else:
|
|
|
|
|
orth, has_space = orth_space
|
|
|
|
|
# Note that we pass self.mem here --- we have ownership, if LexemeC
|
|
|
|
|
# must be created.
|
|
|
|
|
self.push_back(
|
|
|
|
|
<const LexemeC*>self.vocab.get(self.mem, orth), has_space)
|
2016-11-03 01:47:46 +03:00
|
|
|
|
# Tough to decide on policy for this. Is an empty doc tagged and parsed?
|
|
|
|
|
# There's no information we'd like to add to it, so I guess so?
|
|
|
|
|
if self.length == 0:
|
|
|
|
|
self.is_tagged = True
|
|
|
|
|
self.is_parsed = True
|
2017-02-27 00:27:11 +03:00
|
|
|
|
|
2017-10-07 19:56:01 +03:00
|
|
|
|
@property
|
|
|
|
|
def _(self):
|
|
|
|
|
return Underscore(Underscore.doc_extensions, self)
|
|
|
|
|
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
@property
|
|
|
|
|
def is_sentenced(self):
|
|
|
|
|
# Check if the document has sentence boundaries,
|
|
|
|
|
# i.e at least one tok has the sent_start in (-1, 1)
|
|
|
|
|
if 'sents' in self.user_hooks:
|
|
|
|
|
return True
|
|
|
|
|
if self.is_parsed:
|
|
|
|
|
return True
|
|
|
|
|
for i in range(self.length):
|
|
|
|
|
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
|
|
|
|
|
return True
|
|
|
|
|
else:
|
|
|
|
|
return False
|
|
|
|
|
|
2015-07-13 20:58:26 +03:00
|
|
|
|
def __getitem__(self, object i):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Get a `Token` or `Span` object.
|
|
|
|
|
|
2017-10-27 16:41:45 +03:00
|
|
|
|
i (int or tuple) The index of the token, or the slice of the document
|
|
|
|
|
to get.
|
2017-05-19 01:30:51 +03:00
|
|
|
|
RETURNS (Token or Span): The token at `doc[i]]`, or the span at
|
|
|
|
|
`doc[start : end]`.
|
|
|
|
|
|
2017-05-18 23:17:09 +03:00
|
|
|
|
EXAMPLE:
|
|
|
|
|
>>> doc[i]
|
|
|
|
|
Get the `Token` object at position `i`, where `i` is an integer.
|
2017-02-27 00:27:11 +03:00
|
|
|
|
Negative indexing is supported, and follows the usual Python
|
2017-05-18 23:17:09 +03:00
|
|
|
|
semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
|
|
|
|
|
|
|
|
|
|
>>> doc[start : end]]
|
|
|
|
|
Get a `Span` object, starting at position `start` and ending at
|
|
|
|
|
position `end`, where `start` and `end` are token indices. For
|
2017-10-27 16:41:45 +03:00
|
|
|
|
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
|
|
|
|
|
4. Stepped slices (e.g. `doc[start : end : step]`) are not
|
|
|
|
|
supported, as `Span` objects must be contiguous (cannot have gaps).
|
|
|
|
|
You can use negative indices and open-ended ranges, which have
|
|
|
|
|
their normal Python semantics.
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2015-07-13 20:58:26 +03:00
|
|
|
|
if isinstance(i, slice):
|
2015-10-07 11:25:35 +03:00
|
|
|
|
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
|
|
|
|
return Span(self, start, stop, label=0)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
|
|
|
|
if i < 0:
|
|
|
|
|
i = self.length + i
|
|
|
|
|
bounds_check(i, self.length, PADDING)
|
2017-10-16 20:34:21 +03:00
|
|
|
|
return Token.cinit(self.vocab, &self.c[i], i, self)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
|
|
|
|
def __iter__(self):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Iterate over `Token` objects, from which the annotations can be
|
|
|
|
|
easily accessed. This is the main way of accessing `Token` objects,
|
|
|
|
|
which are the main way annotations are accessed from Python. If faster-
|
|
|
|
|
than-Python speeds are required, you can instead access the annotations
|
|
|
|
|
as a numpy array, or access the underlying C data directly from Cython.
|
|
|
|
|
|
|
|
|
|
EXAMPLE:
|
|
|
|
|
>>> for token in doc
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2015-07-18 05:10:53 +03:00
|
|
|
|
cdef int i
|
2015-07-13 20:58:26 +03:00
|
|
|
|
for i in range(self.length):
|
2017-10-16 20:34:21 +03:00
|
|
|
|
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
|
|
|
|
def __len__(self):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""The number of tokens in the document.
|
|
|
|
|
|
2017-05-19 19:47:39 +03:00
|
|
|
|
RETURNS (int): The number of tokens in the document.
|
|
|
|
|
|
2017-05-18 23:17:09 +03:00
|
|
|
|
EXAMPLE:
|
|
|
|
|
>>> len(doc)
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2015-07-13 20:58:26 +03:00
|
|
|
|
return self.length
|
|
|
|
|
|
|
|
|
|
def __unicode__(self):
|
2016-01-16 19:13:50 +03:00
|
|
|
|
return u''.join([t.text_with_ws for t in self])
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2015-11-02 21:22:18 +03:00
|
|
|
|
def __bytes__(self):
|
2016-01-16 19:13:50 +03:00
|
|
|
|
return u''.join([t.text_with_ws for t in self]).encode('utf-8')
|
2015-11-02 21:22:18 +03:00
|
|
|
|
|
2015-07-24 04:49:30 +03:00
|
|
|
|
def __str__(self):
|
2017-04-15 14:05:15 +03:00
|
|
|
|
if is_config(python3=True):
|
2015-11-02 21:22:18 +03:00
|
|
|
|
return self.__unicode__()
|
|
|
|
|
return self.__bytes__()
|
2015-07-24 04:49:30 +03:00
|
|
|
|
|
2015-10-21 14:11:46 +03:00
|
|
|
|
def __repr__(self):
|
2015-11-02 21:22:18 +03:00
|
|
|
|
return self.__str__()
|
2015-10-21 14:11:46 +03:00
|
|
|
|
|
2016-11-24 13:47:20 +03:00
|
|
|
|
@property
|
|
|
|
|
def doc(self):
|
|
|
|
|
return self
|
|
|
|
|
|
2017-08-19 17:18:09 +03:00
|
|
|
|
def char_span(self, int start_idx, int end_idx, label=0, vector=None):
|
2017-08-19 13:21:09 +03:00
|
|
|
|
"""Create a `Span` object from the slice `doc.text[start : end]`.
|
|
|
|
|
|
|
|
|
|
doc (Doc): The parent document.
|
|
|
|
|
start (int): The index of the first character of the span.
|
|
|
|
|
end (int): The index of the first character after the span.
|
2017-10-27 16:41:45 +03:00
|
|
|
|
label (uint64 or string): A label to attach to the Span, e.g. for
|
|
|
|
|
named entities.
|
|
|
|
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
|
|
|
|
the span.
|
2017-08-19 13:21:09 +03:00
|
|
|
|
RETURNS (Span): The newly constructed object.
|
|
|
|
|
"""
|
2017-08-19 17:18:09 +03:00
|
|
|
|
if not isinstance(label, int):
|
|
|
|
|
label = self.vocab.strings.add(label)
|
2017-08-19 13:21:09 +03:00
|
|
|
|
cdef int start = token_by_start(self.c, self.length, start_idx)
|
|
|
|
|
if start == -1:
|
|
|
|
|
return None
|
|
|
|
|
cdef int end = token_by_end(self.c, self.length, end_idx)
|
|
|
|
|
if end == -1:
|
|
|
|
|
return None
|
|
|
|
|
# Currently we have the token index, we want the range-end index
|
|
|
|
|
end += 1
|
|
|
|
|
cdef Span span = Span(self, start, end, label=label, vector=vector)
|
|
|
|
|
return span
|
|
|
|
|
|
2015-09-14 10:49:58 +03:00
|
|
|
|
def similarity(self, other):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Make a semantic similarity estimate. The default estimate is cosine
|
2016-11-01 14:25:36 +03:00
|
|
|
|
similarity using an average of word vectors.
|
|
|
|
|
|
2017-05-18 23:17:09 +03:00
|
|
|
|
other (object): The object to compare with. By default, accepts `Doc`,
|
|
|
|
|
`Span`, `Token` and `Lexeme` objects.
|
|
|
|
|
RETURNS (float): A scalar similarity score. Higher is more similar.
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2016-10-19 21:54:03 +03:00
|
|
|
|
if 'similarity' in self.user_hooks:
|
|
|
|
|
return self.user_hooks['similarity'](self, other)
|
2018-01-15 18:29:48 +03:00
|
|
|
|
if isinstance(other, (Lexeme, Token)) and self.length == 1:
|
|
|
|
|
if self.c[0].lex.orth == other.orth:
|
|
|
|
|
return 1.0
|
|
|
|
|
elif isinstance(other, (Span, Doc)):
|
|
|
|
|
if len(self) == len(other):
|
|
|
|
|
for i in range(self.length):
|
|
|
|
|
if self[i].orth != other[i].orth:
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
return 1.0
|
2018-04-03 19:29:53 +03:00
|
|
|
|
|
2015-09-22 03:10:01 +03:00
|
|
|
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
|
|
|
|
return 0.0
|
2015-09-14 10:49:58 +03:00
|
|
|
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
|
|
|
|
|
2016-05-09 13:36:14 +03:00
|
|
|
|
property has_vector:
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""A boolean value indicating whether a word vector is associated with
|
|
|
|
|
the object.
|
|
|
|
|
|
|
|
|
|
RETURNS (bool): Whether a word vector is associated with the object.
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2016-05-09 13:36:14 +03:00
|
|
|
|
def __get__(self):
|
2016-10-19 21:54:03 +03:00
|
|
|
|
if 'has_vector' in self.user_hooks:
|
|
|
|
|
return self.user_hooks['has_vector'](self)
|
2017-11-03 22:56:33 +03:00
|
|
|
|
elif self.vocab.vectors.data.size:
|
2017-05-31 00:35:17 +03:00
|
|
|
|
return True
|
2017-11-03 13:20:31 +03:00
|
|
|
|
elif self.tensor.size:
|
2017-05-31 00:35:17 +03:00
|
|
|
|
return True
|
|
|
|
|
else:
|
|
|
|
|
return False
|
2016-05-09 13:36:14 +03:00
|
|
|
|
|
2015-09-14 10:49:58 +03:00
|
|
|
|
property vector:
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""A real-valued meaning representation. Defaults to an average of the
|
|
|
|
|
token vectors.
|
2017-02-27 00:27:11 +03:00
|
|
|
|
|
2017-05-18 23:17:09 +03:00
|
|
|
|
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
|
|
|
|
representing the document's semantics.
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2015-09-14 10:49:58 +03:00
|
|
|
|
def __get__(self):
|
2016-10-19 21:54:03 +03:00
|
|
|
|
if 'vector' in self.user_hooks:
|
|
|
|
|
return self.user_hooks['vector'](self)
|
2017-05-31 00:35:17 +03:00
|
|
|
|
if self._vector is not None:
|
|
|
|
|
return self._vector
|
2017-08-22 20:52:19 +03:00
|
|
|
|
elif not len(self):
|
2017-10-27 18:07:26 +03:00
|
|
|
|
self._vector = numpy.zeros((self.vocab.vectors_length,),
|
|
|
|
|
dtype='f')
|
2017-08-22 20:52:19 +03:00
|
|
|
|
return self._vector
|
2017-11-03 22:56:33 +03:00
|
|
|
|
elif self.vocab.vectors.data.size > 0:
|
2017-08-22 20:46:52 +03:00
|
|
|
|
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
|
|
|
|
for token in self.c[:self.length]:
|
|
|
|
|
vector += self.vocab.get_vector(token.lex.orth)
|
|
|
|
|
self._vector = vector / len(self)
|
2017-05-31 00:35:17 +03:00
|
|
|
|
return self._vector
|
2017-11-03 22:56:33 +03:00
|
|
|
|
elif self.tensor.size > 0:
|
2017-05-31 00:35:17 +03:00
|
|
|
|
self._vector = self.tensor.mean(axis=0)
|
|
|
|
|
return self._vector
|
|
|
|
|
else:
|
2017-10-27 18:07:26 +03:00
|
|
|
|
return numpy.zeros((self.vocab.vectors_length,),
|
|
|
|
|
dtype='float32')
|
2015-09-14 10:49:58 +03:00
|
|
|
|
|
2015-09-17 04:50:11 +03:00
|
|
|
|
def __set__(self, value):
|
|
|
|
|
self._vector = value
|
2015-09-14 10:49:58 +03:00
|
|
|
|
|
|
|
|
|
property vector_norm:
|
2017-05-19 00:59:44 +03:00
|
|
|
|
"""The L2 norm of the document's vector representation.
|
|
|
|
|
|
|
|
|
|
RETURNS (float): The L2 norm of the vector representation.
|
|
|
|
|
"""
|
2015-09-14 10:49:58 +03:00
|
|
|
|
def __get__(self):
|
2016-10-19 21:54:03 +03:00
|
|
|
|
if 'vector_norm' in self.user_hooks:
|
|
|
|
|
return self.user_hooks['vector_norm'](self)
|
2015-09-17 04:50:11 +03:00
|
|
|
|
cdef float value
|
2016-10-23 15:49:31 +03:00
|
|
|
|
cdef double norm = 0
|
2015-09-17 04:50:11 +03:00
|
|
|
|
if self._vector_norm is None:
|
2016-10-23 15:49:31 +03:00
|
|
|
|
norm = 0.0
|
2015-09-17 04:50:11 +03:00
|
|
|
|
for value in self.vector:
|
2016-10-23 15:49:31 +03:00
|
|
|
|
norm += value * value
|
|
|
|
|
self._vector_norm = sqrt(norm) if norm != 0 else 0
|
2015-09-17 04:50:11 +03:00
|
|
|
|
return self._vector_norm
|
2017-02-27 00:27:11 +03:00
|
|
|
|
|
2015-09-17 04:50:11 +03:00
|
|
|
|
def __set__(self, value):
|
2017-02-27 00:27:11 +03:00
|
|
|
|
self._vector_norm = value
|
2015-09-14 10:49:58 +03:00
|
|
|
|
|
2016-11-01 15:27:32 +03:00
|
|
|
|
property text:
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""A unicode representation of the document text.
|
|
|
|
|
|
|
|
|
|
RETURNS (unicode): The original verbatim text of the document.
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2016-11-01 14:25:36 +03:00
|
|
|
|
def __get__(self):
|
|
|
|
|
return u''.join(t.text_with_ws for t in self)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2016-11-01 14:25:36 +03:00
|
|
|
|
property text_with_ws:
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""An alias of `Doc.text`, provided for duck-type compatibility with
|
|
|
|
|
`Span` and `Token`.
|
|
|
|
|
|
|
|
|
|
RETURNS (unicode): The original verbatim text of the document.
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2016-11-01 14:25:36 +03:00
|
|
|
|
def __get__(self):
|
|
|
|
|
return self.text
|
2015-09-13 03:27:42 +03:00
|
|
|
|
|
2015-08-06 01:35:40 +03:00
|
|
|
|
property ents:
|
2017-10-27 16:41:45 +03:00
|
|
|
|
"""Iterate over the entities in the document. Yields named-entity
|
|
|
|
|
`Span` objects, if the entity recognizer has been applied to the
|
|
|
|
|
document.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
|
|
|
|
|
YIELDS (Span): Entities in the document.
|
|
|
|
|
|
2017-10-27 16:41:45 +03:00
|
|
|
|
EXAMPLE: Iterate over the span to get individual Token objects,
|
|
|
|
|
or access the label:
|
2017-05-18 23:17:09 +03:00
|
|
|
|
|
|
|
|
|
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
|
|
|
|
>>> ents = list(tokens.ents)
|
|
|
|
|
>>> assert ents[0].label == 346
|
|
|
|
|
>>> assert ents[0].label_ == 'PERSON'
|
|
|
|
|
>>> assert ents[0].orth_ == 'Best'
|
|
|
|
|
>>> assert ents[0].text == 'Mr. Best'
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2015-08-06 01:35:40 +03:00
|
|
|
|
def __get__(self):
|
|
|
|
|
cdef int i
|
|
|
|
|
cdef const TokenC* token
|
|
|
|
|
cdef int start = -1
|
2017-05-28 19:09:27 +03:00
|
|
|
|
cdef attr_t label = 0
|
2015-08-06 01:35:40 +03:00
|
|
|
|
output = []
|
|
|
|
|
for i in range(self.length):
|
2015-11-03 16:15:14 +03:00
|
|
|
|
token = &self.c[i]
|
2015-08-06 01:35:40 +03:00
|
|
|
|
if token.ent_iob == 1:
|
2018-03-26 08:13:34 +03:00
|
|
|
|
if start == -1:
|
|
|
|
|
seq = ['%s|%s' % (t.text, t.ent_iob_) for t in self[i-5:i+5]]
|
2018-04-03 22:40:29 +03:00
|
|
|
|
raise ValueError(Errors.E093.format(seq=' '.join(seq)))
|
2015-08-06 01:35:40 +03:00
|
|
|
|
elif token.ent_iob == 2 or token.ent_iob == 0:
|
|
|
|
|
if start != -1:
|
|
|
|
|
output.append(Span(self, start, i, label=label))
|
|
|
|
|
start = -1
|
|
|
|
|
label = 0
|
|
|
|
|
elif token.ent_iob == 3:
|
|
|
|
|
if start != -1:
|
|
|
|
|
output.append(Span(self, start, i, label=label))
|
|
|
|
|
start = i
|
|
|
|
|
label = token.ent_type
|
|
|
|
|
if start != -1:
|
|
|
|
|
output.append(Span(self, start, self.length, label=label))
|
|
|
|
|
return tuple(output)
|
|
|
|
|
|
|
|
|
|
def __set__(self, ents):
|
|
|
|
|
# TODO:
|
|
|
|
|
# 1. Allow negative matches
|
2017-10-27 18:07:26 +03:00
|
|
|
|
# 2. Ensure pre-set NERs are not over-written during statistical
|
|
|
|
|
# prediction
|
2015-08-06 01:35:40 +03:00
|
|
|
|
# 3. Test basic data-driven ORTH gazetteer
|
|
|
|
|
# 4. Test more nuanced date and currency regex
|
2018-10-27 00:29:16 +03:00
|
|
|
|
|
|
|
|
|
tokens_in_ents = {}
|
|
|
|
|
cdef attr_t entity_type
|
|
|
|
|
cdef int ent_start, ent_end
|
|
|
|
|
for ent_info in ents:
|
|
|
|
|
entity_type, ent_start, ent_end = get_entity_info(ent_info)
|
|
|
|
|
for token_index in range(ent_start, ent_end):
|
|
|
|
|
if token_index in tokens_in_ents.keys():
|
|
|
|
|
raise ValueError(Errors.E098.format(
|
|
|
|
|
span1=(tokens_in_ents[token_index][0],
|
|
|
|
|
tokens_in_ents[token_index][1],
|
|
|
|
|
self.vocab.strings[tokens_in_ents[token_index][2]]),
|
|
|
|
|
span2=(ent_start, ent_end, self.vocab.strings[entity_type])))
|
|
|
|
|
tokens_in_ents[token_index] = (ent_start, ent_end, entity_type)
|
|
|
|
|
|
2015-08-06 01:35:40 +03:00
|
|
|
|
cdef int i
|
|
|
|
|
for i in range(self.length):
|
2015-11-03 16:15:14 +03:00
|
|
|
|
self.c[i].ent_type = 0
|
2018-03-26 08:14:16 +03:00
|
|
|
|
self.c[i].ent_iob = 0 # Means missing.
|
2015-08-06 01:35:40 +03:00
|
|
|
|
cdef attr_t ent_type
|
|
|
|
|
cdef int start, end
|
2016-09-24 02:17:43 +03:00
|
|
|
|
for ent_info in ents:
|
2018-10-27 00:29:16 +03:00
|
|
|
|
ent_type, start, end = get_entity_info(ent_info)
|
2015-08-06 18:28:43 +03:00
|
|
|
|
if ent_type is None or ent_type < 0:
|
2015-08-06 01:35:40 +03:00
|
|
|
|
# Mark as O
|
|
|
|
|
for i in range(start, end):
|
2015-11-03 16:15:14 +03:00
|
|
|
|
self.c[i].ent_type = 0
|
|
|
|
|
self.c[i].ent_iob = 2
|
2015-08-06 01:35:40 +03:00
|
|
|
|
else:
|
|
|
|
|
# Mark (inside) as I
|
|
|
|
|
for i in range(start, end):
|
2015-11-03 16:15:14 +03:00
|
|
|
|
self.c[i].ent_type = ent_type
|
|
|
|
|
self.c[i].ent_iob = 1
|
2015-08-06 01:35:40 +03:00
|
|
|
|
# Set start as B
|
2015-11-03 16:15:14 +03:00
|
|
|
|
self.c[start].ent_iob = 3
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2016-09-28 12:39:49 +03:00
|
|
|
|
property noun_chunks:
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Iterate over the base noun phrases in the document. Yields base
|
2017-10-27 16:41:45 +03:00
|
|
|
|
noun-phrase #[code Span] objects, if the document has been
|
|
|
|
|
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
|
|
|
|
phrase that does not permit other NPs to be nested within it – so no
|
|
|
|
|
NP-level coordination, no prepositional phrases, and no relative
|
|
|
|
|
clauses.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
|
|
|
|
|
YIELDS (Span): Noun chunks in the document.
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2016-09-28 12:15:13 +03:00
|
|
|
|
def __get__(self):
|
|
|
|
|
if not self.is_parsed:
|
2018-04-03 16:50:31 +03:00
|
|
|
|
raise ValueError(Errors.E029)
|
2017-10-27 16:41:45 +03:00
|
|
|
|
# Accumulate the result before beginning to iterate over it. This
|
|
|
|
|
# prevents the tokenisation from being changed out from under us
|
|
|
|
|
# during the iteration. The tricky thing here is that Span accepts
|
|
|
|
|
# its tokenisation changing, so it's okay once we have the Span
|
|
|
|
|
# objects. See Issue #375.
|
2016-09-28 12:15:13 +03:00
|
|
|
|
spans = []
|
2018-04-09 00:44:05 +03:00
|
|
|
|
if self.noun_chunks_iterator is not None:
|
|
|
|
|
for start, end, label in self.noun_chunks_iterator(self):
|
|
|
|
|
spans.append(Span(self, start, end, label=label))
|
2016-09-28 12:15:13 +03:00
|
|
|
|
for span in spans:
|
|
|
|
|
yield span
|
|
|
|
|
|
|
|
|
|
property sents:
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Iterate over the sentences in the document. Yields sentence `Span`
|
|
|
|
|
objects. Sentence spans have no label. To improve accuracy on informal
|
|
|
|
|
texts, spaCy calculates sentence boundaries from the syntactic
|
|
|
|
|
dependency parse. If the parser is disabled, the `sents` iterator will
|
|
|
|
|
be unavailable.
|
|
|
|
|
|
|
|
|
|
EXAMPLE:
|
|
|
|
|
>>> doc = nlp("This is a sentence. Here's another...")
|
|
|
|
|
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
|
2015-07-13 20:58:26 +03:00
|
|
|
|
"""
|
2016-09-28 12:15:13 +03:00
|
|
|
|
def __get__(self):
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
if not self.is_sentenced:
|
2018-04-03 16:50:31 +03:00
|
|
|
|
raise ValueError(Errors.E030)
|
2016-10-19 21:54:03 +03:00
|
|
|
|
if 'sents' in self.user_hooks:
|
2017-06-03 12:31:11 +03:00
|
|
|
|
yield from self.user_hooks['sents'](self)
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
else:
|
|
|
|
|
start = 0
|
2018-01-15 17:21:11 +03:00
|
|
|
|
for i in range(1, self.length):
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
if self.c[i].sent_start == 1:
|
|
|
|
|
yield Span(self, start, i)
|
|
|
|
|
start = i
|
|
|
|
|
if start != self.length:
|
|
|
|
|
yield Span(self, start, self.length)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2015-07-13 22:46:02 +03:00
|
|
|
|
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
2016-11-03 01:47:46 +03:00
|
|
|
|
if self.length == 0:
|
|
|
|
|
# Flip these to false when we see the first token.
|
|
|
|
|
self.is_tagged = False
|
|
|
|
|
self.is_parsed = False
|
2015-07-13 20:58:26 +03:00
|
|
|
|
if self.length == self.max_length:
|
|
|
|
|
self._realloc(self.length * 2)
|
2015-11-03 16:15:14 +03:00
|
|
|
|
cdef TokenC* t = &self.c[self.length]
|
2015-08-28 03:02:33 +03:00
|
|
|
|
if LexemeOrToken is const_TokenC_ptr:
|
2015-07-13 20:58:26 +03:00
|
|
|
|
t[0] = lex_or_tok[0]
|
|
|
|
|
else:
|
|
|
|
|
t.lex = lex_or_tok
|
2015-07-13 22:46:02 +03:00
|
|
|
|
if self.length == 0:
|
|
|
|
|
t.idx = 0
|
|
|
|
|
else:
|
|
|
|
|
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
|
2015-09-09 04:39:46 +03:00
|
|
|
|
t.l_edge = self.length
|
|
|
|
|
t.r_edge = self.length
|
2018-04-03 16:50:31 +03:00
|
|
|
|
if t.lex.orth == 0:
|
|
|
|
|
raise ValueError(Errors.E031.format(i=self.length))
|
2015-07-13 22:46:02 +03:00
|
|
|
|
t.spacy = has_space
|
2015-07-13 20:58:26 +03:00
|
|
|
|
self.length += 1
|
2015-07-13 22:46:02 +03:00
|
|
|
|
return t.idx + t.lex.length + t.spacy
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
|
|
|
|
@cython.boundscheck(False)
|
|
|
|
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
2017-10-19 17:07:14 +03:00
|
|
|
|
"""Export given token attributes to a numpy `ndarray`.
|
2017-10-27 16:41:45 +03:00
|
|
|
|
If `attr_ids` is a sequence of M attributes, the output array will be
|
|
|
|
|
of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
|
|
|
|
|
`attr_ids` is a single attribute, the output shape will be (N,). You
|
|
|
|
|
can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
|
|
|
|
|
string name (e.g. 'LEMMA' or 'lemma').
|
2017-10-19 17:07:14 +03:00
|
|
|
|
|
|
|
|
|
attr_ids (list[]): A list of attributes (int IDs or string names).
|
2017-05-18 23:17:09 +03:00
|
|
|
|
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
|
|
|
|
per word, and one column per attribute indicated in the input
|
|
|
|
|
`attr_ids`.
|
|
|
|
|
|
|
|
|
|
EXAMPLE:
|
|
|
|
|
>>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
|
|
|
|
>>> doc = nlp(text)
|
|
|
|
|
>>> # All strings mapped to integers, for easy export to numpy
|
|
|
|
|
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
2015-07-13 20:58:26 +03:00
|
|
|
|
"""
|
|
|
|
|
cdef int i, j
|
|
|
|
|
cdef attr_id_t feature
|
2015-07-17 22:20:48 +03:00
|
|
|
|
cdef np.ndarray[attr_t, ndim=2] output
|
2017-10-19 17:07:14 +03:00
|
|
|
|
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
2017-11-01 15:25:44 +03:00
|
|
|
|
if not hasattr(py_attr_ids, '__iter__') \
|
|
|
|
|
and not isinstance(py_attr_ids, basestring_):
|
2017-10-20 14:39:37 +03:00
|
|
|
|
py_attr_ids = [py_attr_ids]
|
|
|
|
|
|
|
|
|
|
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
|
|
|
|
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_)
|
|
|
|
|
for id_ in py_attr_ids]
|
2017-10-27 18:07:26 +03:00
|
|
|
|
# Make an array from the attributes --- otherwise our inner loop is
|
|
|
|
|
# Python dict iteration.
|
2017-11-17 20:55:56 +03:00
|
|
|
|
cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype='i')
|
2017-10-27 18:07:26 +03:00
|
|
|
|
output = numpy.ndarray(shape=(self.length, len(attr_ids)),
|
|
|
|
|
dtype=numpy.uint64)
|
2017-11-17 20:55:56 +03:00
|
|
|
|
c_output = <attr_t*>output.data
|
|
|
|
|
c_attr_ids = <attr_id_t*>attr_ids.data
|
|
|
|
|
cdef TokenC* token
|
|
|
|
|
cdef int nr_attr = attr_ids.shape[0]
|
2015-07-13 20:58:26 +03:00
|
|
|
|
for i in range(self.length):
|
2017-11-17 20:55:56 +03:00
|
|
|
|
token = &self.c[i]
|
|
|
|
|
for j in range(nr_attr):
|
|
|
|
|
c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j])
|
2017-10-20 14:39:37 +03:00
|
|
|
|
# Handle 1d case
|
|
|
|
|
return output if len(attr_ids) >= 2 else output.reshape((self.length,))
|
|
|
|
|
|
2017-10-27 18:07:26 +03:00
|
|
|
|
def count_by(self, attr_id_t attr_id, exclude=None,
|
|
|
|
|
PreshCounter counts=None):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Count the frequencies of a given attribute. Produces a dict of
|
|
|
|
|
`{attribute (int): count (ints)}` frequencies, keyed by the values of
|
|
|
|
|
the given attribute ID.
|
|
|
|
|
|
|
|
|
|
attr_id (int): The attribute ID to key the counts.
|
|
|
|
|
RETURNS (dict): A dictionary mapping attributes to integer counts.
|
|
|
|
|
|
|
|
|
|
EXAMPLE:
|
|
|
|
|
>>> from spacy import attrs
|
|
|
|
|
>>> doc = nlp(u'apple apple orange banana')
|
|
|
|
|
>>> tokens.count_by(attrs.ORTH)
|
|
|
|
|
{12800L: 1, 11880L: 2, 7561L: 1}
|
|
|
|
|
>>> tokens.to_array([attrs.ORTH])
|
|
|
|
|
array([[11880], [11880], [7561], [12800]])
|
2015-07-13 20:58:26 +03:00
|
|
|
|
"""
|
|
|
|
|
cdef int i
|
|
|
|
|
cdef attr_t attr
|
|
|
|
|
cdef size_t count
|
2017-02-27 00:27:11 +03:00
|
|
|
|
|
2015-07-14 04:20:09 +03:00
|
|
|
|
if counts is None:
|
2015-09-17 04:50:11 +03:00
|
|
|
|
counts = PreshCounter()
|
2015-07-14 04:20:09 +03:00
|
|
|
|
output_dict = True
|
|
|
|
|
else:
|
|
|
|
|
output_dict = False
|
|
|
|
|
# Take this check out of the loop, for a bit of extra speed
|
|
|
|
|
if exclude is None:
|
|
|
|
|
for i in range(self.length):
|
2015-11-03 16:15:14 +03:00
|
|
|
|
counts.inc(get_token_attr(&self.c[i], attr_id), 1)
|
2015-07-14 04:20:09 +03:00
|
|
|
|
else:
|
|
|
|
|
for i in range(self.length):
|
|
|
|
|
if not exclude(self[i]):
|
2015-11-03 16:15:14 +03:00
|
|
|
|
attr = get_token_attr(&self.c[i], attr_id)
|
2015-07-14 04:20:09 +03:00
|
|
|
|
counts.inc(attr, 1)
|
|
|
|
|
if output_dict:
|
|
|
|
|
return dict(counts)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
|
|
|
|
def _realloc(self, new_size):
|
|
|
|
|
self.max_length = new_size
|
|
|
|
|
n = new_size + (PADDING * 2)
|
|
|
|
|
# What we're storing is a "padded" array. We've jumped forward PADDING
|
|
|
|
|
# places, and are storing the pointer to that. This way, we can access
|
|
|
|
|
# words out-of-bounds, and get out-of-bounds markers.
|
|
|
|
|
# Now that we want to realloc, we need the address of the true start,
|
|
|
|
|
# so we jump the pointer back PADDING places.
|
2015-11-03 16:15:14 +03:00
|
|
|
|
cdef TokenC* data_start = self.c - PADDING
|
2015-07-13 20:58:26 +03:00
|
|
|
|
data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
|
2015-11-03 16:15:14 +03:00
|
|
|
|
self.c = data_start + PADDING
|
2015-07-13 20:58:26 +03:00
|
|
|
|
cdef int i
|
|
|
|
|
for i in range(self.length, self.max_length + PADDING):
|
2015-11-03 16:15:14 +03:00
|
|
|
|
self.c[i].lex = &EMPTY_LEXEME
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2016-01-30 22:27:52 +03:00
|
|
|
|
cdef void set_parse(self, const TokenC* parsed) nogil:
|
2015-07-16 02:16:33 +03:00
|
|
|
|
# TODO: This method is fairly misleading atm. It's used by Parser
|
2015-07-13 20:58:26 +03:00
|
|
|
|
# to actually apply the parse calculated. Need to rethink this.
|
2015-07-22 05:53:01 +03:00
|
|
|
|
|
|
|
|
|
# Probably we should use from_array?
|
2015-07-13 20:58:26 +03:00
|
|
|
|
self.is_parsed = True
|
|
|
|
|
for i in range(self.length):
|
2015-11-03 16:15:14 +03:00
|
|
|
|
self.c[i] = parsed[i]
|
2015-07-13 20:58:26 +03:00
|
|
|
|
|
2016-05-05 13:11:57 +03:00
|
|
|
|
def from_array(self, attrs, array):
|
|
|
|
|
if SENT_START in attrs and HEAD in attrs:
|
2018-04-03 16:50:31 +03:00
|
|
|
|
raise ValueError(Errors.E032)
|
2015-07-22 05:53:01 +03:00
|
|
|
|
cdef int i, col
|
|
|
|
|
cdef attr_id_t attr_id
|
2015-11-03 16:15:14 +03:00
|
|
|
|
cdef TokenC* tokens = self.c
|
2015-07-22 05:53:01 +03:00
|
|
|
|
cdef int length = len(array)
|
2017-05-09 19:45:18 +03:00
|
|
|
|
# Get set up for fast loading
|
|
|
|
|
cdef Pool mem = Pool()
|
|
|
|
|
cdef int n_attrs = len(attrs)
|
|
|
|
|
attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
|
|
|
|
|
for i, attr_id in enumerate(attrs):
|
|
|
|
|
attr_ids[i] = attr_id
|
|
|
|
|
# Now load the data
|
|
|
|
|
for i in range(self.length):
|
|
|
|
|
token = &self.c[i]
|
|
|
|
|
for j in range(n_attrs):
|
|
|
|
|
Token.set_struct_attr(token, attr_ids[j], array[i, j])
|
|
|
|
|
# Auxiliary loading logic
|
2017-02-27 00:27:11 +03:00
|
|
|
|
for col, attr_id in enumerate(attrs):
|
2017-05-09 19:45:18 +03:00
|
|
|
|
if attr_id == TAG:
|
2015-07-22 05:53:01 +03:00
|
|
|
|
for i in range(length):
|
2017-05-09 19:45:18 +03:00
|
|
|
|
if array[i, col] != 0:
|
|
|
|
|
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
|
2018-01-28 21:50:42 +03:00
|
|
|
|
# set flags
|
2016-02-06 16:44:35 +03:00
|
|
|
|
self.is_parsed = bool(HEAD in attrs or DEP in attrs)
|
|
|
|
|
self.is_tagged = bool(TAG in attrs or POS in attrs)
|
2018-01-28 21:50:42 +03:00
|
|
|
|
# if document is parsed, set children
|
|
|
|
|
if self.is_parsed:
|
|
|
|
|
set_children_from_heads(self.c, self.length)
|
2015-07-22 05:53:01 +03:00
|
|
|
|
return self
|
|
|
|
|
|
2017-10-20 21:28:00 +03:00
|
|
|
|
def get_lca_matrix(self):
|
2018-12-29 20:02:26 +03:00
|
|
|
|
"""Calculates a matrix of Lowest Common Ancestors (LCA) for a given
|
|
|
|
|
`Doc`, where LCA[i, j] is the index of the lowest common ancestor among
|
|
|
|
|
token i and j.
|
|
|
|
|
|
|
|
|
|
RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape
|
|
|
|
|
(n, n), where n = len(self).
|
2017-10-27 16:41:45 +03:00
|
|
|
|
"""
|
2018-12-29 20:02:26 +03:00
|
|
|
|
return numpy.asarray(_get_lca_matrix(self, 0, len(self)))
|
2017-10-20 21:28:00 +03:00
|
|
|
|
|
2017-05-31 00:35:17 +03:00
|
|
|
|
def to_disk(self, path, **exclude):
|
2017-05-24 12:58:17 +03:00
|
|
|
|
"""Save the current state to a directory.
|
|
|
|
|
|
|
|
|
|
path (unicode or Path): A path to a directory, which will be created if
|
2017-10-27 18:07:26 +03:00
|
|
|
|
it doesn't exist. Paths may be either strings or Path-like objects.
|
2017-05-24 12:58:17 +03:00
|
|
|
|
"""
|
2017-11-09 04:29:03 +03:00
|
|
|
|
path = util.ensure_path(path)
|
2017-05-31 00:35:17 +03:00
|
|
|
|
with path.open('wb') as file_:
|
|
|
|
|
file_.write(self.to_bytes(**exclude))
|
2017-05-24 12:58:17 +03:00
|
|
|
|
|
2017-05-31 00:35:17 +03:00
|
|
|
|
def from_disk(self, path, **exclude):
|
2017-05-24 12:58:17 +03:00
|
|
|
|
"""Loads state from a directory. Modifies the object in place and
|
|
|
|
|
returns it.
|
|
|
|
|
|
|
|
|
|
path (unicode or Path): A path to a directory. Paths may be either
|
|
|
|
|
strings or `Path`-like objects.
|
|
|
|
|
RETURNS (Doc): The modified `Doc` object.
|
|
|
|
|
"""
|
2017-11-09 04:29:03 +03:00
|
|
|
|
path = util.ensure_path(path)
|
2017-05-31 00:35:17 +03:00
|
|
|
|
with path.open('rb') as file_:
|
|
|
|
|
bytes_data = file_.read()
|
2017-09-18 16:31:57 +03:00
|
|
|
|
return self.from_bytes(bytes_data, **exclude)
|
2017-05-24 12:58:17 +03:00
|
|
|
|
|
2017-05-31 00:35:17 +03:00
|
|
|
|
def to_bytes(self, **exclude):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Serialize, i.e. export the document contents to a binary string.
|
|
|
|
|
|
|
|
|
|
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
|
|
|
|
all annotations.
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2018-01-28 21:50:42 +03:00
|
|
|
|
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]
|
|
|
|
|
|
|
|
|
|
if self.is_tagged:
|
|
|
|
|
array_head.append(TAG)
|
|
|
|
|
# if doc parsed add head and dep attribute
|
|
|
|
|
if self.is_parsed:
|
|
|
|
|
array_head.extend([HEAD, DEP])
|
|
|
|
|
# otherwise add sent_start
|
|
|
|
|
else:
|
|
|
|
|
array_head.append(SENT_START)
|
2017-10-17 20:29:20 +03:00
|
|
|
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
|
|
|
|
# vexing for user data. As a best guess, we *know* that within
|
|
|
|
|
# keys, we must have tuples. In values we just have to hope
|
|
|
|
|
# users don't mind getting a list instead of a tuple.
|
2017-05-31 00:35:17 +03:00
|
|
|
|
serializers = {
|
|
|
|
|
'text': lambda: self.text,
|
|
|
|
|
'array_head': lambda: array_head,
|
|
|
|
|
'array_body': lambda: self.to_array(array_head),
|
|
|
|
|
'sentiment': lambda: self.sentiment,
|
|
|
|
|
'tensor': lambda: self.tensor,
|
|
|
|
|
}
|
2017-10-17 20:29:20 +03:00
|
|
|
|
if 'user_data' not in exclude and self.user_data:
|
|
|
|
|
user_data_keys, user_data_values = list(zip(*self.user_data.items()))
|
|
|
|
|
serializers['user_data_keys'] = lambda: msgpack.dumps(user_data_keys)
|
|
|
|
|
serializers['user_data_values'] = lambda: msgpack.dumps(user_data_values)
|
|
|
|
|
|
2017-05-31 00:35:17 +03:00
|
|
|
|
return util.to_bytes(serializers, exclude)
|
|
|
|
|
|
|
|
|
|
def from_bytes(self, bytes_data, **exclude):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Deserialize, i.e. import the document contents from a binary string.
|
|
|
|
|
|
|
|
|
|
data (bytes): The string to load from.
|
|
|
|
|
RETURNS (Doc): Itself.
|
2017-04-15 14:05:15 +03:00
|
|
|
|
"""
|
2017-05-09 19:11:34 +03:00
|
|
|
|
if self.length != 0:
|
2018-04-03 16:50:31 +03:00
|
|
|
|
raise ValueError(Errors.E033.format(length=self.length))
|
2017-05-31 00:35:17 +03:00
|
|
|
|
deserializers = {
|
|
|
|
|
'text': lambda b: None,
|
|
|
|
|
'array_head': lambda b: None,
|
|
|
|
|
'array_body': lambda b: None,
|
|
|
|
|
'sentiment': lambda b: None,
|
|
|
|
|
'tensor': lambda b: None,
|
2017-10-17 20:29:20 +03:00
|
|
|
|
'user_data_keys': lambda b: None,
|
|
|
|
|
'user_data_values': lambda b: None,
|
2017-05-31 00:35:17 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
2017-10-17 20:29:20 +03:00
|
|
|
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
|
|
|
|
# vexing for user data. As a best guess, we *know* that within
|
|
|
|
|
# keys, we must have tuples. In values we just have to hope
|
|
|
|
|
# users don't mind getting a list instead of a tuple.
|
|
|
|
|
if 'user_data' not in exclude and 'user_data_keys' in msg:
|
2017-10-27 18:07:26 +03:00
|
|
|
|
user_data_keys = msgpack.loads(msg['user_data_keys'],
|
2018-07-20 18:32:00 +03:00
|
|
|
|
use_list=False, raw=False)
|
|
|
|
|
user_data_values = msgpack.loads(msg['user_data_values'], raw=False)
|
2017-10-17 20:29:20 +03:00
|
|
|
|
for key, value in zip(user_data_keys, user_data_values):
|
|
|
|
|
self.user_data[key] = value
|
2017-05-31 00:35:17 +03:00
|
|
|
|
|
2017-05-09 19:11:34 +03:00
|
|
|
|
cdef int i, start, end, has_space
|
2018-05-01 14:40:22 +03:00
|
|
|
|
|
|
|
|
|
if 'sentiment' not in exclude and 'sentiment' in msg:
|
|
|
|
|
self.sentiment = msg['sentiment']
|
|
|
|
|
if 'tensor' not in exclude and 'tensor' in msg:
|
|
|
|
|
self.tensor = msg['tensor']
|
2017-05-09 19:11:34 +03:00
|
|
|
|
|
|
|
|
|
start = 0
|
|
|
|
|
cdef const LexemeC* lex
|
|
|
|
|
cdef unicode orth_
|
2017-05-31 00:35:17 +03:00
|
|
|
|
text = msg['text']
|
|
|
|
|
attrs = msg['array_body']
|
2017-05-09 19:11:34 +03:00
|
|
|
|
for i in range(attrs.shape[0]):
|
|
|
|
|
end = start + attrs[i, 0]
|
|
|
|
|
has_space = attrs[i, 1]
|
|
|
|
|
orth_ = text[start:end]
|
|
|
|
|
lex = self.vocab.get(self.mem, orth_)
|
|
|
|
|
self.push_back(lex, has_space)
|
|
|
|
|
start = end + has_space
|
2018-07-20 15:11:09 +03:00
|
|
|
|
self.from_array(msg['array_head'][2:], attrs[:, 2:])
|
2017-05-09 19:45:18 +03:00
|
|
|
|
return self
|
2015-07-22 05:53:01 +03:00
|
|
|
|
|
2017-11-03 13:20:31 +03:00
|
|
|
|
def extend_tensor(self, tensor):
|
|
|
|
|
'''Concatenate a new tensor onto the doc.tensor object.
|
|
|
|
|
|
|
|
|
|
The doc.tensor attribute holds dense feature vectors
|
|
|
|
|
computed by the models in the pipeline. Let's say a
|
|
|
|
|
document with 30 words has a tensor with 128 dimensions
|
|
|
|
|
per word. doc.tensor.shape will be (30, 128). After
|
2018-04-19 00:55:26 +03:00
|
|
|
|
calling doc.extend_tensor with an array of shape (30, 64),
|
2017-11-03 13:20:31 +03:00
|
|
|
|
doc.tensor == (30, 192).
|
|
|
|
|
'''
|
|
|
|
|
xp = get_array_module(self.tensor)
|
|
|
|
|
if self.tensor.size == 0:
|
2018-09-26 22:31:03 +03:00
|
|
|
|
self.tensor.resize(tensor.shape, refcheck=False)
|
2017-11-03 13:20:31 +03:00
|
|
|
|
copy_array(self.tensor, tensor)
|
|
|
|
|
else:
|
|
|
|
|
self.tensor = xp.hstack((self.tensor, tensor))
|
|
|
|
|
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
def retokenize(self):
|
2018-04-03 19:29:53 +03:00
|
|
|
|
'''Context manager to handle retokenization of the Doc.
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
Modifications to the Doc's tokenization are stored, and then
|
|
|
|
|
made all at once when the context manager exits. This is
|
|
|
|
|
much more efficient, and less error-prone.
|
|
|
|
|
|
|
|
|
|
All views of the Doc (Span and Token) created before the
|
|
|
|
|
retokenization are invalidated, although they may accidentally
|
|
|
|
|
continue to work.
|
|
|
|
|
'''
|
|
|
|
|
return Retokenizer(self)
|
|
|
|
|
|
2018-09-10 17:41:42 +03:00
|
|
|
|
def _bulk_merge(self, spans, attributes):
|
|
|
|
|
"""Retokenize the document, such that the spans given as arguments
|
|
|
|
|
are merged into single tokens. The spans need to be in document
|
|
|
|
|
order, and no span intersection is allowed.
|
|
|
|
|
|
|
|
|
|
spans (Span[]): Spans to merge, in document order, with all span
|
|
|
|
|
intersections empty. Cannot be emty.
|
|
|
|
|
attributes (Dictionary[]): Attributes to assign to the merged tokens. By default,
|
|
|
|
|
must be the same lenghth as spans, emty dictionaries are allowed.
|
|
|
|
|
attributes are inherited from the syntactic root of the span.
|
|
|
|
|
RETURNS (Token): The first newly merged token.
|
|
|
|
|
"""
|
|
|
|
|
cdef unicode tag, lemma, ent_type
|
|
|
|
|
|
|
|
|
|
assert len(attributes) == len(spans), "attribute length should be equal to span length" + str(len(attributes)) +\
|
|
|
|
|
str(len(spans))
|
|
|
|
|
with self.retokenize() as retokenizer:
|
|
|
|
|
for i, span in enumerate(spans):
|
|
|
|
|
fix_attributes(self, attributes[i])
|
|
|
|
|
remove_label_if_necessary(attributes[i])
|
|
|
|
|
retokenizer.merge(span, attributes[i])
|
|
|
|
|
|
2016-10-17 15:02:13 +03:00
|
|
|
|
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
2017-10-27 16:41:45 +03:00
|
|
|
|
"""Retokenize the document, such that the span at
|
|
|
|
|
`doc.text[start_idx : end_idx]` is merged into a single token. If
|
|
|
|
|
`start_idx` and `end_idx `do not mark start and end token boundaries,
|
|
|
|
|
the document remains unchanged.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
|
2017-10-27 16:41:45 +03:00
|
|
|
|
start_idx (int): Character index of the start of the slice to merge.
|
|
|
|
|
end_idx (int): Character index after the end of the slice to merge.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
**attributes: Attributes to assign to the merged token. By default,
|
2017-10-27 16:41:45 +03:00
|
|
|
|
attributes are inherited from the syntactic root of the span.
|
2017-05-18 23:17:09 +03:00
|
|
|
|
RETURNS (Token): The newly merged token, or `None` if the start and end
|
|
|
|
|
indices did not fall at token boundaries.
|
2016-11-01 14:25:36 +03:00
|
|
|
|
"""
|
2016-10-17 15:02:13 +03:00
|
|
|
|
cdef unicode tag, lemma, ent_type
|
|
|
|
|
if len(args) == 3:
|
2018-04-03 16:50:31 +03:00
|
|
|
|
deprecation_warning(Warnings.W003)
|
2016-10-17 15:02:13 +03:00
|
|
|
|
tag, lemma, ent_type = args
|
2017-05-28 15:06:40 +03:00
|
|
|
|
attributes[TAG] = tag
|
|
|
|
|
attributes[LEMMA] = lemma
|
|
|
|
|
attributes[ENT_TYPE] = ent_type
|
2017-03-29 09:35:03 +03:00
|
|
|
|
elif not args:
|
2018-09-10 17:41:42 +03:00
|
|
|
|
fix_attributes(self, attributes)
|
2016-10-17 15:02:13 +03:00
|
|
|
|
elif args:
|
2018-04-03 16:50:31 +03:00
|
|
|
|
raise ValueError(Errors.E034.format(n_args=len(args),
|
|
|
|
|
args=repr(args),
|
|
|
|
|
kwargs=repr(attributes)))
|
2018-09-10 17:41:42 +03:00
|
|
|
|
remove_label_if_necessary(attributes)
|
2017-05-28 16:10:22 +03:00
|
|
|
|
|
2017-05-28 15:06:40 +03:00
|
|
|
|
attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
|
|
|
|
|
|
2015-11-07 00:55:34 +03:00
|
|
|
|
cdef int start = token_by_start(self.c, self.length, start_idx)
|
|
|
|
|
if start == -1:
|
2015-11-05 18:28:08 +03:00
|
|
|
|
return None
|
2015-11-07 00:55:34 +03:00
|
|
|
|
cdef int end = token_by_end(self.c, self.length, end_idx)
|
|
|
|
|
if end == -1:
|
|
|
|
|
return None
|
|
|
|
|
# Currently we have the token index, we want the range-end index
|
|
|
|
|
end += 1
|
Add doc.retokenize() context manager (#2172)
This patch takes a step towards #1487 by introducing the
doc.retokenize() context manager, to handle merging spans, and soon
splitting tokens.
The idea is to do merging and splitting like this:
with doc.retokenize() as retokenizer:
for start, end, label in matches:
retokenizer.merge(doc[start : end], attrs={'ent_type': label})
The retokenizer accumulates the merge requests, and applies them
together at the end of the block. This will allow retokenization to be
more efficient, and much less error prone.
A retokenizer.split() function will then be added, to handle splitting a
single token into multiple tokens. These methods take `Span` and `Token`
objects; if the user wants to go directly from offsets, they can append
to the .merges and .splits lists on the retokenizer.
The doc.merge() method's behaviour remains unchanged, so this patch
should be 100% backwards incompatible (modulo bugs). Internally,
doc.merge() fixes up the arguments (to handle the various deprecated styles),
opens the retokenizer, and makes the single merge.
We can later start making deprecation warnings on direct calls to doc.merge(),
to migrate people to use of the retokenize context manager.
2018-04-03 15:10:35 +03:00
|
|
|
|
with self.retokenize() as retokenizer:
|
|
|
|
|
retokenizer.merge(self[start:end], attrs=attributes)
|
2015-07-13 20:58:26 +03:00
|
|
|
|
return self[start]
|
2015-07-30 03:29:49 +03:00
|
|
|
|
|
2016-12-30 20:19:18 +03:00
|
|
|
|
def print_tree(self, light=False, flat=False):
|
2017-05-18 23:17:09 +03:00
|
|
|
|
"""Returns the parse trees in JSON (dict) format.
|
|
|
|
|
|
|
|
|
|
light (bool): Don't include lemmas or entities.
|
|
|
|
|
flat (bool): Don't include arcs or modifiers.
|
|
|
|
|
RETURNS (dict): Parse tree as dict.
|
|
|
|
|
|
|
|
|
|
EXAMPLE:
|
|
|
|
|
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
|
|
|
|
>>> trees = doc.print_tree()
|
|
|
|
|
>>> trees[1]
|
|
|
|
|
{'modifiers': [
|
2017-10-27 16:41:45 +03:00
|
|
|
|
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
|
|
|
|
|
'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
|
|
|
|
|
'lemma': 'Alice'},
|
2017-05-18 23:17:09 +03:00
|
|
|
|
{'modifiers': [
|
|
|
|
|
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
|
|
|
|
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
|
|
|
|
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
|
|
|
|
|
'POS_fine': 'NN', 'lemma': 'pizza'},
|
|
|
|
|
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
|
|
|
|
|
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
|
|
|
|
|
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
|
|
|
|
'POS_fine': 'VBD', 'lemma': 'eat'}
|
|
|
|
|
"""
|
2016-12-30 20:19:18 +03:00
|
|
|
|
return parse_tree(self, light=light, flat=flat)
|
|
|
|
|
|
2015-07-30 03:29:49 +03:00
|
|
|
|
|
2015-11-07 00:55:34 +03:00
|
|
|
|
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
|
|
|
|
cdef int i
|
|
|
|
|
for i in range(length):
|
2015-11-07 00:56:49 +03:00
|
|
|
|
if tokens[i].idx == start_char:
|
2015-11-07 00:55:34 +03:00
|
|
|
|
return i
|
|
|
|
|
else:
|
|
|
|
|
return -1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2:
|
|
|
|
|
cdef int i
|
|
|
|
|
for i in range(length):
|
|
|
|
|
if tokens[i].idx + tokens[i].lex.length == end_char:
|
|
|
|
|
return i
|
|
|
|
|
else:
|
|
|
|
|
return -1
|
|
|
|
|
|
|
|
|
|
|
2015-07-30 03:29:49 +03:00
|
|
|
|
cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
|
|
|
|
cdef TokenC* head
|
|
|
|
|
cdef TokenC* child
|
|
|
|
|
cdef int i
|
2015-10-18 09:17:27 +03:00
|
|
|
|
# Set number of left/right children to 0. We'll increment it in the loops.
|
|
|
|
|
for i in range(length):
|
|
|
|
|
tokens[i].l_kids = 0
|
|
|
|
|
tokens[i].r_kids = 0
|
|
|
|
|
tokens[i].l_edge = i
|
|
|
|
|
tokens[i].r_edge = i
|
2015-07-30 03:29:49 +03:00
|
|
|
|
# Set left edges
|
|
|
|
|
for i in range(length):
|
|
|
|
|
child = &tokens[i]
|
|
|
|
|
head = &tokens[i + child.head]
|
2015-10-18 09:17:27 +03:00
|
|
|
|
if child < head:
|
|
|
|
|
head.l_kids += 1
|
2018-01-22 22:18:04 +03:00
|
|
|
|
if child.l_edge < head.l_edge:
|
|
|
|
|
head.l_edge = child.l_edge
|
2017-02-27 00:27:11 +03:00
|
|
|
|
|
2015-07-30 03:29:49 +03:00
|
|
|
|
# Set right edges --- same as above, but iterate in reverse
|
|
|
|
|
for i in range(length-1, -1, -1):
|
|
|
|
|
child = &tokens[i]
|
|
|
|
|
head = &tokens[i + child.head]
|
2015-10-18 09:17:27 +03:00
|
|
|
|
if child > head:
|
|
|
|
|
head.r_kids += 1
|
2018-01-22 22:18:04 +03:00
|
|
|
|
if child.r_edge > head.r_edge:
|
|
|
|
|
head.r_edge = child.r_edge
|
|
|
|
|
|
2015-11-03 10:14:53 +03:00
|
|
|
|
|
|
|
|
|
# Set sentence starts
|
|
|
|
|
for i in range(length):
|
|
|
|
|
if tokens[i].head == 0 and tokens[i].dep != 0:
|
|
|
|
|
tokens[tokens[i].l_edge].sent_start = True
|
2017-02-27 00:27:11 +03:00
|
|
|
|
|
2017-10-17 17:11:13 +03:00
|
|
|
|
|
2018-12-29 20:02:26 +03:00
|
|
|
|
cdef int _get_tokens_lca(Token token_j, Token token_k):
|
|
|
|
|
"""Given two tokens, returns the index of the lowest common ancestor
|
|
|
|
|
(LCA) among the two. If they have no common ancestor, -1 is returned.
|
|
|
|
|
|
|
|
|
|
token_j (Token): a token.
|
|
|
|
|
token_k (Token): another token.
|
|
|
|
|
RETURNS (int): index of lowest common ancestor, or -1 if the tokens
|
|
|
|
|
have no common ancestor.
|
|
|
|
|
"""
|
|
|
|
|
if token_j == token_k:
|
|
|
|
|
return token_j.i
|
|
|
|
|
elif token_j.head == token_k:
|
|
|
|
|
return token_k.i
|
|
|
|
|
elif token_k.head == token_j:
|
|
|
|
|
return token_j.i
|
|
|
|
|
|
|
|
|
|
token_j_ancestors = set(token_j.ancestors)
|
|
|
|
|
|
|
|
|
|
if token_k in token_j_ancestors:
|
|
|
|
|
return token_k.i
|
|
|
|
|
|
|
|
|
|
for token_k_ancestor in token_k.ancestors:
|
|
|
|
|
|
|
|
|
|
if token_k_ancestor == token_j:
|
|
|
|
|
return token_j.i
|
|
|
|
|
|
|
|
|
|
if token_k_ancestor in token_j_ancestors:
|
|
|
|
|
return token_k_ancestor.i
|
|
|
|
|
|
|
|
|
|
return -1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
|
|
|
|
"""Given a doc and a start and end position defining a set of contiguous
|
|
|
|
|
tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
|
|
|
|
|
LCA[i, j] is the index of the lowest common ancestor among token i and j.
|
|
|
|
|
If the tokens have no common ancestor within the specified span,
|
|
|
|
|
LCA[i, j] will be -1.
|
|
|
|
|
|
|
|
|
|
doc (Doc): The index of the token, or the slice of the document
|
|
|
|
|
start (int): First token to be included in the LCA matrix.
|
|
|
|
|
end (int): Position of next to last token included in the LCA matrix.
|
|
|
|
|
RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
|
|
|
|
|
with shape (n, n), where n = len(doc).
|
|
|
|
|
"""
|
|
|
|
|
cdef int [:,:] lca_matrix
|
|
|
|
|
|
|
|
|
|
n_tokens= end - start
|
|
|
|
|
lca_matrix = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
|
|
|
|
|
|
|
|
|
|
for j in range(start, end):
|
|
|
|
|
token_j = doc[j]
|
|
|
|
|
# the common ancestor of token and itself is itself:
|
|
|
|
|
lca_matrix[j, j] = j
|
|
|
|
|
for k in range(j + 1, end):
|
|
|
|
|
lca = _get_tokens_lca(token_j, doc[k])
|
|
|
|
|
# if lca is outside of span, we set it to -1
|
|
|
|
|
if not start <= lca < end:
|
|
|
|
|
lca_matrix[j, k] = -1
|
|
|
|
|
lca_matrix[k, j] = -1
|
|
|
|
|
else:
|
|
|
|
|
lca_matrix[j, k] = lca
|
|
|
|
|
lca_matrix[k, j] = lca
|
|
|
|
|
|
|
|
|
|
return lca_matrix
|
|
|
|
|
|
|
|
|
|
|
2017-10-17 17:11:13 +03:00
|
|
|
|
def pickle_doc(doc):
|
2017-10-17 20:29:20 +03:00
|
|
|
|
bytes_data = doc.to_bytes(vocab=False, user_data=False)
|
2017-10-17 20:44:09 +03:00
|
|
|
|
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
|
|
|
|
doc.user_token_hooks)
|
|
|
|
|
return (unpickle_doc, (doc.vocab, dill.dumps(hooks_and_data), bytes_data))
|
2017-10-17 17:11:13 +03:00
|
|
|
|
|
|
|
|
|
|
2017-10-17 20:44:09 +03:00
|
|
|
|
def unpickle_doc(vocab, hooks_and_data, bytes_data):
|
|
|
|
|
user_data, doc_hooks, span_hooks, token_hooks = dill.loads(hooks_and_data)
|
2017-10-27 16:41:45 +03:00
|
|
|
|
|
2017-10-17 20:29:20 +03:00
|
|
|
|
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data,
|
|
|
|
|
exclude='user_data')
|
2017-10-17 20:44:09 +03:00
|
|
|
|
doc.user_hooks.update(doc_hooks)
|
|
|
|
|
doc.user_span_hooks.update(span_hooks)
|
|
|
|
|
doc.user_token_hooks.update(token_hooks)
|
2017-10-17 17:11:13 +03:00
|
|
|
|
return doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
|
2018-09-10 17:41:42 +03:00
|
|
|
|
|
|
|
|
|
def remove_label_if_necessary(attributes):
|
|
|
|
|
# More deprecated attribute handling =/
|
|
|
|
|
if 'label' in attributes:
|
|
|
|
|
attributes['ent_type'] = attributes.pop('label')
|
|
|
|
|
|
|
|
|
|
def fix_attributes(doc, attributes):
|
|
|
|
|
if 'label' in attributes and 'ent_type' not in attributes:
|
|
|
|
|
if isinstance(attributes['label'], int):
|
|
|
|
|
attributes[ENT_TYPE] = attributes['label']
|
|
|
|
|
else:
|
|
|
|
|
attributes[ENT_TYPE] = doc.vocab.strings[attributes['label']]
|
|
|
|
|
if 'ent_type' in attributes:
|
|
|
|
|
attributes[ENT_TYPE] = attributes['ent_type']
|
2018-10-27 00:29:16 +03:00
|
|
|
|
|
|
|
|
|
def get_entity_info(ent_info):
|
|
|
|
|
if isinstance(ent_info, Span):
|
|
|
|
|
ent_type = ent_info.label
|
|
|
|
|
start = ent_info.start
|
|
|
|
|
end = ent_info.end
|
|
|
|
|
elif len(ent_info) == 3:
|
|
|
|
|
ent_type, start, end = ent_info
|
|
|
|
|
else:
|
|
|
|
|
ent_id, ent_type, start, end = ent_info
|
2018-12-29 20:02:26 +03:00
|
|
|
|
return ent_type, start, end
|