mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Tidy up and fix formatting and imports
This commit is contained in:
parent
fefe6684cd
commit
0739ae7b76
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import six
|
import six
|
||||||
import sys
|
import sys
|
||||||
import json
|
import ujson
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import cPickle as pickle
|
import cPickle as pickle
|
||||||
|
@ -28,14 +28,14 @@ if is_python2:
|
||||||
unicode_ = unicode
|
unicode_ = unicode
|
||||||
basestring_ = basestring
|
basestring_ = basestring
|
||||||
input_ = raw_input
|
input_ = raw_input
|
||||||
json_dumps = lambda data: json.dumps(data, indent=2).decode('utf8')
|
json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
|
||||||
|
|
||||||
elif is_python3:
|
elif is_python3:
|
||||||
bytes_ = bytes
|
bytes_ = bytes
|
||||||
unicode_ = str
|
unicode_ = str
|
||||||
basestring_ = str
|
basestring_ = str
|
||||||
input_ = input
|
input_ = input
|
||||||
json_dumps = lambda data: json.dumps(data, indent=2)
|
json_dumps = lambda data: ujson.dumps(data, indent=2)
|
||||||
|
|
||||||
|
|
||||||
def symlink_to(orig, dest):
|
def symlink_to(orig, dest):
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from . import about
|
from . import about
|
||||||
|
|
|
@ -7,17 +7,17 @@ out of "context") is in features/extractor.pyx
|
||||||
The atomic feature names are listed in a big enum, so that the feature tuples
|
The atomic feature names are listed in a big enum, so that the feature tuples
|
||||||
can refer to them.
|
can refer to them.
|
||||||
"""
|
"""
|
||||||
from libc.string cimport memset
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from libc.string cimport memset
|
||||||
from itertools import combinations
|
from itertools import combinations
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||||
if token is NULL:
|
if token is NULL:
|
||||||
|
|
|
@ -1,29 +1,26 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
# cython: cdivision=True
|
# cython: cdivision=True
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||||
|
|
||||||
import ctypes
|
import ctypes
|
||||||
import os
|
from libc.stdint cimport uint32_t
|
||||||
|
from libc.string cimport memcpy
|
||||||
from ..structs cimport TokenC
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
from .stateclass cimport StateClass
|
||||||
|
from ._state cimport StateC, is_space_token
|
||||||
|
from .nonproj import PseudoProjectivity
|
||||||
|
from .nonproj import is_nonproj_tree
|
||||||
from .transition_system cimport do_func_t, get_cost_func_t
|
from .transition_system cimport do_func_t, get_cost_func_t
|
||||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
|
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
|
from ..structs cimport TokenC
|
||||||
from libc.stdint cimport uint32_t
|
|
||||||
from libc.string cimport memcpy
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from .stateclass cimport StateClass
|
|
||||||
from ._state cimport StateC, is_space_token
|
|
||||||
from .nonproj import PseudoProjectivity
|
|
||||||
from .nonproj import is_nonproj_tree
|
|
||||||
|
|
||||||
|
|
||||||
DEF NON_MONOTONIC = True
|
DEF NON_MONOTONIC = True
|
||||||
|
|
|
@ -1,50 +1,34 @@
|
||||||
|
"""
|
||||||
|
MALT-style dependency parser
|
||||||
|
"""
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
# cython: experimental_cpp_class_def=True
|
# cython: experimental_cpp_class_def=True
|
||||||
# cython: cdivision=True
|
# cython: cdivision=True
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
"""
|
# coding: utf-8
|
||||||
MALT-style dependency parser
|
|
||||||
"""
|
from __future__ import unicode_literals, print_function
|
||||||
from __future__ import unicode_literals
|
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||||
|
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
from libc.string cimport memset, memcpy
|
from libc.string cimport memset, memcpy
|
||||||
from libc.stdlib cimport rand
|
from libc.stdlib cimport rand
|
||||||
from libc.math cimport log, exp, isnan, isinf
|
from libc.math cimport log, exp, isnan, isinf
|
||||||
import random
|
|
||||||
import os.path
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import json
|
|
||||||
import math
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool, Address
|
from cymem.cymem cimport Pool, Address
|
||||||
from murmurhash.mrmr cimport real_hash64 as hash64
|
from murmurhash.mrmr cimport real_hash64 as hash64
|
||||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
||||||
|
|
||||||
|
|
||||||
from util import Config
|
|
||||||
|
|
||||||
from thinc.linear.features cimport ConjunctionExtracter
|
from thinc.linear.features cimport ConjunctionExtracter
|
||||||
from thinc.structs cimport FeatureC, ExampleC
|
from thinc.structs cimport FeatureC, ExampleC
|
||||||
|
from thinc.extra.search cimport Beam, MaxViolation
|
||||||
from thinc.extra.search cimport Beam
|
|
||||||
from thinc.extra.search cimport MaxViolation
|
|
||||||
from thinc.extra.eg cimport Example
|
from thinc.extra.eg cimport Example
|
||||||
from thinc.extra.mb cimport Minibatch
|
from thinc.extra.mb cimport Minibatch
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..strings cimport StringStore
|
from ..strings cimport StringStore
|
||||||
|
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
|
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
|
|
||||||
from . import _parse_features
|
from . import _parse_features
|
||||||
from ._parse_features cimport CONTEXT_SIZE
|
from ._parse_features cimport CONTEXT_SIZE
|
||||||
from ._parse_features cimport fill_context
|
from ._parse_features cimport fill_context
|
||||||
|
@ -266,4 +250,3 @@ def is_gold(StateClass state, GoldParse gold, StringStore strings):
|
||||||
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
||||||
truth.add((id_, head, dep))
|
truth.add((id_, head, dep))
|
||||||
return truth == predicted
|
return truth == predicted
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,14 @@
|
||||||
from spacy.parts_of_speech cimport NOUN, PROPN, PRON
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..parts_of_speech cimport NOUN, PROPN, PRON
|
||||||
|
|
||||||
|
|
||||||
def english_noun_chunks(obj):
|
def english_noun_chunks(obj):
|
||||||
'''Detect base noun phrases from a dependency parse.
|
"""
|
||||||
Works on both Doc and Span.'''
|
Detect base noun phrases from a dependency parse.
|
||||||
|
Works on both Doc and Span.
|
||||||
|
"""
|
||||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
||||||
'attr', 'ROOT', 'root']
|
'attr', 'ROOT', 'root']
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
|
|
|
@ -1,17 +1,16 @@
|
||||||
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .transition_system cimport Transition
|
|
||||||
from .transition_system cimport do_func_t
|
|
||||||
|
|
||||||
from ..structs cimport TokenC, Entity
|
|
||||||
|
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
from ..gold cimport GoldParseC
|
|
||||||
from ..gold cimport GoldParse
|
|
||||||
from ..attrs cimport ENT_TYPE, ENT_IOB
|
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
from .transition_system cimport Transition
|
||||||
|
from .transition_system cimport do_func_t
|
||||||
|
from ..structs cimport TokenC, Entity
|
||||||
|
from ..gold cimport GoldParseC
|
||||||
|
from ..gold cimport GoldParse
|
||||||
|
from ..attrs cimport ENT_TYPE, ENT_IOB
|
||||||
|
|
||||||
|
|
||||||
cdef enum:
|
cdef enum:
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from spacy.attrs import DEP, HEAD
|
from ..attrs import DEP, HEAD
|
||||||
|
|
||||||
|
|
||||||
def ancestors(tokenid, heads):
|
def ancestors(tokenid, heads):
|
||||||
|
@ -201,5 +202,3 @@ class PseudoProjectivity:
|
||||||
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
|
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
|
||||||
filtered.append((raw_text, filtered_sents))
|
filtered.append((raw_text, filtered_sents))
|
||||||
return filtered
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,56 +1,44 @@
|
||||||
# cython: infer_types=True
|
|
||||||
"""
|
"""
|
||||||
MALT-style dependency parser
|
MALT-style dependency parser
|
||||||
"""
|
"""
|
||||||
|
# coding: utf-8
|
||||||
|
# cython: infer_types=True
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
import ujson
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
cimport cython.parallel
|
cimport cython.parallel
|
||||||
|
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||||
from cpython.exc cimport PyErr_CheckSignals
|
from cpython.exc cimport PyErr_CheckSignals
|
||||||
|
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
from libc.string cimport memset, memcpy
|
from libc.string cimport memset, memcpy
|
||||||
from libc.stdlib cimport malloc, calloc, free
|
from libc.stdlib cimport malloc, calloc, free
|
||||||
|
|
||||||
import os.path
|
|
||||||
from collections import Counter
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
from .nonproj import PseudoProjectivity
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool, Address
|
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
||||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||||
from thinc.linalg cimport VecVec
|
from thinc.linalg cimport VecVec
|
||||||
from thinc.structs cimport SparseArrayC
|
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
|
||||||
|
from thinc.extra.eg cimport Example
|
||||||
|
from cymem.cymem cimport Pool, Address
|
||||||
|
from murmurhash.mrmr cimport hash64
|
||||||
from preshed.maps cimport MapStruct
|
from preshed.maps cimport MapStruct
|
||||||
from preshed.maps cimport map_get
|
from preshed.maps cimport map_get
|
||||||
|
|
||||||
from thinc.structs cimport FeatureC
|
|
||||||
from thinc.structs cimport ExampleC
|
|
||||||
from thinc.extra.eg cimport Example
|
|
||||||
|
|
||||||
from util import Config
|
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
|
||||||
from ..strings cimport StringStore
|
|
||||||
|
|
||||||
from .transition_system import OracleError
|
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
|
||||||
|
|
||||||
from ..gold cimport GoldParse
|
|
||||||
|
|
||||||
from . import _parse_features
|
from . import _parse_features
|
||||||
from ._parse_features cimport CONTEXT_SIZE
|
from ._parse_features cimport CONTEXT_SIZE
|
||||||
from ._parse_features cimport fill_context
|
from ._parse_features cimport fill_context
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
from .nonproj import PseudoProjectivity
|
||||||
|
from .transition_system import OracleError
|
||||||
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
|
from ..structs cimport TokenC
|
||||||
|
from ..tokens.doc cimport Doc
|
||||||
|
from ..strings cimport StringStore
|
||||||
|
from ..gold cimport GoldParse
|
||||||
|
|
||||||
|
|
||||||
USE_FTRL = False
|
USE_FTRL = False
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
|
@ -80,7 +68,9 @@ cdef class ParserModel(AveragedPerceptron):
|
||||||
return nr_feat
|
return nr_feat
|
||||||
|
|
||||||
def update(self, Example eg, itn=0):
|
def update(self, Example eg, itn=0):
|
||||||
'''Does regression on negative cost. Sort of cute?'''
|
"""
|
||||||
|
Does regression on negative cost. Sort of cute?
|
||||||
|
"""
|
||||||
self.time += 1
|
self.time += 1
|
||||||
cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
|
cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
|
||||||
cdef int guess = eg.guess
|
cdef int guess = eg.guess
|
||||||
|
@ -132,10 +122,13 @@ cdef class ParserModel(AveragedPerceptron):
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser:
|
cdef class Parser:
|
||||||
"""Base class of the DependencyParser and EntityRecognizer."""
|
"""
|
||||||
|
Base class of the DependencyParser and EntityRecognizer.
|
||||||
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
|
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
|
||||||
"""Load the statistical model from the supplied path.
|
"""
|
||||||
|
Load the statistical model from the supplied path.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
path (Path):
|
path (Path):
|
||||||
|
@ -148,7 +141,7 @@ cdef class Parser:
|
||||||
The newly constructed object.
|
The newly constructed object.
|
||||||
"""
|
"""
|
||||||
with (path / 'config.json').open() as file_:
|
with (path / 'config.json').open() as file_:
|
||||||
cfg = json.load(file_)
|
cfg = ujson.load(file_)
|
||||||
# TODO: remove this shim when we don't have to support older data
|
# TODO: remove this shim when we don't have to support older data
|
||||||
if 'labels' in cfg and 'actions' not in cfg:
|
if 'labels' in cfg and 'actions' not in cfg:
|
||||||
cfg['actions'] = cfg.pop('labels')
|
cfg['actions'] = cfg.pop('labels')
|
||||||
|
@ -168,7 +161,8 @@ cdef class Parser:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
|
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
|
||||||
"""Create a Parser.
|
"""
|
||||||
|
Create a Parser.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab (Vocab):
|
vocab (Vocab):
|
||||||
|
@ -198,7 +192,8 @@ cdef class Parser:
|
||||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
|
||||||
def __call__(self, Doc tokens):
|
def __call__(self, Doc tokens):
|
||||||
"""Apply the entity recognizer, setting the annotations onto the Doc object.
|
"""
|
||||||
|
Apply the entity recognizer, setting the annotations onto the Doc object.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc): The document to be processed.
|
doc (Doc): The document to be processed.
|
||||||
|
@ -215,7 +210,8 @@ cdef class Parser:
|
||||||
self.moves.finalize_doc(tokens)
|
self.moves.finalize_doc(tokens)
|
||||||
|
|
||||||
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
||||||
"""Process a stream of documents.
|
"""
|
||||||
|
Process a stream of documents.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
stream: The sequence of documents to process.
|
stream: The sequence of documents to process.
|
||||||
|
@ -303,7 +299,8 @@ cdef class Parser:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def update(self, Doc tokens, GoldParse gold, itn=0):
|
def update(self, Doc tokens, GoldParse gold, itn=0):
|
||||||
"""Update the statistical model.
|
"""
|
||||||
|
Update the statistical model.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc):
|
doc (Doc):
|
||||||
|
@ -342,7 +339,8 @@ cdef class Parser:
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
def step_through(self, Doc doc, GoldParse gold=None):
|
def step_through(self, Doc doc, GoldParse gold=None):
|
||||||
"""Set up a stepwise state, to introspect and control the transition sequence.
|
"""
|
||||||
|
Set up a stepwise state, to introspect and control the transition sequence.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc): The document to step through.
|
doc (Doc): The document to step through.
|
||||||
|
@ -426,7 +424,9 @@ cdef class StepwiseState:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def costs(self):
|
def costs(self):
|
||||||
'''Find the action-costs for the current state'''
|
"""
|
||||||
|
Find the action-costs for the current state.
|
||||||
|
"""
|
||||||
self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs,
|
self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs,
|
||||||
self.stcls, self.gold)
|
self.stcls, self.gold)
|
||||||
costs = {}
|
costs = {}
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
|
|
||||||
from ..vocab cimport EMPTY_LEXEME
|
from ..vocab cimport EMPTY_LEXEME
|
||||||
from ..structs cimport Entity
|
from ..structs cimport Entity
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
|
@ -28,6 +32,6 @@ cdef class StateClass:
|
||||||
top = words[self.S(0)] + '_%d' % self.S_(0).head
|
top = words[self.S(0)] + '_%d' % self.S_(0).head
|
||||||
second = words[self.S(1)] + '_%d' % self.S_(1).head
|
second = words[self.S(1)] + '_%d' % self.S_(1).head
|
||||||
third = words[self.S(2)] + '_%d' % self.S_(2).head
|
third = words[self.S(2)] + '_%d' % self.S_(2).head
|
||||||
n0 = words[self.B(0)]
|
n0 = words[self.B(0)]
|
||||||
n1 = words[self.B(1)]
|
n1 = words[self.B(1)]
|
||||||
return ' '.join((third, second, top, '|', n0, n1))
|
return ' '.join((third, second, top, '|', n0, n1))
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
@ -6,7 +10,6 @@ from collections import defaultdict
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
|
||||||
|
|
||||||
|
|
||||||
cdef weight_t MIN_SCORE = -90000
|
cdef weight_t MIN_SCORE = -90000
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
from os import path
|
|
||||||
import json
|
|
||||||
|
|
||||||
class Config(object):
|
|
||||||
def __init__(self, **kwargs):
|
|
||||||
for key, value in kwargs.items():
|
|
||||||
setattr(self, key, value)
|
|
||||||
|
|
||||||
def get(self, attr, default=None):
|
|
||||||
return self.__dict__.get(attr, default)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def write(cls, model_dir, name, **kwargs):
|
|
||||||
open(path.join(model_dir, '%s.json' % name), 'w').write(json.dumps(kwargs))
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def read(cls, model_dir, name):
|
|
||||||
return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))
|
|
|
@ -1,15 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
|
cimport numpy as np
|
||||||
|
import numpy
|
||||||
|
import numpy.linalg
|
||||||
|
import struct
|
||||||
|
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
|
||||||
import numpy
|
from .span cimport Span
|
||||||
import numpy.linalg
|
from .token cimport Token
|
||||||
import struct
|
|
||||||
cimport numpy as np
|
|
||||||
import six
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..lexeme cimport EMPTY_LEXEME
|
from ..lexeme cimport EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
|
@ -19,11 +22,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from .span cimport Span
|
|
||||||
from .token cimport Token
|
|
||||||
from ..serialize.bits cimport BitArray
|
from ..serialize.bits cimport BitArray
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from ..syntax.iterators import CHUNKERS
|
from ..syntax.iterators import CHUNKERS
|
||||||
|
from ..compat import is_config
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
@ -76,7 +78,7 @@ cdef class Doc:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
|
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
|
||||||
'''
|
"""
|
||||||
Create a Doc object.
|
Create a Doc object.
|
||||||
|
|
||||||
Aside: Implementation
|
Aside: Implementation
|
||||||
|
@ -97,7 +99,7 @@ cdef class Doc:
|
||||||
A list of boolean values, of the same length as words. True
|
A list of boolean values, of the same length as words. True
|
||||||
means that the word is followed by a space, False means it is not.
|
means that the word is followed by a space, False means it is not.
|
||||||
If None, defaults to [True]*len(words)
|
If None, defaults to [True]*len(words)
|
||||||
'''
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
size = 20
|
size = 20
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
@ -158,7 +160,7 @@ cdef class Doc:
|
||||||
self.is_parsed = True
|
self.is_parsed = True
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
'''
|
"""
|
||||||
doc[i]
|
doc[i]
|
||||||
Get the Token object at position i, where i is an integer.
|
Get the Token object at position i, where i is an integer.
|
||||||
Negative indexing is supported, and follows the usual Python
|
Negative indexing is supported, and follows the usual Python
|
||||||
|
@ -172,7 +174,7 @@ cdef class Doc:
|
||||||
are not supported, as `Span` objects must be contiguous (cannot have gaps).
|
are not supported, as `Span` objects must be contiguous (cannot have gaps).
|
||||||
You can use negative indices and open-ended ranges, which have their
|
You can use negative indices and open-ended ranges, which have their
|
||||||
normal Python semantics.
|
normal Python semantics.
|
||||||
'''
|
"""
|
||||||
if isinstance(i, slice):
|
if isinstance(i, slice):
|
||||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||||
return Span(self, start, stop, label=0)
|
return Span(self, start, stop, label=0)
|
||||||
|
@ -186,7 +188,7 @@ cdef class Doc:
|
||||||
return Token.cinit(self.vocab, &self.c[i], i, self)
|
return Token.cinit(self.vocab, &self.c[i], i, self)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
'''
|
"""
|
||||||
for token in doc
|
for token in doc
|
||||||
Iterate over `Token` objects, from which the annotations can
|
Iterate over `Token` objects, from which the annotations can
|
||||||
be easily accessed. This is the main way of accessing Token
|
be easily accessed. This is the main way of accessing Token
|
||||||
|
@ -194,7 +196,7 @@ cdef class Doc:
|
||||||
Python. If faster-than-Python speeds are required, you can
|
Python. If faster-than-Python speeds are required, you can
|
||||||
instead access the annotations as a numpy array, or access the
|
instead access the annotations as a numpy array, or access the
|
||||||
underlying C data directly from Cython.
|
underlying C data directly from Cython.
|
||||||
'''
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
if self._py_tokens[i] is not None:
|
if self._py_tokens[i] is not None:
|
||||||
|
@ -203,10 +205,10 @@ cdef class Doc:
|
||||||
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
'''
|
"""
|
||||||
len(doc)
|
len(doc)
|
||||||
The number of tokens in the document.
|
The number of tokens in the document.
|
||||||
'''
|
"""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
|
@ -216,7 +218,7 @@ cdef class Doc:
|
||||||
return u''.join([t.text_with_ws for t in self]).encode('utf-8')
|
return u''.join([t.text_with_ws for t in self]).encode('utf-8')
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if six.PY3:
|
if is_config(python3=True):
|
||||||
return self.__unicode__()
|
return self.__unicode__()
|
||||||
return self.__bytes__()
|
return self.__bytes__()
|
||||||
|
|
||||||
|
@ -228,7 +230,8 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
'''Make a semantic similarity estimate. The default estimate is cosine
|
"""
|
||||||
|
Make a semantic similarity estimate. The default estimate is cosine
|
||||||
similarity using an average of word vectors.
|
similarity using an average of word vectors.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -237,7 +240,7 @@ cdef class Doc:
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
score (float): A scalar similarity score. Higher is more similar.
|
score (float): A scalar similarity score. Higher is more similar.
|
||||||
'''
|
"""
|
||||||
if 'similarity' in self.user_hooks:
|
if 'similarity' in self.user_hooks:
|
||||||
return self.user_hooks['similarity'](self, other)
|
return self.user_hooks['similarity'](self, other)
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
|
@ -245,9 +248,9 @@ cdef class Doc:
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
'''
|
"""
|
||||||
A boolean value indicating whether a word vector is associated with the object.
|
A boolean value indicating whether a word vector is associated with the object.
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.user_hooks:
|
if 'has_vector' in self.user_hooks:
|
||||||
return self.user_hooks['has_vector'](self)
|
return self.user_hooks['has_vector'](self)
|
||||||
|
@ -255,11 +258,11 @@ cdef class Doc:
|
||||||
return any(token.has_vector for token in self)
|
return any(token.has_vector for token in self)
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
'''
|
"""
|
||||||
A real-valued meaning representation. Defaults to an average of the token vectors.
|
A real-valued meaning representation. Defaults to an average of the token vectors.
|
||||||
|
|
||||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector' in self.user_hooks:
|
if 'vector' in self.user_hooks:
|
||||||
return self.user_hooks['vector'](self)
|
return self.user_hooks['vector'](self)
|
||||||
|
@ -294,17 +297,21 @@ cdef class Doc:
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
property text:
|
property text:
|
||||||
'''A unicode representation of the document text.'''
|
"""
|
||||||
|
A unicode representation of the document text.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return u''.join(t.text_with_ws for t in self)
|
return u''.join(t.text_with_ws for t in self)
|
||||||
|
|
||||||
property text_with_ws:
|
property text_with_ws:
|
||||||
'''An alias of Doc.text, provided for duck-type compatibility with Span and Token.'''
|
"""
|
||||||
|
An alias of Doc.text, provided for duck-type compatibility with Span and Token.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
property ents:
|
property ents:
|
||||||
'''
|
"""
|
||||||
Yields named-entity `Span` objects, if the entity recognizer
|
Yields named-entity `Span` objects, if the entity recognizer
|
||||||
has been applied to the document. Iterate over the span to get
|
has been applied to the document. Iterate over the span to get
|
||||||
individual Token objects, or access the label:
|
individual Token objects, or access the label:
|
||||||
|
@ -318,7 +325,7 @@ cdef class Doc:
|
||||||
assert ents[0].label_ == 'PERSON'
|
assert ents[0].label_ == 'PERSON'
|
||||||
assert ents[0].orth_ == 'Best'
|
assert ents[0].orth_ == 'Best'
|
||||||
assert ents[0].text == 'Mr. Best'
|
assert ents[0].text == 'Mr. Best'
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef const TokenC* token
|
cdef const TokenC* token
|
||||||
|
@ -382,13 +389,13 @@ cdef class Doc:
|
||||||
self.c[start].ent_iob = 3
|
self.c[start].ent_iob = 3
|
||||||
|
|
||||||
property noun_chunks:
|
property noun_chunks:
|
||||||
'''
|
"""
|
||||||
Yields base noun-phrase #[code Span] objects, if the document
|
Yields base noun-phrase #[code Span] objects, if the document
|
||||||
has been syntactically parsed. A base noun phrase, or
|
has been syntactically parsed. A base noun phrase, or
|
||||||
'NP chunk', is a noun phrase that does not permit other NPs to
|
'NP chunk', is a noun phrase that does not permit other NPs to
|
||||||
be nested within it – so no NP-level coordination, no prepositional
|
be nested within it – so no NP-level coordination, no prepositional
|
||||||
phrases, and no relative clauses. For example:
|
phrases, and no relative clauses.
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -496,7 +503,8 @@ cdef class Doc:
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
|
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
|
||||||
"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
"""
|
||||||
|
Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||||
by the values of the given attribute ID.
|
by the values of the given attribute ID.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -563,8 +571,9 @@ cdef class Doc:
|
||||||
self.c[i] = parsed[i]
|
self.c[i] = parsed[i]
|
||||||
|
|
||||||
def from_array(self, attrs, array):
|
def from_array(self, attrs, array):
|
||||||
'''Write to a `Doc` object, from an `(M, N)` array of attributes.
|
"""
|
||||||
'''
|
Write to a `Doc` object, from an `(M, N)` array of attributes.
|
||||||
|
"""
|
||||||
cdef int i, col
|
cdef int i, col
|
||||||
cdef attr_id_t attr_id
|
cdef attr_id_t attr_id
|
||||||
cdef TokenC* tokens = self.c
|
cdef TokenC* tokens = self.c
|
||||||
|
@ -603,19 +612,23 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
'''Serialize, producing a byte string.'''
|
"""
|
||||||
|
Serialize, producing a byte string.
|
||||||
|
"""
|
||||||
byte_string = self.vocab.serializer.pack(self)
|
byte_string = self.vocab.serializer.pack(self)
|
||||||
cdef uint32_t length = len(byte_string)
|
cdef uint32_t length = len(byte_string)
|
||||||
return struct.pack('I', length) + byte_string
|
return struct.pack('I', length) + byte_string
|
||||||
|
|
||||||
def from_bytes(self, data):
|
def from_bytes(self, data):
|
||||||
'''Deserialize, loading from bytes.'''
|
"""
|
||||||
|
Deserialize, loading from bytes.
|
||||||
|
"""
|
||||||
self.vocab.serializer.unpack_into(data[4:], self)
|
self.vocab.serializer.unpack_into(data[4:], self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_bytes(file_):
|
def read_bytes(file_):
|
||||||
'''
|
"""
|
||||||
A static method, used to read serialized #[code Doc] objects from
|
A static method, used to read serialized #[code Doc] objects from
|
||||||
a file. For example:
|
a file. For example:
|
||||||
|
|
||||||
|
@ -630,7 +643,7 @@ cdef class Doc:
|
||||||
for byte_string in Doc.read_bytes(file_):
|
for byte_string in Doc.read_bytes(file_):
|
||||||
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
||||||
assert len(docs) == 2
|
assert len(docs) == 2
|
||||||
'''
|
"""
|
||||||
keep_reading = True
|
keep_reading = True
|
||||||
while keep_reading:
|
while keep_reading:
|
||||||
try:
|
try:
|
||||||
|
@ -644,7 +657,8 @@ cdef class Doc:
|
||||||
yield n_bytes_str + data
|
yield n_bytes_str + data
|
||||||
|
|
||||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||||
"""Retokenize the document, such that the span at doc.text[start_idx : end_idx]
|
"""
|
||||||
|
Retokenize the document, such that the span at doc.text[start_idx : end_idx]
|
||||||
is merged into a single token. If start_idx and end_idx do not mark start
|
is merged into a single token. If start_idx and end_idx do not mark start
|
||||||
and end token boundaries, the document remains unchanged.
|
and end token boundaries, the document remains unchanged.
|
||||||
|
|
||||||
|
@ -658,7 +672,6 @@ cdef class Doc:
|
||||||
token (Token):
|
token (Token):
|
||||||
The newly merged token, or None if the start and end indices did
|
The newly merged token, or None if the start and end indices did
|
||||||
not fall at token boundaries.
|
not fall at token boundaries.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
cdef unicode tag, lemma, ent_type
|
cdef unicode tag, lemma, ent_type
|
||||||
if len(args) == 3:
|
if len(args) == 3:
|
||||||
|
|
|
@ -1,26 +1,31 @@
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
|
cimport numpy as np
|
||||||
import numpy
|
import numpy
|
||||||
import numpy.linalg
|
import numpy.linalg
|
||||||
cimport numpy as np
|
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
import six
|
|
||||||
|
|
||||||
|
from .doc cimport token_by_start, token_by_end
|
||||||
from ..structs cimport TokenC, LexemeC
|
from ..structs cimport TokenC, LexemeC
|
||||||
from ..typedefs cimport flags_t, attr_t, hash_t
|
from ..typedefs cimport flags_t, attr_t, hash_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from .doc cimport token_by_start, token_by_end
|
|
||||||
from ..attrs cimport IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_PUNCT, IS_SPACE
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
|
from ..compat import is_config
|
||||||
|
|
||||||
|
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
"""A slice from a Doc object."""
|
"""
|
||||||
|
A slice from a Doc object.
|
||||||
|
"""
|
||||||
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
|
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
|
||||||
vector_norm=None):
|
vector_norm=None):
|
||||||
'''Create a Span object from the slice doc[start : end]
|
"""
|
||||||
|
Create a Span object from the slice doc[start : end]
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc): The parent document.
|
doc (Doc): The parent document.
|
||||||
|
@ -30,7 +35,7 @@ cdef class Span:
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||||
Returns:
|
Returns:
|
||||||
Span The newly constructed object.
|
Span The newly constructed object.
|
||||||
'''
|
"""
|
||||||
if not (0 <= start <= end <= len(doc)):
|
if not (0 <= start <= end <= len(doc)):
|
||||||
raise IndexError
|
raise IndexError
|
||||||
|
|
||||||
|
@ -68,7 +73,7 @@ cdef class Span:
|
||||||
return self.end - self.start
|
return self.end - self.start
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
if six.PY3:
|
if is_config(python3=True):
|
||||||
return self.text
|
return self.text
|
||||||
return self.text.encode('utf-8')
|
return self.text.encode('utf-8')
|
||||||
|
|
||||||
|
@ -89,7 +94,8 @@ cdef class Span:
|
||||||
yield self.doc[i]
|
yield self.doc[i]
|
||||||
|
|
||||||
def merge(self, *args, **attributes):
|
def merge(self, *args, **attributes):
|
||||||
"""Retokenize the document, such that the span is merged into a single token.
|
"""
|
||||||
|
Retokenize the document, such that the span is merged into a single token.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
**attributes:
|
**attributes:
|
||||||
|
@ -102,7 +108,8 @@ cdef class Span:
|
||||||
return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
'''Make a semantic similarity estimate. The default estimate is cosine
|
"""
|
||||||
|
Make a semantic similarity estimate. The default estimate is cosine
|
||||||
similarity using an average of word vectors.
|
similarity using an average of word vectors.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -111,7 +118,7 @@ cdef class Span:
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
score (float): A scalar similarity score. Higher is more similar.
|
score (float): A scalar similarity score. Higher is more similar.
|
||||||
'''
|
"""
|
||||||
if 'similarity' in self.doc.user_span_hooks:
|
if 'similarity' in self.doc.user_span_hooks:
|
||||||
self.doc.user_span_hooks['similarity'](self, other)
|
self.doc.user_span_hooks['similarity'](self, other)
|
||||||
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
||||||
|
@ -133,11 +140,12 @@ cdef class Span:
|
||||||
self.end = end + 1
|
self.end = end + 1
|
||||||
|
|
||||||
property sent:
|
property sent:
|
||||||
'''The sentence span that this span is a part of.
|
"""
|
||||||
|
The sentence span that this span is a part of.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Span The sentence this is part of.
|
Span The sentence this is part of.
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'sent' in self.doc.user_span_hooks:
|
if 'sent' in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks['sent'](self)
|
return self.doc.user_span_hooks['sent'](self)
|
||||||
|
@ -198,13 +206,13 @@ cdef class Span:
|
||||||
return u''.join([t.text_with_ws for t in self])
|
return u''.join([t.text_with_ws for t in self])
|
||||||
|
|
||||||
property noun_chunks:
|
property noun_chunks:
|
||||||
'''
|
"""
|
||||||
Yields base noun-phrase #[code Span] objects, if the document
|
Yields base noun-phrase #[code Span] objects, if the document
|
||||||
has been syntactically parsed. A base noun phrase, or
|
has been syntactically parsed. A base noun phrase, or
|
||||||
'NP chunk', is a noun phrase that does not permit other NPs to
|
'NP chunk', is a noun phrase that does not permit other NPs to
|
||||||
be nested within it – so no NP-level coordination, no prepositional
|
be nested within it – so no NP-level coordination, no prepositional
|
||||||
phrases, and no relative clauses. For example:
|
phrases, and no relative clauses. For example:
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if not self.doc.is_parsed:
|
if not self.doc.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -223,17 +231,16 @@ cdef class Span:
|
||||||
yield span
|
yield span
|
||||||
|
|
||||||
property root:
|
property root:
|
||||||
"""The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.
|
"""
|
||||||
|
The token within the span that's highest in the parse tree. If there's a
|
||||||
|
tie, the earlist is prefered.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Token: The root token.
|
Token: The root token.
|
||||||
|
|
||||||
i.e. has the
|
i.e. has the shortest path to the root of the sentence (or is the root
|
||||||
shortest path to the root of the sentence (or is the root itself).
|
itself). If multiple words are equally high in the tree, the first word
|
||||||
|
is taken. For example:
|
||||||
If multiple words are equally high in the tree, the first word is taken.
|
|
||||||
|
|
||||||
For example:
|
|
||||||
|
|
||||||
>>> toks = nlp(u'I like New York in Autumn.')
|
>>> toks = nlp(u'I like New York in Autumn.')
|
||||||
|
|
||||||
|
@ -303,7 +310,8 @@ cdef class Span:
|
||||||
return self.doc[root]
|
return self.doc[root]
|
||||||
|
|
||||||
property lefts:
|
property lefts:
|
||||||
"""Tokens that are to the left of the span, whose head is within the Span.
|
"""
|
||||||
|
Tokens that are to the left of the span, whose head is within the Span.
|
||||||
|
|
||||||
Yields: Token A left-child of a token of the span.
|
Yields: Token A left-child of a token of the span.
|
||||||
"""
|
"""
|
||||||
|
@ -314,7 +322,8 @@ cdef class Span:
|
||||||
yield left
|
yield left
|
||||||
|
|
||||||
property rights:
|
property rights:
|
||||||
"""Tokens that are to the right of the Span, whose head is within the Span.
|
"""
|
||||||
|
Tokens that are to the right of the Span, whose head is within the Span.
|
||||||
|
|
||||||
Yields: Token A right-child of a token of the span.
|
Yields: Token A right-child of a token of the span.
|
||||||
"""
|
"""
|
||||||
|
@ -325,7 +334,8 @@ cdef class Span:
|
||||||
yield right
|
yield right
|
||||||
|
|
||||||
property subtree:
|
property subtree:
|
||||||
"""Tokens that descend from tokens in the span, but fall outside it.
|
"""
|
||||||
|
Tokens that descend from tokens in the span, but fall outside it.
|
||||||
|
|
||||||
Yields: Token A descendant of a token within the span.
|
Yields: Token A descendant of a token within the span.
|
||||||
"""
|
"""
|
||||||
|
@ -337,7 +347,9 @@ cdef class Span:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
'''An (integer) entity ID. Usually assigned by patterns in the Matcher.'''
|
"""
|
||||||
|
An (integer) entity ID. Usually assigned by patterns in the Matcher.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.root.ent_id
|
return self.root.ent_id
|
||||||
|
|
||||||
|
@ -345,9 +357,11 @@ cdef class Span:
|
||||||
# TODO
|
# TODO
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Can't yet set ent_id from Span. Vote for this feature on the issue "
|
"Can't yet set ent_id from Span. Vote for this feature on the issue "
|
||||||
"tracker: http://github.com/spacy-io/spaCy")
|
"tracker: http://github.com/explosion/spaCy/issues")
|
||||||
property ent_id_:
|
property ent_id_:
|
||||||
'''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
|
"""
|
||||||
|
A (string) entity ID. Usually assigned by patterns in the Matcher.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.root.ent_id_
|
return self.root.ent_id_
|
||||||
|
|
||||||
|
@ -355,7 +369,7 @@ cdef class Span:
|
||||||
# TODO
|
# TODO
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Can't yet set ent_id_ from Span. Vote for this feature on the issue "
|
"Can't yet set ent_id_ from Span. Vote for this feature on the issue "
|
||||||
"tracker: http://github.com/spacy-io/spaCy")
|
"tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
property orth_:
|
property orth_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -397,5 +411,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Array bounds exceeded while searching for root word. This likely "
|
"Array bounds exceeded while searching for root word. This likely "
|
||||||
"means the parse tree is in an invalid state. Please report this "
|
"means the parse tree is in an invalid state. Please report this "
|
||||||
"issue here: http://github.com/honnibal/spaCy/")
|
"issue here: http://github.com/explosion/spaCy/issues")
|
||||||
return n
|
return n
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# coding: utf8
|
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
|
@ -8,20 +8,15 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
||||||
from cython.view cimport array as cvarray
|
from cython.view cimport array as cvarray
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
np.import_array()
|
np.import_array()
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import six
|
|
||||||
|
|
||||||
|
|
||||||
from ..typedefs cimport hash_t
|
from ..typedefs cimport hash_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from .. import parts_of_speech
|
from .. import parts_of_speech
|
||||||
|
|
||||||
from ..attrs cimport LEMMA
|
from ..attrs cimport LEMMA
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from ..attrs cimport POS, LEMMA, TAG, DEP
|
from ..attrs cimport POS, LEMMA, TAG, DEP
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT
|
from ..parts_of_speech cimport CCONJ, PUNCT
|
||||||
|
|
||||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from ..attrs cimport IS_BRACKET
|
from ..attrs cimport IS_BRACKET
|
||||||
from ..attrs cimport IS_QUOTE
|
from ..attrs cimport IS_QUOTE
|
||||||
|
@ -29,12 +24,13 @@ from ..attrs cimport IS_LEFT_PUNCT
|
||||||
from ..attrs cimport IS_RIGHT_PUNCT
|
from ..attrs cimport IS_RIGHT_PUNCT
|
||||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
from ..attrs cimport IS_OOV
|
from ..attrs cimport IS_OOV
|
||||||
|
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
|
from ..compat import is_config
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
|
"""
|
||||||
|
An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, Vocab vocab, Doc doc, int offset):
|
def __cinit__(self, Vocab vocab, Doc doc, int offset):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -46,7 +42,9 @@ cdef class Token:
|
||||||
return hash((self.doc, self.i))
|
return hash((self.doc, self.i))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
'''Number of unicode characters in token.text'''
|
"""
|
||||||
|
Number of unicode characters in token.text.
|
||||||
|
"""
|
||||||
return self.c.lex.length
|
return self.c.lex.length
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
|
@ -56,7 +54,7 @@ cdef class Token:
|
||||||
return self.text.encode('utf8')
|
return self.text.encode('utf8')
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if six.PY3:
|
if is_config(python3=True):
|
||||||
return self.__unicode__()
|
return self.__unicode__()
|
||||||
return self.__bytes__()
|
return self.__bytes__()
|
||||||
|
|
||||||
|
@ -83,27 +81,30 @@ cdef class Token:
|
||||||
raise ValueError(op)
|
raise ValueError(op)
|
||||||
|
|
||||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||||
'''Check the value of a boolean flag.
|
"""
|
||||||
|
Check the value of a boolean flag.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
flag_id (int): The ID of the flag attribute.
|
flag_id (int): The ID of the flag attribute.
|
||||||
Returns:
|
Returns:
|
||||||
is_set (bool): Whether the flag is set.
|
is_set (bool): Whether the flag is set.
|
||||||
'''
|
"""
|
||||||
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
||||||
|
|
||||||
def nbor(self, int i=1):
|
def nbor(self, int i=1):
|
||||||
'''Get a neighboring token.
|
"""
|
||||||
|
Get a neighboring token.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
i (int): The relative position of the token to get. Defaults to 1.
|
i (int): The relative position of the token to get. Defaults to 1.
|
||||||
Returns:
|
Returns:
|
||||||
neighbor (Token): The token at position self.doc[self.i+i]
|
neighbor (Token): The token at position self.doc[self.i+i]
|
||||||
'''
|
"""
|
||||||
return self.doc[self.i+i]
|
return self.doc[self.i+i]
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
"""
|
||||||
|
Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
other:
|
other:
|
||||||
|
@ -111,7 +112,7 @@ cdef class Token:
|
||||||
Token and Lexeme objects.
|
Token and Lexeme objects.
|
||||||
Returns:
|
Returns:
|
||||||
score (float): A scalar similarity score. Higher is more similar.
|
score (float): A scalar similarity score. Higher is more similar.
|
||||||
'''
|
"""
|
||||||
if 'similarity' in self.doc.user_token_hooks:
|
if 'similarity' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['similarity'](self)
|
return self.doc.user_token_hooks['similarity'](self)
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
|
@ -209,9 +210,9 @@ cdef class Token:
|
||||||
self.c.dep = label
|
self.c.dep = label
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
'''
|
"""
|
||||||
A boolean value indicating whether a word vector is associated with the object.
|
A boolean value indicating whether a word vector is associated with the object.
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.doc.user_token_hooks:
|
if 'has_vector' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['has_vector'](self)
|
return self.doc.user_token_hooks['has_vector'](self)
|
||||||
|
@ -223,11 +224,11 @@ cdef class Token:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
'''
|
"""
|
||||||
A real-valued meaning representation.
|
A real-valued meaning representation.
|
||||||
|
|
||||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector' in self.doc.user_token_hooks:
|
if 'vector' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['vector'](self)
|
return self.doc.user_token_hooks['vector'](self)
|
||||||
|
@ -245,6 +246,7 @@ cdef class Token:
|
||||||
property repvec:
|
property repvec:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
raise AttributeError("repvec was renamed to vector in v0.100")
|
raise AttributeError("repvec was renamed to vector in v0.100")
|
||||||
|
|
||||||
property has_repvec:
|
property has_repvec:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
raise AttributeError("has_repvec was renamed to has_vector in v0.100")
|
raise AttributeError("has_repvec was renamed to has_vector in v0.100")
|
||||||
|
@ -265,7 +267,8 @@ cdef class Token:
|
||||||
|
|
||||||
property lefts:
|
property lefts:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""The leftward immediate children of the word, in the syntactic
|
"""
|
||||||
|
The leftward immediate children of the word, in the syntactic
|
||||||
dependency parse.
|
dependency parse.
|
||||||
"""
|
"""
|
||||||
cdef int nr_iter = 0
|
cdef int nr_iter = 0
|
||||||
|
@ -282,8 +285,10 @@ cdef class Token:
|
||||||
|
|
||||||
property rights:
|
property rights:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""The rightward immediate children of the word, in the syntactic
|
"""
|
||||||
dependency parse."""
|
The rightward immediate children of the word, in the syntactic
|
||||||
|
dependency parse.
|
||||||
|
"""
|
||||||
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
||||||
tokens = []
|
tokens = []
|
||||||
cdef int nr_iter = 0
|
cdef int nr_iter = 0
|
||||||
|
@ -300,19 +305,21 @@ cdef class Token:
|
||||||
yield t
|
yield t
|
||||||
|
|
||||||
property children:
|
property children:
|
||||||
'''A sequence of the token's immediate syntactic children.
|
"""
|
||||||
|
A sequence of the token's immediate syntactic children.
|
||||||
|
|
||||||
Yields: Token A child token such that child.head==self
|
Yields: Token A child token such that child.head==self
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
yield from self.lefts
|
yield from self.lefts
|
||||||
yield from self.rights
|
yield from self.rights
|
||||||
|
|
||||||
property subtree:
|
property subtree:
|
||||||
'''A sequence of all the token's syntactic descendents.
|
"""
|
||||||
|
A sequence of all the token's syntactic descendents.
|
||||||
|
|
||||||
Yields: Token A descendent token such that self.is_ancestor(descendent)
|
Yields: Token A descendent token such that self.is_ancestor(descendent)
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
for word in self.lefts:
|
for word in self.lefts:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
@ -321,26 +328,29 @@ cdef class Token:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
|
||||||
property left_edge:
|
property left_edge:
|
||||||
'''The leftmost token of this token's syntactic descendents.
|
"""
|
||||||
|
The leftmost token of this token's syntactic descendents.
|
||||||
|
|
||||||
Returns: Token The first token such that self.is_ancestor(token)
|
Returns: Token The first token such that self.is_ancestor(token)
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc[self.c.l_edge]
|
return self.doc[self.c.l_edge]
|
||||||
|
|
||||||
property right_edge:
|
property right_edge:
|
||||||
'''The rightmost token of this token's syntactic descendents.
|
"""
|
||||||
|
The rightmost token of this token's syntactic descendents.
|
||||||
|
|
||||||
Returns: Token The last token such that self.is_ancestor(token)
|
Returns: Token The last token such that self.is_ancestor(token)
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc[self.c.r_edge]
|
return self.doc[self.c.r_edge]
|
||||||
|
|
||||||
property ancestors:
|
property ancestors:
|
||||||
'''A sequence of this token's syntactic ancestors.
|
"""
|
||||||
|
A sequence of this token's syntactic ancestors.
|
||||||
|
|
||||||
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
|
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef const TokenC* head_ptr = self.c
|
cdef const TokenC* head_ptr = self.c
|
||||||
# guard against infinite loop, no token can have
|
# guard against infinite loop, no token can have
|
||||||
|
@ -356,25 +366,29 @@ cdef class Token:
|
||||||
return self.is_ancestor(descendant)
|
return self.is_ancestor(descendant)
|
||||||
|
|
||||||
def is_ancestor(self, descendant):
|
def is_ancestor(self, descendant):
|
||||||
'''Check whether this token is a parent, grandparent, etc. of another
|
"""
|
||||||
|
Check whether this token is a parent, grandparent, etc. of another
|
||||||
in the dependency tree.
|
in the dependency tree.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
descendant (Token): Another token.
|
descendant (Token): Another token.
|
||||||
Returns:
|
Returns:
|
||||||
is_ancestor (bool): Whether this token is the ancestor of the descendant.
|
is_ancestor (bool): Whether this token is the ancestor of the descendant.
|
||||||
'''
|
"""
|
||||||
if self.doc is not descendant.doc:
|
if self.doc is not descendant.doc:
|
||||||
return False
|
return False
|
||||||
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
|
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
|
||||||
|
|
||||||
property head:
|
property head:
|
||||||
'''The syntactic parent, or "governor", of this token.
|
"""
|
||||||
|
The syntactic parent, or "governor", of this token.
|
||||||
|
|
||||||
Returns: Token
|
Returns: Token
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""The token predicted by the parser to be the head of the current token."""
|
"""
|
||||||
|
The token predicted by the parser to be the head of the current token.
|
||||||
|
"""
|
||||||
return self.doc[self.i + self.c.head]
|
return self.doc[self.i + self.c.head]
|
||||||
def __set__(self, Token new_head):
|
def __set__(self, Token new_head):
|
||||||
# this function sets the head of self to new_head
|
# this function sets the head of self to new_head
|
||||||
|
@ -467,10 +481,11 @@ cdef class Token:
|
||||||
self.c.head = rel_newhead_i
|
self.c.head = rel_newhead_i
|
||||||
|
|
||||||
property conjuncts:
|
property conjuncts:
|
||||||
'''A sequence of coordinated tokens, including the token itself.
|
"""
|
||||||
|
A sequence of coordinated tokens, including the token itself.
|
||||||
|
|
||||||
Yields: Token A coordinated token
|
Yields: Token A coordinated token
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""Get a list of conjoined words."""
|
"""Get a list of conjoined words."""
|
||||||
cdef Token word
|
cdef Token word
|
||||||
|
@ -501,7 +516,9 @@ cdef class Token:
|
||||||
return iob_strings[self.c.ent_iob]
|
return iob_strings[self.c.ent_iob]
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
'''An (integer) entity ID. Usually assigned by patterns in the Matcher.'''
|
"""
|
||||||
|
An (integer) entity ID. Usually assigned by patterns in the Matcher.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.ent_id
|
return self.c.ent_id
|
||||||
|
|
||||||
|
@ -509,7 +526,9 @@ cdef class Token:
|
||||||
self.c.ent_id = key
|
self.c.ent_id = key
|
||||||
|
|
||||||
property ent_id_:
|
property ent_id_:
|
||||||
'''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
|
"""
|
||||||
|
A (string) entity ID. Usually assigned by patterns in the Matcher.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.ent_id]
|
return self.vocab.strings[self.c.ent_id]
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user