mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Tidy up rest
This commit is contained in:
parent
a8e10f94e4
commit
d96e72f656
|
@ -8,11 +8,9 @@ from thinc.t2t import ExtractWindow, ParametricAttention
|
||||||
from thinc.t2v import Pooling, sum_pool
|
from thinc.t2v import Pooling, sum_pool
|
||||||
from thinc.misc import Residual
|
from thinc.misc import Residual
|
||||||
from thinc.misc import LayerNorm as LN
|
from thinc.misc import LayerNorm as LN
|
||||||
|
|
||||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||||
from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
|
from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
|
||||||
from thinc.api import uniqued, wrap, noop
|
from thinc.api import uniqued, wrap, noop
|
||||||
|
|
||||||
from thinc.linear.linear import LinearModel
|
from thinc.linear.linear import LinearModel
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
|
|
@ -101,17 +101,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
"""
|
"""
|
||||||
Normalize a dictionary of attributes, converting them to ints.
|
Normalize a dictionary of attributes, converting them to ints.
|
||||||
|
|
||||||
Arguments:
|
stringy_attrs (dict): Dictionary keyed by attribute string names. Values
|
||||||
stringy_attrs (dict):
|
can be ints or strings.
|
||||||
Dictionary keyed by attribute string names. Values can be ints or strings.
|
strings_map (StringStore): Defaults to None. If provided, encodes string
|
||||||
|
values into ints.
|
||||||
strings_map (StringStore):
|
RETURNS (dict): Attributes dictionary with keys and optionally values
|
||||||
Defaults to None. If provided, encodes string values into ints.
|
converted to ints.
|
||||||
|
|
||||||
Returns:
|
|
||||||
inty_attrs (dict):
|
|
||||||
Attributes dictionary with keys and optionally values converted to
|
|
||||||
ints.
|
|
||||||
"""
|
"""
|
||||||
inty_attrs = {}
|
inty_attrs = {}
|
||||||
if _do_deprecated:
|
if _do_deprecated:
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import io
|
|
||||||
import re
|
import re
|
||||||
import ujson
|
import ujson
|
||||||
import random
|
import random
|
||||||
|
@ -10,9 +9,8 @@ import cytoolz
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .util import ensure_path
|
|
||||||
from . import util
|
|
||||||
from .tokens import Doc
|
from .tokens import Doc
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
def tags_to_entities(tags):
|
def tags_to_entities(tags):
|
||||||
|
@ -310,7 +308,7 @@ def _corrupt(c, noise_level):
|
||||||
|
|
||||||
|
|
||||||
def read_json_file(loc, docs_filter=None, limit=None):
|
def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
loc = ensure_path(loc)
|
loc = util.ensure_path(loc)
|
||||||
if loc.is_dir():
|
if loc.is_dir():
|
||||||
for filename in loc.iterdir():
|
for filename in loc.iterdir():
|
||||||
yield from read_json_file(loc / filename, limit=limit)
|
yield from read_json_file(loc / filename, limit=limit)
|
||||||
|
|
|
@ -1,22 +1,22 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import absolute_import, unicode_literals
|
from __future__ import absolute_import, unicode_literals
|
||||||
from contextlib import contextmanager
|
|
||||||
import copy
|
|
||||||
|
|
||||||
from thinc.neural import Model
|
|
||||||
from thinc.neural.optimizers import Adam
|
|
||||||
import random
|
import random
|
||||||
import ujson
|
import ujson
|
||||||
from collections import OrderedDict
|
|
||||||
import itertools
|
import itertools
|
||||||
import weakref
|
import weakref
|
||||||
import functools
|
import functools
|
||||||
|
from collections import OrderedDict
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from copy import copy
|
||||||
|
from thinc.neural import Model
|
||||||
|
from thinc.neural.optimizers import Adam
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .pipeline import DependencyParser, Tensorizer, Tagger
|
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
||||||
from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
|
from .pipeline import SimilarityHook, TextCategorizer
|
||||||
from .compat import json_dumps, izip
|
from .compat import json_dumps, izip
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from ._ml import link_vectors_to_models
|
from ._ml import link_vectors_to_models
|
||||||
|
@ -649,7 +649,7 @@ class Language(object):
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('vocab', lambda: self.vocab.to_bytes()),
|
('vocab', lambda: self.vocab.to_bytes()),
|
||||||
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
|
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
|
||||||
('meta', lambda: ujson.dumps(self.meta))
|
('meta', lambda: json_dumps(self.meta))
|
||||||
))
|
))
|
||||||
for i, (name, proc) in enumerate(self.pipeline):
|
for i, (name, proc) in enumerate(self.pipeline):
|
||||||
if name in disable:
|
if name in disable:
|
||||||
|
@ -689,7 +689,7 @@ class DisabledPipes(list):
|
||||||
# Important! Not deep copy -- we just want the container (but we also
|
# Important! Not deep copy -- we just want the container (but we also
|
||||||
# want to support people providing arbitrarily typed nlp.pipeline
|
# want to support people providing arbitrarily typed nlp.pipeline
|
||||||
# objects.)
|
# objects.)
|
||||||
self.original_pipeline = copy.copy(nlp.pipeline)
|
self.original_pipeline = copy(nlp.pipeline)
|
||||||
list.__init__(self)
|
list.__init__(self)
|
||||||
self.extend(nlp.remove_pipe(name) for name in names)
|
self.extend(nlp.remove_pipe(name) for name in names)
|
||||||
|
|
||||||
|
|
|
@ -4,12 +4,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import ujson
|
import ujson
|
||||||
|
|
||||||
from .typedefs cimport attr_t
|
|
||||||
from .typedefs cimport hash_t
|
|
||||||
from .attrs cimport attr_id_t
|
|
||||||
from .structs cimport TokenC
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
@ -17,14 +11,15 @@ from libcpp.pair cimport pair
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
|
||||||
from .attrs cimport ID, NULL_ATTR, ENT_TYPE
|
from .typedefs cimport attr_t
|
||||||
from . import attrs
|
from .typedefs cimport hash_t
|
||||||
from .tokens.doc cimport get_token_attr
|
from .structs cimport TokenC
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc, get_token_attr
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
|
|
||||||
|
from .attrs import IDS
|
||||||
|
from .attrs cimport attr_id_t, ID, NULL_ATTR
|
||||||
from .attrs import FLAG61 as U_ENT
|
from .attrs import FLAG61 as U_ENT
|
||||||
|
|
||||||
from .attrs import FLAG60 as B2_ENT
|
from .attrs import FLAG60 as B2_ENT
|
||||||
from .attrs import FLAG59 as B3_ENT
|
from .attrs import FLAG59 as B3_ENT
|
||||||
from .attrs import FLAG58 as B4_ENT
|
from .attrs import FLAG58 as B4_ENT
|
||||||
|
@ -34,7 +29,6 @@ from .attrs import FLAG55 as B7_ENT
|
||||||
from .attrs import FLAG54 as B8_ENT
|
from .attrs import FLAG54 as B8_ENT
|
||||||
from .attrs import FLAG53 as B9_ENT
|
from .attrs import FLAG53 as B9_ENT
|
||||||
from .attrs import FLAG52 as B10_ENT
|
from .attrs import FLAG52 as B10_ENT
|
||||||
|
|
||||||
from .attrs import FLAG51 as I3_ENT
|
from .attrs import FLAG51 as I3_ENT
|
||||||
from .attrs import FLAG50 as I4_ENT
|
from .attrs import FLAG50 as I4_ENT
|
||||||
from .attrs import FLAG49 as I5_ENT
|
from .attrs import FLAG49 as I5_ENT
|
||||||
|
@ -43,7 +37,6 @@ from .attrs import FLAG47 as I7_ENT
|
||||||
from .attrs import FLAG46 as I8_ENT
|
from .attrs import FLAG46 as I8_ENT
|
||||||
from .attrs import FLAG45 as I9_ENT
|
from .attrs import FLAG45 as I9_ENT
|
||||||
from .attrs import FLAG44 as I10_ENT
|
from .attrs import FLAG44 as I10_ENT
|
||||||
|
|
||||||
from .attrs import FLAG43 as L2_ENT
|
from .attrs import FLAG43 as L2_ENT
|
||||||
from .attrs import FLAG42 as L3_ENT
|
from .attrs import FLAG42 as L3_ENT
|
||||||
from .attrs import FLAG41 as L4_ENT
|
from .attrs import FLAG41 as L4_ENT
|
||||||
|
@ -168,10 +161,10 @@ def _convert_strings(token_specs, string_store):
|
||||||
if value in operators:
|
if value in operators:
|
||||||
ops = operators[value]
|
ops = operators[value]
|
||||||
else:
|
else:
|
||||||
raise KeyError(
|
msg = "Unknown operator '%s'. Options: %s"
|
||||||
"Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys())))
|
raise KeyError(msg % (value, ', '.join(operators.keys())))
|
||||||
if isinstance(attr, basestring):
|
if isinstance(attr, basestring):
|
||||||
attr = attrs.IDS.get(attr.upper())
|
attr = IDS.get(attr.upper())
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, basestring):
|
||||||
value = string_store.add(value)
|
value = string_store.add(value)
|
||||||
if isinstance(value, bool):
|
if isinstance(value, bool):
|
||||||
|
@ -233,13 +226,13 @@ cdef class Matcher:
|
||||||
return self._normalize_key(key) in self._patterns
|
return self._normalize_key(key) in self._patterns
|
||||||
|
|
||||||
def add(self, key, on_match, *patterns):
|
def add(self, key, on_match, *patterns):
|
||||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
|
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||||
an on_match callback, and one or more patterns.
|
key, an on_match callback, and one or more patterns.
|
||||||
|
|
||||||
If the key exists, the patterns are appended to the previous ones, and
|
If the key exists, the patterns are appended to the previous ones, and
|
||||||
the previous on_match callback is replaced. The `on_match` callback will
|
the previous on_match callback is replaced. The `on_match` callback
|
||||||
receive the arguments `(matcher, doc, i, matches)`. You can also set
|
will receive the arguments `(matcher, doc, i, matches)`. You can also
|
||||||
`on_match` to `None` to not perform any actions.
|
set `on_match` to `None` to not perform any actions.
|
||||||
|
|
||||||
A pattern consists of one or more `token_specs`, where a `token_spec`
|
A pattern consists of one or more `token_specs`, where a `token_spec`
|
||||||
is a dictionary mapping attribute IDs to values, and optionally a
|
is a dictionary mapping attribute IDs to values, and optionally a
|
||||||
|
@ -253,8 +246,8 @@ cdef class Matcher:
|
||||||
The + and * operators are usually interpretted "greedily", i.e. longer
|
The + and * operators are usually interpretted "greedily", i.e. longer
|
||||||
matches are returned where possible. However, if you specify two '+'
|
matches are returned where possible. However, if you specify two '+'
|
||||||
and '*' patterns in a row and their matches overlap, the first
|
and '*' patterns in a row and their matches overlap, the first
|
||||||
operator will behave non-greedily. This quirk in the semantics
|
operator will behave non-greedily. This quirk in the semantics makes
|
||||||
makes the matcher more efficient, by avoiding the need for back-tracking.
|
the matcher more efficient, by avoiding the need for back-tracking.
|
||||||
|
|
||||||
key (unicode): The match ID.
|
key (unicode): The match ID.
|
||||||
on_match (callable): Callback executed on match.
|
on_match (callable): Callback executed on match.
|
||||||
|
@ -268,7 +261,6 @@ cdef class Matcher:
|
||||||
key = self._normalize_key(key)
|
key = self._normalize_key(key)
|
||||||
self._patterns.setdefault(key, [])
|
self._patterns.setdefault(key, [])
|
||||||
self._callbacks[key] = on_match
|
self._callbacks[key] = on_match
|
||||||
|
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
specs = _convert_strings(pattern, self.vocab.strings)
|
specs = _convert_strings(pattern, self.vocab.strings)
|
||||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||||
|
@ -315,9 +307,9 @@ cdef class Matcher:
|
||||||
"""Match a stream of documents, yielding them in turn.
|
"""Match a stream of documents, yielding them in turn.
|
||||||
|
|
||||||
docs (iterable): A stream of documents.
|
docs (iterable): A stream of documents.
|
||||||
batch_size (int): The number of documents to accumulate into a working set.
|
batch_size (int): Number of documents to accumulate into a working set.
|
||||||
n_threads (int): The number of threads with which to work on the buffer
|
n_threads (int): The number of threads with which to work on the buffer
|
||||||
in parallel, if the `Matcher` implementation supports multi-threading.
|
in parallel, if the implementation supports multi-threading.
|
||||||
YIELDS (Doc): Documents, in order.
|
YIELDS (Doc): Documents, in order.
|
||||||
"""
|
"""
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
@ -325,7 +317,7 @@ cdef class Matcher:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def __call__(self, Doc doc):
|
def __call__(self, Doc doc):
|
||||||
"""Find all token sequences matching the supplied patterns on the `Doc`.
|
"""Find all token sequences matching the supplied pattern.
|
||||||
|
|
||||||
doc (Doc): The document to match over.
|
doc (Doc): The document to match over.
|
||||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||||
|
@ -342,8 +334,8 @@ cdef class Matcher:
|
||||||
for token_i in range(doc.length):
|
for token_i in range(doc.length):
|
||||||
token = &doc.c[token_i]
|
token = &doc.c[token_i]
|
||||||
q = 0
|
q = 0
|
||||||
# Go over the open matches, extending or finalizing if able. Otherwise,
|
# Go over the open matches, extending or finalizing if able.
|
||||||
# we over-write them (q doesn't advance)
|
# Otherwise, we over-write them (q doesn't advance)
|
||||||
for state in partials:
|
for state in partials:
|
||||||
action = get_action(state.second, token)
|
action = get_action(state.second, token)
|
||||||
if action == PANIC:
|
if action == PANIC:
|
||||||
|
@ -356,8 +348,8 @@ cdef class Matcher:
|
||||||
|
|
||||||
if action == REPEAT:
|
if action == REPEAT:
|
||||||
# Leave the state in the queue, and advance to next slot
|
# Leave the state in the queue, and advance to next slot
|
||||||
# (i.e. we don't overwrite -- we want to greedily match more
|
# (i.e. we don't overwrite -- we want to greedily match
|
||||||
# pattern.
|
# more pattern.
|
||||||
q += 1
|
q += 1
|
||||||
elif action == REJECT:
|
elif action == REJECT:
|
||||||
pass
|
pass
|
||||||
|
@ -366,8 +358,8 @@ cdef class Matcher:
|
||||||
partials[q].second += 1
|
partials[q].second += 1
|
||||||
q += 1
|
q += 1
|
||||||
elif action in (ACCEPT, ACCEPT_PREV):
|
elif action in (ACCEPT, ACCEPT_PREV):
|
||||||
# TODO: What to do about patterns starting with ZERO? Need to
|
# TODO: What to do about patterns starting with ZERO? Need
|
||||||
# adjust the start position.
|
# to adjust the start position.
|
||||||
start = state.first
|
start = state.first
|
||||||
end = token_i+1 if action == ACCEPT else token_i
|
end = token_i+1 if action == ACCEPT else token_i
|
||||||
ent_id = state.second[1].attrs[0].value
|
ent_id = state.second[1].attrs[0].value
|
||||||
|
@ -388,8 +380,8 @@ cdef class Matcher:
|
||||||
state.second = pattern
|
state.second = pattern
|
||||||
partials.push_back(state)
|
partials.push_back(state)
|
||||||
elif action == ADVANCE:
|
elif action == ADVANCE:
|
||||||
# TODO: What to do about patterns starting with ZERO? Need to
|
# TODO: What to do about patterns starting with ZERO? Need
|
||||||
# adjust the start position.
|
# to adjust the start position.
|
||||||
state.first = token_i
|
state.first = token_i
|
||||||
state.second = pattern + 1
|
state.second = pattern + 1
|
||||||
partials.push_back(state)
|
partials.push_back(state)
|
||||||
|
@ -413,7 +405,6 @@ cdef class Matcher:
|
||||||
on_match = self._callbacks.get(ent_id)
|
on_match = self._callbacks.get(ent_id)
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
on_match(self, doc, i, matches)
|
on_match(self, doc, i, matches)
|
||||||
# TODO: only return (match_id, start, end)
|
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
def _normalize_key(self, key):
|
def _normalize_key(self, key):
|
||||||
|
@ -441,7 +432,8 @@ def get_bilou(length):
|
||||||
elif length == 8:
|
elif length == 8:
|
||||||
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
|
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
|
||||||
elif length == 9:
|
elif length == 9:
|
||||||
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
|
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
|
||||||
|
L9_ENT]
|
||||||
elif length == 10:
|
elif length == 10:
|
||||||
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
|
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
|
||||||
I10_ENT, I10_ENT, L10_ENT]
|
I10_ENT, I10_ENT, L10_ENT]
|
||||||
|
@ -454,10 +446,8 @@ cdef class PhraseMatcher:
|
||||||
cdef Vocab vocab
|
cdef Vocab vocab
|
||||||
cdef Matcher matcher
|
cdef Matcher matcher
|
||||||
cdef PreshMap phrase_ids
|
cdef PreshMap phrase_ids
|
||||||
|
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
cdef attr_t* _phrase_key
|
cdef attr_t* _phrase_key
|
||||||
|
|
||||||
cdef public object _callbacks
|
cdef public object _callbacks
|
||||||
cdef public object _patterns
|
cdef public object _patterns
|
||||||
|
|
||||||
|
@ -470,7 +460,8 @@ cdef class PhraseMatcher:
|
||||||
self.phrase_ids = PreshMap()
|
self.phrase_ids = PreshMap()
|
||||||
abstract_patterns = []
|
abstract_patterns = []
|
||||||
for length in range(1, max_length):
|
for length in range(1, max_length):
|
||||||
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
|
abstract_patterns.append([{tag: True}
|
||||||
|
for tag in get_bilou(length)])
|
||||||
self.matcher.add('Candidate', None, *abstract_patterns)
|
self.matcher.add('Candidate', None, *abstract_patterns)
|
||||||
self._callbacks = {}
|
self._callbacks = {}
|
||||||
|
|
||||||
|
@ -496,8 +487,8 @@ cdef class PhraseMatcher:
|
||||||
return (self.__class__, (self.vocab,), None, None)
|
return (self.__class__, (self.vocab,), None, None)
|
||||||
|
|
||||||
def add(self, key, on_match, *docs):
|
def add(self, key, on_match, *docs):
|
||||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
|
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||||
an on_match callback, and one or more patterns.
|
key, an on_match callback, and one or more patterns.
|
||||||
|
|
||||||
key (unicode): The match ID.
|
key (unicode): The match ID.
|
||||||
on_match (callable): Callback executed on match.
|
on_match (callable): Callback executed on match.
|
||||||
|
@ -513,7 +504,6 @@ cdef class PhraseMatcher:
|
||||||
raise ValueError(msg % (len(doc), self.max_length))
|
raise ValueError(msg % (len(doc), self.max_length))
|
||||||
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
||||||
self._callbacks[ent_id] = on_match
|
self._callbacks[ent_id] = on_match
|
||||||
|
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef hash_t phrase_hash
|
cdef hash_t phrase_hash
|
||||||
|
@ -553,9 +543,9 @@ cdef class PhraseMatcher:
|
||||||
"""Match a stream of documents, yielding them in turn.
|
"""Match a stream of documents, yielding them in turn.
|
||||||
|
|
||||||
docs (iterable): A stream of documents.
|
docs (iterable): A stream of documents.
|
||||||
batch_size (int): The number of documents to accumulate into a working set.
|
batch_size (int): Number of documents to accumulate into a working set.
|
||||||
n_threads (int): The number of threads with which to work on the buffer
|
n_threads (int): The number of threads with which to work on the buffer
|
||||||
in parallel, if the `Matcher` implementation supports multi-threading.
|
in parallel, if the implementation supports multi-threading.
|
||||||
YIELDS (Doc): Documents, in order.
|
YIELDS (Doc): Documents, in order.
|
||||||
"""
|
"""
|
||||||
for doc in stream:
|
for doc in stream:
|
||||||
|
@ -569,7 +559,8 @@ cdef class PhraseMatcher:
|
||||||
self._phrase_key[i] = 0
|
self._phrase_key[i] = 0
|
||||||
for i, j in enumerate(range(start, end)):
|
for i, j in enumerate(range(start, end)):
|
||||||
self._phrase_key[i] = doc.c[j].lex.orth
|
self._phrase_key[i] = doc.c[j].lex.orth
|
||||||
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
|
cdef hash_t key = hash64(self._phrase_key,
|
||||||
|
self.max_length * sizeof(attr_t), 0)
|
||||||
ent_id = <hash_t>self.phrase_ids.get(key)
|
ent_id = <hash_t>self.phrase_ids.get(key)
|
||||||
if ent_id == 0:
|
if ent_id == 0:
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -4,17 +4,15 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
|
|
||||||
from .attrs cimport POS, IS_SPACE
|
from .attrs cimport POS, IS_SPACE
|
||||||
|
from .attrs import LEMMA, intify_attrs
|
||||||
|
from .parts_of_speech cimport SPACE
|
||||||
from .parts_of_speech import IDS as POS_IDS
|
from .parts_of_speech import IDS as POS_IDS
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .attrs import LEMMA, intify_attrs
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_props(props):
|
def _normalize_props(props):
|
||||||
"""
|
"""Transform deprecated string keys to correct names."""
|
||||||
Transform deprecated string keys to correct names.
|
|
||||||
"""
|
|
||||||
out = {}
|
out = {}
|
||||||
for key, value in props.items():
|
for key, value in props.items():
|
||||||
if key == POS:
|
if key == POS:
|
||||||
|
@ -77,7 +75,8 @@ cdef class Morphology:
|
||||||
cdef int assign_untagged(self, TokenC* token) except -1:
|
cdef int assign_untagged(self, TokenC* token) except -1:
|
||||||
"""Set morphological attributes on a token without a POS tag. Uses
|
"""Set morphological attributes on a token without a POS tag. Uses
|
||||||
the lemmatizer's lookup() method, which looks up the string in the
|
the lemmatizer's lookup() method, which looks up the string in the
|
||||||
table provided by the language data as lemma_lookup (if available)."""
|
table provided by the language data as lemma_lookup (if available).
|
||||||
|
"""
|
||||||
if token.lemma == 0:
|
if token.lemma == 0:
|
||||||
orth_str = self.strings[token.lex.orth]
|
orth_str = self.strings[token.lex.orth]
|
||||||
lemma = self.lemmatizer.lookup(orth_str)
|
lemma = self.lemmatizer.lookup(orth_str)
|
||||||
|
@ -95,11 +94,10 @@ cdef class Morphology:
|
||||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||||
if tag_id > self.n_tags:
|
if tag_id > self.n_tags:
|
||||||
raise ValueError("Unknown tag ID: %s" % tag_id)
|
raise ValueError("Unknown tag ID: %s" % tag_id)
|
||||||
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
# TODO: It's pretty arbitrary to put this logic here. I guess the
|
||||||
# is that this is where the specific word and the tag interact. Still,
|
# justification is that this is where the specific word and the tag
|
||||||
# we should have a better way to enforce this rule, or figure out why
|
# interact. Still, we should have a better way to enforce this rule, or
|
||||||
# the statistical model fails.
|
# figure out why the statistical model fails. Related to Issue #220
|
||||||
# Related to Issue #220
|
|
||||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||||
tag_id = self.reverse_index[self.strings.add('_SP')]
|
tag_id = self.reverse_index[self.strings.add('_SP')]
|
||||||
rich_tag = self.rich_tags[tag_id]
|
rich_tag = self.rich_tags[tag_id]
|
||||||
|
@ -123,12 +121,11 @@ cdef class Morphology:
|
||||||
else:
|
else:
|
||||||
flags[0] &= ~(one << flag_id)
|
flags[0] &= ~(one << flag_id)
|
||||||
|
|
||||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
|
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
||||||
"""
|
force=False):
|
||||||
Add a special-case rule to the morphological analyser. Tokens whose
|
"""Add a special-case rule to the morphological analyser. Tokens whose
|
||||||
tag and orth match the rule will receive the specified properties.
|
tag and orth match the rule will receive the specified properties.
|
||||||
|
|
||||||
Arguments:
|
|
||||||
tag (unicode): The part-of-speech tag to key the exception.
|
tag (unicode): The part-of-speech tag to key the exception.
|
||||||
orth (unicode): The word-form to key the exception.
|
orth (unicode): The word-form to key the exception.
|
||||||
"""
|
"""
|
||||||
|
@ -144,10 +141,9 @@ cdef class Morphology:
|
||||||
elif force:
|
elif force:
|
||||||
memset(cached, 0, sizeof(cached[0]))
|
memset(cached, 0, sizeof(cached[0]))
|
||||||
else:
|
else:
|
||||||
msg = ("Conflicting morphology exception for (%s, %s). Use force=True "
|
raise ValueError(
|
||||||
"to overwrite.")
|
"Conflicting morphology exception for (%s, %s). Use "
|
||||||
msg = msg % (tag_str, orth_str)
|
"force=True to overwrite." % (tag_str, orth_str))
|
||||||
raise ValueError(msg)
|
|
||||||
|
|
||||||
cached.tag = rich_tag
|
cached.tag = rich_tag
|
||||||
# TODO: Refactor this to take arbitrary attributes.
|
# TODO: Refactor this to take arbitrary attributes.
|
||||||
|
|
|
@ -85,7 +85,6 @@ class Scorer(object):
|
||||||
|
|
||||||
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
|
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
|
||||||
assert len(tokens) == len(gold)
|
assert len(tokens) == len(gold)
|
||||||
|
|
||||||
gold_deps = set()
|
gold_deps = set()
|
||||||
gold_tags = set()
|
gold_tags = set()
|
||||||
gold_ents = set(tags_to_entities([annot[-1]
|
gold_ents = set(tags_to_entities([annot[-1]
|
||||||
|
|
|
@ -4,19 +4,15 @@ from __future__ import unicode_literals, absolute_import
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
from libc.stdint cimport uint64_t, uint32_t
|
|
||||||
from murmurhash.mrmr cimport hash64, hash32
|
|
||||||
from preshed.maps cimport map_iter, key_t
|
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
|
from murmurhash.mrmr cimport hash64, hash32
|
||||||
import ujson
|
import ujson
|
||||||
import dill
|
|
||||||
|
|
||||||
from .symbols import IDS as SYMBOLS_BY_STR
|
from .symbols import IDS as SYMBOLS_BY_STR
|
||||||
from .symbols import NAMES as SYMBOLS_BY_INT
|
from .symbols import NAMES as SYMBOLS_BY_INT
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from . import util
|
|
||||||
from .compat import json_dumps
|
from .compat import json_dumps
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
cpdef hash_t hash_string(unicode string) except 0:
|
cpdef hash_t hash_string(unicode string) except 0:
|
||||||
|
@ -195,7 +191,7 @@ cdef class StringStore:
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
strings = list(self)
|
strings = list(self)
|
||||||
|
@ -225,7 +221,7 @@ cdef class StringStore:
|
||||||
**exclude: Named attributes to prevent from being serialized.
|
**exclude: Named attributes to prevent from being serialized.
|
||||||
RETURNS (bytes): The serialized form of the `StringStore` object.
|
RETURNS (bytes): The serialized form of the `StringStore` object.
|
||||||
"""
|
"""
|
||||||
return ujson.dumps(list(self))
|
return json_dumps(list(self))
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
"""Load state from a binary string.
|
"""Load state from a binary string.
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
#cython: optimize.unpack_method_calls=False
|
#cython: optimize.unpack_method_calls=False
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
IDS = {
|
IDS = {
|
||||||
"": NIL,
|
"": NIL,
|
||||||
"IS_ALPHA": IS_ALPHA,
|
"IS_ALPHA": IS_ALPHA,
|
||||||
|
@ -464,9 +464,11 @@ IDS = {
|
||||||
"LAW": LAW
|
"LAW": LAW
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def sort_nums(x):
|
def sort_nums(x):
|
||||||
return x[1]
|
return x[1]
|
||||||
|
|
||||||
|
|
||||||
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
||||||
# Unfortunate hack here, to work around problem with long cpdef enum
|
# Unfortunate hack here, to work around problem with long cpdef enum
|
||||||
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
||||||
|
|
|
@ -8,12 +8,11 @@ from cython.operator cimport preincrement as preinc
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
from .strings cimport hash_string
|
|
||||||
from . import util
|
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
|
from .strings cimport hash_string
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
@ -74,9 +73,8 @@ cdef class Tokenizer:
|
||||||
RETURNS (Doc): A container for linguistic annotations.
|
RETURNS (Doc): A container for linguistic annotations.
|
||||||
"""
|
"""
|
||||||
if len(string) >= (2 ** 30):
|
if len(string) >= (2 ** 30):
|
||||||
raise ValueError(
|
msg = "String is too long: %d characters. Max is 2**30."
|
||||||
"String is too long: %d characters. Max is 2**30." % len(string)
|
raise ValueError(msg % len(string))
|
||||||
)
|
|
||||||
cdef int length = len(string)
|
cdef int length = len(string)
|
||||||
cdef Doc doc = Doc(self.vocab)
|
cdef Doc doc = Doc(self.vocab)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
|
@ -122,8 +120,8 @@ cdef class Tokenizer:
|
||||||
"""Tokenize a stream of texts.
|
"""Tokenize a stream of texts.
|
||||||
|
|
||||||
texts: A sequence of unicode texts.
|
texts: A sequence of unicode texts.
|
||||||
batch_size (int): The number of texts to accumulate in an internal buffer.
|
batch_size (int): Number of texts to accumulate in an internal buffer.
|
||||||
n_threads (int): The number of threads to use, if the implementation
|
n_threads (int): Number of threads to use, if the implementation
|
||||||
supports multi-threading. The default tokenizer is single-threaded.
|
supports multi-threading. The default tokenizer is single-threaded.
|
||||||
YIELDS (Doc): A sequence of Doc objects, in order.
|
YIELDS (Doc): A sequence of Doc objects, in order.
|
||||||
"""
|
"""
|
||||||
|
@ -232,8 +230,8 @@ cdef class Tokenizer:
|
||||||
if not matches:
|
if not matches:
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||||
else:
|
else:
|
||||||
# let's say we have dyn-o-mite-dave
|
# let's say we have dyn-o-mite-dave - the regex finds the
|
||||||
# the regex finds the start and end positions of the hyphens
|
# start and end positions of the hyphens
|
||||||
start = 0
|
start = 0
|
||||||
for match in matches:
|
for match in matches:
|
||||||
infix_start = match.start()
|
infix_start = match.start()
|
||||||
|
@ -293,8 +291,8 @@ cdef class Tokenizer:
|
||||||
return list(self.infix_finditer(string))
|
return list(self.infix_finditer(string))
|
||||||
|
|
||||||
def find_prefix(self, unicode string):
|
def find_prefix(self, unicode string):
|
||||||
"""Find the length of a prefix that should be segmented from the string,
|
"""Find the length of a prefix that should be segmented from the
|
||||||
or None if no prefix rules match.
|
string, or None if no prefix rules match.
|
||||||
|
|
||||||
string (unicode): The string to segment.
|
string (unicode): The string to segment.
|
||||||
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
||||||
|
@ -305,8 +303,8 @@ cdef class Tokenizer:
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def find_suffix(self, unicode string):
|
def find_suffix(self, unicode string):
|
||||||
"""Find the length of a suffix that should be segmented from the string,
|
"""Find the length of a suffix that should be segmented from the
|
||||||
or None if no suffix rules match.
|
string, or None if no suffix rules match.
|
||||||
|
|
||||||
string (unicode): The string to segment.
|
string (unicode): The string to segment.
|
||||||
Returns (int): The length of the suffix if present, otherwise `None`.
|
Returns (int): The length of the suffix if present, otherwise `None`.
|
||||||
|
@ -326,8 +324,8 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
string (unicode): The string to specially tokenize.
|
string (unicode): The string to specially tokenize.
|
||||||
token_attrs (iterable): A sequence of dicts, where each dict describes
|
token_attrs (iterable): A sequence of dicts, where each dict describes
|
||||||
a token and its attributes. The `ORTH` fields of the attributes must
|
a token and its attributes. The `ORTH` fields of the attributes
|
||||||
exactly match the string when they are concatenated.
|
must exactly match the string when they are concatenated.
|
||||||
"""
|
"""
|
||||||
substrings = list(substrings)
|
substrings = list(substrings)
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
|
@ -343,7 +341,7 @@ cdef class Tokenizer:
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||||
"""
|
"""
|
||||||
with path.open('wb') as file_:
|
with path.open('wb') as file_:
|
||||||
file_.write(self.to_bytes(**exclude))
|
file_.write(self.to_bytes(**exclude))
|
||||||
|
|
|
@ -476,7 +476,7 @@ cdef class Span:
|
||||||
"""
|
"""
|
||||||
# TODO: implement
|
# TODO: implement
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError
|
||||||
|
|
||||||
property n_rights:
|
property n_rights:
|
||||||
"""RETURNS (int): The number of rightward immediate children of the
|
"""RETURNS (int): The number of rightward immediate children of the
|
||||||
|
@ -484,7 +484,7 @@ cdef class Span:
|
||||||
"""
|
"""
|
||||||
# TODO: implement
|
# TODO: implement
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError
|
||||||
|
|
||||||
property subtree:
|
property subtree:
|
||||||
"""Tokens that descend from tokens in the span, but fall outside it.
|
"""Tokens that descend from tokens in the span, but fall outside it.
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
|
|
|
@ -17,8 +17,8 @@ from .compat import copy_reg, basestring_
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .vectors import Vectors
|
from .vectors import Vectors
|
||||||
from . import util
|
|
||||||
from ._ml import link_vectors_to_models
|
from ._ml import link_vectors_to_models
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user