mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Tidy up rest
This commit is contained in:
parent
a8e10f94e4
commit
d96e72f656
|
@ -8,11 +8,9 @@ from thinc.t2t import ExtractWindow, ParametricAttention
|
|||
from thinc.t2v import Pooling, sum_pool
|
||||
from thinc.misc import Residual
|
||||
from thinc.misc import LayerNorm as LN
|
||||
|
||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||
from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
|
||||
from thinc.api import uniqued, wrap, noop
|
||||
|
||||
from thinc.linear.linear import LinearModel
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
|
|
|
@ -101,17 +101,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
"""
|
||||
Normalize a dictionary of attributes, converting them to ints.
|
||||
|
||||
Arguments:
|
||||
stringy_attrs (dict):
|
||||
Dictionary keyed by attribute string names. Values can be ints or strings.
|
||||
|
||||
strings_map (StringStore):
|
||||
Defaults to None. If provided, encodes string values into ints.
|
||||
|
||||
Returns:
|
||||
inty_attrs (dict):
|
||||
Attributes dictionary with keys and optionally values converted to
|
||||
ints.
|
||||
stringy_attrs (dict): Dictionary keyed by attribute string names. Values
|
||||
can be ints or strings.
|
||||
strings_map (StringStore): Defaults to None. If provided, encodes string
|
||||
values into ints.
|
||||
RETURNS (dict): Attributes dictionary with keys and optionally values
|
||||
converted to ints.
|
||||
"""
|
||||
inty_attrs = {}
|
||||
if _do_deprecated:
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import io
|
||||
import re
|
||||
import ujson
|
||||
import random
|
||||
|
@ -10,9 +9,8 @@ import cytoolz
|
|||
import itertools
|
||||
|
||||
from .syntax import nonproj
|
||||
from .util import ensure_path
|
||||
from . import util
|
||||
from .tokens import Doc
|
||||
from . import util
|
||||
|
||||
|
||||
def tags_to_entities(tags):
|
||||
|
@ -310,7 +308,7 @@ def _corrupt(c, noise_level):
|
|||
|
||||
|
||||
def read_json_file(loc, docs_filter=None, limit=None):
|
||||
loc = ensure_path(loc)
|
||||
loc = util.ensure_path(loc)
|
||||
if loc.is_dir():
|
||||
for filename in loc.iterdir():
|
||||
yield from read_json_file(loc / filename, limit=limit)
|
||||
|
|
|
@ -1,22 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
from contextlib import contextmanager
|
||||
import copy
|
||||
|
||||
from thinc.neural import Model
|
||||
from thinc.neural.optimizers import Adam
|
||||
import random
|
||||
import ujson
|
||||
from collections import OrderedDict
|
||||
import itertools
|
||||
import weakref
|
||||
import functools
|
||||
from collections import OrderedDict
|
||||
from contextlib import contextmanager
|
||||
from copy import copy
|
||||
from thinc.neural import Model
|
||||
from thinc.neural.optimizers import Adam
|
||||
|
||||
from .tokenizer import Tokenizer
|
||||
from .vocab import Vocab
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .pipeline import DependencyParser, Tensorizer, Tagger
|
||||
from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
|
||||
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
||||
from .pipeline import SimilarityHook, TextCategorizer
|
||||
from .compat import json_dumps, izip
|
||||
from .scorer import Scorer
|
||||
from ._ml import link_vectors_to_models
|
||||
|
@ -649,7 +649,7 @@ class Language(object):
|
|||
serializers = OrderedDict((
|
||||
('vocab', lambda: self.vocab.to_bytes()),
|
||||
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
|
||||
('meta', lambda: ujson.dumps(self.meta))
|
||||
('meta', lambda: json_dumps(self.meta))
|
||||
))
|
||||
for i, (name, proc) in enumerate(self.pipeline):
|
||||
if name in disable:
|
||||
|
@ -689,7 +689,7 @@ class DisabledPipes(list):
|
|||
# Important! Not deep copy -- we just want the container (but we also
|
||||
# want to support people providing arbitrarily typed nlp.pipeline
|
||||
# objects.)
|
||||
self.original_pipeline = copy.copy(nlp.pipeline)
|
||||
self.original_pipeline = copy(nlp.pipeline)
|
||||
list.__init__(self)
|
||||
self.extend(nlp.remove_pipe(name) for name in names)
|
||||
|
||||
|
|
|
@ -4,12 +4,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import ujson
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
from .typedefs cimport hash_t
|
||||
from .attrs cimport attr_id_t
|
||||
from .structs cimport TokenC
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from libcpp.vector cimport vector
|
||||
|
@ -17,14 +11,15 @@ from libcpp.pair cimport pair
|
|||
from murmurhash.mrmr cimport hash64
|
||||
from libc.stdint cimport int32_t
|
||||
|
||||
from .attrs cimport ID, NULL_ATTR, ENT_TYPE
|
||||
from . import attrs
|
||||
from .tokens.doc cimport get_token_attr
|
||||
from .tokens.doc cimport Doc
|
||||
from .typedefs cimport attr_t
|
||||
from .typedefs cimport hash_t
|
||||
from .structs cimport TokenC
|
||||
from .tokens.doc cimport Doc, get_token_attr
|
||||
from .vocab cimport Vocab
|
||||
|
||||
from .attrs import IDS
|
||||
from .attrs cimport attr_id_t, ID, NULL_ATTR
|
||||
from .attrs import FLAG61 as U_ENT
|
||||
|
||||
from .attrs import FLAG60 as B2_ENT
|
||||
from .attrs import FLAG59 as B3_ENT
|
||||
from .attrs import FLAG58 as B4_ENT
|
||||
|
@ -34,7 +29,6 @@ from .attrs import FLAG55 as B7_ENT
|
|||
from .attrs import FLAG54 as B8_ENT
|
||||
from .attrs import FLAG53 as B9_ENT
|
||||
from .attrs import FLAG52 as B10_ENT
|
||||
|
||||
from .attrs import FLAG51 as I3_ENT
|
||||
from .attrs import FLAG50 as I4_ENT
|
||||
from .attrs import FLAG49 as I5_ENT
|
||||
|
@ -43,7 +37,6 @@ from .attrs import FLAG47 as I7_ENT
|
|||
from .attrs import FLAG46 as I8_ENT
|
||||
from .attrs import FLAG45 as I9_ENT
|
||||
from .attrs import FLAG44 as I10_ENT
|
||||
|
||||
from .attrs import FLAG43 as L2_ENT
|
||||
from .attrs import FLAG42 as L3_ENT
|
||||
from .attrs import FLAG41 as L4_ENT
|
||||
|
@ -168,10 +161,10 @@ def _convert_strings(token_specs, string_store):
|
|||
if value in operators:
|
||||
ops = operators[value]
|
||||
else:
|
||||
raise KeyError(
|
||||
"Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys())))
|
||||
msg = "Unknown operator '%s'. Options: %s"
|
||||
raise KeyError(msg % (value, ', '.join(operators.keys())))
|
||||
if isinstance(attr, basestring):
|
||||
attr = attrs.IDS.get(attr.upper())
|
||||
attr = IDS.get(attr.upper())
|
||||
if isinstance(value, basestring):
|
||||
value = string_store.add(value)
|
||||
if isinstance(value, bool):
|
||||
|
@ -186,7 +179,7 @@ def _convert_strings(token_specs, string_store):
|
|||
def merge_phrase(matcher, doc, i, matches):
|
||||
"""Callback to merge a phrase on match."""
|
||||
ent_id, label, start, end = matches[i]
|
||||
span = doc[start : end]
|
||||
span = doc[start:end]
|
||||
span.merge(ent_type=label, ent_id=ent_id)
|
||||
|
||||
|
||||
|
@ -233,13 +226,13 @@ cdef class Matcher:
|
|||
return self._normalize_key(key) in self._patterns
|
||||
|
||||
def add(self, key, on_match, *patterns):
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
|
||||
an on_match callback, and one or more patterns.
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||
key, an on_match callback, and one or more patterns.
|
||||
|
||||
If the key exists, the patterns are appended to the previous ones, and
|
||||
the previous on_match callback is replaced. The `on_match` callback will
|
||||
receive the arguments `(matcher, doc, i, matches)`. You can also set
|
||||
`on_match` to `None` to not perform any actions.
|
||||
the previous on_match callback is replaced. The `on_match` callback
|
||||
will receive the arguments `(matcher, doc, i, matches)`. You can also
|
||||
set `on_match` to `None` to not perform any actions.
|
||||
|
||||
A pattern consists of one or more `token_specs`, where a `token_spec`
|
||||
is a dictionary mapping attribute IDs to values, and optionally a
|
||||
|
@ -253,8 +246,8 @@ cdef class Matcher:
|
|||
The + and * operators are usually interpretted "greedily", i.e. longer
|
||||
matches are returned where possible. However, if you specify two '+'
|
||||
and '*' patterns in a row and their matches overlap, the first
|
||||
operator will behave non-greedily. This quirk in the semantics
|
||||
makes the matcher more efficient, by avoiding the need for back-tracking.
|
||||
operator will behave non-greedily. This quirk in the semantics makes
|
||||
the matcher more efficient, by avoiding the need for back-tracking.
|
||||
|
||||
key (unicode): The match ID.
|
||||
on_match (callable): Callback executed on match.
|
||||
|
@ -268,7 +261,6 @@ cdef class Matcher:
|
|||
key = self._normalize_key(key)
|
||||
self._patterns.setdefault(key, [])
|
||||
self._callbacks[key] = on_match
|
||||
|
||||
for pattern in patterns:
|
||||
specs = _convert_strings(pattern, self.vocab.strings)
|
||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||
|
@ -315,9 +307,9 @@ cdef class Matcher:
|
|||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (iterable): A stream of documents.
|
||||
batch_size (int): The number of documents to accumulate into a working set.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel, if the `Matcher` implementation supports multi-threading.
|
||||
in parallel, if the implementation supports multi-threading.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
for doc in docs:
|
||||
|
@ -325,7 +317,7 @@ cdef class Matcher:
|
|||
yield doc
|
||||
|
||||
def __call__(self, Doc doc):
|
||||
"""Find all token sequences matching the supplied patterns on the `Doc`.
|
||||
"""Find all token sequences matching the supplied pattern.
|
||||
|
||||
doc (Doc): The document to match over.
|
||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||
|
@ -342,8 +334,8 @@ cdef class Matcher:
|
|||
for token_i in range(doc.length):
|
||||
token = &doc.c[token_i]
|
||||
q = 0
|
||||
# Go over the open matches, extending or finalizing if able. Otherwise,
|
||||
# we over-write them (q doesn't advance)
|
||||
# Go over the open matches, extending or finalizing if able.
|
||||
# Otherwise, we over-write them (q doesn't advance)
|
||||
for state in partials:
|
||||
action = get_action(state.second, token)
|
||||
if action == PANIC:
|
||||
|
@ -356,8 +348,8 @@ cdef class Matcher:
|
|||
|
||||
if action == REPEAT:
|
||||
# Leave the state in the queue, and advance to next slot
|
||||
# (i.e. we don't overwrite -- we want to greedily match more
|
||||
# pattern.
|
||||
# (i.e. we don't overwrite -- we want to greedily match
|
||||
# more pattern.
|
||||
q += 1
|
||||
elif action == REJECT:
|
||||
pass
|
||||
|
@ -366,8 +358,8 @@ cdef class Matcher:
|
|||
partials[q].second += 1
|
||||
q += 1
|
||||
elif action in (ACCEPT, ACCEPT_PREV):
|
||||
# TODO: What to do about patterns starting with ZERO? Need to
|
||||
# adjust the start position.
|
||||
# TODO: What to do about patterns starting with ZERO? Need
|
||||
# to adjust the start position.
|
||||
start = state.first
|
||||
end = token_i+1 if action == ACCEPT else token_i
|
||||
ent_id = state.second[1].attrs[0].value
|
||||
|
@ -388,8 +380,8 @@ cdef class Matcher:
|
|||
state.second = pattern
|
||||
partials.push_back(state)
|
||||
elif action == ADVANCE:
|
||||
# TODO: What to do about patterns starting with ZERO? Need to
|
||||
# adjust the start position.
|
||||
# TODO: What to do about patterns starting with ZERO? Need
|
||||
# to adjust the start position.
|
||||
state.first = token_i
|
||||
state.second = pattern + 1
|
||||
partials.push_back(state)
|
||||
|
@ -413,7 +405,6 @@ cdef class Matcher:
|
|||
on_match = self._callbacks.get(ent_id)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matches)
|
||||
# TODO: only return (match_id, start, end)
|
||||
return matches
|
||||
|
||||
def _normalize_key(self, key):
|
||||
|
@ -441,7 +432,8 @@ def get_bilou(length):
|
|||
elif length == 8:
|
||||
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
|
||||
elif length == 9:
|
||||
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
|
||||
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
|
||||
L9_ENT]
|
||||
elif length == 10:
|
||||
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
|
||||
I10_ENT, I10_ENT, L10_ENT]
|
||||
|
@ -454,10 +446,8 @@ cdef class PhraseMatcher:
|
|||
cdef Vocab vocab
|
||||
cdef Matcher matcher
|
||||
cdef PreshMap phrase_ids
|
||||
|
||||
cdef int max_length
|
||||
cdef attr_t* _phrase_key
|
||||
|
||||
cdef public object _callbacks
|
||||
cdef public object _patterns
|
||||
|
||||
|
@ -470,7 +460,8 @@ cdef class PhraseMatcher:
|
|||
self.phrase_ids = PreshMap()
|
||||
abstract_patterns = []
|
||||
for length in range(1, max_length):
|
||||
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
|
||||
abstract_patterns.append([{tag: True}
|
||||
for tag in get_bilou(length)])
|
||||
self.matcher.add('Candidate', None, *abstract_patterns)
|
||||
self._callbacks = {}
|
||||
|
||||
|
@ -496,8 +487,8 @@ cdef class PhraseMatcher:
|
|||
return (self.__class__, (self.vocab,), None, None)
|
||||
|
||||
def add(self, key, on_match, *docs):
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
|
||||
an on_match callback, and one or more patterns.
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||
key, an on_match callback, and one or more patterns.
|
||||
|
||||
key (unicode): The match ID.
|
||||
on_match (callable): Callback executed on match.
|
||||
|
@ -513,7 +504,6 @@ cdef class PhraseMatcher:
|
|||
raise ValueError(msg % (len(doc), self.max_length))
|
||||
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
||||
self._callbacks[ent_id] = on_match
|
||||
|
||||
cdef int length
|
||||
cdef int i
|
||||
cdef hash_t phrase_hash
|
||||
|
@ -553,9 +543,9 @@ cdef class PhraseMatcher:
|
|||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (iterable): A stream of documents.
|
||||
batch_size (int): The number of documents to accumulate into a working set.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel, if the `Matcher` implementation supports multi-threading.
|
||||
in parallel, if the implementation supports multi-threading.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
for doc in stream:
|
||||
|
@ -569,7 +559,8 @@ cdef class PhraseMatcher:
|
|||
self._phrase_key[i] = 0
|
||||
for i, j in enumerate(range(start, end)):
|
||||
self._phrase_key[i] = doc.c[j].lex.orth
|
||||
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
|
||||
cdef hash_t key = hash64(self._phrase_key,
|
||||
self.max_length * sizeof(attr_t), 0)
|
||||
ent_id = <hash_t>self.phrase_ids.get(key)
|
||||
if ent_id == 0:
|
||||
return None
|
||||
|
|
|
@ -4,17 +4,15 @@ from __future__ import unicode_literals
|
|||
|
||||
from libc.string cimport memset
|
||||
|
||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
|
||||
from .attrs cimport POS, IS_SPACE
|
||||
from .attrs import LEMMA, intify_attrs
|
||||
from .parts_of_speech cimport SPACE
|
||||
from .parts_of_speech import IDS as POS_IDS
|
||||
from .lexeme cimport Lexeme
|
||||
from .attrs import LEMMA, intify_attrs
|
||||
|
||||
|
||||
def _normalize_props(props):
|
||||
"""
|
||||
Transform deprecated string keys to correct names.
|
||||
"""
|
||||
"""Transform deprecated string keys to correct names."""
|
||||
out = {}
|
||||
for key, value in props.items():
|
||||
if key == POS:
|
||||
|
@ -77,7 +75,8 @@ cdef class Morphology:
|
|||
cdef int assign_untagged(self, TokenC* token) except -1:
|
||||
"""Set morphological attributes on a token without a POS tag. Uses
|
||||
the lemmatizer's lookup() method, which looks up the string in the
|
||||
table provided by the language data as lemma_lookup (if available)."""
|
||||
table provided by the language data as lemma_lookup (if available).
|
||||
"""
|
||||
if token.lemma == 0:
|
||||
orth_str = self.strings[token.lex.orth]
|
||||
lemma = self.lemmatizer.lookup(orth_str)
|
||||
|
@ -95,11 +94,10 @@ cdef class Morphology:
|
|||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||
if tag_id > self.n_tags:
|
||||
raise ValueError("Unknown tag ID: %s" % tag_id)
|
||||
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
||||
# is that this is where the specific word and the tag interact. Still,
|
||||
# we should have a better way to enforce this rule, or figure out why
|
||||
# the statistical model fails.
|
||||
# Related to Issue #220
|
||||
# TODO: It's pretty arbitrary to put this logic here. I guess the
|
||||
# justification is that this is where the specific word and the tag
|
||||
# interact. Still, we should have a better way to enforce this rule, or
|
||||
# figure out why the statistical model fails. Related to Issue #220
|
||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||
tag_id = self.reverse_index[self.strings.add('_SP')]
|
||||
rich_tag = self.rich_tags[tag_id]
|
||||
|
@ -123,12 +121,11 @@ cdef class Morphology:
|
|||
else:
|
||||
flags[0] &= ~(one << flag_id)
|
||||
|
||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
|
||||
"""
|
||||
Add a special-case rule to the morphological analyser. Tokens whose
|
||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
||||
force=False):
|
||||
"""Add a special-case rule to the morphological analyser. Tokens whose
|
||||
tag and orth match the rule will receive the specified properties.
|
||||
|
||||
Arguments:
|
||||
tag (unicode): The part-of-speech tag to key the exception.
|
||||
orth (unicode): The word-form to key the exception.
|
||||
"""
|
||||
|
@ -144,10 +141,9 @@ cdef class Morphology:
|
|||
elif force:
|
||||
memset(cached, 0, sizeof(cached[0]))
|
||||
else:
|
||||
msg = ("Conflicting morphology exception for (%s, %s). Use force=True "
|
||||
"to overwrite.")
|
||||
msg = msg % (tag_str, orth_str)
|
||||
raise ValueError(msg)
|
||||
raise ValueError(
|
||||
"Conflicting morphology exception for (%s, %s). Use "
|
||||
"force=True to overwrite." % (tag_str, orth_str))
|
||||
|
||||
cached.tag = rich_tag
|
||||
# TODO: Refactor this to take arbitrary attributes.
|
||||
|
|
|
@ -85,7 +85,6 @@ class Scorer(object):
|
|||
|
||||
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
|
||||
assert len(tokens) == len(gold)
|
||||
|
||||
gold_deps = set()
|
||||
gold_tags = set()
|
||||
gold_ents = set(tags_to_entities([annot[-1]
|
||||
|
|
|
@ -4,19 +4,15 @@ from __future__ import unicode_literals, absolute_import
|
|||
|
||||
cimport cython
|
||||
from libc.string cimport memcpy
|
||||
from libc.stdint cimport uint64_t, uint32_t
|
||||
from murmurhash.mrmr cimport hash64, hash32
|
||||
from preshed.maps cimport map_iter, key_t
|
||||
from libc.stdint cimport uint32_t
|
||||
from murmurhash.mrmr cimport hash64, hash32
|
||||
import ujson
|
||||
import dill
|
||||
|
||||
from .symbols import IDS as SYMBOLS_BY_STR
|
||||
from .symbols import NAMES as SYMBOLS_BY_INT
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from . import util
|
||||
from .compat import json_dumps
|
||||
from . import util
|
||||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0:
|
||||
|
@ -195,7 +191,7 @@ cdef class StringStore:
|
|||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
strings = list(self)
|
||||
|
@ -225,7 +221,7 @@ cdef class StringStore:
|
|||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `StringStore` object.
|
||||
"""
|
||||
return ujson.dumps(list(self))
|
||||
return json_dumps(list(self))
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
"""Load state from a binary string.
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
# coding: utf8
|
||||
#cython: optimize.unpack_method_calls=False
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
IDS = {
|
||||
"": NIL,
|
||||
"IS_ALPHA": IS_ALPHA,
|
||||
|
@ -464,9 +464,11 @@ IDS = {
|
|||
"LAW": LAW
|
||||
}
|
||||
|
||||
|
||||
def sort_nums(x):
|
||||
return x[1]
|
||||
|
||||
|
||||
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
||||
# Unfortunate hack here, to work around problem with long cpdef enum
|
||||
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
||||
|
|
|
@ -8,12 +8,11 @@ from cython.operator cimport preincrement as preinc
|
|||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
import regex as re
|
||||
|
||||
from .strings cimport hash_string
|
||||
from . import util
|
||||
cimport cython
|
||||
|
||||
from .tokens.doc cimport Doc
|
||||
from .strings cimport hash_string
|
||||
from . import util
|
||||
|
||||
|
||||
cdef class Tokenizer:
|
||||
|
@ -74,9 +73,8 @@ cdef class Tokenizer:
|
|||
RETURNS (Doc): A container for linguistic annotations.
|
||||
"""
|
||||
if len(string) >= (2 ** 30):
|
||||
raise ValueError(
|
||||
"String is too long: %d characters. Max is 2**30." % len(string)
|
||||
)
|
||||
msg = "String is too long: %d characters. Max is 2**30."
|
||||
raise ValueError(msg % len(string))
|
||||
cdef int length = len(string)
|
||||
cdef Doc doc = Doc(self.vocab)
|
||||
if length == 0:
|
||||
|
@ -122,8 +120,8 @@ cdef class Tokenizer:
|
|||
"""Tokenize a stream of texts.
|
||||
|
||||
texts: A sequence of unicode texts.
|
||||
batch_size (int): The number of texts to accumulate in an internal buffer.
|
||||
n_threads (int): The number of threads to use, if the implementation
|
||||
batch_size (int): Number of texts to accumulate in an internal buffer.
|
||||
n_threads (int): Number of threads to use, if the implementation
|
||||
supports multi-threading. The default tokenizer is single-threaded.
|
||||
YIELDS (Doc): A sequence of Doc objects, in order.
|
||||
"""
|
||||
|
@ -232,8 +230,8 @@ cdef class Tokenizer:
|
|||
if not matches:
|
||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||
else:
|
||||
# let's say we have dyn-o-mite-dave
|
||||
# the regex finds the start and end positions of the hyphens
|
||||
# let's say we have dyn-o-mite-dave - the regex finds the
|
||||
# start and end positions of the hyphens
|
||||
start = 0
|
||||
for match in matches:
|
||||
infix_start = match.start()
|
||||
|
@ -293,8 +291,8 @@ cdef class Tokenizer:
|
|||
return list(self.infix_finditer(string))
|
||||
|
||||
def find_prefix(self, unicode string):
|
||||
"""Find the length of a prefix that should be segmented from the string,
|
||||
or None if no prefix rules match.
|
||||
"""Find the length of a prefix that should be segmented from the
|
||||
string, or None if no prefix rules match.
|
||||
|
||||
string (unicode): The string to segment.
|
||||
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
||||
|
@ -305,8 +303,8 @@ cdef class Tokenizer:
|
|||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def find_suffix(self, unicode string):
|
||||
"""Find the length of a suffix that should be segmented from the string,
|
||||
or None if no suffix rules match.
|
||||
"""Find the length of a suffix that should be segmented from the
|
||||
string, or None if no suffix rules match.
|
||||
|
||||
string (unicode): The string to segment.
|
||||
Returns (int): The length of the suffix if present, otherwise `None`.
|
||||
|
@ -326,8 +324,8 @@ cdef class Tokenizer:
|
|||
|
||||
string (unicode): The string to specially tokenize.
|
||||
token_attrs (iterable): A sequence of dicts, where each dict describes
|
||||
a token and its attributes. The `ORTH` fields of the attributes must
|
||||
exactly match the string when they are concatenated.
|
||||
a token and its attributes. The `ORTH` fields of the attributes
|
||||
must exactly match the string when they are concatenated.
|
||||
"""
|
||||
substrings = list(substrings)
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
|
@ -343,7 +341,7 @@ cdef class Tokenizer:
|
|||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
"""
|
||||
with path.open('wb') as file_:
|
||||
file_.write(self.to_bytes(**exclude))
|
||||
|
|
|
@ -476,7 +476,7 @@ cdef class Span:
|
|||
"""
|
||||
# TODO: implement
|
||||
def __get__(self):
|
||||
raise NotImplementedError()
|
||||
raise NotImplementedError
|
||||
|
||||
property n_rights:
|
||||
"""RETURNS (int): The number of rightward immediate children of the
|
||||
|
@ -484,7 +484,7 @@ cdef class Span:
|
|||
"""
|
||||
# TODO: implement
|
||||
def __get__(self):
|
||||
raise NotImplementedError()
|
||||
raise NotImplementedError
|
||||
|
||||
property subtree:
|
||||
"""Tokens that descend from tokens in the span, but fall outside it.
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
|
|
@ -17,8 +17,8 @@ from .compat import copy_reg, basestring_
|
|||
from .lemmatizer import Lemmatizer
|
||||
from .attrs import intify_attrs
|
||||
from .vectors import Vectors
|
||||
from . import util
|
||||
from ._ml import link_vectors_to_models
|
||||
from . import util
|
||||
|
||||
|
||||
cdef class Vocab:
|
||||
|
|
Loading…
Reference in New Issue
Block a user