mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Tidy up rest
This commit is contained in:
parent
a8e10f94e4
commit
d96e72f656
|
@ -8,11 +8,9 @@ from thinc.t2t import ExtractWindow, ParametricAttention
|
|||
from thinc.t2v import Pooling, sum_pool
|
||||
from thinc.misc import Residual
|
||||
from thinc.misc import LayerNorm as LN
|
||||
|
||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||
from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
|
||||
from thinc.api import uniqued, wrap, noop
|
||||
|
||||
from thinc.linear.linear import LinearModel
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
|
|
|
@ -101,17 +101,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
"""
|
||||
Normalize a dictionary of attributes, converting them to ints.
|
||||
|
||||
Arguments:
|
||||
stringy_attrs (dict):
|
||||
Dictionary keyed by attribute string names. Values can be ints or strings.
|
||||
|
||||
strings_map (StringStore):
|
||||
Defaults to None. If provided, encodes string values into ints.
|
||||
|
||||
Returns:
|
||||
inty_attrs (dict):
|
||||
Attributes dictionary with keys and optionally values converted to
|
||||
ints.
|
||||
stringy_attrs (dict): Dictionary keyed by attribute string names. Values
|
||||
can be ints or strings.
|
||||
strings_map (StringStore): Defaults to None. If provided, encodes string
|
||||
values into ints.
|
||||
RETURNS (dict): Attributes dictionary with keys and optionally values
|
||||
converted to ints.
|
||||
"""
|
||||
inty_attrs = {}
|
||||
if _do_deprecated:
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import io
|
||||
import re
|
||||
import ujson
|
||||
import random
|
||||
|
@ -10,9 +9,8 @@ import cytoolz
|
|||
import itertools
|
||||
|
||||
from .syntax import nonproj
|
||||
from .util import ensure_path
|
||||
from . import util
|
||||
from .tokens import Doc
|
||||
from . import util
|
||||
|
||||
|
||||
def tags_to_entities(tags):
|
||||
|
@ -310,7 +308,7 @@ def _corrupt(c, noise_level):
|
|||
|
||||
|
||||
def read_json_file(loc, docs_filter=None, limit=None):
|
||||
loc = ensure_path(loc)
|
||||
loc = util.ensure_path(loc)
|
||||
if loc.is_dir():
|
||||
for filename in loc.iterdir():
|
||||
yield from read_json_file(loc / filename, limit=limit)
|
||||
|
|
|
@ -1,22 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
from contextlib import contextmanager
|
||||
import copy
|
||||
|
||||
from thinc.neural import Model
|
||||
from thinc.neural.optimizers import Adam
|
||||
import random
|
||||
import ujson
|
||||
from collections import OrderedDict
|
||||
import itertools
|
||||
import weakref
|
||||
import functools
|
||||
from collections import OrderedDict
|
||||
from contextlib import contextmanager
|
||||
from copy import copy
|
||||
from thinc.neural import Model
|
||||
from thinc.neural.optimizers import Adam
|
||||
|
||||
from .tokenizer import Tokenizer
|
||||
from .vocab import Vocab
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .pipeline import DependencyParser, Tensorizer, Tagger
|
||||
from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
|
||||
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
||||
from .pipeline import SimilarityHook, TextCategorizer
|
||||
from .compat import json_dumps, izip
|
||||
from .scorer import Scorer
|
||||
from ._ml import link_vectors_to_models
|
||||
|
@ -649,7 +649,7 @@ class Language(object):
|
|||
serializers = OrderedDict((
|
||||
('vocab', lambda: self.vocab.to_bytes()),
|
||||
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
|
||||
('meta', lambda: ujson.dumps(self.meta))
|
||||
('meta', lambda: json_dumps(self.meta))
|
||||
))
|
||||
for i, (name, proc) in enumerate(self.pipeline):
|
||||
if name in disable:
|
||||
|
@ -689,7 +689,7 @@ class DisabledPipes(list):
|
|||
# Important! Not deep copy -- we just want the container (but we also
|
||||
# want to support people providing arbitrarily typed nlp.pipeline
|
||||
# objects.)
|
||||
self.original_pipeline = copy.copy(nlp.pipeline)
|
||||
self.original_pipeline = copy(nlp.pipeline)
|
||||
list.__init__(self)
|
||||
self.extend(nlp.remove_pipe(name) for name in names)
|
||||
|
||||
|
|
|
@ -4,12 +4,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import ujson
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
from .typedefs cimport hash_t
|
||||
from .attrs cimport attr_id_t
|
||||
from .structs cimport TokenC
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from libcpp.vector cimport vector
|
||||
|
@ -17,14 +11,15 @@ from libcpp.pair cimport pair
|
|||
from murmurhash.mrmr cimport hash64
|
||||
from libc.stdint cimport int32_t
|
||||
|
||||
from .attrs cimport ID, NULL_ATTR, ENT_TYPE
|
||||
from . import attrs
|
||||
from .tokens.doc cimport get_token_attr
|
||||
from .tokens.doc cimport Doc
|
||||
from .typedefs cimport attr_t
|
||||
from .typedefs cimport hash_t
|
||||
from .structs cimport TokenC
|
||||
from .tokens.doc cimport Doc, get_token_attr
|
||||
from .vocab cimport Vocab
|
||||
|
||||
from .attrs import IDS
|
||||
from .attrs cimport attr_id_t, ID, NULL_ATTR
|
||||
from .attrs import FLAG61 as U_ENT
|
||||
|
||||
from .attrs import FLAG60 as B2_ENT
|
||||
from .attrs import FLAG59 as B3_ENT
|
||||
from .attrs import FLAG58 as B4_ENT
|
||||
|
@ -34,7 +29,6 @@ from .attrs import FLAG55 as B7_ENT
|
|||
from .attrs import FLAG54 as B8_ENT
|
||||
from .attrs import FLAG53 as B9_ENT
|
||||
from .attrs import FLAG52 as B10_ENT
|
||||
|
||||
from .attrs import FLAG51 as I3_ENT
|
||||
from .attrs import FLAG50 as I4_ENT
|
||||
from .attrs import FLAG49 as I5_ENT
|
||||
|
@ -43,7 +37,6 @@ from .attrs import FLAG47 as I7_ENT
|
|||
from .attrs import FLAG46 as I8_ENT
|
||||
from .attrs import FLAG45 as I9_ENT
|
||||
from .attrs import FLAG44 as I10_ENT
|
||||
|
||||
from .attrs import FLAG43 as L2_ENT
|
||||
from .attrs import FLAG42 as L3_ENT
|
||||
from .attrs import FLAG41 as L4_ENT
|
||||
|
@ -153,7 +146,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
|
|||
def _convert_strings(token_specs, string_store):
|
||||
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
|
||||
operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
|
||||
'?': (ZERO_ONE,), '1': (ONE,)}
|
||||
'?': (ZERO_ONE,), '1': (ONE,)}
|
||||
tokens = []
|
||||
op = ONE
|
||||
for spec in token_specs:
|
||||
|
@ -168,10 +161,10 @@ def _convert_strings(token_specs, string_store):
|
|||
if value in operators:
|
||||
ops = operators[value]
|
||||
else:
|
||||
raise KeyError(
|
||||
"Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys())))
|
||||
msg = "Unknown operator '%s'. Options: %s"
|
||||
raise KeyError(msg % (value, ', '.join(operators.keys())))
|
||||
if isinstance(attr, basestring):
|
||||
attr = attrs.IDS.get(attr.upper())
|
||||
attr = IDS.get(attr.upper())
|
||||
if isinstance(value, basestring):
|
||||
value = string_store.add(value)
|
||||
if isinstance(value, bool):
|
||||
|
@ -186,7 +179,7 @@ def _convert_strings(token_specs, string_store):
|
|||
def merge_phrase(matcher, doc, i, matches):
|
||||
"""Callback to merge a phrase on match."""
|
||||
ent_id, label, start, end = matches[i]
|
||||
span = doc[start : end]
|
||||
span = doc[start:end]
|
||||
span.merge(ent_type=label, ent_id=ent_id)
|
||||
|
||||
|
||||
|
@ -233,13 +226,13 @@ cdef class Matcher:
|
|||
return self._normalize_key(key) in self._patterns
|
||||
|
||||
def add(self, key, on_match, *patterns):
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
|
||||
an on_match callback, and one or more patterns.
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||
key, an on_match callback, and one or more patterns.
|
||||
|
||||
If the key exists, the patterns are appended to the previous ones, and
|
||||
the previous on_match callback is replaced. The `on_match` callback will
|
||||
receive the arguments `(matcher, doc, i, matches)`. You can also set
|
||||
`on_match` to `None` to not perform any actions.
|
||||
the previous on_match callback is replaced. The `on_match` callback
|
||||
will receive the arguments `(matcher, doc, i, matches)`. You can also
|
||||
set `on_match` to `None` to not perform any actions.
|
||||
|
||||
A pattern consists of one or more `token_specs`, where a `token_spec`
|
||||
is a dictionary mapping attribute IDs to values, and optionally a
|
||||
|
@ -253,8 +246,8 @@ cdef class Matcher:
|
|||
The + and * operators are usually interpretted "greedily", i.e. longer
|
||||
matches are returned where possible. However, if you specify two '+'
|
||||
and '*' patterns in a row and their matches overlap, the first
|
||||
operator will behave non-greedily. This quirk in the semantics
|
||||
makes the matcher more efficient, by avoiding the need for back-tracking.
|
||||
operator will behave non-greedily. This quirk in the semantics makes
|
||||
the matcher more efficient, by avoiding the need for back-tracking.
|
||||
|
||||
key (unicode): The match ID.
|
||||
on_match (callable): Callback executed on match.
|
||||
|
@ -268,7 +261,6 @@ cdef class Matcher:
|
|||
key = self._normalize_key(key)
|
||||
self._patterns.setdefault(key, [])
|
||||
self._callbacks[key] = on_match
|
||||
|
||||
for pattern in patterns:
|
||||
specs = _convert_strings(pattern, self.vocab.strings)
|
||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||
|
@ -315,9 +307,9 @@ cdef class Matcher:
|
|||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (iterable): A stream of documents.
|
||||
batch_size (int): The number of documents to accumulate into a working set.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel, if the `Matcher` implementation supports multi-threading.
|
||||
in parallel, if the implementation supports multi-threading.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
for doc in docs:
|
||||
|
@ -325,7 +317,7 @@ cdef class Matcher:
|
|||
yield doc
|
||||
|
||||
def __call__(self, Doc doc):
|
||||
"""Find all token sequences matching the supplied patterns on the `Doc`.
|
||||
"""Find all token sequences matching the supplied pattern.
|
||||
|
||||
doc (Doc): The document to match over.
|
||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||
|
@ -342,8 +334,8 @@ cdef class Matcher:
|
|||
for token_i in range(doc.length):
|
||||
token = &doc.c[token_i]
|
||||
q = 0
|
||||
# Go over the open matches, extending or finalizing if able. Otherwise,
|
||||
# we over-write them (q doesn't advance)
|
||||
# Go over the open matches, extending or finalizing if able.
|
||||
# Otherwise, we over-write them (q doesn't advance)
|
||||
for state in partials:
|
||||
action = get_action(state.second, token)
|
||||
if action == PANIC:
|
||||
|
@ -356,8 +348,8 @@ cdef class Matcher:
|
|||
|
||||
if action == REPEAT:
|
||||
# Leave the state in the queue, and advance to next slot
|
||||
# (i.e. we don't overwrite -- we want to greedily match more
|
||||
# pattern.
|
||||
# (i.e. we don't overwrite -- we want to greedily match
|
||||
# more pattern.
|
||||
q += 1
|
||||
elif action == REJECT:
|
||||
pass
|
||||
|
@ -366,8 +358,8 @@ cdef class Matcher:
|
|||
partials[q].second += 1
|
||||
q += 1
|
||||
elif action in (ACCEPT, ACCEPT_PREV):
|
||||
# TODO: What to do about patterns starting with ZERO? Need to
|
||||
# adjust the start position.
|
||||
# TODO: What to do about patterns starting with ZERO? Need
|
||||
# to adjust the start position.
|
||||
start = state.first
|
||||
end = token_i+1 if action == ACCEPT else token_i
|
||||
ent_id = state.second[1].attrs[0].value
|
||||
|
@ -388,8 +380,8 @@ cdef class Matcher:
|
|||
state.second = pattern
|
||||
partials.push_back(state)
|
||||
elif action == ADVANCE:
|
||||
# TODO: What to do about patterns starting with ZERO? Need to
|
||||
# adjust the start position.
|
||||
# TODO: What to do about patterns starting with ZERO? Need
|
||||
# to adjust the start position.
|
||||
state.first = token_i
|
||||
state.second = pattern + 1
|
||||
partials.push_back(state)
|
||||
|
@ -413,7 +405,6 @@ cdef class Matcher:
|
|||
on_match = self._callbacks.get(ent_id)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matches)
|
||||
# TODO: only return (match_id, start, end)
|
||||
return matches
|
||||
|
||||
def _normalize_key(self, key):
|
||||
|
@ -441,7 +432,8 @@ def get_bilou(length):
|
|||
elif length == 8:
|
||||
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
|
||||
elif length == 9:
|
||||
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
|
||||
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
|
||||
L9_ENT]
|
||||
elif length == 10:
|
||||
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
|
||||
I10_ENT, I10_ENT, L10_ENT]
|
||||
|
@ -454,10 +446,8 @@ cdef class PhraseMatcher:
|
|||
cdef Vocab vocab
|
||||
cdef Matcher matcher
|
||||
cdef PreshMap phrase_ids
|
||||
|
||||
cdef int max_length
|
||||
cdef attr_t* _phrase_key
|
||||
|
||||
cdef public object _callbacks
|
||||
cdef public object _patterns
|
||||
|
||||
|
@ -470,7 +460,8 @@ cdef class PhraseMatcher:
|
|||
self.phrase_ids = PreshMap()
|
||||
abstract_patterns = []
|
||||
for length in range(1, max_length):
|
||||
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
|
||||
abstract_patterns.append([{tag: True}
|
||||
for tag in get_bilou(length)])
|
||||
self.matcher.add('Candidate', None, *abstract_patterns)
|
||||
self._callbacks = {}
|
||||
|
||||
|
@ -496,8 +487,8 @@ cdef class PhraseMatcher:
|
|||
return (self.__class__, (self.vocab,), None, None)
|
||||
|
||||
def add(self, key, on_match, *docs):
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
|
||||
an on_match callback, and one or more patterns.
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||
key, an on_match callback, and one or more patterns.
|
||||
|
||||
key (unicode): The match ID.
|
||||
on_match (callable): Callback executed on match.
|
||||
|
@ -513,7 +504,6 @@ cdef class PhraseMatcher:
|
|||
raise ValueError(msg % (len(doc), self.max_length))
|
||||
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
||||
self._callbacks[ent_id] = on_match
|
||||
|
||||
cdef int length
|
||||
cdef int i
|
||||
cdef hash_t phrase_hash
|
||||
|
@ -553,9 +543,9 @@ cdef class PhraseMatcher:
|
|||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (iterable): A stream of documents.
|
||||
batch_size (int): The number of documents to accumulate into a working set.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel, if the `Matcher` implementation supports multi-threading.
|
||||
in parallel, if the implementation supports multi-threading.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
for doc in stream:
|
||||
|
@ -569,7 +559,8 @@ cdef class PhraseMatcher:
|
|||
self._phrase_key[i] = 0
|
||||
for i, j in enumerate(range(start, end)):
|
||||
self._phrase_key[i] = doc.c[j].lex.orth
|
||||
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
|
||||
cdef hash_t key = hash64(self._phrase_key,
|
||||
self.max_length * sizeof(attr_t), 0)
|
||||
ent_id = <hash_t>self.phrase_ids.get(key)
|
||||
if ent_id == 0:
|
||||
return None
|
||||
|
|
|
@ -4,17 +4,15 @@ from __future__ import unicode_literals
|
|||
|
||||
from libc.string cimport memset
|
||||
|
||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
|
||||
from .attrs cimport POS, IS_SPACE
|
||||
from .attrs import LEMMA, intify_attrs
|
||||
from .parts_of_speech cimport SPACE
|
||||
from .parts_of_speech import IDS as POS_IDS
|
||||
from .lexeme cimport Lexeme
|
||||
from .attrs import LEMMA, intify_attrs
|
||||
|
||||
|
||||
def _normalize_props(props):
|
||||
"""
|
||||
Transform deprecated string keys to correct names.
|
||||
"""
|
||||
"""Transform deprecated string keys to correct names."""
|
||||
out = {}
|
||||
for key, value in props.items():
|
||||
if key == POS:
|
||||
|
@ -77,7 +75,8 @@ cdef class Morphology:
|
|||
cdef int assign_untagged(self, TokenC* token) except -1:
|
||||
"""Set morphological attributes on a token without a POS tag. Uses
|
||||
the lemmatizer's lookup() method, which looks up the string in the
|
||||
table provided by the language data as lemma_lookup (if available)."""
|
||||
table provided by the language data as lemma_lookup (if available).
|
||||
"""
|
||||
if token.lemma == 0:
|
||||
orth_str = self.strings[token.lex.orth]
|
||||
lemma = self.lemmatizer.lookup(orth_str)
|
||||
|
@ -95,11 +94,10 @@ cdef class Morphology:
|
|||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||
if tag_id > self.n_tags:
|
||||
raise ValueError("Unknown tag ID: %s" % tag_id)
|
||||
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
||||
# is that this is where the specific word and the tag interact. Still,
|
||||
# we should have a better way to enforce this rule, or figure out why
|
||||
# the statistical model fails.
|
||||
# Related to Issue #220
|
||||
# TODO: It's pretty arbitrary to put this logic here. I guess the
|
||||
# justification is that this is where the specific word and the tag
|
||||
# interact. Still, we should have a better way to enforce this rule, or
|
||||
# figure out why the statistical model fails. Related to Issue #220
|
||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||
tag_id = self.reverse_index[self.strings.add('_SP')]
|
||||
rich_tag = self.rich_tags[tag_id]
|
||||
|
@ -123,14 +121,13 @@ cdef class Morphology:
|
|||
else:
|
||||
flags[0] &= ~(one << flag_id)
|
||||
|
||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
|
||||
"""
|
||||
Add a special-case rule to the morphological analyser. Tokens whose
|
||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
||||
force=False):
|
||||
"""Add a special-case rule to the morphological analyser. Tokens whose
|
||||
tag and orth match the rule will receive the specified properties.
|
||||
|
||||
Arguments:
|
||||
tag (unicode): The part-of-speech tag to key the exception.
|
||||
orth (unicode): The word-form to key the exception.
|
||||
tag (unicode): The part-of-speech tag to key the exception.
|
||||
orth (unicode): The word-form to key the exception.
|
||||
"""
|
||||
self.exc[(tag_str, orth_str)] = dict(attrs)
|
||||
tag = self.strings.add(tag_str)
|
||||
|
@ -144,10 +141,9 @@ cdef class Morphology:
|
|||
elif force:
|
||||
memset(cached, 0, sizeof(cached[0]))
|
||||
else:
|
||||
msg = ("Conflicting morphology exception for (%s, %s). Use force=True "
|
||||
"to overwrite.")
|
||||
msg = msg % (tag_str, orth_str)
|
||||
raise ValueError(msg)
|
||||
raise ValueError(
|
||||
"Conflicting morphology exception for (%s, %s). Use "
|
||||
"force=True to overwrite." % (tag_str, orth_str))
|
||||
|
||||
cached.tag = rich_tag
|
||||
# TODO: Refactor this to take arbitrary attributes.
|
||||
|
@ -218,7 +214,7 @@ IDS = {
|
|||
"Definite_two": Definite_two,
|
||||
"Definite_def": Definite_def,
|
||||
"Definite_red": Definite_red,
|
||||
"Definite_cons": Definite_cons, # U20
|
||||
"Definite_cons": Definite_cons, # U20
|
||||
"Definite_ind": Definite_ind,
|
||||
"Degree_cmp": Degree_cmp,
|
||||
"Degree_comp": Degree_comp,
|
||||
|
@ -227,7 +223,7 @@ IDS = {
|
|||
"Degree_sup": Degree_sup,
|
||||
"Degree_abs": Degree_abs,
|
||||
"Degree_com": Degree_com,
|
||||
"Degree_dim ": Degree_dim, # du
|
||||
"Degree_dim ": Degree_dim, # du
|
||||
"Gender_com": Gender_com,
|
||||
"Gender_fem": Gender_fem,
|
||||
"Gender_masc": Gender_masc,
|
||||
|
@ -242,15 +238,15 @@ IDS = {
|
|||
"Negative_neg": Negative_neg,
|
||||
"Negative_pos": Negative_pos,
|
||||
"Negative_yes": Negative_yes,
|
||||
"Polarity_neg": Polarity_neg, # U20
|
||||
"Polarity_pos": Polarity_pos, # U20
|
||||
"Polarity_neg": Polarity_neg, # U20
|
||||
"Polarity_pos": Polarity_pos, # U20
|
||||
"Number_com": Number_com,
|
||||
"Number_dual": Number_dual,
|
||||
"Number_none": Number_none,
|
||||
"Number_plur": Number_plur,
|
||||
"Number_sing": Number_sing,
|
||||
"Number_ptan ": Number_ptan, # bg
|
||||
"Number_count ": Number_count, # bg
|
||||
"Number_ptan ": Number_ptan, # bg
|
||||
"Number_count ": Number_count, # bg
|
||||
"NumType_card": NumType_card,
|
||||
"NumType_dist": NumType_dist,
|
||||
"NumType_frac": NumType_frac,
|
||||
|
@ -276,7 +272,7 @@ IDS = {
|
|||
"PronType_rel": PronType_rel,
|
||||
"PronType_tot": PronType_tot,
|
||||
"PronType_clit": PronType_clit,
|
||||
"PronType_exc ": PronType_exc, # es, ca, it, fa,
|
||||
"PronType_exc ": PronType_exc, # es, ca, it, fa,
|
||||
"Reflex_yes": Reflex_yes,
|
||||
"Tense_fut": Tense_fut,
|
||||
"Tense_imp": Tense_imp,
|
||||
|
@ -292,19 +288,19 @@ IDS = {
|
|||
"VerbForm_partPres": VerbForm_partPres,
|
||||
"VerbForm_sup": VerbForm_sup,
|
||||
"VerbForm_trans": VerbForm_trans,
|
||||
"VerbForm_conv": VerbForm_conv, # U20
|
||||
"VerbForm_gdv ": VerbForm_gdv, # la,
|
||||
"VerbForm_conv": VerbForm_conv, # U20
|
||||
"VerbForm_gdv ": VerbForm_gdv, # la,
|
||||
"Voice_act": Voice_act,
|
||||
"Voice_cau": Voice_cau,
|
||||
"Voice_pass": Voice_pass,
|
||||
"Voice_mid ": Voice_mid, # gkc,
|
||||
"Voice_int ": Voice_int, # hb,
|
||||
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
|
||||
"AdpType_prep ": AdpType_prep, # cz, U,
|
||||
"AdpType_post ": AdpType_post, # U,
|
||||
"AdpType_voc ": AdpType_voc, # cz,
|
||||
"AdpType_comprep ": AdpType_comprep, # cz,
|
||||
"AdpType_circ ": AdpType_circ, # U,
|
||||
"Voice_mid ": Voice_mid, # gkc,
|
||||
"Voice_int ": Voice_int, # hb,
|
||||
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
|
||||
"AdpType_prep ": AdpType_prep, # cz, U,
|
||||
"AdpType_post ": AdpType_post, # U,
|
||||
"AdpType_voc ": AdpType_voc, # cz,
|
||||
"AdpType_comprep ": AdpType_comprep, # cz,
|
||||
"AdpType_circ ": AdpType_circ, # U,
|
||||
"AdvType_man": AdvType_man,
|
||||
"AdvType_loc": AdvType_loc,
|
||||
"AdvType_tim": AdvType_tim,
|
||||
|
@ -314,122 +310,122 @@ IDS = {
|
|||
"AdvType_sta": AdvType_sta,
|
||||
"AdvType_ex": AdvType_ex,
|
||||
"AdvType_adadj": AdvType_adadj,
|
||||
"ConjType_oper ": ConjType_oper, # cz, U,
|
||||
"ConjType_comp ": ConjType_comp, # cz, U,
|
||||
"Connegative_yes ": Connegative_yes, # fi,
|
||||
"Derivation_minen ": Derivation_minen, # fi,
|
||||
"Derivation_sti ": Derivation_sti, # fi,
|
||||
"Derivation_inen ": Derivation_inen, # fi,
|
||||
"Derivation_lainen ": Derivation_lainen, # fi,
|
||||
"Derivation_ja ": Derivation_ja, # fi,
|
||||
"Derivation_ton ": Derivation_ton, # fi,
|
||||
"Derivation_vs ": Derivation_vs, # fi,
|
||||
"Derivation_ttain ": Derivation_ttain, # fi,
|
||||
"Derivation_ttaa ": Derivation_ttaa, # fi,
|
||||
"Echo_rdp ": Echo_rdp, # U,
|
||||
"Echo_ech ": Echo_ech, # U,
|
||||
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
|
||||
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
|
||||
"Foreign_tscript ": Foreign_tscript, # cz, U,
|
||||
"Foreign_yes ": Foreign_yes, # sl,
|
||||
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
|
||||
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
|
||||
"Gender_erg_masc ": Gender_erg_masc, # bq,
|
||||
"Gender_erg_fem ": Gender_erg_fem, # bq,
|
||||
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
|
||||
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
|
||||
"Gender_psor_neut ": Gender_psor_neut, # sl,
|
||||
"Hyph_yes ": Hyph_yes, # cz, U,
|
||||
"InfForm_one ": InfForm_one, # fi,
|
||||
"InfForm_two ": InfForm_two, # fi,
|
||||
"InfForm_three ": InfForm_three, # fi,
|
||||
"NameType_geo ": NameType_geo, # U, cz,
|
||||
"NameType_prs ": NameType_prs, # U, cz,
|
||||
"NameType_giv ": NameType_giv, # U, cz,
|
||||
"NameType_sur ": NameType_sur, # U, cz,
|
||||
"NameType_nat ": NameType_nat, # U, cz,
|
||||
"NameType_com ": NameType_com, # U, cz,
|
||||
"NameType_pro ": NameType_pro, # U, cz,
|
||||
"NameType_oth ": NameType_oth, # U, cz,
|
||||
"NounType_com ": NounType_com, # U,
|
||||
"NounType_prop ": NounType_prop, # U,
|
||||
"NounType_class ": NounType_class, # U,
|
||||
"Number_abs_sing ": Number_abs_sing, # bq, U,
|
||||
"Number_abs_plur ": Number_abs_plur, # bq, U,
|
||||
"Number_dat_sing ": Number_dat_sing, # bq, U,
|
||||
"Number_dat_plur ": Number_dat_plur, # bq, U,
|
||||
"Number_erg_sing ": Number_erg_sing, # bq, U,
|
||||
"Number_erg_plur ": Number_erg_plur, # bq, U,
|
||||
"Number_psee_sing ": Number_psee_sing, # U,
|
||||
"Number_psee_plur ": Number_psee_plur, # U,
|
||||
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
|
||||
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
|
||||
"NumForm_digit ": NumForm_digit, # cz, sl, U,
|
||||
"NumForm_roman ": NumForm_roman, # cz, sl, U,
|
||||
"NumForm_word ": NumForm_word, # cz, sl, U,
|
||||
"NumValue_one ": NumValue_one, # cz, U,
|
||||
"NumValue_two ": NumValue_two, # cz, U,
|
||||
"NumValue_three ": NumValue_three, # cz, U,
|
||||
"PartForm_pres ": PartForm_pres, # fi,
|
||||
"PartForm_past ": PartForm_past, # fi,
|
||||
"PartForm_agt ": PartForm_agt, # fi,
|
||||
"PartForm_neg ": PartForm_neg, # fi,
|
||||
"PartType_mod ": PartType_mod, # U,
|
||||
"PartType_emp ": PartType_emp, # U,
|
||||
"PartType_res ": PartType_res, # U,
|
||||
"PartType_inf ": PartType_inf, # U,
|
||||
"PartType_vbp ": PartType_vbp, # U,
|
||||
"Person_abs_one ": Person_abs_one, # bq, U,
|
||||
"Person_abs_two ": Person_abs_two, # bq, U,
|
||||
"Person_abs_three ": Person_abs_three, # bq, U,
|
||||
"Person_dat_one ": Person_dat_one, # bq, U,
|
||||
"Person_dat_two ": Person_dat_two, # bq, U,
|
||||
"Person_dat_three ": Person_dat_three, # bq, U,
|
||||
"Person_erg_one ": Person_erg_one, # bq, U,
|
||||
"Person_erg_two ": Person_erg_two, # bq, U,
|
||||
"Person_erg_three ": Person_erg_three, # bq, U,
|
||||
"Person_psor_one ": Person_psor_one, # fi, U,
|
||||
"Person_psor_two ": Person_psor_two, # fi, U,
|
||||
"Person_psor_three ": Person_psor_three, # fi, U,
|
||||
"Polite_inf ": Polite_inf, # bq, U,
|
||||
"Polite_pol ": Polite_pol, # bq, U,
|
||||
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
|
||||
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
|
||||
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
|
||||
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
|
||||
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
|
||||
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
|
||||
"Prefix_yes ": Prefix_yes, # U,
|
||||
"PrepCase_npr ": PrepCase_npr, # cz,
|
||||
"PrepCase_pre ": PrepCase_pre, # U,
|
||||
"PunctSide_ini ": PunctSide_ini, # U,
|
||||
"PunctSide_fin ": PunctSide_fin, # U,
|
||||
"PunctType_peri ": PunctType_peri, # U,
|
||||
"PunctType_qest ": PunctType_qest, # U,
|
||||
"PunctType_excl ": PunctType_excl, # U,
|
||||
"PunctType_quot ": PunctType_quot, # U,
|
||||
"PunctType_brck ": PunctType_brck, # U,
|
||||
"PunctType_comm ": PunctType_comm, # U,
|
||||
"PunctType_colo ": PunctType_colo, # U,
|
||||
"PunctType_semi ": PunctType_semi, # U,
|
||||
"PunctType_dash ": PunctType_dash, # U,
|
||||
"Style_arch ": Style_arch, # cz, fi, U,
|
||||
"Style_rare ": Style_rare, # cz, fi, U,
|
||||
"Style_poet ": Style_poet, # cz, U,
|
||||
"Style_norm ": Style_norm, # cz, U,
|
||||
"Style_coll ": Style_coll, # cz, U,
|
||||
"Style_vrnc ": Style_vrnc, # cz, U,
|
||||
"Style_sing ": Style_sing, # cz, U,
|
||||
"Style_expr ": Style_expr, # cz, U,
|
||||
"Style_derg ": Style_derg, # cz, U,
|
||||
"Style_vulg ": Style_vulg, # cz, U,
|
||||
"Style_yes ": Style_yes, # fi, U,
|
||||
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
|
||||
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
|
||||
"VerbType_aux ": VerbType_aux, # U,
|
||||
"VerbType_cop ": VerbType_cop, # U,
|
||||
"VerbType_mod ": VerbType_mod, # U,
|
||||
"VerbType_light ": VerbType_light, # U,
|
||||
"ConjType_oper ": ConjType_oper, # cz, U,
|
||||
"ConjType_comp ": ConjType_comp, # cz, U,
|
||||
"Connegative_yes ": Connegative_yes, # fi,
|
||||
"Derivation_minen ": Derivation_minen, # fi,
|
||||
"Derivation_sti ": Derivation_sti, # fi,
|
||||
"Derivation_inen ": Derivation_inen, # fi,
|
||||
"Derivation_lainen ": Derivation_lainen, # fi,
|
||||
"Derivation_ja ": Derivation_ja, # fi,
|
||||
"Derivation_ton ": Derivation_ton, # fi,
|
||||
"Derivation_vs ": Derivation_vs, # fi,
|
||||
"Derivation_ttain ": Derivation_ttain, # fi,
|
||||
"Derivation_ttaa ": Derivation_ttaa, # fi,
|
||||
"Echo_rdp ": Echo_rdp, # U,
|
||||
"Echo_ech ": Echo_ech, # U,
|
||||
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
|
||||
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
|
||||
"Foreign_tscript ": Foreign_tscript, # cz, U,
|
||||
"Foreign_yes ": Foreign_yes, # sl,
|
||||
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
|
||||
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
|
||||
"Gender_erg_masc ": Gender_erg_masc, # bq,
|
||||
"Gender_erg_fem ": Gender_erg_fem, # bq,
|
||||
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
|
||||
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
|
||||
"Gender_psor_neut ": Gender_psor_neut, # sl,
|
||||
"Hyph_yes ": Hyph_yes, # cz, U,
|
||||
"InfForm_one ": InfForm_one, # fi,
|
||||
"InfForm_two ": InfForm_two, # fi,
|
||||
"InfForm_three ": InfForm_three, # fi,
|
||||
"NameType_geo ": NameType_geo, # U, cz,
|
||||
"NameType_prs ": NameType_prs, # U, cz,
|
||||
"NameType_giv ": NameType_giv, # U, cz,
|
||||
"NameType_sur ": NameType_sur, # U, cz,
|
||||
"NameType_nat ": NameType_nat, # U, cz,
|
||||
"NameType_com ": NameType_com, # U, cz,
|
||||
"NameType_pro ": NameType_pro, # U, cz,
|
||||
"NameType_oth ": NameType_oth, # U, cz,
|
||||
"NounType_com ": NounType_com, # U,
|
||||
"NounType_prop ": NounType_prop, # U,
|
||||
"NounType_class ": NounType_class, # U,
|
||||
"Number_abs_sing ": Number_abs_sing, # bq, U,
|
||||
"Number_abs_plur ": Number_abs_plur, # bq, U,
|
||||
"Number_dat_sing ": Number_dat_sing, # bq, U,
|
||||
"Number_dat_plur ": Number_dat_plur, # bq, U,
|
||||
"Number_erg_sing ": Number_erg_sing, # bq, U,
|
||||
"Number_erg_plur ": Number_erg_plur, # bq, U,
|
||||
"Number_psee_sing ": Number_psee_sing, # U,
|
||||
"Number_psee_plur ": Number_psee_plur, # U,
|
||||
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
|
||||
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
|
||||
"NumForm_digit ": NumForm_digit, # cz, sl, U,
|
||||
"NumForm_roman ": NumForm_roman, # cz, sl, U,
|
||||
"NumForm_word ": NumForm_word, # cz, sl, U,
|
||||
"NumValue_one ": NumValue_one, # cz, U,
|
||||
"NumValue_two ": NumValue_two, # cz, U,
|
||||
"NumValue_three ": NumValue_three, # cz, U,
|
||||
"PartForm_pres ": PartForm_pres, # fi,
|
||||
"PartForm_past ": PartForm_past, # fi,
|
||||
"PartForm_agt ": PartForm_agt, # fi,
|
||||
"PartForm_neg ": PartForm_neg, # fi,
|
||||
"PartType_mod ": PartType_mod, # U,
|
||||
"PartType_emp ": PartType_emp, # U,
|
||||
"PartType_res ": PartType_res, # U,
|
||||
"PartType_inf ": PartType_inf, # U,
|
||||
"PartType_vbp ": PartType_vbp, # U,
|
||||
"Person_abs_one ": Person_abs_one, # bq, U,
|
||||
"Person_abs_two ": Person_abs_two, # bq, U,
|
||||
"Person_abs_three ": Person_abs_three, # bq, U,
|
||||
"Person_dat_one ": Person_dat_one, # bq, U,
|
||||
"Person_dat_two ": Person_dat_two, # bq, U,
|
||||
"Person_dat_three ": Person_dat_three, # bq, U,
|
||||
"Person_erg_one ": Person_erg_one, # bq, U,
|
||||
"Person_erg_two ": Person_erg_two, # bq, U,
|
||||
"Person_erg_three ": Person_erg_three, # bq, U,
|
||||
"Person_psor_one ": Person_psor_one, # fi, U,
|
||||
"Person_psor_two ": Person_psor_two, # fi, U,
|
||||
"Person_psor_three ": Person_psor_three, # fi, U,
|
||||
"Polite_inf ": Polite_inf, # bq, U,
|
||||
"Polite_pol ": Polite_pol, # bq, U,
|
||||
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
|
||||
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
|
||||
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
|
||||
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
|
||||
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
|
||||
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
|
||||
"Prefix_yes ": Prefix_yes, # U,
|
||||
"PrepCase_npr ": PrepCase_npr, # cz,
|
||||
"PrepCase_pre ": PrepCase_pre, # U,
|
||||
"PunctSide_ini ": PunctSide_ini, # U,
|
||||
"PunctSide_fin ": PunctSide_fin, # U,
|
||||
"PunctType_peri ": PunctType_peri, # U,
|
||||
"PunctType_qest ": PunctType_qest, # U,
|
||||
"PunctType_excl ": PunctType_excl, # U,
|
||||
"PunctType_quot ": PunctType_quot, # U,
|
||||
"PunctType_brck ": PunctType_brck, # U,
|
||||
"PunctType_comm ": PunctType_comm, # U,
|
||||
"PunctType_colo ": PunctType_colo, # U,
|
||||
"PunctType_semi ": PunctType_semi, # U,
|
||||
"PunctType_dash ": PunctType_dash, # U,
|
||||
"Style_arch ": Style_arch, # cz, fi, U,
|
||||
"Style_rare ": Style_rare, # cz, fi, U,
|
||||
"Style_poet ": Style_poet, # cz, U,
|
||||
"Style_norm ": Style_norm, # cz, U,
|
||||
"Style_coll ": Style_coll, # cz, U,
|
||||
"Style_vrnc ": Style_vrnc, # cz, U,
|
||||
"Style_sing ": Style_sing, # cz, U,
|
||||
"Style_expr ": Style_expr, # cz, U,
|
||||
"Style_derg ": Style_derg, # cz, U,
|
||||
"Style_vulg ": Style_vulg, # cz, U,
|
||||
"Style_yes ": Style_yes, # fi, U,
|
||||
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
|
||||
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
|
||||
"VerbType_aux ": VerbType_aux, # U,
|
||||
"VerbType_cop ": VerbType_cop, # U,
|
||||
"VerbType_mod ": VerbType_mod, # U,
|
||||
"VerbType_light ": VerbType_light, # U,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ IDS = {
|
|||
"ADP": ADP,
|
||||
"ADV": ADV,
|
||||
"AUX": AUX,
|
||||
"CONJ": CONJ, # U20
|
||||
"CONJ": CONJ, # U20
|
||||
"CCONJ": CCONJ,
|
||||
"DET": DET,
|
||||
"INTJ": INTJ,
|
||||
|
|
|
@ -85,7 +85,6 @@ class Scorer(object):
|
|||
|
||||
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
|
||||
assert len(tokens) == len(gold)
|
||||
|
||||
gold_deps = set()
|
||||
gold_tags = set()
|
||||
gold_ents = set(tags_to_entities([annot[-1]
|
||||
|
|
|
@ -4,19 +4,15 @@ from __future__ import unicode_literals, absolute_import
|
|||
|
||||
cimport cython
|
||||
from libc.string cimport memcpy
|
||||
from libc.stdint cimport uint64_t, uint32_t
|
||||
from murmurhash.mrmr cimport hash64, hash32
|
||||
from preshed.maps cimport map_iter, key_t
|
||||
from libc.stdint cimport uint32_t
|
||||
from murmurhash.mrmr cimport hash64, hash32
|
||||
import ujson
|
||||
import dill
|
||||
|
||||
from .symbols import IDS as SYMBOLS_BY_STR
|
||||
from .symbols import NAMES as SYMBOLS_BY_INT
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from . import util
|
||||
from .compat import json_dumps
|
||||
from . import util
|
||||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0:
|
||||
|
@ -195,7 +191,7 @@ cdef class StringStore:
|
|||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
strings = list(self)
|
||||
|
@ -225,7 +221,7 @@ cdef class StringStore:
|
|||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `StringStore` object.
|
||||
"""
|
||||
return ujson.dumps(list(self))
|
||||
return json_dumps(list(self))
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
"""Load state from a binary string.
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
# coding: utf8
|
||||
#cython: optimize.unpack_method_calls=False
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
IDS = {
|
||||
"": NIL,
|
||||
"IS_ALPHA": IS_ALPHA,
|
||||
|
@ -464,9 +464,11 @@ IDS = {
|
|||
"LAW": LAW
|
||||
}
|
||||
|
||||
|
||||
def sort_nums(x):
|
||||
return x[1]
|
||||
|
||||
|
||||
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
||||
# Unfortunate hack here, to work around problem with long cpdef enum
|
||||
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
||||
|
|
|
@ -8,12 +8,11 @@ from cython.operator cimport preincrement as preinc
|
|||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
import regex as re
|
||||
|
||||
from .strings cimport hash_string
|
||||
from . import util
|
||||
cimport cython
|
||||
|
||||
from .tokens.doc cimport Doc
|
||||
from .strings cimport hash_string
|
||||
from . import util
|
||||
|
||||
|
||||
cdef class Tokenizer:
|
||||
|
@ -21,7 +20,7 @@ cdef class Tokenizer:
|
|||
boundaries.
|
||||
"""
|
||||
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
||||
suffix_search=None, infix_finditer=None, token_match=None):
|
||||
suffix_search=None, infix_finditer=None, token_match=None):
|
||||
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
||||
|
||||
vocab (Vocab): A storage container for lexical types.
|
||||
|
@ -74,9 +73,8 @@ cdef class Tokenizer:
|
|||
RETURNS (Doc): A container for linguistic annotations.
|
||||
"""
|
||||
if len(string) >= (2 ** 30):
|
||||
raise ValueError(
|
||||
"String is too long: %d characters. Max is 2**30." % len(string)
|
||||
)
|
||||
msg = "String is too long: %d characters. Max is 2**30."
|
||||
raise ValueError(msg % len(string))
|
||||
cdef int length = len(string)
|
||||
cdef Doc doc = Doc(self.vocab)
|
||||
if length == 0:
|
||||
|
@ -122,8 +120,8 @@ cdef class Tokenizer:
|
|||
"""Tokenize a stream of texts.
|
||||
|
||||
texts: A sequence of unicode texts.
|
||||
batch_size (int): The number of texts to accumulate in an internal buffer.
|
||||
n_threads (int): The number of threads to use, if the implementation
|
||||
batch_size (int): Number of texts to accumulate in an internal buffer.
|
||||
n_threads (int): Number of threads to use, if the implementation
|
||||
supports multi-threading. The default tokenizer is single-threaded.
|
||||
YIELDS (Doc): A sequence of Doc objects, in order.
|
||||
"""
|
||||
|
@ -232,8 +230,8 @@ cdef class Tokenizer:
|
|||
if not matches:
|
||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||
else:
|
||||
# let's say we have dyn-o-mite-dave
|
||||
# the regex finds the start and end positions of the hyphens
|
||||
# let's say we have dyn-o-mite-dave - the regex finds the
|
||||
# start and end positions of the hyphens
|
||||
start = 0
|
||||
for match in matches:
|
||||
infix_start = match.start()
|
||||
|
@ -293,8 +291,8 @@ cdef class Tokenizer:
|
|||
return list(self.infix_finditer(string))
|
||||
|
||||
def find_prefix(self, unicode string):
|
||||
"""Find the length of a prefix that should be segmented from the string,
|
||||
or None if no prefix rules match.
|
||||
"""Find the length of a prefix that should be segmented from the
|
||||
string, or None if no prefix rules match.
|
||||
|
||||
string (unicode): The string to segment.
|
||||
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
||||
|
@ -305,8 +303,8 @@ cdef class Tokenizer:
|
|||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def find_suffix(self, unicode string):
|
||||
"""Find the length of a suffix that should be segmented from the string,
|
||||
or None if no suffix rules match.
|
||||
"""Find the length of a suffix that should be segmented from the
|
||||
string, or None if no suffix rules match.
|
||||
|
||||
string (unicode): The string to segment.
|
||||
Returns (int): The length of the suffix if present, otherwise `None`.
|
||||
|
@ -326,8 +324,8 @@ cdef class Tokenizer:
|
|||
|
||||
string (unicode): The string to specially tokenize.
|
||||
token_attrs (iterable): A sequence of dicts, where each dict describes
|
||||
a token and its attributes. The `ORTH` fields of the attributes must
|
||||
exactly match the string when they are concatenated.
|
||||
a token and its attributes. The `ORTH` fields of the attributes
|
||||
must exactly match the string when they are concatenated.
|
||||
"""
|
||||
substrings = list(substrings)
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
|
@ -343,7 +341,7 @@ cdef class Tokenizer:
|
|||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
"""
|
||||
with path.open('wb') as file_:
|
||||
file_.write(self.to_bytes(**exclude))
|
||||
|
|
|
@ -476,7 +476,7 @@ cdef class Span:
|
|||
"""
|
||||
# TODO: implement
|
||||
def __get__(self):
|
||||
raise NotImplementedError()
|
||||
raise NotImplementedError
|
||||
|
||||
property n_rights:
|
||||
"""RETURNS (int): The number of rightward immediate children of the
|
||||
|
@ -484,7 +484,7 @@ cdef class Span:
|
|||
"""
|
||||
# TODO: implement
|
||||
def __get__(self):
|
||||
raise NotImplementedError()
|
||||
raise NotImplementedError
|
||||
|
||||
property subtree:
|
||||
"""Tokens that descend from tokens in the span, but fall outside it.
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
|
|
@ -17,8 +17,8 @@ from .compat import copy_reg, basestring_
|
|||
from .lemmatizer import Lemmatizer
|
||||
from .attrs import intify_attrs
|
||||
from .vectors import Vectors
|
||||
from . import util
|
||||
from ._ml import link_vectors_to_models
|
||||
from . import util
|
||||
|
||||
|
||||
cdef class Vocab:
|
||||
|
|
Loading…
Reference in New Issue
Block a user