Tidy up rest

This commit is contained in:
ines 2017-10-27 21:07:59 +02:00
parent a8e10f94e4
commit d96e72f656
14 changed files with 233 additions and 261 deletions

View File

@ -8,11 +8,9 @@ from thinc.t2t import ExtractWindow, ParametricAttention
from thinc.t2v import Pooling, sum_pool from thinc.t2v import Pooling, sum_pool
from thinc.misc import Residual from thinc.misc import Residual
from thinc.misc import LayerNorm as LN from thinc.misc import LayerNorm as LN
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
from thinc.api import uniqued, wrap, noop from thinc.api import uniqued, wrap, noop
from thinc.linear.linear import LinearModel from thinc.linear.linear import LinearModel
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module

View File

@ -101,17 +101,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
""" """
Normalize a dictionary of attributes, converting them to ints. Normalize a dictionary of attributes, converting them to ints.
Arguments: stringy_attrs (dict): Dictionary keyed by attribute string names. Values
stringy_attrs (dict): can be ints or strings.
Dictionary keyed by attribute string names. Values can be ints or strings. strings_map (StringStore): Defaults to None. If provided, encodes string
values into ints.
strings_map (StringStore): RETURNS (dict): Attributes dictionary with keys and optionally values
Defaults to None. If provided, encodes string values into ints. converted to ints.
Returns:
inty_attrs (dict):
Attributes dictionary with keys and optionally values converted to
ints.
""" """
inty_attrs = {} inty_attrs = {}
if _do_deprecated: if _do_deprecated:

View File

@ -2,7 +2,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import io
import re import re
import ujson import ujson
import random import random
@ -10,9 +9,8 @@ import cytoolz
import itertools import itertools
from .syntax import nonproj from .syntax import nonproj
from .util import ensure_path
from . import util
from .tokens import Doc from .tokens import Doc
from . import util
def tags_to_entities(tags): def tags_to_entities(tags):
@ -310,7 +308,7 @@ def _corrupt(c, noise_level):
def read_json_file(loc, docs_filter=None, limit=None): def read_json_file(loc, docs_filter=None, limit=None):
loc = ensure_path(loc) loc = util.ensure_path(loc)
if loc.is_dir(): if loc.is_dir():
for filename in loc.iterdir(): for filename in loc.iterdir():
yield from read_json_file(loc / filename, limit=limit) yield from read_json_file(loc / filename, limit=limit)

View File

@ -1,22 +1,22 @@
# coding: utf8 # coding: utf8
from __future__ import absolute_import, unicode_literals from __future__ import absolute_import, unicode_literals
from contextlib import contextmanager
import copy
from thinc.neural import Model
from thinc.neural.optimizers import Adam
import random import random
import ujson import ujson
from collections import OrderedDict
import itertools import itertools
import weakref import weakref
import functools import functools
from collections import OrderedDict
from contextlib import contextmanager
from copy import copy
from thinc.neural import Model
from thinc.neural.optimizers import Adam
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .pipeline import DependencyParser, Tensorizer, Tagger from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer from .pipeline import SimilarityHook, TextCategorizer
from .compat import json_dumps, izip from .compat import json_dumps, izip
from .scorer import Scorer from .scorer import Scorer
from ._ml import link_vectors_to_models from ._ml import link_vectors_to_models
@ -649,7 +649,7 @@ class Language(object):
serializers = OrderedDict(( serializers = OrderedDict((
('vocab', lambda: self.vocab.to_bytes()), ('vocab', lambda: self.vocab.to_bytes()),
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
('meta', lambda: ujson.dumps(self.meta)) ('meta', lambda: json_dumps(self.meta))
)) ))
for i, (name, proc) in enumerate(self.pipeline): for i, (name, proc) in enumerate(self.pipeline):
if name in disable: if name in disable:
@ -689,7 +689,7 @@ class DisabledPipes(list):
# Important! Not deep copy -- we just want the container (but we also # Important! Not deep copy -- we just want the container (but we also
# want to support people providing arbitrarily typed nlp.pipeline # want to support people providing arbitrarily typed nlp.pipeline
# objects.) # objects.)
self.original_pipeline = copy.copy(nlp.pipeline) self.original_pipeline = copy(nlp.pipeline)
list.__init__(self) list.__init__(self)
self.extend(nlp.remove_pipe(name) for name in names) self.extend(nlp.remove_pipe(name) for name in names)

View File

@ -4,12 +4,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import ujson import ujson
from .typedefs cimport attr_t
from .typedefs cimport hash_t
from .attrs cimport attr_id_t
from .structs cimport TokenC
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from libcpp.vector cimport vector from libcpp.vector cimport vector
@ -17,14 +11,15 @@ from libcpp.pair cimport pair
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from libc.stdint cimport int32_t from libc.stdint cimport int32_t
from .attrs cimport ID, NULL_ATTR, ENT_TYPE from .typedefs cimport attr_t
from . import attrs from .typedefs cimport hash_t
from .tokens.doc cimport get_token_attr from .structs cimport TokenC
from .tokens.doc cimport Doc from .tokens.doc cimport Doc, get_token_attr
from .vocab cimport Vocab from .vocab cimport Vocab
from .attrs import IDS
from .attrs cimport attr_id_t, ID, NULL_ATTR
from .attrs import FLAG61 as U_ENT from .attrs import FLAG61 as U_ENT
from .attrs import FLAG60 as B2_ENT from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT from .attrs import FLAG59 as B3_ENT
from .attrs import FLAG58 as B4_ENT from .attrs import FLAG58 as B4_ENT
@ -34,7 +29,6 @@ from .attrs import FLAG55 as B7_ENT
from .attrs import FLAG54 as B8_ENT from .attrs import FLAG54 as B8_ENT
from .attrs import FLAG53 as B9_ENT from .attrs import FLAG53 as B9_ENT
from .attrs import FLAG52 as B10_ENT from .attrs import FLAG52 as B10_ENT
from .attrs import FLAG51 as I3_ENT from .attrs import FLAG51 as I3_ENT
from .attrs import FLAG50 as I4_ENT from .attrs import FLAG50 as I4_ENT
from .attrs import FLAG49 as I5_ENT from .attrs import FLAG49 as I5_ENT
@ -43,7 +37,6 @@ from .attrs import FLAG47 as I7_ENT
from .attrs import FLAG46 as I8_ENT from .attrs import FLAG46 as I8_ENT
from .attrs import FLAG45 as I9_ENT from .attrs import FLAG45 as I9_ENT
from .attrs import FLAG44 as I10_ENT from .attrs import FLAG44 as I10_ENT
from .attrs import FLAG43 as L2_ENT from .attrs import FLAG43 as L2_ENT
from .attrs import FLAG42 as L3_ENT from .attrs import FLAG42 as L3_ENT
from .attrs import FLAG41 as L4_ENT from .attrs import FLAG41 as L4_ENT
@ -153,7 +146,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
def _convert_strings(token_specs, string_store): def _convert_strings(token_specs, string_store):
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS), operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
'?': (ZERO_ONE,), '1': (ONE,)} '?': (ZERO_ONE,), '1': (ONE,)}
tokens = [] tokens = []
op = ONE op = ONE
for spec in token_specs: for spec in token_specs:
@ -168,10 +161,10 @@ def _convert_strings(token_specs, string_store):
if value in operators: if value in operators:
ops = operators[value] ops = operators[value]
else: else:
raise KeyError( msg = "Unknown operator '%s'. Options: %s"
"Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys()))) raise KeyError(msg % (value, ', '.join(operators.keys())))
if isinstance(attr, basestring): if isinstance(attr, basestring):
attr = attrs.IDS.get(attr.upper()) attr = IDS.get(attr.upper())
if isinstance(value, basestring): if isinstance(value, basestring):
value = string_store.add(value) value = string_store.add(value)
if isinstance(value, bool): if isinstance(value, bool):
@ -186,7 +179,7 @@ def _convert_strings(token_specs, string_store):
def merge_phrase(matcher, doc, i, matches): def merge_phrase(matcher, doc, i, matches):
"""Callback to merge a phrase on match.""" """Callback to merge a phrase on match."""
ent_id, label, start, end = matches[i] ent_id, label, start, end = matches[i]
span = doc[start : end] span = doc[start:end]
span.merge(ent_type=label, ent_id=ent_id) span.merge(ent_type=label, ent_id=ent_id)
@ -233,13 +226,13 @@ cdef class Matcher:
return self._normalize_key(key) in self._patterns return self._normalize_key(key) in self._patterns
def add(self, key, on_match, *patterns): def add(self, key, on_match, *patterns):
"""Add a match-rule to the matcher. A match-rule consists of: an ID key, """Add a match-rule to the matcher. A match-rule consists of: an ID
an on_match callback, and one or more patterns. key, an on_match callback, and one or more patterns.
If the key exists, the patterns are appended to the previous ones, and If the key exists, the patterns are appended to the previous ones, and
the previous on_match callback is replaced. The `on_match` callback will the previous on_match callback is replaced. The `on_match` callback
receive the arguments `(matcher, doc, i, matches)`. You can also set will receive the arguments `(matcher, doc, i, matches)`. You can also
`on_match` to `None` to not perform any actions. set `on_match` to `None` to not perform any actions.
A pattern consists of one or more `token_specs`, where a `token_spec` A pattern consists of one or more `token_specs`, where a `token_spec`
is a dictionary mapping attribute IDs to values, and optionally a is a dictionary mapping attribute IDs to values, and optionally a
@ -253,8 +246,8 @@ cdef class Matcher:
The + and * operators are usually interpretted "greedily", i.e. longer The + and * operators are usually interpretted "greedily", i.e. longer
matches are returned where possible. However, if you specify two '+' matches are returned where possible. However, if you specify two '+'
and '*' patterns in a row and their matches overlap, the first and '*' patterns in a row and their matches overlap, the first
operator will behave non-greedily. This quirk in the semantics operator will behave non-greedily. This quirk in the semantics makes
makes the matcher more efficient, by avoiding the need for back-tracking. the matcher more efficient, by avoiding the need for back-tracking.
key (unicode): The match ID. key (unicode): The match ID.
on_match (callable): Callback executed on match. on_match (callable): Callback executed on match.
@ -268,7 +261,6 @@ cdef class Matcher:
key = self._normalize_key(key) key = self._normalize_key(key)
self._patterns.setdefault(key, []) self._patterns.setdefault(key, [])
self._callbacks[key] = on_match self._callbacks[key] = on_match
for pattern in patterns: for pattern in patterns:
specs = _convert_strings(pattern, self.vocab.strings) specs = _convert_strings(pattern, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, key, specs)) self.patterns.push_back(init_pattern(self.mem, key, specs))
@ -315,9 +307,9 @@ cdef class Matcher:
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents. docs (iterable): A stream of documents.
batch_size (int): The number of documents to accumulate into a working set. batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer n_threads (int): The number of threads with which to work on the buffer
in parallel, if the `Matcher` implementation supports multi-threading. in parallel, if the implementation supports multi-threading.
YIELDS (Doc): Documents, in order. YIELDS (Doc): Documents, in order.
""" """
for doc in docs: for doc in docs:
@ -325,7 +317,7 @@ cdef class Matcher:
yield doc yield doc
def __call__(self, Doc doc): def __call__(self, Doc doc):
"""Find all token sequences matching the supplied patterns on the `Doc`. """Find all token sequences matching the supplied pattern.
doc (Doc): The document to match over. doc (Doc): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples, RETURNS (list): A list of `(key, start, end)` tuples,
@ -342,8 +334,8 @@ cdef class Matcher:
for token_i in range(doc.length): for token_i in range(doc.length):
token = &doc.c[token_i] token = &doc.c[token_i]
q = 0 q = 0
# Go over the open matches, extending or finalizing if able. Otherwise, # Go over the open matches, extending or finalizing if able.
# we over-write them (q doesn't advance) # Otherwise, we over-write them (q doesn't advance)
for state in partials: for state in partials:
action = get_action(state.second, token) action = get_action(state.second, token)
if action == PANIC: if action == PANIC:
@ -356,8 +348,8 @@ cdef class Matcher:
if action == REPEAT: if action == REPEAT:
# Leave the state in the queue, and advance to next slot # Leave the state in the queue, and advance to next slot
# (i.e. we don't overwrite -- we want to greedily match more # (i.e. we don't overwrite -- we want to greedily match
# pattern. # more pattern.
q += 1 q += 1
elif action == REJECT: elif action == REJECT:
pass pass
@ -366,8 +358,8 @@ cdef class Matcher:
partials[q].second += 1 partials[q].second += 1
q += 1 q += 1
elif action in (ACCEPT, ACCEPT_PREV): elif action in (ACCEPT, ACCEPT_PREV):
# TODO: What to do about patterns starting with ZERO? Need to # TODO: What to do about patterns starting with ZERO? Need
# adjust the start position. # to adjust the start position.
start = state.first start = state.first
end = token_i+1 if action == ACCEPT else token_i end = token_i+1 if action == ACCEPT else token_i
ent_id = state.second[1].attrs[0].value ent_id = state.second[1].attrs[0].value
@ -388,8 +380,8 @@ cdef class Matcher:
state.second = pattern state.second = pattern
partials.push_back(state) partials.push_back(state)
elif action == ADVANCE: elif action == ADVANCE:
# TODO: What to do about patterns starting with ZERO? Need to # TODO: What to do about patterns starting with ZERO? Need
# adjust the start position. # to adjust the start position.
state.first = token_i state.first = token_i
state.second = pattern + 1 state.second = pattern + 1
partials.push_back(state) partials.push_back(state)
@ -413,7 +405,6 @@ cdef class Matcher:
on_match = self._callbacks.get(ent_id) on_match = self._callbacks.get(ent_id)
if on_match is not None: if on_match is not None:
on_match(self, doc, i, matches) on_match(self, doc, i, matches)
# TODO: only return (match_id, start, end)
return matches return matches
def _normalize_key(self, key): def _normalize_key(self, key):
@ -441,7 +432,8 @@ def get_bilou(length):
elif length == 8: elif length == 8:
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT] return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
elif length == 9: elif length == 9:
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT] return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
L9_ENT]
elif length == 10: elif length == 10:
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
I10_ENT, I10_ENT, L10_ENT] I10_ENT, I10_ENT, L10_ENT]
@ -454,10 +446,8 @@ cdef class PhraseMatcher:
cdef Vocab vocab cdef Vocab vocab
cdef Matcher matcher cdef Matcher matcher
cdef PreshMap phrase_ids cdef PreshMap phrase_ids
cdef int max_length cdef int max_length
cdef attr_t* _phrase_key cdef attr_t* _phrase_key
cdef public object _callbacks cdef public object _callbacks
cdef public object _patterns cdef public object _patterns
@ -470,7 +460,8 @@ cdef class PhraseMatcher:
self.phrase_ids = PreshMap() self.phrase_ids = PreshMap()
abstract_patterns = [] abstract_patterns = []
for length in range(1, max_length): for length in range(1, max_length):
abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) abstract_patterns.append([{tag: True}
for tag in get_bilou(length)])
self.matcher.add('Candidate', None, *abstract_patterns) self.matcher.add('Candidate', None, *abstract_patterns)
self._callbacks = {} self._callbacks = {}
@ -496,8 +487,8 @@ cdef class PhraseMatcher:
return (self.__class__, (self.vocab,), None, None) return (self.__class__, (self.vocab,), None, None)
def add(self, key, on_match, *docs): def add(self, key, on_match, *docs):
"""Add a match-rule to the matcher. A match-rule consists of: an ID key, """Add a match-rule to the matcher. A match-rule consists of: an ID
an on_match callback, and one or more patterns. key, an on_match callback, and one or more patterns.
key (unicode): The match ID. key (unicode): The match ID.
on_match (callable): Callback executed on match. on_match (callable): Callback executed on match.
@ -513,7 +504,6 @@ cdef class PhraseMatcher:
raise ValueError(msg % (len(doc), self.max_length)) raise ValueError(msg % (len(doc), self.max_length))
cdef hash_t ent_id = self.matcher._normalize_key(key) cdef hash_t ent_id = self.matcher._normalize_key(key)
self._callbacks[ent_id] = on_match self._callbacks[ent_id] = on_match
cdef int length cdef int length
cdef int i cdef int i
cdef hash_t phrase_hash cdef hash_t phrase_hash
@ -553,9 +543,9 @@ cdef class PhraseMatcher:
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents. docs (iterable): A stream of documents.
batch_size (int): The number of documents to accumulate into a working set. batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer n_threads (int): The number of threads with which to work on the buffer
in parallel, if the `Matcher` implementation supports multi-threading. in parallel, if the implementation supports multi-threading.
YIELDS (Doc): Documents, in order. YIELDS (Doc): Documents, in order.
""" """
for doc in stream: for doc in stream:
@ -569,7 +559,8 @@ cdef class PhraseMatcher:
self._phrase_key[i] = 0 self._phrase_key[i] = 0
for i, j in enumerate(range(start, end)): for i, j in enumerate(range(start, end)):
self._phrase_key[i] = doc.c[j].lex.orth self._phrase_key[i] = doc.c[j].lex.orth
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) cdef hash_t key = hash64(self._phrase_key,
self.max_length * sizeof(attr_t), 0)
ent_id = <hash_t>self.phrase_ids.get(key) ent_id = <hash_t>self.phrase_ids.get(key)
if ent_id == 0: if ent_id == 0:
return None return None

View File

@ -4,17 +4,15 @@ from __future__ import unicode_literals
from libc.string cimport memset from libc.string cimport memset
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
from .attrs cimport POS, IS_SPACE from .attrs cimport POS, IS_SPACE
from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .attrs import LEMMA, intify_attrs
def _normalize_props(props): def _normalize_props(props):
""" """Transform deprecated string keys to correct names."""
Transform deprecated string keys to correct names.
"""
out = {} out = {}
for key, value in props.items(): for key, value in props.items():
if key == POS: if key == POS:
@ -77,7 +75,8 @@ cdef class Morphology:
cdef int assign_untagged(self, TokenC* token) except -1: cdef int assign_untagged(self, TokenC* token) except -1:
"""Set morphological attributes on a token without a POS tag. Uses """Set morphological attributes on a token without a POS tag. Uses
the lemmatizer's lookup() method, which looks up the string in the the lemmatizer's lookup() method, which looks up the string in the
table provided by the language data as lemma_lookup (if available).""" table provided by the language data as lemma_lookup (if available).
"""
if token.lemma == 0: if token.lemma == 0:
orth_str = self.strings[token.lex.orth] orth_str = self.strings[token.lex.orth]
lemma = self.lemmatizer.lookup(orth_str) lemma = self.lemmatizer.lookup(orth_str)
@ -95,11 +94,10 @@ cdef class Morphology:
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id > self.n_tags: if tag_id > self.n_tags:
raise ValueError("Unknown tag ID: %s" % tag_id) raise ValueError("Unknown tag ID: %s" % tag_id)
# TODO: It's pretty arbitrary to put this logic here. I guess the justification # TODO: It's pretty arbitrary to put this logic here. I guess the
# is that this is where the specific word and the tag interact. Still, # justification is that this is where the specific word and the tag
# we should have a better way to enforce this rule, or figure out why # interact. Still, we should have a better way to enforce this rule, or
# the statistical model fails. # figure out why the statistical model fails. Related to Issue #220
# Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE): if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings.add('_SP')] tag_id = self.reverse_index[self.strings.add('_SP')]
rich_tag = self.rich_tags[tag_id] rich_tag = self.rich_tags[tag_id]
@ -123,14 +121,13 @@ cdef class Morphology:
else: else:
flags[0] &= ~(one << flag_id) flags[0] &= ~(one << flag_id)
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
""" force=False):
Add a special-case rule to the morphological analyser. Tokens whose """Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties. tag and orth match the rule will receive the specified properties.
Arguments: tag (unicode): The part-of-speech tag to key the exception.
tag (unicode): The part-of-speech tag to key the exception. orth (unicode): The word-form to key the exception.
orth (unicode): The word-form to key the exception.
""" """
self.exc[(tag_str, orth_str)] = dict(attrs) self.exc[(tag_str, orth_str)] = dict(attrs)
tag = self.strings.add(tag_str) tag = self.strings.add(tag_str)
@ -144,10 +141,9 @@ cdef class Morphology:
elif force: elif force:
memset(cached, 0, sizeof(cached[0])) memset(cached, 0, sizeof(cached[0]))
else: else:
msg = ("Conflicting morphology exception for (%s, %s). Use force=True " raise ValueError(
"to overwrite.") "Conflicting morphology exception for (%s, %s). Use "
msg = msg % (tag_str, orth_str) "force=True to overwrite." % (tag_str, orth_str))
raise ValueError(msg)
cached.tag = rich_tag cached.tag = rich_tag
# TODO: Refactor this to take arbitrary attributes. # TODO: Refactor this to take arbitrary attributes.
@ -218,7 +214,7 @@ IDS = {
"Definite_two": Definite_two, "Definite_two": Definite_two,
"Definite_def": Definite_def, "Definite_def": Definite_def,
"Definite_red": Definite_red, "Definite_red": Definite_red,
"Definite_cons": Definite_cons, # U20 "Definite_cons": Definite_cons, # U20
"Definite_ind": Definite_ind, "Definite_ind": Definite_ind,
"Degree_cmp": Degree_cmp, "Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp, "Degree_comp": Degree_comp,
@ -227,7 +223,7 @@ IDS = {
"Degree_sup": Degree_sup, "Degree_sup": Degree_sup,
"Degree_abs": Degree_abs, "Degree_abs": Degree_abs,
"Degree_com": Degree_com, "Degree_com": Degree_com,
"Degree_dim ": Degree_dim, # du "Degree_dim ": Degree_dim, # du
"Gender_com": Gender_com, "Gender_com": Gender_com,
"Gender_fem": Gender_fem, "Gender_fem": Gender_fem,
"Gender_masc": Gender_masc, "Gender_masc": Gender_masc,
@ -242,15 +238,15 @@ IDS = {
"Negative_neg": Negative_neg, "Negative_neg": Negative_neg,
"Negative_pos": Negative_pos, "Negative_pos": Negative_pos,
"Negative_yes": Negative_yes, "Negative_yes": Negative_yes,
"Polarity_neg": Polarity_neg, # U20 "Polarity_neg": Polarity_neg, # U20
"Polarity_pos": Polarity_pos, # U20 "Polarity_pos": Polarity_pos, # U20
"Number_com": Number_com, "Number_com": Number_com,
"Number_dual": Number_dual, "Number_dual": Number_dual,
"Number_none": Number_none, "Number_none": Number_none,
"Number_plur": Number_plur, "Number_plur": Number_plur,
"Number_sing": Number_sing, "Number_sing": Number_sing,
"Number_ptan ": Number_ptan, # bg "Number_ptan ": Number_ptan, # bg
"Number_count ": Number_count, # bg "Number_count ": Number_count, # bg
"NumType_card": NumType_card, "NumType_card": NumType_card,
"NumType_dist": NumType_dist, "NumType_dist": NumType_dist,
"NumType_frac": NumType_frac, "NumType_frac": NumType_frac,
@ -276,7 +272,7 @@ IDS = {
"PronType_rel": PronType_rel, "PronType_rel": PronType_rel,
"PronType_tot": PronType_tot, "PronType_tot": PronType_tot,
"PronType_clit": PronType_clit, "PronType_clit": PronType_clit,
"PronType_exc ": PronType_exc, # es, ca, it, fa, "PronType_exc ": PronType_exc, # es, ca, it, fa,
"Reflex_yes": Reflex_yes, "Reflex_yes": Reflex_yes,
"Tense_fut": Tense_fut, "Tense_fut": Tense_fut,
"Tense_imp": Tense_imp, "Tense_imp": Tense_imp,
@ -292,19 +288,19 @@ IDS = {
"VerbForm_partPres": VerbForm_partPres, "VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup, "VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans, "VerbForm_trans": VerbForm_trans,
"VerbForm_conv": VerbForm_conv, # U20 "VerbForm_conv": VerbForm_conv, # U20
"VerbForm_gdv ": VerbForm_gdv, # la, "VerbForm_gdv ": VerbForm_gdv, # la,
"Voice_act": Voice_act, "Voice_act": Voice_act,
"Voice_cau": Voice_cau, "Voice_cau": Voice_cau,
"Voice_pass": Voice_pass, "Voice_pass": Voice_pass,
"Voice_mid ": Voice_mid, # gkc, "Voice_mid ": Voice_mid, # gkc,
"Voice_int ": Voice_int, # hb, "Voice_int ": Voice_int, # hb,
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U, "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
"AdpType_prep ": AdpType_prep, # cz, U, "AdpType_prep ": AdpType_prep, # cz, U,
"AdpType_post ": AdpType_post, # U, "AdpType_post ": AdpType_post, # U,
"AdpType_voc ": AdpType_voc, # cz, "AdpType_voc ": AdpType_voc, # cz,
"AdpType_comprep ": AdpType_comprep, # cz, "AdpType_comprep ": AdpType_comprep, # cz,
"AdpType_circ ": AdpType_circ, # U, "AdpType_circ ": AdpType_circ, # U,
"AdvType_man": AdvType_man, "AdvType_man": AdvType_man,
"AdvType_loc": AdvType_loc, "AdvType_loc": AdvType_loc,
"AdvType_tim": AdvType_tim, "AdvType_tim": AdvType_tim,
@ -314,122 +310,122 @@ IDS = {
"AdvType_sta": AdvType_sta, "AdvType_sta": AdvType_sta,
"AdvType_ex": AdvType_ex, "AdvType_ex": AdvType_ex,
"AdvType_adadj": AdvType_adadj, "AdvType_adadj": AdvType_adadj,
"ConjType_oper ": ConjType_oper, # cz, U, "ConjType_oper ": ConjType_oper, # cz, U,
"ConjType_comp ": ConjType_comp, # cz, U, "ConjType_comp ": ConjType_comp, # cz, U,
"Connegative_yes ": Connegative_yes, # fi, "Connegative_yes ": Connegative_yes, # fi,
"Derivation_minen ": Derivation_minen, # fi, "Derivation_minen ": Derivation_minen, # fi,
"Derivation_sti ": Derivation_sti, # fi, "Derivation_sti ": Derivation_sti, # fi,
"Derivation_inen ": Derivation_inen, # fi, "Derivation_inen ": Derivation_inen, # fi,
"Derivation_lainen ": Derivation_lainen, # fi, "Derivation_lainen ": Derivation_lainen, # fi,
"Derivation_ja ": Derivation_ja, # fi, "Derivation_ja ": Derivation_ja, # fi,
"Derivation_ton ": Derivation_ton, # fi, "Derivation_ton ": Derivation_ton, # fi,
"Derivation_vs ": Derivation_vs, # fi, "Derivation_vs ": Derivation_vs, # fi,
"Derivation_ttain ": Derivation_ttain, # fi, "Derivation_ttain ": Derivation_ttain, # fi,
"Derivation_ttaa ": Derivation_ttaa, # fi, "Derivation_ttaa ": Derivation_ttaa, # fi,
"Echo_rdp ": Echo_rdp, # U, "Echo_rdp ": Echo_rdp, # U,
"Echo_ech ": Echo_ech, # U, "Echo_ech ": Echo_ech, # U,
"Foreign_foreign ": Foreign_foreign, # cz, fi, U, "Foreign_foreign ": Foreign_foreign, # cz, fi, U,
"Foreign_fscript ": Foreign_fscript, # cz, fi, U, "Foreign_fscript ": Foreign_fscript, # cz, fi, U,
"Foreign_tscript ": Foreign_tscript, # cz, U, "Foreign_tscript ": Foreign_tscript, # cz, U,
"Foreign_yes ": Foreign_yes, # sl, "Foreign_yes ": Foreign_yes, # sl,
"Gender_dat_masc ": Gender_dat_masc, # bq, U, "Gender_dat_masc ": Gender_dat_masc, # bq, U,
"Gender_dat_fem ": Gender_dat_fem, # bq, U, "Gender_dat_fem ": Gender_dat_fem, # bq, U,
"Gender_erg_masc ": Gender_erg_masc, # bq, "Gender_erg_masc ": Gender_erg_masc, # bq,
"Gender_erg_fem ": Gender_erg_fem, # bq, "Gender_erg_fem ": Gender_erg_fem, # bq,
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
"Gender_psor_neut ": Gender_psor_neut, # sl, "Gender_psor_neut ": Gender_psor_neut, # sl,
"Hyph_yes ": Hyph_yes, # cz, U, "Hyph_yes ": Hyph_yes, # cz, U,
"InfForm_one ": InfForm_one, # fi, "InfForm_one ": InfForm_one, # fi,
"InfForm_two ": InfForm_two, # fi, "InfForm_two ": InfForm_two, # fi,
"InfForm_three ": InfForm_three, # fi, "InfForm_three ": InfForm_three, # fi,
"NameType_geo ": NameType_geo, # U, cz, "NameType_geo ": NameType_geo, # U, cz,
"NameType_prs ": NameType_prs, # U, cz, "NameType_prs ": NameType_prs, # U, cz,
"NameType_giv ": NameType_giv, # U, cz, "NameType_giv ": NameType_giv, # U, cz,
"NameType_sur ": NameType_sur, # U, cz, "NameType_sur ": NameType_sur, # U, cz,
"NameType_nat ": NameType_nat, # U, cz, "NameType_nat ": NameType_nat, # U, cz,
"NameType_com ": NameType_com, # U, cz, "NameType_com ": NameType_com, # U, cz,
"NameType_pro ": NameType_pro, # U, cz, "NameType_pro ": NameType_pro, # U, cz,
"NameType_oth ": NameType_oth, # U, cz, "NameType_oth ": NameType_oth, # U, cz,
"NounType_com ": NounType_com, # U, "NounType_com ": NounType_com, # U,
"NounType_prop ": NounType_prop, # U, "NounType_prop ": NounType_prop, # U,
"NounType_class ": NounType_class, # U, "NounType_class ": NounType_class, # U,
"Number_abs_sing ": Number_abs_sing, # bq, U, "Number_abs_sing ": Number_abs_sing, # bq, U,
"Number_abs_plur ": Number_abs_plur, # bq, U, "Number_abs_plur ": Number_abs_plur, # bq, U,
"Number_dat_sing ": Number_dat_sing, # bq, U, "Number_dat_sing ": Number_dat_sing, # bq, U,
"Number_dat_plur ": Number_dat_plur, # bq, U, "Number_dat_plur ": Number_dat_plur, # bq, U,
"Number_erg_sing ": Number_erg_sing, # bq, U, "Number_erg_sing ": Number_erg_sing, # bq, U,
"Number_erg_plur ": Number_erg_plur, # bq, U, "Number_erg_plur ": Number_erg_plur, # bq, U,
"Number_psee_sing ": Number_psee_sing, # U, "Number_psee_sing ": Number_psee_sing, # U,
"Number_psee_plur ": Number_psee_plur, # U, "Number_psee_plur ": Number_psee_plur, # U,
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
"NumForm_digit ": NumForm_digit, # cz, sl, U, "NumForm_digit ": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U, "NumForm_roman ": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U, "NumForm_word ": NumForm_word, # cz, sl, U,
"NumValue_one ": NumValue_one, # cz, U, "NumValue_one ": NumValue_one, # cz, U,
"NumValue_two ": NumValue_two, # cz, U, "NumValue_two ": NumValue_two, # cz, U,
"NumValue_three ": NumValue_three, # cz, U, "NumValue_three ": NumValue_three, # cz, U,
"PartForm_pres ": PartForm_pres, # fi, "PartForm_pres ": PartForm_pres, # fi,
"PartForm_past ": PartForm_past, # fi, "PartForm_past ": PartForm_past, # fi,
"PartForm_agt ": PartForm_agt, # fi, "PartForm_agt ": PartForm_agt, # fi,
"PartForm_neg ": PartForm_neg, # fi, "PartForm_neg ": PartForm_neg, # fi,
"PartType_mod ": PartType_mod, # U, "PartType_mod ": PartType_mod, # U,
"PartType_emp ": PartType_emp, # U, "PartType_emp ": PartType_emp, # U,
"PartType_res ": PartType_res, # U, "PartType_res ": PartType_res, # U,
"PartType_inf ": PartType_inf, # U, "PartType_inf ": PartType_inf, # U,
"PartType_vbp ": PartType_vbp, # U, "PartType_vbp ": PartType_vbp, # U,
"Person_abs_one ": Person_abs_one, # bq, U, "Person_abs_one ": Person_abs_one, # bq, U,
"Person_abs_two ": Person_abs_two, # bq, U, "Person_abs_two ": Person_abs_two, # bq, U,
"Person_abs_three ": Person_abs_three, # bq, U, "Person_abs_three ": Person_abs_three, # bq, U,
"Person_dat_one ": Person_dat_one, # bq, U, "Person_dat_one ": Person_dat_one, # bq, U,
"Person_dat_two ": Person_dat_two, # bq, U, "Person_dat_two ": Person_dat_two, # bq, U,
"Person_dat_three ": Person_dat_three, # bq, U, "Person_dat_three ": Person_dat_three, # bq, U,
"Person_erg_one ": Person_erg_one, # bq, U, "Person_erg_one ": Person_erg_one, # bq, U,
"Person_erg_two ": Person_erg_two, # bq, U, "Person_erg_two ": Person_erg_two, # bq, U,
"Person_erg_three ": Person_erg_three, # bq, U, "Person_erg_three ": Person_erg_three, # bq, U,
"Person_psor_one ": Person_psor_one, # fi, U, "Person_psor_one ": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U, "Person_psor_two ": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U, "Person_psor_three ": Person_psor_three, # fi, U,
"Polite_inf ": Polite_inf, # bq, U, "Polite_inf ": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U, "Polite_pol ": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U, "Polite_abs_inf ": Polite_abs_inf, # bq, U,
"Polite_abs_pol ": Polite_abs_pol, # bq, U, "Polite_abs_pol ": Polite_abs_pol, # bq, U,
"Polite_erg_inf ": Polite_erg_inf, # bq, U, "Polite_erg_inf ": Polite_erg_inf, # bq, U,
"Polite_erg_pol ": Polite_erg_pol, # bq, U, "Polite_erg_pol ": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U, "Polite_dat_inf ": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U, "Polite_dat_pol ": Polite_dat_pol, # bq, U,
"Prefix_yes ": Prefix_yes, # U, "Prefix_yes ": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz, "PrepCase_npr ": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U, "PrepCase_pre ": PrepCase_pre, # U,
"PunctSide_ini ": PunctSide_ini, # U, "PunctSide_ini ": PunctSide_ini, # U,
"PunctSide_fin ": PunctSide_fin, # U, "PunctSide_fin ": PunctSide_fin, # U,
"PunctType_peri ": PunctType_peri, # U, "PunctType_peri ": PunctType_peri, # U,
"PunctType_qest ": PunctType_qest, # U, "PunctType_qest ": PunctType_qest, # U,
"PunctType_excl ": PunctType_excl, # U, "PunctType_excl ": PunctType_excl, # U,
"PunctType_quot ": PunctType_quot, # U, "PunctType_quot ": PunctType_quot, # U,
"PunctType_brck ": PunctType_brck, # U, "PunctType_brck ": PunctType_brck, # U,
"PunctType_comm ": PunctType_comm, # U, "PunctType_comm ": PunctType_comm, # U,
"PunctType_colo ": PunctType_colo, # U, "PunctType_colo ": PunctType_colo, # U,
"PunctType_semi ": PunctType_semi, # U, "PunctType_semi ": PunctType_semi, # U,
"PunctType_dash ": PunctType_dash, # U, "PunctType_dash ": PunctType_dash, # U,
"Style_arch ": Style_arch, # cz, fi, U, "Style_arch ": Style_arch, # cz, fi, U,
"Style_rare ": Style_rare, # cz, fi, U, "Style_rare ": Style_rare, # cz, fi, U,
"Style_poet ": Style_poet, # cz, U, "Style_poet ": Style_poet, # cz, U,
"Style_norm ": Style_norm, # cz, U, "Style_norm ": Style_norm, # cz, U,
"Style_coll ": Style_coll, # cz, U, "Style_coll ": Style_coll, # cz, U,
"Style_vrnc ": Style_vrnc, # cz, U, "Style_vrnc ": Style_vrnc, # cz, U,
"Style_sing ": Style_sing, # cz, U, "Style_sing ": Style_sing, # cz, U,
"Style_expr ": Style_expr, # cz, U, "Style_expr ": Style_expr, # cz, U,
"Style_derg ": Style_derg, # cz, U, "Style_derg ": Style_derg, # cz, U,
"Style_vulg ": Style_vulg, # cz, U, "Style_vulg ": Style_vulg, # cz, U,
"Style_yes ": Style_yes, # fi, U, "Style_yes ": Style_yes, # fi, U,
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz, "StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
"VerbType_aux ": VerbType_aux, # U, "VerbType_aux ": VerbType_aux, # U,
"VerbType_cop ": VerbType_cop, # U, "VerbType_cop ": VerbType_cop, # U,
"VerbType_mod ": VerbType_mod, # U, "VerbType_mod ": VerbType_mod, # U,
"VerbType_light ": VerbType_light, # U, "VerbType_light ": VerbType_light, # U,
} }

View File

@ -8,7 +8,7 @@ IDS = {
"ADP": ADP, "ADP": ADP,
"ADV": ADV, "ADV": ADV,
"AUX": AUX, "AUX": AUX,
"CONJ": CONJ, # U20 "CONJ": CONJ, # U20
"CCONJ": CCONJ, "CCONJ": CCONJ,
"DET": DET, "DET": DET,
"INTJ": INTJ, "INTJ": INTJ,

View File

@ -85,7 +85,6 @@ class Scorer(object):
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')): def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
assert len(tokens) == len(gold) assert len(tokens) == len(gold)
gold_deps = set() gold_deps = set()
gold_tags = set() gold_tags = set()
gold_ents = set(tags_to_entities([annot[-1] gold_ents = set(tags_to_entities([annot[-1]

View File

@ -4,19 +4,15 @@ from __future__ import unicode_literals, absolute_import
cimport cython cimport cython
from libc.string cimport memcpy from libc.string cimport memcpy
from libc.stdint cimport uint64_t, uint32_t
from murmurhash.mrmr cimport hash64, hash32
from preshed.maps cimport map_iter, key_t
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from murmurhash.mrmr cimport hash64, hash32
import ujson import ujson
import dill
from .symbols import IDS as SYMBOLS_BY_STR from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT from .symbols import NAMES as SYMBOLS_BY_INT
from .typedefs cimport hash_t from .typedefs cimport hash_t
from . import util
from .compat import json_dumps from .compat import json_dumps
from . import util
cpdef hash_t hash_string(unicode string) except 0: cpdef hash_t hash_string(unicode string) except 0:
@ -195,7 +191,7 @@ cdef class StringStore:
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects. it doesn't exist. Paths may be either strings or Path-like objects.
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
strings = list(self) strings = list(self)
@ -225,7 +221,7 @@ cdef class StringStore:
**exclude: Named attributes to prevent from being serialized. **exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object. RETURNS (bytes): The serialized form of the `StringStore` object.
""" """
return ujson.dumps(list(self)) return json_dumps(list(self))
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string. """Load state from a binary string.

View File

@ -1,8 +1,8 @@
# coding: utf8 # coding: utf8
#cython: optimize.unpack_method_calls=False #cython: optimize.unpack_method_calls=False
from __future__ import unicode_literals from __future__ import unicode_literals
IDS = { IDS = {
"": NIL, "": NIL,
"IS_ALPHA": IS_ALPHA, "IS_ALPHA": IS_ALPHA,
@ -464,9 +464,11 @@ IDS = {
"LAW": LAW "LAW": LAW
} }
def sort_nums(x): def sort_nums(x):
return x[1] return x[1]
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)] NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
# Unfortunate hack here, to work around problem with long cpdef enum # Unfortunate hack here, to work around problem with long cpdef enum
# (which is generating an enormous amount of C++ in Cython 0.24+) # (which is generating an enormous amount of C++ in Cython 0.24+)

View File

@ -8,12 +8,11 @@ from cython.operator cimport preincrement as preinc
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
import regex as re import regex as re
from .strings cimport hash_string
from . import util
cimport cython cimport cython
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .strings cimport hash_string
from . import util
cdef class Tokenizer: cdef class Tokenizer:
@ -21,7 +20,7 @@ cdef class Tokenizer:
boundaries. boundaries.
""" """
def __init__(self, Vocab vocab, rules=None, prefix_search=None, def __init__(self, Vocab vocab, rules=None, prefix_search=None,
suffix_search=None, infix_finditer=None, token_match=None): suffix_search=None, infix_finditer=None, token_match=None):
"""Create a `Tokenizer`, to create `Doc` objects given unicode text. """Create a `Tokenizer`, to create `Doc` objects given unicode text.
vocab (Vocab): A storage container for lexical types. vocab (Vocab): A storage container for lexical types.
@ -74,9 +73,8 @@ cdef class Tokenizer:
RETURNS (Doc): A container for linguistic annotations. RETURNS (Doc): A container for linguistic annotations.
""" """
if len(string) >= (2 ** 30): if len(string) >= (2 ** 30):
raise ValueError( msg = "String is too long: %d characters. Max is 2**30."
"String is too long: %d characters. Max is 2**30." % len(string) raise ValueError(msg % len(string))
)
cdef int length = len(string) cdef int length = len(string)
cdef Doc doc = Doc(self.vocab) cdef Doc doc = Doc(self.vocab)
if length == 0: if length == 0:
@ -122,8 +120,8 @@ cdef class Tokenizer:
"""Tokenize a stream of texts. """Tokenize a stream of texts.
texts: A sequence of unicode texts. texts: A sequence of unicode texts.
batch_size (int): The number of texts to accumulate in an internal buffer. batch_size (int): Number of texts to accumulate in an internal buffer.
n_threads (int): The number of threads to use, if the implementation n_threads (int): Number of threads to use, if the implementation
supports multi-threading. The default tokenizer is single-threaded. supports multi-threading. The default tokenizer is single-threaded.
YIELDS (Doc): A sequence of Doc objects, in order. YIELDS (Doc): A sequence of Doc objects, in order.
""" """
@ -232,8 +230,8 @@ cdef class Tokenizer:
if not matches: if not matches:
tokens.push_back(self.vocab.get(tokens.mem, string), False) tokens.push_back(self.vocab.get(tokens.mem, string), False)
else: else:
# let's say we have dyn-o-mite-dave # let's say we have dyn-o-mite-dave - the regex finds the
# the regex finds the start and end positions of the hyphens # start and end positions of the hyphens
start = 0 start = 0
for match in matches: for match in matches:
infix_start = match.start() infix_start = match.start()
@ -293,8 +291,8 @@ cdef class Tokenizer:
return list(self.infix_finditer(string)) return list(self.infix_finditer(string))
def find_prefix(self, unicode string): def find_prefix(self, unicode string):
"""Find the length of a prefix that should be segmented from the string, """Find the length of a prefix that should be segmented from the
or None if no prefix rules match. string, or None if no prefix rules match.
string (unicode): The string to segment. string (unicode): The string to segment.
RETURNS (int): The length of the prefix if present, otherwise `None`. RETURNS (int): The length of the prefix if present, otherwise `None`.
@ -305,8 +303,8 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def find_suffix(self, unicode string): def find_suffix(self, unicode string):
"""Find the length of a suffix that should be segmented from the string, """Find the length of a suffix that should be segmented from the
or None if no suffix rules match. string, or None if no suffix rules match.
string (unicode): The string to segment. string (unicode): The string to segment.
Returns (int): The length of the suffix if present, otherwise `None`. Returns (int): The length of the suffix if present, otherwise `None`.
@ -326,8 +324,8 @@ cdef class Tokenizer:
string (unicode): The string to specially tokenize. string (unicode): The string to specially tokenize.
token_attrs (iterable): A sequence of dicts, where each dict describes token_attrs (iterable): A sequence of dicts, where each dict describes
a token and its attributes. The `ORTH` fields of the attributes must a token and its attributes. The `ORTH` fields of the attributes
exactly match the string when they are concatenated. must exactly match the string when they are concatenated.
""" """
substrings = list(substrings) substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
@ -343,7 +341,7 @@ cdef class Tokenizer:
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects. it doesn't exist. Paths may be either strings or Path-like objects.
""" """
with path.open('wb') as file_: with path.open('wb') as file_:
file_.write(self.to_bytes(**exclude)) file_.write(self.to_bytes(**exclude))

View File

@ -476,7 +476,7 @@ cdef class Span:
""" """
# TODO: implement # TODO: implement
def __get__(self): def __get__(self):
raise NotImplementedError() raise NotImplementedError
property n_rights: property n_rights:
"""RETURNS (int): The number of rightward immediate children of the """RETURNS (int): The number of rightward immediate children of the
@ -484,7 +484,7 @@ cdef class Span:
""" """
# TODO: implement # TODO: implement
def __get__(self): def __get__(self):
raise NotImplementedError() raise NotImplementedError
property subtree: property subtree:
"""Tokens that descend from tokens in the span, but fall outside it. """Tokens that descend from tokens in the span, but fall outside it.

View File

@ -1 +0,0 @@

View File

@ -17,8 +17,8 @@ from .compat import copy_reg, basestring_
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .attrs import intify_attrs from .attrs import intify_attrs
from .vectors import Vectors from .vectors import Vectors
from . import util
from ._ml import link_vectors_to_models from ._ml import link_vectors_to_models
from . import util
cdef class Vocab: cdef class Vocab: