mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Tidy up rest
This commit is contained in:
		
							parent
							
								
									a8e10f94e4
								
							
						
					
					
						commit
						d96e72f656
					
				| 
						 | 
					@ -8,11 +8,9 @@ from thinc.t2t import ExtractWindow, ParametricAttention
 | 
				
			||||||
from thinc.t2v import Pooling, sum_pool
 | 
					from thinc.t2v import Pooling, sum_pool
 | 
				
			||||||
from thinc.misc import Residual
 | 
					from thinc.misc import Residual
 | 
				
			||||||
from thinc.misc import LayerNorm as LN
 | 
					from thinc.misc import LayerNorm as LN
 | 
				
			||||||
 | 
					 | 
				
			||||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
 | 
					from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
 | 
				
			||||||
from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
 | 
					from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
 | 
				
			||||||
from thinc.api import uniqued, wrap, noop
 | 
					from thinc.api import uniqued, wrap, noop
 | 
				
			||||||
 | 
					 | 
				
			||||||
from thinc.linear.linear import LinearModel
 | 
					from thinc.linear.linear import LinearModel
 | 
				
			||||||
from thinc.neural.ops import NumpyOps, CupyOps
 | 
					from thinc.neural.ops import NumpyOps, CupyOps
 | 
				
			||||||
from thinc.neural.util import get_array_module
 | 
					from thinc.neural.util import get_array_module
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -101,17 +101,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Normalize a dictionary of attributes, converting them to ints.
 | 
					    Normalize a dictionary of attributes, converting them to ints.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Arguments:
 | 
					    stringy_attrs (dict): Dictionary keyed by attribute string names. Values
 | 
				
			||||||
        stringy_attrs (dict):
 | 
					        can be ints or strings.
 | 
				
			||||||
            Dictionary keyed by attribute string names. Values can be ints or strings.
 | 
					    strings_map (StringStore): Defaults to None. If provided, encodes string
 | 
				
			||||||
 | 
					        values into ints.
 | 
				
			||||||
        strings_map (StringStore):
 | 
					    RETURNS (dict): Attributes dictionary with keys and optionally values
 | 
				
			||||||
            Defaults to None. If provided, encodes string values into ints.
 | 
					        converted to ints.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    Returns:
 | 
					 | 
				
			||||||
        inty_attrs (dict):
 | 
					 | 
				
			||||||
            Attributes dictionary with keys and optionally values converted to
 | 
					 | 
				
			||||||
            ints.
 | 
					 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    inty_attrs = {}
 | 
					    inty_attrs = {}
 | 
				
			||||||
    if _do_deprecated:
 | 
					    if _do_deprecated:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,7 +2,6 @@
 | 
				
			||||||
# coding: utf8
 | 
					# coding: utf8
 | 
				
			||||||
from __future__ import unicode_literals, print_function
 | 
					from __future__ import unicode_literals, print_function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import io
 | 
					 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import ujson
 | 
					import ujson
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
| 
						 | 
					@ -10,9 +9,8 @@ import cytoolz
 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .syntax import nonproj
 | 
					from .syntax import nonproj
 | 
				
			||||||
from .util import ensure_path
 | 
					 | 
				
			||||||
from . import util
 | 
					 | 
				
			||||||
from .tokens import Doc
 | 
					from .tokens import Doc
 | 
				
			||||||
 | 
					from . import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def tags_to_entities(tags):
 | 
					def tags_to_entities(tags):
 | 
				
			||||||
| 
						 | 
					@ -310,7 +308,7 @@ def _corrupt(c, noise_level):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def read_json_file(loc, docs_filter=None, limit=None):
 | 
					def read_json_file(loc, docs_filter=None, limit=None):
 | 
				
			||||||
    loc = ensure_path(loc)
 | 
					    loc = util.ensure_path(loc)
 | 
				
			||||||
    if loc.is_dir():
 | 
					    if loc.is_dir():
 | 
				
			||||||
        for filename in loc.iterdir():
 | 
					        for filename in loc.iterdir():
 | 
				
			||||||
            yield from read_json_file(loc / filename, limit=limit)
 | 
					            yield from read_json_file(loc / filename, limit=limit)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,22 +1,22 @@
 | 
				
			||||||
# coding: utf8
 | 
					# coding: utf8
 | 
				
			||||||
from __future__ import absolute_import, unicode_literals
 | 
					from __future__ import absolute_import, unicode_literals
 | 
				
			||||||
from contextlib import contextmanager
 | 
					 | 
				
			||||||
import copy
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.neural import Model
 | 
					 | 
				
			||||||
from thinc.neural.optimizers import Adam
 | 
					 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import ujson
 | 
					import ujson
 | 
				
			||||||
from collections import OrderedDict
 | 
					 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
import weakref
 | 
					import weakref
 | 
				
			||||||
import functools
 | 
					import functools
 | 
				
			||||||
 | 
					from collections import OrderedDict
 | 
				
			||||||
 | 
					from contextlib import contextmanager
 | 
				
			||||||
 | 
					from copy import copy
 | 
				
			||||||
 | 
					from thinc.neural import Model
 | 
				
			||||||
 | 
					from thinc.neural.optimizers import Adam
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer import Tokenizer
 | 
					from .tokenizer import Tokenizer
 | 
				
			||||||
from .vocab import Vocab
 | 
					from .vocab import Vocab
 | 
				
			||||||
from .lemmatizer import Lemmatizer
 | 
					from .lemmatizer import Lemmatizer
 | 
				
			||||||
from .pipeline import DependencyParser, Tensorizer, Tagger
 | 
					from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
 | 
				
			||||||
from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
 | 
					from .pipeline import SimilarityHook, TextCategorizer
 | 
				
			||||||
from .compat import json_dumps, izip
 | 
					from .compat import json_dumps, izip
 | 
				
			||||||
from .scorer import Scorer
 | 
					from .scorer import Scorer
 | 
				
			||||||
from ._ml import link_vectors_to_models
 | 
					from ._ml import link_vectors_to_models
 | 
				
			||||||
| 
						 | 
					@ -649,7 +649,7 @@ class Language(object):
 | 
				
			||||||
        serializers = OrderedDict((
 | 
					        serializers = OrderedDict((
 | 
				
			||||||
            ('vocab', lambda: self.vocab.to_bytes()),
 | 
					            ('vocab', lambda: self.vocab.to_bytes()),
 | 
				
			||||||
            ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
 | 
					            ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
 | 
				
			||||||
            ('meta', lambda: ujson.dumps(self.meta))
 | 
					            ('meta', lambda: json_dumps(self.meta))
 | 
				
			||||||
        ))
 | 
					        ))
 | 
				
			||||||
        for i, (name, proc) in enumerate(self.pipeline):
 | 
					        for i, (name, proc) in enumerate(self.pipeline):
 | 
				
			||||||
            if name in disable:
 | 
					            if name in disable:
 | 
				
			||||||
| 
						 | 
					@ -689,7 +689,7 @@ class DisabledPipes(list):
 | 
				
			||||||
        # Important! Not deep copy -- we just want the container (but we also
 | 
					        # Important! Not deep copy -- we just want the container (but we also
 | 
				
			||||||
        # want to support people providing arbitrarily typed nlp.pipeline
 | 
					        # want to support people providing arbitrarily typed nlp.pipeline
 | 
				
			||||||
        # objects.)
 | 
					        # objects.)
 | 
				
			||||||
        self.original_pipeline = copy.copy(nlp.pipeline)
 | 
					        self.original_pipeline = copy(nlp.pipeline)
 | 
				
			||||||
        list.__init__(self)
 | 
					        list.__init__(self)
 | 
				
			||||||
        self.extend(nlp.remove_pipe(name) for name in names)
 | 
					        self.extend(nlp.remove_pipe(name) for name in names)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,12 +4,6 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import ujson
 | 
					import ujson
 | 
				
			||||||
 | 
					 | 
				
			||||||
from .typedefs cimport attr_t
 | 
					 | 
				
			||||||
from .typedefs cimport hash_t
 | 
					 | 
				
			||||||
from .attrs cimport attr_id_t
 | 
					 | 
				
			||||||
from .structs cimport TokenC
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from preshed.maps cimport PreshMap
 | 
					from preshed.maps cimport PreshMap
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
| 
						 | 
					@ -17,14 +11,15 @@ from libcpp.pair cimport pair
 | 
				
			||||||
from murmurhash.mrmr cimport hash64
 | 
					from murmurhash.mrmr cimport hash64
 | 
				
			||||||
from libc.stdint cimport int32_t
 | 
					from libc.stdint cimport int32_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .attrs cimport ID, NULL_ATTR, ENT_TYPE
 | 
					from .typedefs cimport attr_t
 | 
				
			||||||
from . import attrs
 | 
					from .typedefs cimport hash_t
 | 
				
			||||||
from .tokens.doc cimport get_token_attr
 | 
					from .structs cimport TokenC
 | 
				
			||||||
from .tokens.doc cimport Doc
 | 
					from .tokens.doc cimport Doc, get_token_attr
 | 
				
			||||||
from .vocab cimport Vocab
 | 
					from .vocab cimport Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .attrs import IDS
 | 
				
			||||||
 | 
					from .attrs cimport attr_id_t, ID, NULL_ATTR
 | 
				
			||||||
from .attrs import FLAG61 as U_ENT
 | 
					from .attrs import FLAG61 as U_ENT
 | 
				
			||||||
 | 
					 | 
				
			||||||
from .attrs import FLAG60 as B2_ENT
 | 
					from .attrs import FLAG60 as B2_ENT
 | 
				
			||||||
from .attrs import FLAG59 as B3_ENT
 | 
					from .attrs import FLAG59 as B3_ENT
 | 
				
			||||||
from .attrs import FLAG58 as B4_ENT
 | 
					from .attrs import FLAG58 as B4_ENT
 | 
				
			||||||
| 
						 | 
					@ -34,7 +29,6 @@ from .attrs import FLAG55 as B7_ENT
 | 
				
			||||||
from .attrs import FLAG54 as B8_ENT
 | 
					from .attrs import FLAG54 as B8_ENT
 | 
				
			||||||
from .attrs import FLAG53 as B9_ENT
 | 
					from .attrs import FLAG53 as B9_ENT
 | 
				
			||||||
from .attrs import FLAG52 as B10_ENT
 | 
					from .attrs import FLAG52 as B10_ENT
 | 
				
			||||||
 | 
					 | 
				
			||||||
from .attrs import FLAG51 as I3_ENT
 | 
					from .attrs import FLAG51 as I3_ENT
 | 
				
			||||||
from .attrs import FLAG50 as I4_ENT
 | 
					from .attrs import FLAG50 as I4_ENT
 | 
				
			||||||
from .attrs import FLAG49 as I5_ENT
 | 
					from .attrs import FLAG49 as I5_ENT
 | 
				
			||||||
| 
						 | 
					@ -43,7 +37,6 @@ from .attrs import FLAG47 as I7_ENT
 | 
				
			||||||
from .attrs import FLAG46 as I8_ENT
 | 
					from .attrs import FLAG46 as I8_ENT
 | 
				
			||||||
from .attrs import FLAG45 as I9_ENT
 | 
					from .attrs import FLAG45 as I9_ENT
 | 
				
			||||||
from .attrs import FLAG44 as I10_ENT
 | 
					from .attrs import FLAG44 as I10_ENT
 | 
				
			||||||
 | 
					 | 
				
			||||||
from .attrs import FLAG43 as L2_ENT
 | 
					from .attrs import FLAG43 as L2_ENT
 | 
				
			||||||
from .attrs import FLAG42 as L3_ENT
 | 
					from .attrs import FLAG42 as L3_ENT
 | 
				
			||||||
from .attrs import FLAG41 as L4_ENT
 | 
					from .attrs import FLAG41 as L4_ENT
 | 
				
			||||||
| 
						 | 
					@ -168,10 +161,10 @@ def _convert_strings(token_specs, string_store):
 | 
				
			||||||
                if value in operators:
 | 
					                if value in operators:
 | 
				
			||||||
                    ops = operators[value]
 | 
					                    ops = operators[value]
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    raise KeyError(
 | 
					                    msg = "Unknown operator '%s'. Options: %s"
 | 
				
			||||||
                        "Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys())))
 | 
					                    raise KeyError(msg % (value, ', '.join(operators.keys())))
 | 
				
			||||||
            if isinstance(attr, basestring):
 | 
					            if isinstance(attr, basestring):
 | 
				
			||||||
                attr = attrs.IDS.get(attr.upper())
 | 
					                attr = IDS.get(attr.upper())
 | 
				
			||||||
            if isinstance(value, basestring):
 | 
					            if isinstance(value, basestring):
 | 
				
			||||||
                value = string_store.add(value)
 | 
					                value = string_store.add(value)
 | 
				
			||||||
            if isinstance(value, bool):
 | 
					            if isinstance(value, bool):
 | 
				
			||||||
| 
						 | 
					@ -186,7 +179,7 @@ def _convert_strings(token_specs, string_store):
 | 
				
			||||||
def merge_phrase(matcher, doc, i, matches):
 | 
					def merge_phrase(matcher, doc, i, matches):
 | 
				
			||||||
    """Callback to merge a phrase on match."""
 | 
					    """Callback to merge a phrase on match."""
 | 
				
			||||||
    ent_id, label, start, end = matches[i]
 | 
					    ent_id, label, start, end = matches[i]
 | 
				
			||||||
    span = doc[start : end]
 | 
					    span = doc[start:end]
 | 
				
			||||||
    span.merge(ent_type=label, ent_id=ent_id)
 | 
					    span.merge(ent_type=label, ent_id=ent_id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -233,13 +226,13 @@ cdef class Matcher:
 | 
				
			||||||
        return self._normalize_key(key) in self._patterns
 | 
					        return self._normalize_key(key) in self._patterns
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def add(self, key, on_match, *patterns):
 | 
					    def add(self, key, on_match, *patterns):
 | 
				
			||||||
        """Add a match-rule to the matcher. A match-rule consists of: an ID key,
 | 
					        """Add a match-rule to the matcher. A match-rule consists of: an ID
 | 
				
			||||||
        an on_match callback, and one or more patterns.
 | 
					        key, an on_match callback, and one or more patterns.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        If the key exists, the patterns are appended to the previous ones, and
 | 
					        If the key exists, the patterns are appended to the previous ones, and
 | 
				
			||||||
        the previous on_match callback is replaced. The `on_match` callback will
 | 
					        the previous on_match callback is replaced. The `on_match` callback
 | 
				
			||||||
        receive the arguments `(matcher, doc, i, matches)`. You can also set
 | 
					        will receive the arguments `(matcher, doc, i, matches)`. You can also
 | 
				
			||||||
        `on_match` to `None` to not perform any actions.
 | 
					        set `on_match` to `None` to not perform any actions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        A pattern consists of one or more `token_specs`, where a `token_spec`
 | 
					        A pattern consists of one or more `token_specs`, where a `token_spec`
 | 
				
			||||||
        is a dictionary mapping attribute IDs to values, and optionally a
 | 
					        is a dictionary mapping attribute IDs to values, and optionally a
 | 
				
			||||||
| 
						 | 
					@ -253,8 +246,8 @@ cdef class Matcher:
 | 
				
			||||||
        The + and * operators are usually interpretted "greedily", i.e. longer
 | 
					        The + and * operators are usually interpretted "greedily", i.e. longer
 | 
				
			||||||
        matches are returned where possible. However, if you specify two '+'
 | 
					        matches are returned where possible. However, if you specify two '+'
 | 
				
			||||||
        and '*' patterns in a row and their matches overlap, the first
 | 
					        and '*' patterns in a row and their matches overlap, the first
 | 
				
			||||||
        operator will behave non-greedily. This quirk in the semantics
 | 
					        operator will behave non-greedily. This quirk in the semantics makes
 | 
				
			||||||
        makes the matcher more efficient, by avoiding the need for back-tracking.
 | 
					        the matcher more efficient, by avoiding the need for back-tracking.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        key (unicode): The match ID.
 | 
					        key (unicode): The match ID.
 | 
				
			||||||
        on_match (callable): Callback executed on match.
 | 
					        on_match (callable): Callback executed on match.
 | 
				
			||||||
| 
						 | 
					@ -268,7 +261,6 @@ cdef class Matcher:
 | 
				
			||||||
        key = self._normalize_key(key)
 | 
					        key = self._normalize_key(key)
 | 
				
			||||||
        self._patterns.setdefault(key, [])
 | 
					        self._patterns.setdefault(key, [])
 | 
				
			||||||
        self._callbacks[key] = on_match
 | 
					        self._callbacks[key] = on_match
 | 
				
			||||||
 | 
					 | 
				
			||||||
        for pattern in patterns:
 | 
					        for pattern in patterns:
 | 
				
			||||||
            specs = _convert_strings(pattern, self.vocab.strings)
 | 
					            specs = _convert_strings(pattern, self.vocab.strings)
 | 
				
			||||||
            self.patterns.push_back(init_pattern(self.mem, key, specs))
 | 
					            self.patterns.push_back(init_pattern(self.mem, key, specs))
 | 
				
			||||||
| 
						 | 
					@ -315,9 +307,9 @@ cdef class Matcher:
 | 
				
			||||||
        """Match a stream of documents, yielding them in turn.
 | 
					        """Match a stream of documents, yielding them in turn.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        docs (iterable): A stream of documents.
 | 
					        docs (iterable): A stream of documents.
 | 
				
			||||||
        batch_size (int): The number of documents to accumulate into a working set.
 | 
					        batch_size (int): Number of documents to accumulate into a working set.
 | 
				
			||||||
        n_threads (int): The number of threads with which to work on the buffer
 | 
					        n_threads (int): The number of threads with which to work on the buffer
 | 
				
			||||||
            in parallel, if the `Matcher` implementation supports multi-threading.
 | 
					            in parallel, if the implementation supports multi-threading.
 | 
				
			||||||
        YIELDS (Doc): Documents, in order.
 | 
					        YIELDS (Doc): Documents, in order.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        for doc in docs:
 | 
					        for doc in docs:
 | 
				
			||||||
| 
						 | 
					@ -325,7 +317,7 @@ cdef class Matcher:
 | 
				
			||||||
            yield doc
 | 
					            yield doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, Doc doc):
 | 
					    def __call__(self, Doc doc):
 | 
				
			||||||
        """Find all token sequences matching the supplied patterns on the `Doc`.
 | 
					        """Find all token sequences matching the supplied pattern.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        doc (Doc): The document to match over.
 | 
					        doc (Doc): The document to match over.
 | 
				
			||||||
        RETURNS (list): A list of `(key, start, end)` tuples,
 | 
					        RETURNS (list): A list of `(key, start, end)` tuples,
 | 
				
			||||||
| 
						 | 
					@ -342,8 +334,8 @@ cdef class Matcher:
 | 
				
			||||||
        for token_i in range(doc.length):
 | 
					        for token_i in range(doc.length):
 | 
				
			||||||
            token = &doc.c[token_i]
 | 
					            token = &doc.c[token_i]
 | 
				
			||||||
            q = 0
 | 
					            q = 0
 | 
				
			||||||
            # Go over the open matches, extending or finalizing if able. Otherwise,
 | 
					            # Go over the open matches, extending or finalizing if able.
 | 
				
			||||||
            # we over-write them (q doesn't advance)
 | 
					            # Otherwise, we over-write them (q doesn't advance)
 | 
				
			||||||
            for state in partials:
 | 
					            for state in partials:
 | 
				
			||||||
                action = get_action(state.second, token)
 | 
					                action = get_action(state.second, token)
 | 
				
			||||||
                if action == PANIC:
 | 
					                if action == PANIC:
 | 
				
			||||||
| 
						 | 
					@ -356,8 +348,8 @@ cdef class Matcher:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                if action == REPEAT:
 | 
					                if action == REPEAT:
 | 
				
			||||||
                    # Leave the state in the queue, and advance to next slot
 | 
					                    # Leave the state in the queue, and advance to next slot
 | 
				
			||||||
                    # (i.e. we don't overwrite -- we want to greedily match more
 | 
					                    # (i.e. we don't overwrite -- we want to greedily match
 | 
				
			||||||
                    # pattern.
 | 
					                    # more pattern.
 | 
				
			||||||
                    q += 1
 | 
					                    q += 1
 | 
				
			||||||
                elif action == REJECT:
 | 
					                elif action == REJECT:
 | 
				
			||||||
                    pass
 | 
					                    pass
 | 
				
			||||||
| 
						 | 
					@ -366,8 +358,8 @@ cdef class Matcher:
 | 
				
			||||||
                    partials[q].second += 1
 | 
					                    partials[q].second += 1
 | 
				
			||||||
                    q += 1
 | 
					                    q += 1
 | 
				
			||||||
                elif action in (ACCEPT, ACCEPT_PREV):
 | 
					                elif action in (ACCEPT, ACCEPT_PREV):
 | 
				
			||||||
                    # TODO: What to do about patterns starting with ZERO? Need to
 | 
					                    # TODO: What to do about patterns starting with ZERO? Need
 | 
				
			||||||
                    # adjust the start position.
 | 
					                    # to adjust the start position.
 | 
				
			||||||
                    start = state.first
 | 
					                    start = state.first
 | 
				
			||||||
                    end = token_i+1 if action == ACCEPT else token_i
 | 
					                    end = token_i+1 if action == ACCEPT else token_i
 | 
				
			||||||
                    ent_id = state.second[1].attrs[0].value
 | 
					                    ent_id = state.second[1].attrs[0].value
 | 
				
			||||||
| 
						 | 
					@ -388,8 +380,8 @@ cdef class Matcher:
 | 
				
			||||||
                    state.second = pattern
 | 
					                    state.second = pattern
 | 
				
			||||||
                    partials.push_back(state)
 | 
					                    partials.push_back(state)
 | 
				
			||||||
                elif action == ADVANCE:
 | 
					                elif action == ADVANCE:
 | 
				
			||||||
                    # TODO: What to do about patterns starting with ZERO? Need to
 | 
					                    # TODO: What to do about patterns starting with ZERO? Need
 | 
				
			||||||
                    # adjust the start position.
 | 
					                    # to adjust the start position.
 | 
				
			||||||
                    state.first = token_i
 | 
					                    state.first = token_i
 | 
				
			||||||
                    state.second = pattern + 1
 | 
					                    state.second = pattern + 1
 | 
				
			||||||
                    partials.push_back(state)
 | 
					                    partials.push_back(state)
 | 
				
			||||||
| 
						 | 
					@ -413,7 +405,6 @@ cdef class Matcher:
 | 
				
			||||||
            on_match = self._callbacks.get(ent_id)
 | 
					            on_match = self._callbacks.get(ent_id)
 | 
				
			||||||
            if on_match is not None:
 | 
					            if on_match is not None:
 | 
				
			||||||
                on_match(self, doc, i, matches)
 | 
					                on_match(self, doc, i, matches)
 | 
				
			||||||
        # TODO: only return (match_id, start, end)
 | 
					 | 
				
			||||||
        return matches
 | 
					        return matches
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _normalize_key(self, key):
 | 
					    def _normalize_key(self, key):
 | 
				
			||||||
| 
						 | 
					@ -441,7 +432,8 @@ def get_bilou(length):
 | 
				
			||||||
    elif length == 8:
 | 
					    elif length == 8:
 | 
				
			||||||
        return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
 | 
					        return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
 | 
				
			||||||
    elif length == 9:
 | 
					    elif length == 9:
 | 
				
			||||||
        return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
 | 
					        return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
 | 
				
			||||||
 | 
					                L9_ENT]
 | 
				
			||||||
    elif length == 10:
 | 
					    elif length == 10:
 | 
				
			||||||
        return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
 | 
					        return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
 | 
				
			||||||
                I10_ENT, I10_ENT, L10_ENT]
 | 
					                I10_ENT, I10_ENT, L10_ENT]
 | 
				
			||||||
| 
						 | 
					@ -454,10 +446,8 @@ cdef class PhraseMatcher:
 | 
				
			||||||
    cdef Vocab vocab
 | 
					    cdef Vocab vocab
 | 
				
			||||||
    cdef Matcher matcher
 | 
					    cdef Matcher matcher
 | 
				
			||||||
    cdef PreshMap phrase_ids
 | 
					    cdef PreshMap phrase_ids
 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef int max_length
 | 
					    cdef int max_length
 | 
				
			||||||
    cdef attr_t* _phrase_key
 | 
					    cdef attr_t* _phrase_key
 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef public object _callbacks
 | 
					    cdef public object _callbacks
 | 
				
			||||||
    cdef public object _patterns
 | 
					    cdef public object _patterns
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -470,7 +460,8 @@ cdef class PhraseMatcher:
 | 
				
			||||||
        self.phrase_ids = PreshMap()
 | 
					        self.phrase_ids = PreshMap()
 | 
				
			||||||
        abstract_patterns = []
 | 
					        abstract_patterns = []
 | 
				
			||||||
        for length in range(1, max_length):
 | 
					        for length in range(1, max_length):
 | 
				
			||||||
            abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
 | 
					            abstract_patterns.append([{tag: True}
 | 
				
			||||||
 | 
					                                      for tag in get_bilou(length)])
 | 
				
			||||||
        self.matcher.add('Candidate', None, *abstract_patterns)
 | 
					        self.matcher.add('Candidate', None, *abstract_patterns)
 | 
				
			||||||
        self._callbacks = {}
 | 
					        self._callbacks = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -496,8 +487,8 @@ cdef class PhraseMatcher:
 | 
				
			||||||
        return (self.__class__, (self.vocab,), None, None)
 | 
					        return (self.__class__, (self.vocab,), None, None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def add(self, key, on_match, *docs):
 | 
					    def add(self, key, on_match, *docs):
 | 
				
			||||||
        """Add a match-rule to the matcher. A match-rule consists of: an ID key,
 | 
					        """Add a match-rule to the matcher. A match-rule consists of: an ID
 | 
				
			||||||
        an on_match callback, and one or more patterns.
 | 
					        key, an on_match callback, and one or more patterns.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        key (unicode): The match ID.
 | 
					        key (unicode): The match ID.
 | 
				
			||||||
        on_match (callable): Callback executed on match.
 | 
					        on_match (callable): Callback executed on match.
 | 
				
			||||||
| 
						 | 
					@ -513,7 +504,6 @@ cdef class PhraseMatcher:
 | 
				
			||||||
                raise ValueError(msg % (len(doc), self.max_length))
 | 
					                raise ValueError(msg % (len(doc), self.max_length))
 | 
				
			||||||
        cdef hash_t ent_id = self.matcher._normalize_key(key)
 | 
					        cdef hash_t ent_id = self.matcher._normalize_key(key)
 | 
				
			||||||
        self._callbacks[ent_id] = on_match
 | 
					        self._callbacks[ent_id] = on_match
 | 
				
			||||||
 | 
					 | 
				
			||||||
        cdef int length
 | 
					        cdef int length
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
        cdef hash_t phrase_hash
 | 
					        cdef hash_t phrase_hash
 | 
				
			||||||
| 
						 | 
					@ -553,9 +543,9 @@ cdef class PhraseMatcher:
 | 
				
			||||||
        """Match a stream of documents, yielding them in turn.
 | 
					        """Match a stream of documents, yielding them in turn.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        docs (iterable): A stream of documents.
 | 
					        docs (iterable): A stream of documents.
 | 
				
			||||||
        batch_size (int): The number of documents to accumulate into a working set.
 | 
					        batch_size (int): Number of documents to accumulate into a working set.
 | 
				
			||||||
        n_threads (int): The number of threads with which to work on the buffer
 | 
					        n_threads (int): The number of threads with which to work on the buffer
 | 
				
			||||||
            in parallel, if the `Matcher` implementation supports multi-threading.
 | 
					            in parallel, if the implementation supports multi-threading.
 | 
				
			||||||
        YIELDS (Doc): Documents, in order.
 | 
					        YIELDS (Doc): Documents, in order.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        for doc in stream:
 | 
					        for doc in stream:
 | 
				
			||||||
| 
						 | 
					@ -569,7 +559,8 @@ cdef class PhraseMatcher:
 | 
				
			||||||
            self._phrase_key[i] = 0
 | 
					            self._phrase_key[i] = 0
 | 
				
			||||||
        for i, j in enumerate(range(start, end)):
 | 
					        for i, j in enumerate(range(start, end)):
 | 
				
			||||||
            self._phrase_key[i] = doc.c[j].lex.orth
 | 
					            self._phrase_key[i] = doc.c[j].lex.orth
 | 
				
			||||||
        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
 | 
					        cdef hash_t key = hash64(self._phrase_key,
 | 
				
			||||||
 | 
					                                 self.max_length * sizeof(attr_t), 0)
 | 
				
			||||||
        ent_id = <hash_t>self.phrase_ids.get(key)
 | 
					        ent_id = <hash_t>self.phrase_ids.get(key)
 | 
				
			||||||
        if ent_id == 0:
 | 
					        if ent_id == 0:
 | 
				
			||||||
            return None
 | 
					            return None
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,17 +4,15 @@ from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from libc.string cimport memset
 | 
					from libc.string cimport memset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
 | 
					 | 
				
			||||||
from .attrs cimport POS, IS_SPACE
 | 
					from .attrs cimport POS, IS_SPACE
 | 
				
			||||||
 | 
					from .attrs import LEMMA, intify_attrs
 | 
				
			||||||
 | 
					from .parts_of_speech cimport SPACE
 | 
				
			||||||
from .parts_of_speech import IDS as POS_IDS
 | 
					from .parts_of_speech import IDS as POS_IDS
 | 
				
			||||||
from .lexeme cimport Lexeme
 | 
					from .lexeme cimport Lexeme
 | 
				
			||||||
from .attrs import LEMMA, intify_attrs
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _normalize_props(props):
 | 
					def _normalize_props(props):
 | 
				
			||||||
    """
 | 
					    """Transform deprecated string keys to correct names."""
 | 
				
			||||||
    Transform deprecated string keys to correct names.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    out = {}
 | 
					    out = {}
 | 
				
			||||||
    for key, value in props.items():
 | 
					    for key, value in props.items():
 | 
				
			||||||
        if key == POS:
 | 
					        if key == POS:
 | 
				
			||||||
| 
						 | 
					@ -77,7 +75,8 @@ cdef class Morphology:
 | 
				
			||||||
    cdef int assign_untagged(self, TokenC* token) except -1:
 | 
					    cdef int assign_untagged(self, TokenC* token) except -1:
 | 
				
			||||||
        """Set morphological attributes on a token without a POS tag. Uses
 | 
					        """Set morphological attributes on a token without a POS tag. Uses
 | 
				
			||||||
        the lemmatizer's lookup() method, which looks up the string in the
 | 
					        the lemmatizer's lookup() method, which looks up the string in the
 | 
				
			||||||
        table provided by the language data as lemma_lookup (if available)."""
 | 
					        table provided by the language data as lemma_lookup (if available).
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
        if token.lemma == 0:
 | 
					        if token.lemma == 0:
 | 
				
			||||||
            orth_str = self.strings[token.lex.orth]
 | 
					            orth_str = self.strings[token.lex.orth]
 | 
				
			||||||
            lemma = self.lemmatizer.lookup(orth_str)
 | 
					            lemma = self.lemmatizer.lookup(orth_str)
 | 
				
			||||||
| 
						 | 
					@ -95,11 +94,10 @@ cdef class Morphology:
 | 
				
			||||||
    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
 | 
					    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
 | 
				
			||||||
        if tag_id > self.n_tags:
 | 
					        if tag_id > self.n_tags:
 | 
				
			||||||
            raise ValueError("Unknown tag ID: %s" % tag_id)
 | 
					            raise ValueError("Unknown tag ID: %s" % tag_id)
 | 
				
			||||||
        # TODO: It's pretty arbitrary to put this logic here. I guess the justification
 | 
					        # TODO: It's pretty arbitrary to put this logic here. I guess the
 | 
				
			||||||
        # is that this is where the specific word and the tag interact. Still,
 | 
					        # justification is that this is where the specific word and the tag
 | 
				
			||||||
        # we should have a better way to enforce this rule, or figure out why
 | 
					        # interact. Still, we should have a better way to enforce this rule, or
 | 
				
			||||||
        # the statistical model fails.
 | 
					        # figure out why the statistical model fails. Related to Issue #220
 | 
				
			||||||
        # Related to Issue #220
 | 
					 | 
				
			||||||
        if Lexeme.c_check_flag(token.lex, IS_SPACE):
 | 
					        if Lexeme.c_check_flag(token.lex, IS_SPACE):
 | 
				
			||||||
            tag_id = self.reverse_index[self.strings.add('_SP')]
 | 
					            tag_id = self.reverse_index[self.strings.add('_SP')]
 | 
				
			||||||
        rich_tag = self.rich_tags[tag_id]
 | 
					        rich_tag = self.rich_tags[tag_id]
 | 
				
			||||||
| 
						 | 
					@ -123,12 +121,11 @@ cdef class Morphology:
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            flags[0] &= ~(one << flag_id)
 | 
					            flags[0] &= ~(one << flag_id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
 | 
					    def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
 | 
				
			||||||
        """
 | 
					                         force=False):
 | 
				
			||||||
        Add a special-case rule to the morphological analyser. Tokens whose
 | 
					        """Add a special-case rule to the morphological analyser. Tokens whose
 | 
				
			||||||
        tag and orth match the rule will receive the specified properties.
 | 
					        tag and orth match the rule will receive the specified properties.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Arguments:
 | 
					 | 
				
			||||||
        tag (unicode): The part-of-speech tag to key the exception.
 | 
					        tag (unicode): The part-of-speech tag to key the exception.
 | 
				
			||||||
        orth (unicode): The word-form to key the exception.
 | 
					        orth (unicode): The word-form to key the exception.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
| 
						 | 
					@ -144,10 +141,9 @@ cdef class Morphology:
 | 
				
			||||||
        elif force:
 | 
					        elif force:
 | 
				
			||||||
            memset(cached, 0, sizeof(cached[0]))
 | 
					            memset(cached, 0, sizeof(cached[0]))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            msg = ("Conflicting morphology exception for (%s, %s). Use force=True "
 | 
					            raise ValueError(
 | 
				
			||||||
                   "to overwrite.")
 | 
					                "Conflicting morphology exception for (%s, %s). Use "
 | 
				
			||||||
            msg = msg % (tag_str, orth_str)
 | 
					                "force=True to overwrite." % (tag_str, orth_str))
 | 
				
			||||||
            raise ValueError(msg)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cached.tag = rich_tag
 | 
					        cached.tag = rich_tag
 | 
				
			||||||
        # TODO: Refactor this to take arbitrary attributes.
 | 
					        # TODO: Refactor this to take arbitrary attributes.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -85,7 +85,6 @@ class Scorer(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
 | 
					    def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
 | 
				
			||||||
        assert len(tokens) == len(gold)
 | 
					        assert len(tokens) == len(gold)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        gold_deps = set()
 | 
					        gold_deps = set()
 | 
				
			||||||
        gold_tags = set()
 | 
					        gold_tags = set()
 | 
				
			||||||
        gold_ents = set(tags_to_entities([annot[-1]
 | 
					        gold_ents = set(tags_to_entities([annot[-1]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,19 +4,15 @@ from __future__ import unicode_literals, absolute_import
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cimport cython
 | 
					cimport cython
 | 
				
			||||||
from libc.string cimport memcpy
 | 
					from libc.string cimport memcpy
 | 
				
			||||||
from libc.stdint cimport uint64_t, uint32_t
 | 
					 | 
				
			||||||
from murmurhash.mrmr cimport hash64, hash32
 | 
					 | 
				
			||||||
from preshed.maps cimport map_iter, key_t
 | 
					 | 
				
			||||||
from libc.stdint cimport uint32_t
 | 
					from libc.stdint cimport uint32_t
 | 
				
			||||||
 | 
					from murmurhash.mrmr cimport hash64, hash32
 | 
				
			||||||
import ujson
 | 
					import ujson
 | 
				
			||||||
import dill
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .symbols import IDS as SYMBOLS_BY_STR
 | 
					from .symbols import IDS as SYMBOLS_BY_STR
 | 
				
			||||||
from .symbols import NAMES as SYMBOLS_BY_INT
 | 
					from .symbols import NAMES as SYMBOLS_BY_INT
 | 
				
			||||||
 | 
					 | 
				
			||||||
from .typedefs cimport hash_t
 | 
					from .typedefs cimport hash_t
 | 
				
			||||||
from . import util
 | 
					 | 
				
			||||||
from .compat import json_dumps
 | 
					from .compat import json_dumps
 | 
				
			||||||
 | 
					from . import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef hash_t hash_string(unicode string) except 0:
 | 
					cpdef hash_t hash_string(unicode string) except 0:
 | 
				
			||||||
| 
						 | 
					@ -195,7 +191,7 @@ cdef class StringStore:
 | 
				
			||||||
        """Save the current state to a directory.
 | 
					        """Save the current state to a directory.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        path (unicode or Path): A path to a directory, which will be created if
 | 
					        path (unicode or Path): A path to a directory, which will be created if
 | 
				
			||||||
            it doesn't exist. Paths may be either strings or `Path`-like objects.
 | 
					            it doesn't exist. Paths may be either strings or Path-like objects.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        path = util.ensure_path(path)
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
        strings = list(self)
 | 
					        strings = list(self)
 | 
				
			||||||
| 
						 | 
					@ -225,7 +221,7 @@ cdef class StringStore:
 | 
				
			||||||
        **exclude: Named attributes to prevent from being serialized.
 | 
					        **exclude: Named attributes to prevent from being serialized.
 | 
				
			||||||
        RETURNS (bytes): The serialized form of the `StringStore` object.
 | 
					        RETURNS (bytes): The serialized form of the `StringStore` object.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        return ujson.dumps(list(self))
 | 
					        return json_dumps(list(self))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_bytes(self, bytes_data, **exclude):
 | 
					    def from_bytes(self, bytes_data, **exclude):
 | 
				
			||||||
        """Load state from a binary string.
 | 
					        """Load state from a binary string.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,8 @@
 | 
				
			||||||
# coding: utf8
 | 
					# coding: utf8
 | 
				
			||||||
#cython: optimize.unpack_method_calls=False
 | 
					#cython: optimize.unpack_method_calls=False
 | 
				
			||||||
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
IDS = {
 | 
					IDS = {
 | 
				
			||||||
    "": NIL,
 | 
					    "": NIL,
 | 
				
			||||||
    "IS_ALPHA": IS_ALPHA,
 | 
					    "IS_ALPHA": IS_ALPHA,
 | 
				
			||||||
| 
						 | 
					@ -464,9 +464,11 @@ IDS = {
 | 
				
			||||||
    "LAW": LAW
 | 
					    "LAW": LAW
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def sort_nums(x):
 | 
					def sort_nums(x):
 | 
				
			||||||
    return x[1]
 | 
					    return x[1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
 | 
					NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
 | 
				
			||||||
# Unfortunate hack here, to work around problem with long cpdef enum
 | 
					# Unfortunate hack here, to work around problem with long cpdef enum
 | 
				
			||||||
# (which is generating an enormous amount of C++ in Cython 0.24+)
 | 
					# (which is generating an enormous amount of C++ in Cython 0.24+)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,12 +8,11 @@ from cython.operator cimport preincrement as preinc
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from preshed.maps cimport PreshMap
 | 
					from preshed.maps cimport PreshMap
 | 
				
			||||||
import regex as re
 | 
					import regex as re
 | 
				
			||||||
 | 
					 | 
				
			||||||
from .strings cimport hash_string
 | 
					 | 
				
			||||||
from . import util
 | 
					 | 
				
			||||||
cimport cython
 | 
					cimport cython
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokens.doc cimport Doc
 | 
					from .tokens.doc cimport Doc
 | 
				
			||||||
 | 
					from .strings cimport hash_string
 | 
				
			||||||
 | 
					from . import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Tokenizer:
 | 
					cdef class Tokenizer:
 | 
				
			||||||
| 
						 | 
					@ -74,9 +73,8 @@ cdef class Tokenizer:
 | 
				
			||||||
        RETURNS (Doc): A container for linguistic annotations.
 | 
					        RETURNS (Doc): A container for linguistic annotations.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        if len(string) >= (2 ** 30):
 | 
					        if len(string) >= (2 ** 30):
 | 
				
			||||||
            raise ValueError(
 | 
					            msg = "String is too long: %d characters. Max is 2**30."
 | 
				
			||||||
                "String is too long: %d characters. Max is 2**30." % len(string)
 | 
					            raise ValueError(msg % len(string))
 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        cdef int length = len(string)
 | 
					        cdef int length = len(string)
 | 
				
			||||||
        cdef Doc doc = Doc(self.vocab)
 | 
					        cdef Doc doc = Doc(self.vocab)
 | 
				
			||||||
        if length == 0:
 | 
					        if length == 0:
 | 
				
			||||||
| 
						 | 
					@ -122,8 +120,8 @@ cdef class Tokenizer:
 | 
				
			||||||
        """Tokenize a stream of texts.
 | 
					        """Tokenize a stream of texts.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        texts: A sequence of unicode texts.
 | 
					        texts: A sequence of unicode texts.
 | 
				
			||||||
        batch_size (int): The number of texts to accumulate in an internal buffer.
 | 
					        batch_size (int): Number of texts to accumulate in an internal buffer.
 | 
				
			||||||
        n_threads (int): The number of threads to use, if the implementation
 | 
					        n_threads (int): Number of threads to use, if the implementation
 | 
				
			||||||
            supports multi-threading. The default tokenizer is single-threaded.
 | 
					            supports multi-threading. The default tokenizer is single-threaded.
 | 
				
			||||||
        YIELDS (Doc): A sequence of Doc objects, in order.
 | 
					        YIELDS (Doc): A sequence of Doc objects, in order.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
| 
						 | 
					@ -232,8 +230,8 @@ cdef class Tokenizer:
 | 
				
			||||||
                if not matches:
 | 
					                if not matches:
 | 
				
			||||||
                    tokens.push_back(self.vocab.get(tokens.mem, string), False)
 | 
					                    tokens.push_back(self.vocab.get(tokens.mem, string), False)
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    # let's say we have dyn-o-mite-dave
 | 
					                    # let's say we have dyn-o-mite-dave - the regex finds the
 | 
				
			||||||
                    # the regex finds the start and end positions of the hyphens
 | 
					                    # start and end positions of the hyphens
 | 
				
			||||||
                    start = 0
 | 
					                    start = 0
 | 
				
			||||||
                    for match in matches:
 | 
					                    for match in matches:
 | 
				
			||||||
                        infix_start = match.start()
 | 
					                        infix_start = match.start()
 | 
				
			||||||
| 
						 | 
					@ -293,8 +291,8 @@ cdef class Tokenizer:
 | 
				
			||||||
        return list(self.infix_finditer(string))
 | 
					        return list(self.infix_finditer(string))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def find_prefix(self, unicode string):
 | 
					    def find_prefix(self, unicode string):
 | 
				
			||||||
        """Find the length of a prefix that should be segmented from the string,
 | 
					        """Find the length of a prefix that should be segmented from the
 | 
				
			||||||
        or None if no prefix rules match.
 | 
					        string, or None if no prefix rules match.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        string (unicode): The string to segment.
 | 
					        string (unicode): The string to segment.
 | 
				
			||||||
        RETURNS (int): The length of the prefix if present, otherwise `None`.
 | 
					        RETURNS (int): The length of the prefix if present, otherwise `None`.
 | 
				
			||||||
| 
						 | 
					@ -305,8 +303,8 @@ cdef class Tokenizer:
 | 
				
			||||||
        return (match.end() - match.start()) if match is not None else 0
 | 
					        return (match.end() - match.start()) if match is not None else 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def find_suffix(self, unicode string):
 | 
					    def find_suffix(self, unicode string):
 | 
				
			||||||
        """Find the length of a suffix that should be segmented from the string,
 | 
					        """Find the length of a suffix that should be segmented from the
 | 
				
			||||||
        or None if no suffix rules match.
 | 
					        string, or None if no suffix rules match.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        string (unicode): The string to segment.
 | 
					        string (unicode): The string to segment.
 | 
				
			||||||
        Returns (int): The length of the suffix if present, otherwise `None`.
 | 
					        Returns (int): The length of the suffix if present, otherwise `None`.
 | 
				
			||||||
| 
						 | 
					@ -326,8 +324,8 @@ cdef class Tokenizer:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        string (unicode): The string to specially tokenize.
 | 
					        string (unicode): The string to specially tokenize.
 | 
				
			||||||
        token_attrs (iterable): A sequence of dicts, where each dict describes
 | 
					        token_attrs (iterable): A sequence of dicts, where each dict describes
 | 
				
			||||||
            a token and its attributes. The `ORTH` fields of the attributes must
 | 
					            a token and its attributes. The `ORTH` fields of the attributes
 | 
				
			||||||
            exactly match the string when they are concatenated.
 | 
					            must exactly match the string when they are concatenated.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        substrings = list(substrings)
 | 
					        substrings = list(substrings)
 | 
				
			||||||
        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
 | 
					        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
 | 
				
			||||||
| 
						 | 
					@ -343,7 +341,7 @@ cdef class Tokenizer:
 | 
				
			||||||
        """Save the current state to a directory.
 | 
					        """Save the current state to a directory.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        path (unicode or Path): A path to a directory, which will be created if
 | 
					        path (unicode or Path): A path to a directory, which will be created if
 | 
				
			||||||
            it doesn't exist. Paths may be either strings or `Path`-like objects.
 | 
					            it doesn't exist. Paths may be either strings or Path-like objects.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        with path.open('wb') as file_:
 | 
					        with path.open('wb') as file_:
 | 
				
			||||||
            file_.write(self.to_bytes(**exclude))
 | 
					            file_.write(self.to_bytes(**exclude))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -476,7 +476,7 @@ cdef class Span:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        # TODO: implement
 | 
					        # TODO: implement
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            raise NotImplementedError()
 | 
					            raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property n_rights:
 | 
					    property n_rights:
 | 
				
			||||||
        """RETURNS (int): The number of rightward immediate children of the
 | 
					        """RETURNS (int): The number of rightward immediate children of the
 | 
				
			||||||
| 
						 | 
					@ -484,7 +484,7 @@ cdef class Span:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        # TODO: implement
 | 
					        # TODO: implement
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            raise NotImplementedError()
 | 
					            raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property subtree:
 | 
					    property subtree:
 | 
				
			||||||
        """Tokens that descend from tokens in the span, but fall outside it.
 | 
					        """Tokens that descend from tokens in the span, but fall outside it.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1 +0,0 @@
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					@ -17,8 +17,8 @@ from .compat import copy_reg, basestring_
 | 
				
			||||||
from .lemmatizer import Lemmatizer
 | 
					from .lemmatizer import Lemmatizer
 | 
				
			||||||
from .attrs import intify_attrs
 | 
					from .attrs import intify_attrs
 | 
				
			||||||
from .vectors import Vectors
 | 
					from .vectors import Vectors
 | 
				
			||||||
from . import util
 | 
					 | 
				
			||||||
from ._ml import link_vectors_to_models
 | 
					from ._ml import link_vectors_to_models
 | 
				
			||||||
 | 
					from . import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Vocab:
 | 
					cdef class Vocab:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user