Tidy up rest

This commit is contained in:
ines 2017-10-27 21:07:59 +02:00
parent a8e10f94e4
commit d96e72f656
14 changed files with 233 additions and 261 deletions

View File

@ -8,11 +8,9 @@ from thinc.t2t import ExtractWindow, ParametricAttention
from thinc.t2v import Pooling, sum_pool
from thinc.misc import Residual
from thinc.misc import LayerNorm as LN
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
from thinc.api import uniqued, wrap, noop
from thinc.linear.linear import LinearModel
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module

View File

@ -101,17 +101,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
"""
Normalize a dictionary of attributes, converting them to ints.
Arguments:
stringy_attrs (dict):
Dictionary keyed by attribute string names. Values can be ints or strings.
strings_map (StringStore):
Defaults to None. If provided, encodes string values into ints.
Returns:
inty_attrs (dict):
Attributes dictionary with keys and optionally values converted to
ints.
stringy_attrs (dict): Dictionary keyed by attribute string names. Values
can be ints or strings.
strings_map (StringStore): Defaults to None. If provided, encodes string
values into ints.
RETURNS (dict): Attributes dictionary with keys and optionally values
converted to ints.
"""
inty_attrs = {}
if _do_deprecated:

View File

@ -2,7 +2,6 @@
# coding: utf8
from __future__ import unicode_literals, print_function
import io
import re
import ujson
import random
@ -10,9 +9,8 @@ import cytoolz
import itertools
from .syntax import nonproj
from .util import ensure_path
from . import util
from .tokens import Doc
from . import util
def tags_to_entities(tags):
@ -310,7 +308,7 @@ def _corrupt(c, noise_level):
def read_json_file(loc, docs_filter=None, limit=None):
loc = ensure_path(loc)
loc = util.ensure_path(loc)
if loc.is_dir():
for filename in loc.iterdir():
yield from read_json_file(loc / filename, limit=limit)

View File

@ -1,22 +1,22 @@
# coding: utf8
from __future__ import absolute_import, unicode_literals
from contextlib import contextmanager
import copy
from thinc.neural import Model
from thinc.neural.optimizers import Adam
import random
import ujson
from collections import OrderedDict
import itertools
import weakref
import functools
from collections import OrderedDict
from contextlib import contextmanager
from copy import copy
from thinc.neural import Model
from thinc.neural.optimizers import Adam
from .tokenizer import Tokenizer
from .vocab import Vocab
from .lemmatizer import Lemmatizer
from .pipeline import DependencyParser, Tensorizer, Tagger
from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import SimilarityHook, TextCategorizer
from .compat import json_dumps, izip
from .scorer import Scorer
from ._ml import link_vectors_to_models
@ -649,7 +649,7 @@ class Language(object):
serializers = OrderedDict((
('vocab', lambda: self.vocab.to_bytes()),
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
('meta', lambda: ujson.dumps(self.meta))
('meta', lambda: json_dumps(self.meta))
))
for i, (name, proc) in enumerate(self.pipeline):
if name in disable:
@ -689,7 +689,7 @@ class DisabledPipes(list):
# Important! Not deep copy -- we just want the container (but we also
# want to support people providing arbitrarily typed nlp.pipeline
# objects.)
self.original_pipeline = copy.copy(nlp.pipeline)
self.original_pipeline = copy(nlp.pipeline)
list.__init__(self)
self.extend(nlp.remove_pipe(name) for name in names)

View File

@ -4,12 +4,6 @@
from __future__ import unicode_literals
import ujson
from .typedefs cimport attr_t
from .typedefs cimport hash_t
from .attrs cimport attr_id_t
from .structs cimport TokenC
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector
@ -17,14 +11,15 @@ from libcpp.pair cimport pair
from murmurhash.mrmr cimport hash64
from libc.stdint cimport int32_t
from .attrs cimport ID, NULL_ATTR, ENT_TYPE
from . import attrs
from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc
from .typedefs cimport attr_t
from .typedefs cimport hash_t
from .structs cimport TokenC
from .tokens.doc cimport Doc, get_token_attr
from .vocab cimport Vocab
from .attrs import IDS
from .attrs cimport attr_id_t, ID, NULL_ATTR
from .attrs import FLAG61 as U_ENT
from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT
from .attrs import FLAG58 as B4_ENT
@ -34,7 +29,6 @@ from .attrs import FLAG55 as B7_ENT
from .attrs import FLAG54 as B8_ENT
from .attrs import FLAG53 as B9_ENT
from .attrs import FLAG52 as B10_ENT
from .attrs import FLAG51 as I3_ENT
from .attrs import FLAG50 as I4_ENT
from .attrs import FLAG49 as I5_ENT
@ -43,7 +37,6 @@ from .attrs import FLAG47 as I7_ENT
from .attrs import FLAG46 as I8_ENT
from .attrs import FLAG45 as I9_ENT
from .attrs import FLAG44 as I10_ENT
from .attrs import FLAG43 as L2_ENT
from .attrs import FLAG42 as L3_ENT
from .attrs import FLAG41 as L4_ENT
@ -168,10 +161,10 @@ def _convert_strings(token_specs, string_store):
if value in operators:
ops = operators[value]
else:
raise KeyError(
"Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys())))
msg = "Unknown operator '%s'. Options: %s"
raise KeyError(msg % (value, ', '.join(operators.keys())))
if isinstance(attr, basestring):
attr = attrs.IDS.get(attr.upper())
attr = IDS.get(attr.upper())
if isinstance(value, basestring):
value = string_store.add(value)
if isinstance(value, bool):
@ -186,7 +179,7 @@ def _convert_strings(token_specs, string_store):
def merge_phrase(matcher, doc, i, matches):
"""Callback to merge a phrase on match."""
ent_id, label, start, end = matches[i]
span = doc[start : end]
span = doc[start:end]
span.merge(ent_type=label, ent_id=ent_id)
@ -233,13 +226,13 @@ cdef class Matcher:
return self._normalize_key(key) in self._patterns
def add(self, key, on_match, *patterns):
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
an on_match callback, and one or more patterns.
"""Add a match-rule to the matcher. A match-rule consists of: an ID
key, an on_match callback, and one or more patterns.
If the key exists, the patterns are appended to the previous ones, and
the previous on_match callback is replaced. The `on_match` callback will
receive the arguments `(matcher, doc, i, matches)`. You can also set
`on_match` to `None` to not perform any actions.
the previous on_match callback is replaced. The `on_match` callback
will receive the arguments `(matcher, doc, i, matches)`. You can also
set `on_match` to `None` to not perform any actions.
A pattern consists of one or more `token_specs`, where a `token_spec`
is a dictionary mapping attribute IDs to values, and optionally a
@ -253,8 +246,8 @@ cdef class Matcher:
The + and * operators are usually interpretted "greedily", i.e. longer
matches are returned where possible. However, if you specify two '+'
and '*' patterns in a row and their matches overlap, the first
operator will behave non-greedily. This quirk in the semantics
makes the matcher more efficient, by avoiding the need for back-tracking.
operator will behave non-greedily. This quirk in the semantics makes
the matcher more efficient, by avoiding the need for back-tracking.
key (unicode): The match ID.
on_match (callable): Callback executed on match.
@ -268,7 +261,6 @@ cdef class Matcher:
key = self._normalize_key(key)
self._patterns.setdefault(key, [])
self._callbacks[key] = on_match
for pattern in patterns:
specs = _convert_strings(pattern, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, key, specs))
@ -315,9 +307,9 @@ cdef class Matcher:
"""Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents.
batch_size (int): The number of documents to accumulate into a working set.
batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the `Matcher` implementation supports multi-threading.
in parallel, if the implementation supports multi-threading.
YIELDS (Doc): Documents, in order.
"""
for doc in docs:
@ -325,7 +317,7 @@ cdef class Matcher:
yield doc
def __call__(self, Doc doc):
"""Find all token sequences matching the supplied patterns on the `Doc`.
"""Find all token sequences matching the supplied pattern.
doc (Doc): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples,
@ -342,8 +334,8 @@ cdef class Matcher:
for token_i in range(doc.length):
token = &doc.c[token_i]
q = 0
# Go over the open matches, extending or finalizing if able. Otherwise,
# we over-write them (q doesn't advance)
# Go over the open matches, extending or finalizing if able.
# Otherwise, we over-write them (q doesn't advance)
for state in partials:
action = get_action(state.second, token)
if action == PANIC:
@ -356,8 +348,8 @@ cdef class Matcher:
if action == REPEAT:
# Leave the state in the queue, and advance to next slot
# (i.e. we don't overwrite -- we want to greedily match more
# pattern.
# (i.e. we don't overwrite -- we want to greedily match
# more pattern.
q += 1
elif action == REJECT:
pass
@ -366,8 +358,8 @@ cdef class Matcher:
partials[q].second += 1
q += 1
elif action in (ACCEPT, ACCEPT_PREV):
# TODO: What to do about patterns starting with ZERO? Need to
# adjust the start position.
# TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position.
start = state.first
end = token_i+1 if action == ACCEPT else token_i
ent_id = state.second[1].attrs[0].value
@ -388,8 +380,8 @@ cdef class Matcher:
state.second = pattern
partials.push_back(state)
elif action == ADVANCE:
# TODO: What to do about patterns starting with ZERO? Need to
# adjust the start position.
# TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position.
state.first = token_i
state.second = pattern + 1
partials.push_back(state)
@ -413,7 +405,6 @@ cdef class Matcher:
on_match = self._callbacks.get(ent_id)
if on_match is not None:
on_match(self, doc, i, matches)
# TODO: only return (match_id, start, end)
return matches
def _normalize_key(self, key):
@ -441,7 +432,8 @@ def get_bilou(length):
elif length == 8:
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
elif length == 9:
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
L9_ENT]
elif length == 10:
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
I10_ENT, I10_ENT, L10_ENT]
@ -454,10 +446,8 @@ cdef class PhraseMatcher:
cdef Vocab vocab
cdef Matcher matcher
cdef PreshMap phrase_ids
cdef int max_length
cdef attr_t* _phrase_key
cdef public object _callbacks
cdef public object _patterns
@ -470,7 +460,8 @@ cdef class PhraseMatcher:
self.phrase_ids = PreshMap()
abstract_patterns = []
for length in range(1, max_length):
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
abstract_patterns.append([{tag: True}
for tag in get_bilou(length)])
self.matcher.add('Candidate', None, *abstract_patterns)
self._callbacks = {}
@ -496,8 +487,8 @@ cdef class PhraseMatcher:
return (self.__class__, (self.vocab,), None, None)
def add(self, key, on_match, *docs):
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
an on_match callback, and one or more patterns.
"""Add a match-rule to the matcher. A match-rule consists of: an ID
key, an on_match callback, and one or more patterns.
key (unicode): The match ID.
on_match (callable): Callback executed on match.
@ -513,7 +504,6 @@ cdef class PhraseMatcher:
raise ValueError(msg % (len(doc), self.max_length))
cdef hash_t ent_id = self.matcher._normalize_key(key)
self._callbacks[ent_id] = on_match
cdef int length
cdef int i
cdef hash_t phrase_hash
@ -553,9 +543,9 @@ cdef class PhraseMatcher:
"""Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents.
batch_size (int): The number of documents to accumulate into a working set.
batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the `Matcher` implementation supports multi-threading.
in parallel, if the implementation supports multi-threading.
YIELDS (Doc): Documents, in order.
"""
for doc in stream:
@ -569,7 +559,8 @@ cdef class PhraseMatcher:
self._phrase_key[i] = 0
for i, j in enumerate(range(start, end)):
self._phrase_key[i] = doc.c[j].lex.orth
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
cdef hash_t key = hash64(self._phrase_key,
self.max_length * sizeof(attr_t), 0)
ent_id = <hash_t>self.phrase_ids.get(key)
if ent_id == 0:
return None

View File

@ -4,17 +4,15 @@ from __future__ import unicode_literals
from libc.string cimport memset
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
from .attrs cimport POS, IS_SPACE
from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme
from .attrs import LEMMA, intify_attrs
def _normalize_props(props):
"""
Transform deprecated string keys to correct names.
"""
"""Transform deprecated string keys to correct names."""
out = {}
for key, value in props.items():
if key == POS:
@ -77,7 +75,8 @@ cdef class Morphology:
cdef int assign_untagged(self, TokenC* token) except -1:
"""Set morphological attributes on a token without a POS tag. Uses
the lemmatizer's lookup() method, which looks up the string in the
table provided by the language data as lemma_lookup (if available)."""
table provided by the language data as lemma_lookup (if available).
"""
if token.lemma == 0:
orth_str = self.strings[token.lex.orth]
lemma = self.lemmatizer.lookup(orth_str)
@ -95,11 +94,10 @@ cdef class Morphology:
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id > self.n_tags:
raise ValueError("Unknown tag ID: %s" % tag_id)
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
# is that this is where the specific word and the tag interact. Still,
# we should have a better way to enforce this rule, or figure out why
# the statistical model fails.
# Related to Issue #220
# TODO: It's pretty arbitrary to put this logic here. I guess the
# justification is that this is where the specific word and the tag
# interact. Still, we should have a better way to enforce this rule, or
# figure out why the statistical model fails. Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings.add('_SP')]
rich_tag = self.rich_tags[tag_id]
@ -123,12 +121,11 @@ cdef class Morphology:
else:
flags[0] &= ~(one << flag_id)
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
"""
Add a special-case rule to the morphological analyser. Tokens whose
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
force=False):
"""Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties.
Arguments:
tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception.
"""
@ -144,10 +141,9 @@ cdef class Morphology:
elif force:
memset(cached, 0, sizeof(cached[0]))
else:
msg = ("Conflicting morphology exception for (%s, %s). Use force=True "
"to overwrite.")
msg = msg % (tag_str, orth_str)
raise ValueError(msg)
raise ValueError(
"Conflicting morphology exception for (%s, %s). Use "
"force=True to overwrite." % (tag_str, orth_str))
cached.tag = rich_tag
# TODO: Refactor this to take arbitrary attributes.

View File

@ -85,7 +85,6 @@ class Scorer(object):
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
assert len(tokens) == len(gold)
gold_deps = set()
gold_tags = set()
gold_ents = set(tags_to_entities([annot[-1]

View File

@ -4,19 +4,15 @@ from __future__ import unicode_literals, absolute_import
cimport cython
from libc.string cimport memcpy
from libc.stdint cimport uint64_t, uint32_t
from murmurhash.mrmr cimport hash64, hash32
from preshed.maps cimport map_iter, key_t
from libc.stdint cimport uint32_t
from murmurhash.mrmr cimport hash64, hash32
import ujson
import dill
from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT
from .typedefs cimport hash_t
from . import util
from .compat import json_dumps
from . import util
cpdef hash_t hash_string(unicode string) except 0:
@ -195,7 +191,7 @@ cdef class StringStore:
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
it doesn't exist. Paths may be either strings or Path-like objects.
"""
path = util.ensure_path(path)
strings = list(self)
@ -225,7 +221,7 @@ cdef class StringStore:
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object.
"""
return ujson.dumps(list(self))
return json_dumps(list(self))
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.

View File

@ -1,8 +1,8 @@
# coding: utf8
#cython: optimize.unpack_method_calls=False
from __future__ import unicode_literals
IDS = {
"": NIL,
"IS_ALPHA": IS_ALPHA,
@ -464,9 +464,11 @@ IDS = {
"LAW": LAW
}
def sort_nums(x):
return x[1]
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
# Unfortunate hack here, to work around problem with long cpdef enum
# (which is generating an enormous amount of C++ in Cython 0.24+)

View File

@ -8,12 +8,11 @@ from cython.operator cimport preincrement as preinc
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
import regex as re
from .strings cimport hash_string
from . import util
cimport cython
from .tokens.doc cimport Doc
from .strings cimport hash_string
from . import util
cdef class Tokenizer:
@ -74,9 +73,8 @@ cdef class Tokenizer:
RETURNS (Doc): A container for linguistic annotations.
"""
if len(string) >= (2 ** 30):
raise ValueError(
"String is too long: %d characters. Max is 2**30." % len(string)
)
msg = "String is too long: %d characters. Max is 2**30."
raise ValueError(msg % len(string))
cdef int length = len(string)
cdef Doc doc = Doc(self.vocab)
if length == 0:
@ -122,8 +120,8 @@ cdef class Tokenizer:
"""Tokenize a stream of texts.
texts: A sequence of unicode texts.
batch_size (int): The number of texts to accumulate in an internal buffer.
n_threads (int): The number of threads to use, if the implementation
batch_size (int): Number of texts to accumulate in an internal buffer.
n_threads (int): Number of threads to use, if the implementation
supports multi-threading. The default tokenizer is single-threaded.
YIELDS (Doc): A sequence of Doc objects, in order.
"""
@ -232,8 +230,8 @@ cdef class Tokenizer:
if not matches:
tokens.push_back(self.vocab.get(tokens.mem, string), False)
else:
# let's say we have dyn-o-mite-dave
# the regex finds the start and end positions of the hyphens
# let's say we have dyn-o-mite-dave - the regex finds the
# start and end positions of the hyphens
start = 0
for match in matches:
infix_start = match.start()
@ -293,8 +291,8 @@ cdef class Tokenizer:
return list(self.infix_finditer(string))
def find_prefix(self, unicode string):
"""Find the length of a prefix that should be segmented from the string,
or None if no prefix rules match.
"""Find the length of a prefix that should be segmented from the
string, or None if no prefix rules match.
string (unicode): The string to segment.
RETURNS (int): The length of the prefix if present, otherwise `None`.
@ -305,8 +303,8 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0
def find_suffix(self, unicode string):
"""Find the length of a suffix that should be segmented from the string,
or None if no suffix rules match.
"""Find the length of a suffix that should be segmented from the
string, or None if no suffix rules match.
string (unicode): The string to segment.
Returns (int): The length of the suffix if present, otherwise `None`.
@ -326,8 +324,8 @@ cdef class Tokenizer:
string (unicode): The string to specially tokenize.
token_attrs (iterable): A sequence of dicts, where each dict describes
a token and its attributes. The `ORTH` fields of the attributes must
exactly match the string when they are concatenated.
a token and its attributes. The `ORTH` fields of the attributes
must exactly match the string when they are concatenated.
"""
substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
@ -343,7 +341,7 @@ cdef class Tokenizer:
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
it doesn't exist. Paths may be either strings or Path-like objects.
"""
with path.open('wb') as file_:
file_.write(self.to_bytes(**exclude))

View File

@ -476,7 +476,7 @@ cdef class Span:
"""
# TODO: implement
def __get__(self):
raise NotImplementedError()
raise NotImplementedError
property n_rights:
"""RETURNS (int): The number of rightward immediate children of the
@ -484,7 +484,7 @@ cdef class Span:
"""
# TODO: implement
def __get__(self):
raise NotImplementedError()
raise NotImplementedError
property subtree:
"""Tokens that descend from tokens in the span, but fall outside it.

View File

@ -1 +0,0 @@

View File

@ -17,8 +17,8 @@ from .compat import copy_reg, basestring_
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from .vectors import Vectors
from . import util
from ._ml import link_vectors_to_models
from . import util
cdef class Vocab: