Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-02-18 15:38:18 +01:00
parent 1278161f47
commit e3f40a6a0f
127 changed files with 219 additions and 275 deletions

View File

@ -5,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
# These are imported as part of the API
from thinc.util import prefer_gpu, require_gpu
from thinc.api import prefer_gpu, require_gpu
from . import pipeline
from .cli.info import info as cli_info

View File

@ -4,7 +4,7 @@ from .link import link # noqa: F401
from .package import package # noqa: F401
from .profile import profile # noqa: F401
from .train import train # noqa: F401
from .train_from_config import train_from_config_cli # noqa: F401
from .train_from_config import train_from_config_cli # noqa: F401
from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .evaluate import evaluate # noqa: F401

View File

@ -192,11 +192,7 @@ def debug_data(
has_ws_ents_error = True
if gold_train_data["punct_ents"]:
msg.warn(
"{} entity span(s) with punctuation".format(
gold_train_data["punct_ents"]
)
)
msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
has_punct_ents_warning = True
for label in new_labels:

View File

@ -4,14 +4,12 @@ import time
import re
from collections import Counter
from pathlib import Path
from thinc.layers import Linear, Maxout
from thinc.util import prefer_gpu
from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu
from thinc.api import CosineDistance, L2Distance
from wasabi import msg
import srsly
from thinc.layers import chain, list2array
from thinc.loss import CosineDistance, L2Distance
from spacy.gold import Example
from ..gold import Example
from ..errors import Errors
from ..tokens import Doc
from ..attrs import ID, HEAD
@ -85,7 +83,7 @@ def pretrain(
)
if not output_dir.exists():
output_dir.mkdir()
msg.good("Created output directory: {}".format(output_dir))
msg.good(f"Created output directory: {output_dir}")
srsly.write_json(output_dir / "config.json", config)
msg.good("Saved settings to config.json")

View File

@ -1,7 +1,7 @@
import os
import tqdm
from pathlib import Path
from thinc.backends import use_ops
from thinc.api import use_ops
from timeit import default_timer as timer
import shutil
import srsly
@ -89,7 +89,7 @@ def train(
)
if not output_path.exists():
output_path.mkdir()
msg.good("Created output directory: {}".format(output_path))
msg.good(f"Created output directory: {output_path}")
tag_map = {}
if tag_map_path is not None:
@ -125,17 +125,17 @@ def train(
msg.text(f"Training pipeline: {pipeline}")
disabled_pipes = None
pipes_added = False
msg.text("Training pipeline: {}".format(pipeline))
msg.text(f"Training pipeline: {pipeline}")
if use_gpu >= 0:
activated_gpu = None
try:
activated_gpu = set_gpu(use_gpu)
except Exception as e:
msg.warn("Exception: {}".format(e))
msg.warn(f"Exception: {e}")
if activated_gpu is not None:
msg.text("Using GPU: {}".format(use_gpu))
msg.text(f"Using GPU: {use_gpu}")
else:
msg.warn("Unable to activate GPU: {}".format(use_gpu))
msg.warn(f"Unable to activate GPU: {use_gpu}")
msg.text("Using CPU only")
use_gpu = -1
if base_model:
@ -158,11 +158,11 @@ def train(
"positive_label": textcat_positive_label,
}
if pipe not in nlp.pipe_names:
msg.text("Adding component to base model '{}'".format(pipe))
msg.text(f"Adding component to base model '{pipe}'")
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
pipes_added = True
elif replace_components:
msg.text("Replacing component from base model '{}'".format(pipe))
msg.text(f"Replacing component from base model '{pipe}'")
nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
pipes_added = True
else:
@ -180,7 +180,7 @@ def train(
f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
exits=1,
)
msg.text("Extending component from base model '{}'".format(pipe))
msg.text(f"Extending component from base model '{pipe}'")
disabled_pipes = nlp.disable_pipes(
[p for p in nlp.pipe_names if p not in pipeline]
)
@ -377,7 +377,7 @@ def train(
msg.warn(
"Did you provide the same parameters during 'train' as during 'pretrain'?"
)
msg.fail("Original error message: {}".format(e), exits=1)
msg.fail(f"Original error message: {e}", exits=1)
if raw_text:
# If raw text is available, perform 'rehearsal' updates,
# which use unlabelled data to reduce overfitting.
@ -504,11 +504,7 @@ def train(
)
break
except Exception as e:
msg.warn(
"Aborting and saving the final best model. Encountered exception: {}".format(
e
)
)
msg.warn(f"Aborting and saving final best model. Encountered exception: {e}")
finally:
best_pipes = nlp.pipe_names
if disabled_pipes:

View File

@ -1,19 +1,20 @@
from typing import Optional, Dict, List, Union, Sequence
import plac
from thinc.util import require_gpu
from wasabi import msg
from pathlib import Path
import thinc
import thinc.schedules
from thinc.model import Model
from spacy.gold import GoldCorpus
import spacy
from spacy.pipeline.tok2vec import Tok2VecListener
from typing import Optional, Dict, List, Union, Sequence
from thinc.api import Model
from pydantic import BaseModel, FilePath, StrictInt
import tqdm
from ..ml import component_models
from .. import util
# TODO: relative imports?
import spacy
from spacy.gold import GoldCorpus
from spacy.pipeline.tok2vec import Tok2VecListener
from spacy.ml import component_models
from spacy import util
registry = util.registry
@ -153,10 +154,9 @@ def create_tb_parser_model(
hidden_width: StrictInt = 64,
maxout_pieces: StrictInt = 3,
):
from thinc.layers import Linear, chain, list2array
from thinc.api import Linear, chain, list2array, use_ops, zero_init
from spacy.ml._layers import PrecomputableAffine
from spacy.syntax._parser_model import ParserModel
from thinc.api import use_ops, zero_init
token_vector_width = tok2vec.get_dim("nO")
tok2vec = chain(tok2vec, list2array())
@ -221,13 +221,9 @@ def train_from_config_cli(
def train_from_config(
config_path,
data_paths,
raw_text=None,
meta_path=None,
output_path=None,
config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
):
msg.info("Loading config from: {}".format(config_path))
msg.info(f"Loading config from: {config_path}")
config = util.load_from_config(config_path, create_objects=True)
use_gpu = config["training"]["use_gpu"]
if use_gpu >= 0:
@ -241,9 +237,7 @@ def train_from_config(
msg.info("Loading training corpus")
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
msg.info("Initializing the nlp pipeline")
nlp.begin_training(
lambda: corpus.train_examples, device=use_gpu
)
nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
train_batches = create_train_batches(nlp, corpus, config["training"])
evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"])
@ -260,7 +254,7 @@ def train_from_config(
config["training"]["eval_frequency"],
)
msg.info("Training. Initial learn rate: {}".format(optimizer.learn_rate))
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
print_row = setup_printer(config)
try:
@ -414,7 +408,7 @@ def subdivide_batch(batch):
def setup_printer(config):
score_cols = config["training"]["scores"]
score_widths = [max(len(col), 6) for col in score_cols]
loss_cols = ["Loss {}".format(pipe) for pipe in config["nlp"]["pipeline"]]
loss_cols = [f"Loss {pipe}" for pipe in config["nlp"]["pipeline"]]
loss_widths = [max(len(col), 8) for col in loss_cols]
table_header = ["#"] + loss_cols + score_cols + ["Score"]
table_header = [col.upper() for col in table_header]

View File

@ -30,7 +30,7 @@ try:
except ImportError:
cupy = None
from thinc.optimizers import Optimizer # noqa: F401
from thinc.api import Optimizer # noqa: F401
pickle = pickle
copy_reg = copy_reg

View File

@ -1,4 +1,3 @@
# Setting explicit height and max-width: none on the SVG is required for
# Jupyter to render it properly in a cell

View File

@ -1,4 +1,3 @@
def explain(term):
"""Get a description for a given POS tag, dependency label or entity type.

View File

@ -1,6 +1,6 @@
from cymem.cymem cimport Pool
from spacy.tokens import Doc
from .tokens import Doc
from .typedefs cimport attr_t
from .syntax.transition_system cimport Transition
@ -65,5 +65,3 @@ cdef class Example:
cdef public TokenAnnotation token_annotation
cdef public DocAnnotation doc_annotation
cdef public object goldparse

View File

@ -6,7 +6,7 @@ from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE
from spacy.vocab cimport Vocab
from .vocab cimport Vocab
from .typedefs cimport hash_t
from .structs cimport KBEntryC, AliasC
@ -113,7 +113,7 @@ cdef class KnowledgeBase:
return new_index
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
"""
"""
Initializing the vectors and making sure the first element of each vector is a dummy,
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
cf. https://github.com/explosion/preshed/issues/17
@ -169,4 +169,3 @@ cdef class Reader:
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
cdef int _read(self, void* value, size_t size) except -1

View File

@ -1,4 +1,3 @@
# Source: https://github.com/stopwords-iso/stopwords-af
STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
# Source: https://github.com/Alir3z4/stop-words
STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set(
"""
অতএব অথচ অথব অন অন অন অন অনতত অবধি অবশ অর অন অন অরধভ

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# Source: https://github.com/Alir3z4/stop-words
STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set(
"""
á a ab aber ach acht achte achten achter achtes ag alle allein allem allen
@ -44,7 +43,7 @@ kleines kommen kommt können könnt konnte könnte konnten kurz
lang lange leicht leider lieber los
machen macht machte mag magst man manche manchem manchen mancher manches mehr
mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten
mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten
mögen möglich mögt morgen muss muß müssen musst müsst musste mussten
na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter

View File

@ -1,4 +1,3 @@
def get_pos_from_wiktionary():
import re
from gensim.corpora.wikicorpus import extract_pages

View File

@ -1,4 +1,3 @@
# These exceptions are used to add NORM values based on a token's ORTH value.
# Norms are only set if no alternative is provided in the tokenizer exceptions.

View File

@ -1,4 +1,3 @@
# Stop words
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
_exc = {
# Slang and abbreviations
"cos": "because",

View File

@ -1,4 +1,3 @@
# Stop words
STOP_WORDS = set(
"""

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set(
"""
actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí

View File

@ -1,4 +1,3 @@
# Source: https://github.com/stopwords-iso/stopwords-et
STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
verb_roots = """
#هست
آخت#آهنج

View File

@ -1,4 +1,3 @@
# Stop words from HAZM package
STOP_WORDS = set(
"""

View File

@ -1,4 +1,3 @@
# Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt
# Reformatted with some minor corrections
STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set(
"""
a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons

View File

@ -1,4 +1,3 @@
# fmt: off
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
broad_vowels = ["a", "á", "o", "ó", "u", "ú"]

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6
STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set(
"""
a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# Source: https://github.com/Xangis/extra-stopwords
STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set(
"""
a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set(
"""
ಹಲವ

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# Source: https://github.com/stopwords-iso/stopwords-lv
STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
# Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json
STOP_WORDS = set(
"""

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them -
# for example, British vs. American spelling in English.

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set(
"""
à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
_exc = {
# Slang
"прив": "привет",

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set(
"""
අතර

View File

@ -1,4 +1,3 @@
# Source: https://github.com/Ardevop-sk/stopwords-sk
STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
# Source: https://github.com/stopwords-iso/stopwords-sl
# TODO: probably needs to be tidied up the list seems to have month names in
# it, which shouldn't be considered stop words.

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# Source: https://github.com/andrixh/index-albanian
STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
_exc = {
# Slang
"ћале": "отац",

View File

@ -1,4 +1,3 @@
STOP_WORDS = set(
"""
а

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set(
"""
aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# Stop words
STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
_exc = {
# Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
"สนุ๊กเกอร์": "สนุกเกอร์",

View File

@ -34,7 +34,7 @@ URL_PATTERN = (
r"|"
# host & domain names
# mods: match is case-sensitive, so include [A-Z]
"(?:"
"(?:" # noqa: E131
"(?:"
"[A-Za-z0-9\u00a1-\uffff]"
"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.tr.examples import sentences

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.

View File

@ -4,7 +4,7 @@ import weakref
import functools
from contextlib import contextmanager
from copy import copy, deepcopy
from thinc.backends import get_current_ops
from thinc.api import get_current_ops
import srsly
import multiprocessing as mp
from itertools import chain, cycle

View File

@ -6,7 +6,7 @@ cimport numpy as np
np.import_array()
import numpy
from thinc.util import get_array_module
from thinc.api import get_array_module
from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE

View File

@ -3,18 +3,20 @@ from thinc.api import Model
def CharacterEmbed(nM, nC):
# nM: Number of dimensions per character. nC: Number of characters.
nO = nM*nC if (nM is not None and nC is not None) else None
nO = nM * nC if (nM is not None and nC is not None) else None
return Model(
"charembed",
forward,
init=init,
dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256},
params={"E": None}
params={"E": None},
).initialize()
def init(model, X=None, Y=None):
vectors_table = model.ops.alloc3f(model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM"))
vectors_table = model.ops.alloc3f(
model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
)
model.set_param("E", vectors_table)

View File

@ -1,5 +1,4 @@
from thinc.model import Model
from thinc.api import normal_init
from thinc.api import Model, normal_init
def PrecomputableAffine(nO, nI, nF, nP):
@ -20,9 +19,7 @@ def forward(model, X, is_train):
nP = model.get_dim("nP")
nI = model.get_dim("nI")
W = model.get_param("W")
Yf = model.ops.gemm(
X, W.reshape((nF * nO * nP, nI)), trans2=True
)
Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True)
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
@ -37,14 +34,14 @@ def forward(model, X, is_train):
# for b in range(nB):
# for f in range(nF):
# dYf[b, ids[b, f]] += dY[b]
#
#
# However, we avoid building that array for efficiency -- and just pass
# in the indices.
dY, ids = dY_ids
assert dY.ndim == 3
assert dY.shape[1] == nO, dY.shape
assert dY.shape[2] == nP, dY.shape
nB = dY.shape[0]
# nB = dY.shape[0]
model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
Xf = X[ids]
Xf = Xf.reshape((Xf.shape[0], nF * nI))
@ -83,12 +80,12 @@ def _backprop_precomputable_affine_padding(model, dY, ids):
# for f in range(nF):
# if ids[b, f] < 0:
# d_padding[0, f] += dY[b]
#
#
# Which can be rewritten as:
#
# for b in range(nB):
# d_pad[0, ids[b] < 0] += dY[b]
#
#
# I don't know how to avoid the loop without building a whole array :(.
# Cursed numpy.
d_pad = model.ops.alloc((1, nF, nO, nP))
@ -118,7 +115,7 @@ def init(model, X=None, Y=None):
pad = model.ops.alloc4f(1, nF, nO, nP)
ops = model.ops
W = normal_init(ops, W.shape, fan_in=nF*nI)
W = normal_init(ops, W.shape, fan_in=nF * nI)
model.set_param("W", W)
model.set_param("b", b)
model.set_param("pad", pad)

View File

@ -9,7 +9,7 @@ from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged
from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors
from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain
from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued
from thinc.api import zero_init, glorot_uniform_init
from thinc.api import zero_init
def build_text_classifier(arch, config):
@ -33,10 +33,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg
output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO"))
else:
# TODO: experiment with init_w=zero_init
output_layer = (
Linear(nO=nr_class, nI=tok2vec.get_dim("nO"))
>> Logistic()
)
output_layer = Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) >> Logistic()
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
model.set_ref("tok2vec", tok2vec)
model.set_dim("nO", nr_class)
@ -149,13 +146,21 @@ def Tok2Vec(
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0)
if subword_features:
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0)
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0)
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0)
prefix = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0
)
suffix = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0
)
shape = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0
)
else:
prefix, suffix, shape = (None, None, None)
if pretrained_vectors is not None:
glove = StaticVectors(vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0)
glove = StaticVectors(
vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0
)
if subword_features:
embed = uniqued(

View File

@ -1,5 +1,5 @@
import numpy
from thinc.model import Model
from thinc.api import Model
from ..attrs import LOWER
@ -26,9 +26,7 @@ def forward(self, docs, is_train: bool):
# The dtype here matches what thinc is expecting -- which differs per
# platform (by int definition). This should be fixed once the problem
# is fixed on Thinc's side.
lengths = self.ops.asarray(
[arr.shape[0] for arr in batch_keys], dtype=numpy.int_
)
lengths = self.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
batch_keys = self.ops.xp.concatenate(batch_keys)
batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
@ -36,4 +34,3 @@ def forward(self, docs, is_train: bool):
return dY
return (batch_keys, batch_vals, lengths), backprop

View File

@ -1,11 +1,8 @@
from thinc.layers import chain, clone, concatenate, with_array, uniqued
from thinc.model import Model
from thinc.layers import noop, with_padded
from thinc.layers import Maxout, expand_window
from thinc.layers import HashEmbed, StaticVectors
from thinc.layers import residual, LayerNorm, FeatureExtractor
from thinc.api import Model, chain, clone, concatenate, with_array, uniqued, noop
from thinc.api import with_padded, Maxout, expand_window, HashEmbed, StaticVectors
from thinc.api import residual, LayerNorm, FeatureExtractor
from spacy.ml import _character_embed
from ..ml import _character_embed
from ..util import make_layer, registry
@ -93,8 +90,10 @@ def MaxoutWindowEncoder(config):
nW = config["window_size"]
nP = config["pieces"]
depth = config["depth"]
cnn = expand_window(window_size=nW), Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True)
cnn = (
expand_window(window_size=nW),
Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True),
)
model = clone(residual(cnn), depth)
model.set_dim("nO", nO)
model.attrs["receptive_field"] = nW * depth
@ -103,13 +102,16 @@ def MaxoutWindowEncoder(config):
@registry.architectures.register("spacy.MishWindowEncoder.v1")
def MishWindowEncoder(config):
from thinc.layers import Mish
from thinc.api import Mish
nO = config["width"]
nW = config["window_size"]
depth = config["depth"]
cnn = chain(expand_window(window_size=nW), Mish(nO=nO, nI=nO * ((nW * 2) + 1)), LayerNorm(nO))
cnn = chain(
expand_window(window_size=nW),
Mish(nO=nO, nI=nO * ((nW * 2) + 1)),
LayerNorm(nO),
)
model = clone(residual(cnn), depth)
model.set_dim("nO", nO)
return model
@ -118,14 +120,20 @@ def MishWindowEncoder(config):
@registry.architectures.register("spacy.PretrainedVectors.v1")
def PretrainedVectors(config):
# TODO: actual vectors instead of name
return StaticVectors(vectors=config["vectors_name"], nO=config["width"], column=config["column"], dropout=0.0)
return StaticVectors(
vectors=config["vectors_name"],
nO=config["width"],
column=config["column"],
dropout=0.0,
)
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
def TorchBiLSTMEncoder(config):
import torch.nn
# TODO FIX
from thinc.layers import PyTorchRNNWrapper
# TODO: FIX
from thinc.api import PyTorchRNNWrapper
width = config["width"]
depth = config["depth"]

View File

@ -1,4 +1,4 @@
from thinc.layers import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity
from thinc.api import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity
from .pipes import Pipe
from ..language import component
@ -63,8 +63,7 @@ class SimilarityHook(Pipe):
@classmethod
def Model(cls, length):
return siamese(
concatenate(reduce_max(), reduce_mean()),
CauchySimilarity(length * 2)
concatenate(reduce_max(), reduce_mean()), CauchySimilarity(length * 2)
)
def __call__(self, doc):

View File

@ -3,8 +3,8 @@ from collections import defaultdict
import numpy
cimport numpy as np
from thinc.layers import chain, list2array
from thinc.util import to_categorical, copy_array, get_array_module
from thinc.api import chain, list2array, to_categorical, get_array_module
from thinc.util import copy_array
from .. import util
from .pipes import Pipe

View File

@ -3,11 +3,9 @@
import numpy
import srsly
import random
from thinc.layers import chain, Linear, Maxout, Softmax, LayerNorm, list2array
from thinc.initializers import zero_init
from thinc.loss import CosineDistance
from thinc.util import to_categorical, get_array_module
from thinc.model import set_dropout_rate
from thinc.api import chain, Linear, Maxout, Softmax, LayerNorm, list2array
from thinc.api import zero_init, CosineDistance, to_categorical, get_array_module
from thinc.api import set_dropout_rate
from ..tokens.doc cimport Doc
from ..syntax.nn_parser cimport Parser

View File

@ -1,3 +1,5 @@
from thinc.api import Model, set_dropout_rate
from .pipes import Pipe
from ..gold import Example
from ..tokens import Doc
@ -5,8 +7,6 @@ from ..vocab import Vocab
from ..language import component
from ..util import link_vectors_to_models, minibatch, registry, eg2doc
from thinc.model import Model, set_dropout_rate
@component("tok2vec", assigns=["doc.tensor"])
class Tok2Vec(Pipe):
@ -39,7 +39,9 @@ class Tok2Vec(Pipe):
self.listeners = []
def create_listener(self):
listener = Tok2VecListener(upstream_name="tok2vec", width=self.model.get_dim("nO"))
listener = Tok2VecListener(
upstream_name="tok2vec", width=self.model.get_dim("nO")
)
self.listeners.append(listener)
def add_listener(self, listener):
@ -112,10 +114,10 @@ class Tok2Vec(Pipe):
docs = [docs]
set_dropout_rate(self.model, drop)
tokvecs, bp_tokvecs = self.model.begin_update(docs)
def capture_losses(d_tokvecs):
"""Accumulate tok2vec loss before doing backprop."""
l2_loss = sum((d_t2v**2).sum() for d_t2v in d_tokvecs)
l2_loss = sum((d_t2v ** 2).sum() for d_t2v in d_tokvecs)
if self.name in losses:
losses[self.name] += l2_loss / len(d_tokvecs)
else:
@ -133,7 +135,9 @@ class Tok2Vec(Pipe):
def get_loss(self, docs, golds, scores):
pass
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
def begin_training(
self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
):
"""Allocate models and pre-process training data
get_examples (function): Function returning example training data.
@ -151,6 +155,7 @@ class Tok2VecListener(Model):
"""A layer that gets fed its answers from an upstream connection,
for instance from a component earlier in the pipeline.
"""
name = "tok2vec-listener"
def __init__(self, upstream_name, width):

View File

@ -11,9 +11,7 @@ from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free, realloc
from cymem.cymem cimport Pool
from thinc.extra.search cimport Beam
from thinc.layers import Linear
from thinc.model import Model
from thinc.backends import CupyOps, NumpyOps, use_ops
from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops
from thinc.backends.linalg cimport Vec, VecVec
cimport blis.cy

View File

@ -1,11 +1,8 @@
# cython: infer_types=True
# cython: cdivision=True
# cython: boundscheck=False
import numpy
cimport cython.parallel
import numpy.random
cimport numpy as np
from itertools import islice
from cpython.ref cimport PyObject, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from libc.math cimport exp
@ -14,15 +11,16 @@ from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free
from cymem.cymem cimport Pool
from thinc.extra.search cimport Beam
from thinc.layers import chain, clone, Linear, list2array
from thinc.backends import NumpyOps, CupyOps, use_ops
from thinc.util import get_array_module
from thinc.backends.linalg cimport Vec, VecVec
from thinc.initializers import zero_init
from thinc.model import set_dropout_rate
import srsly
from spacy.gold import Example
from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops
from thinc.api import get_array_module, zero_init, set_dropout_rate
from itertools import islice
import srsly
import numpy.random
import numpy
from ..gold import Example
from ..typedefs cimport weight_t, class_t, hash_t
from ._parser_model cimport alloc_activations, free_activations
from ._parser_model cimport predict_states, arg_max_if_valid

View File

@ -6,7 +6,7 @@ scheme.
"""
from copy import copy
from spacy.gold import Example
from ..gold import Example
from ..tokens.doc cimport Doc, set_children_from_heads
from ..errors import Errors

View File

@ -1,4 +1,3 @@
import pytest
import numpy
from spacy.tokens import Doc, Span
@ -274,7 +273,19 @@ def test_doc_is_nered(en_vocab):
def test_doc_from_array_sent_starts(en_vocab):
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
deps = [
"ROOT",
"dep",
"dep",
"dep",
"dep",
"dep",
"ROOT",
"dep",
"dep",
"dep",
"dep",
]
doc = Doc(en_vocab, words=words)
for i, (dep, head) in enumerate(zip(deps, heads)):
doc[i].dep_ = dep

View File

@ -29,7 +29,9 @@ def test_morph_props(i_has):
def test_morph_iter(i_has):
assert set(i_has[0].morph) == set(["PronType=prs"])
assert set(i_has[1].morph) == set(["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"])
assert set(i_has[1].morph) == set(
["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"]
)
def test_morph_get(i_has):

View File

@ -8,7 +8,12 @@ from ..util import get_doc
def test_doc_retokenize_merge(en_tokenizer):
text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"}
attrs = {
"tag": "NAMED",
"lemma": "LEMMA",
"ent_type": "TYPE",
"morph": "Number=Plur",
}
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:

View File

@ -1,4 +1,3 @@
def test_ar_tokenizer_handles_long_text(ar_tokenizer):
text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين.
ولد نجيب محفوظ في مدينة القاهرة، حيث ترعرع و تلقى تعليمه الجامعي في جامعتها،

Some files were not shown because too many files have changed in this diff Show More