mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-24 12:41:23 +03:00
Tidy up and auto-format
This commit is contained in:
parent
1278161f47
commit
e3f40a6a0f
|
@ -5,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")
|
|||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
|
||||
|
||||
# These are imported as part of the API
|
||||
from thinc.util import prefer_gpu, require_gpu
|
||||
from thinc.api import prefer_gpu, require_gpu
|
||||
|
||||
from . import pipeline
|
||||
from .cli.info import info as cli_info
|
||||
|
|
|
@ -192,11 +192,7 @@ def debug_data(
|
|||
has_ws_ents_error = True
|
||||
|
||||
if gold_train_data["punct_ents"]:
|
||||
msg.warn(
|
||||
"{} entity span(s) with punctuation".format(
|
||||
gold_train_data["punct_ents"]
|
||||
)
|
||||
)
|
||||
msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
|
||||
has_punct_ents_warning = True
|
||||
|
||||
for label in new_labels:
|
||||
|
|
|
@ -4,14 +4,12 @@ import time
|
|||
import re
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from thinc.layers import Linear, Maxout
|
||||
from thinc.util import prefer_gpu
|
||||
from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu
|
||||
from thinc.api import CosineDistance, L2Distance
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
from thinc.layers import chain, list2array
|
||||
from thinc.loss import CosineDistance, L2Distance
|
||||
|
||||
from spacy.gold import Example
|
||||
from ..gold import Example
|
||||
from ..errors import Errors
|
||||
from ..tokens import Doc
|
||||
from ..attrs import ID, HEAD
|
||||
|
@ -85,7 +83,7 @@ def pretrain(
|
|||
)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
msg.good("Created output directory: {}".format(output_dir))
|
||||
msg.good(f"Created output directory: {output_dir}")
|
||||
srsly.write_json(output_dir / "config.json", config)
|
||||
msg.good("Saved settings to config.json")
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
from thinc.backends import use_ops
|
||||
from thinc.api import use_ops
|
||||
from timeit import default_timer as timer
|
||||
import shutil
|
||||
import srsly
|
||||
|
@ -89,7 +89,7 @@ def train(
|
|||
)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
msg.good("Created output directory: {}".format(output_path))
|
||||
msg.good(f"Created output directory: {output_path}")
|
||||
|
||||
tag_map = {}
|
||||
if tag_map_path is not None:
|
||||
|
@ -125,17 +125,17 @@ def train(
|
|||
msg.text(f"Training pipeline: {pipeline}")
|
||||
disabled_pipes = None
|
||||
pipes_added = False
|
||||
msg.text("Training pipeline: {}".format(pipeline))
|
||||
msg.text(f"Training pipeline: {pipeline}")
|
||||
if use_gpu >= 0:
|
||||
activated_gpu = None
|
||||
try:
|
||||
activated_gpu = set_gpu(use_gpu)
|
||||
except Exception as e:
|
||||
msg.warn("Exception: {}".format(e))
|
||||
msg.warn(f"Exception: {e}")
|
||||
if activated_gpu is not None:
|
||||
msg.text("Using GPU: {}".format(use_gpu))
|
||||
msg.text(f"Using GPU: {use_gpu}")
|
||||
else:
|
||||
msg.warn("Unable to activate GPU: {}".format(use_gpu))
|
||||
msg.warn(f"Unable to activate GPU: {use_gpu}")
|
||||
msg.text("Using CPU only")
|
||||
use_gpu = -1
|
||||
if base_model:
|
||||
|
@ -158,11 +158,11 @@ def train(
|
|||
"positive_label": textcat_positive_label,
|
||||
}
|
||||
if pipe not in nlp.pipe_names:
|
||||
msg.text("Adding component to base model '{}'".format(pipe))
|
||||
msg.text(f"Adding component to base model '{pipe}'")
|
||||
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
||||
pipes_added = True
|
||||
elif replace_components:
|
||||
msg.text("Replacing component from base model '{}'".format(pipe))
|
||||
msg.text(f"Replacing component from base model '{pipe}'")
|
||||
nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
|
||||
pipes_added = True
|
||||
else:
|
||||
|
@ -180,7 +180,7 @@ def train(
|
|||
f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
|
||||
exits=1,
|
||||
)
|
||||
msg.text("Extending component from base model '{}'".format(pipe))
|
||||
msg.text(f"Extending component from base model '{pipe}'")
|
||||
disabled_pipes = nlp.disable_pipes(
|
||||
[p for p in nlp.pipe_names if p not in pipeline]
|
||||
)
|
||||
|
@ -377,7 +377,7 @@ def train(
|
|||
msg.warn(
|
||||
"Did you provide the same parameters during 'train' as during 'pretrain'?"
|
||||
)
|
||||
msg.fail("Original error message: {}".format(e), exits=1)
|
||||
msg.fail(f"Original error message: {e}", exits=1)
|
||||
if raw_text:
|
||||
# If raw text is available, perform 'rehearsal' updates,
|
||||
# which use unlabelled data to reduce overfitting.
|
||||
|
@ -504,11 +504,7 @@ def train(
|
|||
)
|
||||
break
|
||||
except Exception as e:
|
||||
msg.warn(
|
||||
"Aborting and saving the final best model. Encountered exception: {}".format(
|
||||
e
|
||||
)
|
||||
)
|
||||
msg.warn(f"Aborting and saving final best model. Encountered exception: {e}")
|
||||
finally:
|
||||
best_pipes = nlp.pipe_names
|
||||
if disabled_pipes:
|
||||
|
|
|
@ -1,19 +1,20 @@
|
|||
from typing import Optional, Dict, List, Union, Sequence
|
||||
import plac
|
||||
from thinc.util import require_gpu
|
||||
from wasabi import msg
|
||||
from pathlib import Path
|
||||
import thinc
|
||||
import thinc.schedules
|
||||
from thinc.model import Model
|
||||
from spacy.gold import GoldCorpus
|
||||
import spacy
|
||||
from spacy.pipeline.tok2vec import Tok2VecListener
|
||||
from typing import Optional, Dict, List, Union, Sequence
|
||||
from thinc.api import Model
|
||||
from pydantic import BaseModel, FilePath, StrictInt
|
||||
import tqdm
|
||||
|
||||
from ..ml import component_models
|
||||
from .. import util
|
||||
# TODO: relative imports?
|
||||
import spacy
|
||||
from spacy.gold import GoldCorpus
|
||||
from spacy.pipeline.tok2vec import Tok2VecListener
|
||||
from spacy.ml import component_models
|
||||
from spacy import util
|
||||
|
||||
|
||||
registry = util.registry
|
||||
|
||||
|
@ -153,10 +154,9 @@ def create_tb_parser_model(
|
|||
hidden_width: StrictInt = 64,
|
||||
maxout_pieces: StrictInt = 3,
|
||||
):
|
||||
from thinc.layers import Linear, chain, list2array
|
||||
from thinc.api import Linear, chain, list2array, use_ops, zero_init
|
||||
from spacy.ml._layers import PrecomputableAffine
|
||||
from spacy.syntax._parser_model import ParserModel
|
||||
from thinc.api import use_ops, zero_init
|
||||
|
||||
token_vector_width = tok2vec.get_dim("nO")
|
||||
tok2vec = chain(tok2vec, list2array())
|
||||
|
@ -221,13 +221,9 @@ def train_from_config_cli(
|
|||
|
||||
|
||||
def train_from_config(
|
||||
config_path,
|
||||
data_paths,
|
||||
raw_text=None,
|
||||
meta_path=None,
|
||||
output_path=None,
|
||||
config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
|
||||
):
|
||||
msg.info("Loading config from: {}".format(config_path))
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
config = util.load_from_config(config_path, create_objects=True)
|
||||
use_gpu = config["training"]["use_gpu"]
|
||||
if use_gpu >= 0:
|
||||
|
@ -241,9 +237,7 @@ def train_from_config(
|
|||
msg.info("Loading training corpus")
|
||||
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
|
||||
msg.info("Initializing the nlp pipeline")
|
||||
nlp.begin_training(
|
||||
lambda: corpus.train_examples, device=use_gpu
|
||||
)
|
||||
nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
|
||||
|
||||
train_batches = create_train_batches(nlp, corpus, config["training"])
|
||||
evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"])
|
||||
|
@ -260,7 +254,7 @@ def train_from_config(
|
|||
config["training"]["eval_frequency"],
|
||||
)
|
||||
|
||||
msg.info("Training. Initial learn rate: {}".format(optimizer.learn_rate))
|
||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
||||
print_row = setup_printer(config)
|
||||
|
||||
try:
|
||||
|
@ -414,7 +408,7 @@ def subdivide_batch(batch):
|
|||
def setup_printer(config):
|
||||
score_cols = config["training"]["scores"]
|
||||
score_widths = [max(len(col), 6) for col in score_cols]
|
||||
loss_cols = ["Loss {}".format(pipe) for pipe in config["nlp"]["pipeline"]]
|
||||
loss_cols = [f"Loss {pipe}" for pipe in config["nlp"]["pipeline"]]
|
||||
loss_widths = [max(len(col), 8) for col in loss_cols]
|
||||
table_header = ["#"] + loss_cols + score_cols + ["Score"]
|
||||
table_header = [col.upper() for col in table_header]
|
||||
|
|
|
@ -30,7 +30,7 @@ try:
|
|||
except ImportError:
|
||||
cupy = None
|
||||
|
||||
from thinc.optimizers import Optimizer # noqa: F401
|
||||
from thinc.api import Optimizer # noqa: F401
|
||||
|
||||
pickle = pickle
|
||||
copy_reg = copy_reg
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Setting explicit height and max-width: none on the SVG is required for
|
||||
# Jupyter to render it properly in a cell
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
def explain(term):
|
||||
"""Get a description for a given POS tag, dependency label or entity type.
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from spacy.tokens import Doc
|
||||
from .tokens import Doc
|
||||
from .typedefs cimport attr_t
|
||||
from .syntax.transition_system cimport Transition
|
||||
|
||||
|
@ -65,5 +65,3 @@ cdef class Example:
|
|||
cdef public TokenAnnotation token_annotation
|
||||
cdef public DocAnnotation doc_annotation
|
||||
cdef public object goldparse
|
||||
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from libcpp.vector cimport vector
|
|||
from libc.stdint cimport int32_t, int64_t
|
||||
from libc.stdio cimport FILE
|
||||
|
||||
from spacy.vocab cimport Vocab
|
||||
from .vocab cimport Vocab
|
||||
from .typedefs cimport hash_t
|
||||
|
||||
from .structs cimport KBEntryC, AliasC
|
||||
|
@ -169,4 +169,3 @@ cdef class Reader:
|
|||
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
||||
|
||||
cdef int _read(self, void* value, size_t size) except -1
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Source: https://github.com/stopwords-iso/stopwords-af
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Source: https://github.com/Alir3z4/stop-words
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Source: https://github.com/Alir3z4/stop-words
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
á a ab aber ach acht achte achten achter achtes ag alle allein allem allen
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
def get_pos_from_wiktionary():
|
||||
import re
|
||||
from gensim.corpora.wikicorpus import extract_pages
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# These exceptions are used to add NORM values based on a token's ORTH value.
|
||||
# Norms are only set if no alternative is provided in the tokenizer exceptions.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Stop words
|
||||
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
_exc = {
|
||||
# Slang and abbreviations
|
||||
"cos": "because",
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Stop words
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Source: https://github.com/stopwords-iso/stopwords-et
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
verb_roots = """
|
||||
#هست
|
||||
آخت#آهنج
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Stop words from HAZM package
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt
|
||||
# Reformatted with some minor corrections
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# fmt: off
|
||||
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
|
||||
broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Source: https://github.com/Xangis/extra-stopwords
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
ಹಲವು
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Source: https://github.com/stopwords-iso/stopwords-lv
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# These exceptions are used to add NORM values based on a token's ORTH value.
|
||||
# Individual languages can also add their own exceptions and overwrite them -
|
||||
# for example, British vs. American spelling in English.
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
_exc = {
|
||||
# Slang
|
||||
"прив": "привет",
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
අතර
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Source: https://github.com/Ardevop-sk/stopwords-sk
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Source: https://github.com/stopwords-iso/stopwords-sl
|
||||
# TODO: probably needs to be tidied up – the list seems to have month names in
|
||||
# it, which shouldn't be considered stop words.
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Source: https://github.com/andrixh/index-albanian
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
_exc = {
|
||||
# Slang
|
||||
"ћале": "отац",
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
а
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# Stop words
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
_exc = {
|
||||
# Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
|
||||
"สนุ๊กเกอร์": "สนุกเกอร์",
|
||||
|
|
|
@ -34,7 +34,7 @@ URL_PATTERN = (
|
|||
r"|"
|
||||
# host & domain names
|
||||
# mods: match is case-sensitive, so include [A-Z]
|
||||
"(?:"
|
||||
"(?:" # noqa: E131
|
||||
"(?:"
|
||||
"[A-Za-z0-9\u00a1-\uffff]"
|
||||
"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
>>> from spacy.lang.tr.examples import sentences
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ import weakref
|
|||
import functools
|
||||
from contextlib import contextmanager
|
||||
from copy import copy, deepcopy
|
||||
from thinc.backends import get_current_ops
|
||||
from thinc.api import get_current_ops
|
||||
import srsly
|
||||
import multiprocessing as mp
|
||||
from itertools import chain, cycle
|
||||
|
|
|
@ -6,7 +6,7 @@ cimport numpy as np
|
|||
np.import_array()
|
||||
|
||||
import numpy
|
||||
from thinc.util import get_array_module
|
||||
from thinc.api import get_array_module
|
||||
|
||||
from .typedefs cimport attr_t, flags_t
|
||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
|
|
|
@ -3,18 +3,20 @@ from thinc.api import Model
|
|||
|
||||
def CharacterEmbed(nM, nC):
|
||||
# nM: Number of dimensions per character. nC: Number of characters.
|
||||
nO = nM*nC if (nM is not None and nC is not None) else None
|
||||
nO = nM * nC if (nM is not None and nC is not None) else None
|
||||
return Model(
|
||||
"charembed",
|
||||
forward,
|
||||
init=init,
|
||||
dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256},
|
||||
params={"E": None}
|
||||
params={"E": None},
|
||||
).initialize()
|
||||
|
||||
|
||||
def init(model, X=None, Y=None):
|
||||
vectors_table = model.ops.alloc3f(model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM"))
|
||||
vectors_table = model.ops.alloc3f(
|
||||
model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
|
||||
)
|
||||
model.set_param("E", vectors_table)
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from thinc.model import Model
|
||||
from thinc.api import normal_init
|
||||
from thinc.api import Model, normal_init
|
||||
|
||||
|
||||
def PrecomputableAffine(nO, nI, nF, nP):
|
||||
|
@ -20,9 +19,7 @@ def forward(model, X, is_train):
|
|||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
W = model.get_param("W")
|
||||
Yf = model.ops.gemm(
|
||||
X, W.reshape((nF * nO * nP, nI)), trans2=True
|
||||
)
|
||||
Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True)
|
||||
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
|
||||
Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
|
||||
|
||||
|
@ -44,7 +41,7 @@ def forward(model, X, is_train):
|
|||
assert dY.ndim == 3
|
||||
assert dY.shape[1] == nO, dY.shape
|
||||
assert dY.shape[2] == nP, dY.shape
|
||||
nB = dY.shape[0]
|
||||
# nB = dY.shape[0]
|
||||
model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
|
||||
Xf = X[ids]
|
||||
Xf = Xf.reshape((Xf.shape[0], nF * nI))
|
||||
|
@ -118,7 +115,7 @@ def init(model, X=None, Y=None):
|
|||
pad = model.ops.alloc4f(1, nF, nO, nP)
|
||||
|
||||
ops = model.ops
|
||||
W = normal_init(ops, W.shape, fan_in=nF*nI)
|
||||
W = normal_init(ops, W.shape, fan_in=nF * nI)
|
||||
model.set_param("W", W)
|
||||
model.set_param("b", b)
|
||||
model.set_param("pad", pad)
|
||||
|
|
|
@ -9,7 +9,7 @@ from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged
|
|||
from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors
|
||||
from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain
|
||||
from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued
|
||||
from thinc.api import zero_init, glorot_uniform_init
|
||||
from thinc.api import zero_init
|
||||
|
||||
|
||||
def build_text_classifier(arch, config):
|
||||
|
@ -33,10 +33,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg
|
|||
output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO"))
|
||||
else:
|
||||
# TODO: experiment with init_w=zero_init
|
||||
output_layer = (
|
||||
Linear(nO=nr_class, nI=tok2vec.get_dim("nO"))
|
||||
>> Logistic()
|
||||
)
|
||||
output_layer = Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) >> Logistic()
|
||||
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
model.set_dim("nO", nr_class)
|
||||
|
@ -149,13 +146,21 @@ def Tok2Vec(
|
|||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0)
|
||||
if subword_features:
|
||||
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0)
|
||||
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0)
|
||||
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0)
|
||||
prefix = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0
|
||||
)
|
||||
shape = HashEmbed(
|
||||
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0
|
||||
)
|
||||
else:
|
||||
prefix, suffix, shape = (None, None, None)
|
||||
if pretrained_vectors is not None:
|
||||
glove = StaticVectors(vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0)
|
||||
glove = StaticVectors(
|
||||
vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0
|
||||
)
|
||||
|
||||
if subword_features:
|
||||
embed = uniqued(
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import numpy
|
||||
from thinc.model import Model
|
||||
from thinc.api import Model
|
||||
|
||||
from ..attrs import LOWER
|
||||
|
||||
|
@ -26,9 +26,7 @@ def forward(self, docs, is_train: bool):
|
|||
# The dtype here matches what thinc is expecting -- which differs per
|
||||
# platform (by int definition). This should be fixed once the problem
|
||||
# is fixed on Thinc's side.
|
||||
lengths = self.ops.asarray(
|
||||
[arr.shape[0] for arr in batch_keys], dtype=numpy.int_
|
||||
)
|
||||
lengths = self.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
|
||||
batch_keys = self.ops.xp.concatenate(batch_keys)
|
||||
batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
|
||||
|
||||
|
@ -36,4 +34,3 @@ def forward(self, docs, is_train: bool):
|
|||
return dY
|
||||
|
||||
return (batch_keys, batch_vals, lengths), backprop
|
||||
|
||||
|
|
|
@ -1,11 +1,8 @@
|
|||
from thinc.layers import chain, clone, concatenate, with_array, uniqued
|
||||
from thinc.model import Model
|
||||
from thinc.layers import noop, with_padded
|
||||
from thinc.layers import Maxout, expand_window
|
||||
from thinc.layers import HashEmbed, StaticVectors
|
||||
from thinc.layers import residual, LayerNorm, FeatureExtractor
|
||||
from thinc.api import Model, chain, clone, concatenate, with_array, uniqued, noop
|
||||
from thinc.api import with_padded, Maxout, expand_window, HashEmbed, StaticVectors
|
||||
from thinc.api import residual, LayerNorm, FeatureExtractor
|
||||
|
||||
from spacy.ml import _character_embed
|
||||
from ..ml import _character_embed
|
||||
from ..util import make_layer, registry
|
||||
|
||||
|
||||
|
@ -93,8 +90,10 @@ def MaxoutWindowEncoder(config):
|
|||
nW = config["window_size"]
|
||||
nP = config["pieces"]
|
||||
depth = config["depth"]
|
||||
|
||||
cnn = expand_window(window_size=nW), Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True)
|
||||
cnn = (
|
||||
expand_window(window_size=nW),
|
||||
Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True),
|
||||
)
|
||||
model = clone(residual(cnn), depth)
|
||||
model.set_dim("nO", nO)
|
||||
model.attrs["receptive_field"] = nW * depth
|
||||
|
@ -103,13 +102,16 @@ def MaxoutWindowEncoder(config):
|
|||
|
||||
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
||||
def MishWindowEncoder(config):
|
||||
from thinc.layers import Mish
|
||||
from thinc.api import Mish
|
||||
|
||||
nO = config["width"]
|
||||
nW = config["window_size"]
|
||||
depth = config["depth"]
|
||||
|
||||
cnn = chain(expand_window(window_size=nW), Mish(nO=nO, nI=nO * ((nW * 2) + 1)), LayerNorm(nO))
|
||||
cnn = chain(
|
||||
expand_window(window_size=nW),
|
||||
Mish(nO=nO, nI=nO * ((nW * 2) + 1)),
|
||||
LayerNorm(nO),
|
||||
)
|
||||
model = clone(residual(cnn), depth)
|
||||
model.set_dim("nO", nO)
|
||||
return model
|
||||
|
@ -118,14 +120,20 @@ def MishWindowEncoder(config):
|
|||
@registry.architectures.register("spacy.PretrainedVectors.v1")
|
||||
def PretrainedVectors(config):
|
||||
# TODO: actual vectors instead of name
|
||||
return StaticVectors(vectors=config["vectors_name"], nO=config["width"], column=config["column"], dropout=0.0)
|
||||
return StaticVectors(
|
||||
vectors=config["vectors_name"],
|
||||
nO=config["width"],
|
||||
column=config["column"],
|
||||
dropout=0.0,
|
||||
)
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
|
||||
def TorchBiLSTMEncoder(config):
|
||||
import torch.nn
|
||||
# TODO FIX
|
||||
from thinc.layers import PyTorchRNNWrapper
|
||||
|
||||
# TODO: FIX
|
||||
from thinc.api import PyTorchRNNWrapper
|
||||
|
||||
width = config["width"]
|
||||
depth = config["depth"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from thinc.layers import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity
|
||||
from thinc.api import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity
|
||||
|
||||
from .pipes import Pipe
|
||||
from ..language import component
|
||||
|
@ -63,8 +63,7 @@ class SimilarityHook(Pipe):
|
|||
@classmethod
|
||||
def Model(cls, length):
|
||||
return siamese(
|
||||
concatenate(reduce_max(), reduce_mean()),
|
||||
CauchySimilarity(length * 2)
|
||||
concatenate(reduce_max(), reduce_mean()), CauchySimilarity(length * 2)
|
||||
)
|
||||
|
||||
def __call__(self, doc):
|
||||
|
|
|
@ -3,8 +3,8 @@ from collections import defaultdict
|
|||
import numpy
|
||||
cimport numpy as np
|
||||
|
||||
from thinc.layers import chain, list2array
|
||||
from thinc.util import to_categorical, copy_array, get_array_module
|
||||
from thinc.api import chain, list2array, to_categorical, get_array_module
|
||||
from thinc.util import copy_array
|
||||
|
||||
from .. import util
|
||||
from .pipes import Pipe
|
||||
|
|
|
@ -3,11 +3,9 @@
|
|||
import numpy
|
||||
import srsly
|
||||
import random
|
||||
from thinc.layers import chain, Linear, Maxout, Softmax, LayerNorm, list2array
|
||||
from thinc.initializers import zero_init
|
||||
from thinc.loss import CosineDistance
|
||||
from thinc.util import to_categorical, get_array_module
|
||||
from thinc.model import set_dropout_rate
|
||||
from thinc.api import chain, Linear, Maxout, Softmax, LayerNorm, list2array
|
||||
from thinc.api import zero_init, CosineDistance, to_categorical, get_array_module
|
||||
from thinc.api import set_dropout_rate
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..syntax.nn_parser cimport Parser
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from thinc.api import Model, set_dropout_rate
|
||||
|
||||
from .pipes import Pipe
|
||||
from ..gold import Example
|
||||
from ..tokens import Doc
|
||||
|
@ -5,8 +7,6 @@ from ..vocab import Vocab
|
|||
from ..language import component
|
||||
from ..util import link_vectors_to_models, minibatch, registry, eg2doc
|
||||
|
||||
from thinc.model import Model, set_dropout_rate
|
||||
|
||||
|
||||
@component("tok2vec", assigns=["doc.tensor"])
|
||||
class Tok2Vec(Pipe):
|
||||
|
@ -39,7 +39,9 @@ class Tok2Vec(Pipe):
|
|||
self.listeners = []
|
||||
|
||||
def create_listener(self):
|
||||
listener = Tok2VecListener(upstream_name="tok2vec", width=self.model.get_dim("nO"))
|
||||
listener = Tok2VecListener(
|
||||
upstream_name="tok2vec", width=self.model.get_dim("nO")
|
||||
)
|
||||
self.listeners.append(listener)
|
||||
|
||||
def add_listener(self, listener):
|
||||
|
@ -115,7 +117,7 @@ class Tok2Vec(Pipe):
|
|||
|
||||
def capture_losses(d_tokvecs):
|
||||
"""Accumulate tok2vec loss before doing backprop."""
|
||||
l2_loss = sum((d_t2v**2).sum() for d_t2v in d_tokvecs)
|
||||
l2_loss = sum((d_t2v ** 2).sum() for d_t2v in d_tokvecs)
|
||||
if self.name in losses:
|
||||
losses[self.name] += l2_loss / len(d_tokvecs)
|
||||
else:
|
||||
|
@ -133,7 +135,9 @@ class Tok2Vec(Pipe):
|
|||
def get_loss(self, docs, golds, scores):
|
||||
pass
|
||||
|
||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
||||
def begin_training(
|
||||
self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
|
||||
):
|
||||
"""Allocate models and pre-process training data
|
||||
|
||||
get_examples (function): Function returning example training data.
|
||||
|
@ -151,6 +155,7 @@ class Tok2VecListener(Model):
|
|||
"""A layer that gets fed its answers from an upstream connection,
|
||||
for instance from a component earlier in the pipeline.
|
||||
"""
|
||||
|
||||
name = "tok2vec-listener"
|
||||
|
||||
def __init__(self, upstream_name, width):
|
||||
|
|
|
@ -11,9 +11,7 @@ from libc.string cimport memset, memcpy
|
|||
from libc.stdlib cimport calloc, free, realloc
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.extra.search cimport Beam
|
||||
from thinc.layers import Linear
|
||||
from thinc.model import Model
|
||||
from thinc.backends import CupyOps, NumpyOps, use_ops
|
||||
from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops
|
||||
from thinc.backends.linalg cimport Vec, VecVec
|
||||
cimport blis.cy
|
||||
|
||||
|
|
|
@ -1,11 +1,8 @@
|
|||
# cython: infer_types=True
|
||||
# cython: cdivision=True
|
||||
# cython: boundscheck=False
|
||||
import numpy
|
||||
cimport cython.parallel
|
||||
import numpy.random
|
||||
cimport numpy as np
|
||||
from itertools import islice
|
||||
from cpython.ref cimport PyObject, Py_XDECREF
|
||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||
from libc.math cimport exp
|
||||
|
@ -14,15 +11,16 @@ from libc.string cimport memset, memcpy
|
|||
from libc.stdlib cimport calloc, free
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.extra.search cimport Beam
|
||||
from thinc.layers import chain, clone, Linear, list2array
|
||||
from thinc.backends import NumpyOps, CupyOps, use_ops
|
||||
from thinc.util import get_array_module
|
||||
from thinc.backends.linalg cimport Vec, VecVec
|
||||
from thinc.initializers import zero_init
|
||||
from thinc.model import set_dropout_rate
|
||||
import srsly
|
||||
|
||||
from spacy.gold import Example
|
||||
from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops
|
||||
from thinc.api import get_array_module, zero_init, set_dropout_rate
|
||||
from itertools import islice
|
||||
import srsly
|
||||
import numpy.random
|
||||
import numpy
|
||||
|
||||
from ..gold import Example
|
||||
from ..typedefs cimport weight_t, class_t, hash_t
|
||||
from ._parser_model cimport alloc_activations, free_activations
|
||||
from ._parser_model cimport predict_states, arg_max_if_valid
|
||||
|
|
|
@ -6,7 +6,7 @@ scheme.
|
|||
"""
|
||||
from copy import copy
|
||||
|
||||
from spacy.gold import Example
|
||||
from ..gold import Example
|
||||
from ..tokens.doc cimport Doc, set_children_from_heads
|
||||
from ..errors import Errors
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
import pytest
|
||||
import numpy
|
||||
from spacy.tokens import Doc, Span
|
||||
|
@ -274,7 +273,19 @@ def test_doc_is_nered(en_vocab):
|
|||
def test_doc_from_array_sent_starts(en_vocab):
|
||||
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
|
||||
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
|
||||
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
|
||||
deps = [
|
||||
"ROOT",
|
||||
"dep",
|
||||
"dep",
|
||||
"dep",
|
||||
"dep",
|
||||
"dep",
|
||||
"ROOT",
|
||||
"dep",
|
||||
"dep",
|
||||
"dep",
|
||||
"dep",
|
||||
]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, (dep, head) in enumerate(zip(deps, heads)):
|
||||
doc[i].dep_ = dep
|
||||
|
|
|
@ -29,7 +29,9 @@ def test_morph_props(i_has):
|
|||
|
||||
def test_morph_iter(i_has):
|
||||
assert set(i_has[0].morph) == set(["PronType=prs"])
|
||||
assert set(i_has[1].morph) == set(["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"])
|
||||
assert set(i_has[1].morph) == set(
|
||||
["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"]
|
||||
)
|
||||
|
||||
|
||||
def test_morph_get(i_has):
|
||||
|
|
|
@ -8,7 +8,12 @@ from ..util import get_doc
|
|||
|
||||
def test_doc_retokenize_merge(en_tokenizer):
|
||||
text = "WKRO played songs by the beach boys all night"
|
||||
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"}
|
||||
attrs = {
|
||||
"tag": "NAMED",
|
||||
"lemma": "LEMMA",
|
||||
"ent_type": "TYPE",
|
||||
"morph": "Number=Plur",
|
||||
}
|
||||
doc = en_tokenizer(text)
|
||||
assert len(doc) == 9
|
||||
with doc.retokenize() as retokenizer:
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
def test_ar_tokenizer_handles_long_text(ar_tokenizer):
|
||||
text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين.
|
||||
ولد نجيب محفوظ في مدينة القاهرة، حيث ترعرع و تلقى تعليمه الجامعي في جامعتها،
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
def test_en_simple_punct(en_tokenizer):
|
||||
text = "to walk, do foo"
|
||||
tokens = en_tokenizer(text)
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user