mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Tidy up and auto-format
This commit is contained in:
parent
1278161f47
commit
e3f40a6a0f
|
@ -5,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")
|
||||||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
|
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
|
||||||
|
|
||||||
# These are imported as part of the API
|
# These are imported as part of the API
|
||||||
from thinc.util import prefer_gpu, require_gpu
|
from thinc.api import prefer_gpu, require_gpu
|
||||||
|
|
||||||
from . import pipeline
|
from . import pipeline
|
||||||
from .cli.info import info as cli_info
|
from .cli.info import info as cli_info
|
||||||
|
|
|
@ -192,11 +192,7 @@ def debug_data(
|
||||||
has_ws_ents_error = True
|
has_ws_ents_error = True
|
||||||
|
|
||||||
if gold_train_data["punct_ents"]:
|
if gold_train_data["punct_ents"]:
|
||||||
msg.warn(
|
msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
|
||||||
"{} entity span(s) with punctuation".format(
|
|
||||||
gold_train_data["punct_ents"]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
has_punct_ents_warning = True
|
has_punct_ents_warning = True
|
||||||
|
|
||||||
for label in new_labels:
|
for label in new_labels:
|
||||||
|
|
|
@ -4,14 +4,12 @@ import time
|
||||||
import re
|
import re
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.layers import Linear, Maxout
|
from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu
|
||||||
from thinc.util import prefer_gpu
|
from thinc.api import CosineDistance, L2Distance
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.layers import chain, list2array
|
|
||||||
from thinc.loss import CosineDistance, L2Distance
|
|
||||||
|
|
||||||
from spacy.gold import Example
|
from ..gold import Example
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..attrs import ID, HEAD
|
from ..attrs import ID, HEAD
|
||||||
|
@ -85,7 +83,7 @@ def pretrain(
|
||||||
)
|
)
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
msg.good("Created output directory: {}".format(output_dir))
|
msg.good(f"Created output directory: {output_dir}")
|
||||||
srsly.write_json(output_dir / "config.json", config)
|
srsly.write_json(output_dir / "config.json", config)
|
||||||
msg.good("Saved settings to config.json")
|
msg.good("Saved settings to config.json")
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import tqdm
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.backends import use_ops
|
from thinc.api import use_ops
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import shutil
|
import shutil
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -89,7 +89,7 @@ def train(
|
||||||
)
|
)
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir()
|
||||||
msg.good("Created output directory: {}".format(output_path))
|
msg.good(f"Created output directory: {output_path}")
|
||||||
|
|
||||||
tag_map = {}
|
tag_map = {}
|
||||||
if tag_map_path is not None:
|
if tag_map_path is not None:
|
||||||
|
@ -125,17 +125,17 @@ def train(
|
||||||
msg.text(f"Training pipeline: {pipeline}")
|
msg.text(f"Training pipeline: {pipeline}")
|
||||||
disabled_pipes = None
|
disabled_pipes = None
|
||||||
pipes_added = False
|
pipes_added = False
|
||||||
msg.text("Training pipeline: {}".format(pipeline))
|
msg.text(f"Training pipeline: {pipeline}")
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
activated_gpu = None
|
activated_gpu = None
|
||||||
try:
|
try:
|
||||||
activated_gpu = set_gpu(use_gpu)
|
activated_gpu = set_gpu(use_gpu)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg.warn("Exception: {}".format(e))
|
msg.warn(f"Exception: {e}")
|
||||||
if activated_gpu is not None:
|
if activated_gpu is not None:
|
||||||
msg.text("Using GPU: {}".format(use_gpu))
|
msg.text(f"Using GPU: {use_gpu}")
|
||||||
else:
|
else:
|
||||||
msg.warn("Unable to activate GPU: {}".format(use_gpu))
|
msg.warn(f"Unable to activate GPU: {use_gpu}")
|
||||||
msg.text("Using CPU only")
|
msg.text("Using CPU only")
|
||||||
use_gpu = -1
|
use_gpu = -1
|
||||||
if base_model:
|
if base_model:
|
||||||
|
@ -158,11 +158,11 @@ def train(
|
||||||
"positive_label": textcat_positive_label,
|
"positive_label": textcat_positive_label,
|
||||||
}
|
}
|
||||||
if pipe not in nlp.pipe_names:
|
if pipe not in nlp.pipe_names:
|
||||||
msg.text("Adding component to base model '{}'".format(pipe))
|
msg.text(f"Adding component to base model '{pipe}'")
|
||||||
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
||||||
pipes_added = True
|
pipes_added = True
|
||||||
elif replace_components:
|
elif replace_components:
|
||||||
msg.text("Replacing component from base model '{}'".format(pipe))
|
msg.text(f"Replacing component from base model '{pipe}'")
|
||||||
nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
|
nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
|
||||||
pipes_added = True
|
pipes_added = True
|
||||||
else:
|
else:
|
||||||
|
@ -180,7 +180,7 @@ def train(
|
||||||
f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
|
f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
msg.text("Extending component from base model '{}'".format(pipe))
|
msg.text(f"Extending component from base model '{pipe}'")
|
||||||
disabled_pipes = nlp.disable_pipes(
|
disabled_pipes = nlp.disable_pipes(
|
||||||
[p for p in nlp.pipe_names if p not in pipeline]
|
[p for p in nlp.pipe_names if p not in pipeline]
|
||||||
)
|
)
|
||||||
|
@ -377,7 +377,7 @@ def train(
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Did you provide the same parameters during 'train' as during 'pretrain'?"
|
"Did you provide the same parameters during 'train' as during 'pretrain'?"
|
||||||
)
|
)
|
||||||
msg.fail("Original error message: {}".format(e), exits=1)
|
msg.fail(f"Original error message: {e}", exits=1)
|
||||||
if raw_text:
|
if raw_text:
|
||||||
# If raw text is available, perform 'rehearsal' updates,
|
# If raw text is available, perform 'rehearsal' updates,
|
||||||
# which use unlabelled data to reduce overfitting.
|
# which use unlabelled data to reduce overfitting.
|
||||||
|
@ -504,11 +504,7 @@ def train(
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg.warn(
|
msg.warn(f"Aborting and saving final best model. Encountered exception: {e}")
|
||||||
"Aborting and saving the final best model. Encountered exception: {}".format(
|
|
||||||
e
|
|
||||||
)
|
|
||||||
)
|
|
||||||
finally:
|
finally:
|
||||||
best_pipes = nlp.pipe_names
|
best_pipes = nlp.pipe_names
|
||||||
if disabled_pipes:
|
if disabled_pipes:
|
||||||
|
|
|
@ -1,19 +1,20 @@
|
||||||
|
from typing import Optional, Dict, List, Union, Sequence
|
||||||
import plac
|
import plac
|
||||||
from thinc.util import require_gpu
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import thinc
|
import thinc
|
||||||
import thinc.schedules
|
import thinc.schedules
|
||||||
from thinc.model import Model
|
from thinc.api import Model
|
||||||
from spacy.gold import GoldCorpus
|
|
||||||
import spacy
|
|
||||||
from spacy.pipeline.tok2vec import Tok2VecListener
|
|
||||||
from typing import Optional, Dict, List, Union, Sequence
|
|
||||||
from pydantic import BaseModel, FilePath, StrictInt
|
from pydantic import BaseModel, FilePath, StrictInt
|
||||||
import tqdm
|
import tqdm
|
||||||
|
|
||||||
from ..ml import component_models
|
# TODO: relative imports?
|
||||||
from .. import util
|
import spacy
|
||||||
|
from spacy.gold import GoldCorpus
|
||||||
|
from spacy.pipeline.tok2vec import Tok2VecListener
|
||||||
|
from spacy.ml import component_models
|
||||||
|
from spacy import util
|
||||||
|
|
||||||
|
|
||||||
registry = util.registry
|
registry = util.registry
|
||||||
|
|
||||||
|
@ -153,10 +154,9 @@ def create_tb_parser_model(
|
||||||
hidden_width: StrictInt = 64,
|
hidden_width: StrictInt = 64,
|
||||||
maxout_pieces: StrictInt = 3,
|
maxout_pieces: StrictInt = 3,
|
||||||
):
|
):
|
||||||
from thinc.layers import Linear, chain, list2array
|
from thinc.api import Linear, chain, list2array, use_ops, zero_init
|
||||||
from spacy.ml._layers import PrecomputableAffine
|
from spacy.ml._layers import PrecomputableAffine
|
||||||
from spacy.syntax._parser_model import ParserModel
|
from spacy.syntax._parser_model import ParserModel
|
||||||
from thinc.api import use_ops, zero_init
|
|
||||||
|
|
||||||
token_vector_width = tok2vec.get_dim("nO")
|
token_vector_width = tok2vec.get_dim("nO")
|
||||||
tok2vec = chain(tok2vec, list2array())
|
tok2vec = chain(tok2vec, list2array())
|
||||||
|
@ -221,13 +221,9 @@ def train_from_config_cli(
|
||||||
|
|
||||||
|
|
||||||
def train_from_config(
|
def train_from_config(
|
||||||
config_path,
|
config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
|
||||||
data_paths,
|
|
||||||
raw_text=None,
|
|
||||||
meta_path=None,
|
|
||||||
output_path=None,
|
|
||||||
):
|
):
|
||||||
msg.info("Loading config from: {}".format(config_path))
|
msg.info(f"Loading config from: {config_path}")
|
||||||
config = util.load_from_config(config_path, create_objects=True)
|
config = util.load_from_config(config_path, create_objects=True)
|
||||||
use_gpu = config["training"]["use_gpu"]
|
use_gpu = config["training"]["use_gpu"]
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
|
@ -241,9 +237,7 @@ def train_from_config(
|
||||||
msg.info("Loading training corpus")
|
msg.info("Loading training corpus")
|
||||||
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
|
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
|
||||||
msg.info("Initializing the nlp pipeline")
|
msg.info("Initializing the nlp pipeline")
|
||||||
nlp.begin_training(
|
nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
|
||||||
lambda: corpus.train_examples, device=use_gpu
|
|
||||||
)
|
|
||||||
|
|
||||||
train_batches = create_train_batches(nlp, corpus, config["training"])
|
train_batches = create_train_batches(nlp, corpus, config["training"])
|
||||||
evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"])
|
evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"])
|
||||||
|
@ -260,7 +254,7 @@ def train_from_config(
|
||||||
config["training"]["eval_frequency"],
|
config["training"]["eval_frequency"],
|
||||||
)
|
)
|
||||||
|
|
||||||
msg.info("Training. Initial learn rate: {}".format(optimizer.learn_rate))
|
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
||||||
print_row = setup_printer(config)
|
print_row = setup_printer(config)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -414,7 +408,7 @@ def subdivide_batch(batch):
|
||||||
def setup_printer(config):
|
def setup_printer(config):
|
||||||
score_cols = config["training"]["scores"]
|
score_cols = config["training"]["scores"]
|
||||||
score_widths = [max(len(col), 6) for col in score_cols]
|
score_widths = [max(len(col), 6) for col in score_cols]
|
||||||
loss_cols = ["Loss {}".format(pipe) for pipe in config["nlp"]["pipeline"]]
|
loss_cols = [f"Loss {pipe}" for pipe in config["nlp"]["pipeline"]]
|
||||||
loss_widths = [max(len(col), 8) for col in loss_cols]
|
loss_widths = [max(len(col), 8) for col in loss_cols]
|
||||||
table_header = ["#"] + loss_cols + score_cols + ["Score"]
|
table_header = ["#"] + loss_cols + score_cols + ["Score"]
|
||||||
table_header = [col.upper() for col in table_header]
|
table_header = [col.upper() for col in table_header]
|
||||||
|
|
|
@ -30,7 +30,7 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
cupy = None
|
cupy = None
|
||||||
|
|
||||||
from thinc.optimizers import Optimizer # noqa: F401
|
from thinc.api import Optimizer # noqa: F401
|
||||||
|
|
||||||
pickle = pickle
|
pickle = pickle
|
||||||
copy_reg = copy_reg
|
copy_reg = copy_reg
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Setting explicit height and max-width: none on the SVG is required for
|
# Setting explicit height and max-width: none on the SVG is required for
|
||||||
# Jupyter to render it properly in a cell
|
# Jupyter to render it properly in a cell
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
def explain(term):
|
def explain(term):
|
||||||
"""Get a description for a given POS tag, dependency label or entity type.
|
"""Get a description for a given POS tag, dependency label or entity type.
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from spacy.tokens import Doc
|
from .tokens import Doc
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .syntax.transition_system cimport Transition
|
from .syntax.transition_system cimport Transition
|
||||||
|
|
||||||
|
@ -65,5 +65,3 @@ cdef class Example:
|
||||||
cdef public TokenAnnotation token_annotation
|
cdef public TokenAnnotation token_annotation
|
||||||
cdef public DocAnnotation doc_annotation
|
cdef public DocAnnotation doc_annotation
|
||||||
cdef public object goldparse
|
cdef public object goldparse
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t, int64_t
|
||||||
from libc.stdio cimport FILE
|
from libc.stdio cimport FILE
|
||||||
|
|
||||||
from spacy.vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
|
|
||||||
from .structs cimport KBEntryC, AliasC
|
from .structs cimport KBEntryC, AliasC
|
||||||
|
@ -169,4 +169,3 @@ cdef class Reader:
|
||||||
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
||||||
|
|
||||||
cdef int _read(self, void* value, size_t size) except -1
|
cdef int _read(self, void* value, size_t size) except -1
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Source: https://github.com/stopwords-iso/stopwords-af
|
# Source: https://github.com/stopwords-iso/stopwords-af
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Source: https://github.com/Alir3z4/stop-words
|
# Source: https://github.com/Alir3z4/stop-words
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
|
অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Source: https://github.com/Alir3z4/stop-words
|
# Source: https://github.com/Alir3z4/stop-words
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
á a ab aber ach acht achte achten achter achtes ag alle allein allem allen
|
á a ab aber ach acht achte achten achter achtes ag alle allein allem allen
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
def get_pos_from_wiktionary():
|
def get_pos_from_wiktionary():
|
||||||
import re
|
import re
|
||||||
from gensim.corpora.wikicorpus import extract_pages
|
from gensim.corpora.wikicorpus import extract_pages
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# These exceptions are used to add NORM values based on a token's ORTH value.
|
# These exceptions are used to add NORM values based on a token's ORTH value.
|
||||||
# Norms are only set if no alternative is provided in the tokenizer exceptions.
|
# Norms are only set if no alternative is provided in the tokenizer exceptions.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Stop words
|
# Stop words
|
||||||
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
|
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
# Slang and abbreviations
|
# Slang and abbreviations
|
||||||
"cos": "because",
|
"cos": "because",
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Stop words
|
# Stop words
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
|
actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Source: https://github.com/stopwords-iso/stopwords-et
|
# Source: https://github.com/stopwords-iso/stopwords-et
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
verb_roots = """
|
verb_roots = """
|
||||||
#هست
|
#هست
|
||||||
آخت#آهنج
|
آخت#آهنج
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Stop words from HAZM package
|
# Stop words from HAZM package
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt
|
# Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt
|
||||||
# Reformatted with some minor corrections
|
# Reformatted with some minor corrections
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons
|
a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
|
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
|
||||||
broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
|
broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6
|
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben
|
a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Source: https://github.com/Xangis/extra-stopwords
|
# Source: https://github.com/Xangis/extra-stopwords
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl
|
a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
ಹಲವು
|
ಹಲವು
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Source: https://github.com/stopwords-iso/stopwords-lv
|
# Source: https://github.com/stopwords-iso/stopwords-lv
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json
|
# Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# These exceptions are used to add NORM values based on a token's ORTH value.
|
# These exceptions are used to add NORM values based on a token's ORTH value.
|
||||||
# Individual languages can also add their own exceptions and overwrite them -
|
# Individual languages can also add their own exceptions and overwrite them -
|
||||||
# for example, British vs. American spelling in English.
|
# for example, British vs. American spelling in English.
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes
|
à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
# Slang
|
# Slang
|
||||||
"прив": "привет",
|
"прив": "привет",
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
අතර
|
අතර
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Source: https://github.com/Ardevop-sk/stopwords-sk
|
# Source: https://github.com/Ardevop-sk/stopwords-sk
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Source: https://github.com/stopwords-iso/stopwords-sl
|
# Source: https://github.com/stopwords-iso/stopwords-sl
|
||||||
# TODO: probably needs to be tidied up – the list seems to have month names in
|
# TODO: probably needs to be tidied up – the list seems to have month names in
|
||||||
# it, which shouldn't be considered stop words.
|
# it, which shouldn't be considered stop words.
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Source: https://github.com/andrixh/index-albanian
|
# Source: https://github.com/andrixh/index-albanian
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
# Slang
|
# Slang
|
||||||
"ћале": "отац",
|
"ћале": "отац",
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
а
|
а
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras
|
aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# Stop words
|
# Stop words
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
# Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
|
# Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
|
||||||
"สนุ๊กเกอร์": "สนุกเกอร์",
|
"สนุ๊กเกอร์": "สนุกเกอร์",
|
||||||
|
|
|
@ -34,7 +34,7 @@ URL_PATTERN = (
|
||||||
r"|"
|
r"|"
|
||||||
# host & domain names
|
# host & domain names
|
||||||
# mods: match is case-sensitive, so include [A-Z]
|
# mods: match is case-sensitive, so include [A-Z]
|
||||||
"(?:"
|
"(?:" # noqa: E131
|
||||||
"(?:"
|
"(?:"
|
||||||
"[A-Za-z0-9\u00a1-\uffff]"
|
"[A-Za-z0-9\u00a1-\uffff]"
|
||||||
"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
|
"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
>>> from spacy.lang.tr.examples import sentences
|
>>> from spacy.lang.tr.examples import sentences
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ import weakref
|
||||||
import functools
|
import functools
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from copy import copy, deepcopy
|
from copy import copy, deepcopy
|
||||||
from thinc.backends import get_current_ops
|
from thinc.api import get_current_ops
|
||||||
import srsly
|
import srsly
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
|
|
|
@ -6,7 +6,7 @@ cimport numpy as np
|
||||||
np.import_array()
|
np.import_array()
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from thinc.util import get_array_module
|
from thinc.api import get_array_module
|
||||||
|
|
||||||
from .typedefs cimport attr_t, flags_t
|
from .typedefs cimport attr_t, flags_t
|
||||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
|
|
|
@ -3,18 +3,20 @@ from thinc.api import Model
|
||||||
|
|
||||||
def CharacterEmbed(nM, nC):
|
def CharacterEmbed(nM, nC):
|
||||||
# nM: Number of dimensions per character. nC: Number of characters.
|
# nM: Number of dimensions per character. nC: Number of characters.
|
||||||
nO = nM*nC if (nM is not None and nC is not None) else None
|
nO = nM * nC if (nM is not None and nC is not None) else None
|
||||||
return Model(
|
return Model(
|
||||||
"charembed",
|
"charembed",
|
||||||
forward,
|
forward,
|
||||||
init=init,
|
init=init,
|
||||||
dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256},
|
dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256},
|
||||||
params={"E": None}
|
params={"E": None},
|
||||||
).initialize()
|
).initialize()
|
||||||
|
|
||||||
|
|
||||||
def init(model, X=None, Y=None):
|
def init(model, X=None, Y=None):
|
||||||
vectors_table = model.ops.alloc3f(model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM"))
|
vectors_table = model.ops.alloc3f(
|
||||||
|
model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
|
||||||
|
)
|
||||||
model.set_param("E", vectors_table)
|
model.set_param("E", vectors_table)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from thinc.model import Model
|
from thinc.api import Model, normal_init
|
||||||
from thinc.api import normal_init
|
|
||||||
|
|
||||||
|
|
||||||
def PrecomputableAffine(nO, nI, nF, nP):
|
def PrecomputableAffine(nO, nI, nF, nP):
|
||||||
|
@ -20,9 +19,7 @@ def forward(model, X, is_train):
|
||||||
nP = model.get_dim("nP")
|
nP = model.get_dim("nP")
|
||||||
nI = model.get_dim("nI")
|
nI = model.get_dim("nI")
|
||||||
W = model.get_param("W")
|
W = model.get_param("W")
|
||||||
Yf = model.ops.gemm(
|
Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True)
|
||||||
X, W.reshape((nF * nO * nP, nI)), trans2=True
|
|
||||||
)
|
|
||||||
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
|
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
|
||||||
Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
|
Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
|
||||||
|
|
||||||
|
@ -44,7 +41,7 @@ def forward(model, X, is_train):
|
||||||
assert dY.ndim == 3
|
assert dY.ndim == 3
|
||||||
assert dY.shape[1] == nO, dY.shape
|
assert dY.shape[1] == nO, dY.shape
|
||||||
assert dY.shape[2] == nP, dY.shape
|
assert dY.shape[2] == nP, dY.shape
|
||||||
nB = dY.shape[0]
|
# nB = dY.shape[0]
|
||||||
model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
|
model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
|
||||||
Xf = X[ids]
|
Xf = X[ids]
|
||||||
Xf = Xf.reshape((Xf.shape[0], nF * nI))
|
Xf = Xf.reshape((Xf.shape[0], nF * nI))
|
||||||
|
@ -118,7 +115,7 @@ def init(model, X=None, Y=None):
|
||||||
pad = model.ops.alloc4f(1, nF, nO, nP)
|
pad = model.ops.alloc4f(1, nF, nO, nP)
|
||||||
|
|
||||||
ops = model.ops
|
ops = model.ops
|
||||||
W = normal_init(ops, W.shape, fan_in=nF*nI)
|
W = normal_init(ops, W.shape, fan_in=nF * nI)
|
||||||
model.set_param("W", W)
|
model.set_param("W", W)
|
||||||
model.set_param("b", b)
|
model.set_param("b", b)
|
||||||
model.set_param("pad", pad)
|
model.set_param("pad", pad)
|
||||||
|
|
|
@ -9,7 +9,7 @@ from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged
|
||||||
from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors
|
from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors
|
||||||
from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain
|
from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain
|
||||||
from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued
|
from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued
|
||||||
from thinc.api import zero_init, glorot_uniform_init
|
from thinc.api import zero_init
|
||||||
|
|
||||||
|
|
||||||
def build_text_classifier(arch, config):
|
def build_text_classifier(arch, config):
|
||||||
|
@ -33,10 +33,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg
|
||||||
output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO"))
|
output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO"))
|
||||||
else:
|
else:
|
||||||
# TODO: experiment with init_w=zero_init
|
# TODO: experiment with init_w=zero_init
|
||||||
output_layer = (
|
output_layer = Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) >> Logistic()
|
||||||
Linear(nO=nr_class, nI=tok2vec.get_dim("nO"))
|
|
||||||
>> Logistic()
|
|
||||||
)
|
|
||||||
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
|
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
|
||||||
model.set_ref("tok2vec", tok2vec)
|
model.set_ref("tok2vec", tok2vec)
|
||||||
model.set_dim("nO", nr_class)
|
model.set_dim("nO", nr_class)
|
||||||
|
@ -149,13 +146,21 @@ def Tok2Vec(
|
||||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||||
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0)
|
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0)
|
||||||
if subword_features:
|
if subword_features:
|
||||||
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0)
|
prefix = HashEmbed(
|
||||||
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0)
|
nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0
|
||||||
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0)
|
)
|
||||||
|
suffix = HashEmbed(
|
||||||
|
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0
|
||||||
|
)
|
||||||
|
shape = HashEmbed(
|
||||||
|
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
prefix, suffix, shape = (None, None, None)
|
prefix, suffix, shape = (None, None, None)
|
||||||
if pretrained_vectors is not None:
|
if pretrained_vectors is not None:
|
||||||
glove = StaticVectors(vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0)
|
glove = StaticVectors(
|
||||||
|
vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0
|
||||||
|
)
|
||||||
|
|
||||||
if subword_features:
|
if subword_features:
|
||||||
embed = uniqued(
|
embed = uniqued(
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import numpy
|
import numpy
|
||||||
from thinc.model import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from ..attrs import LOWER
|
from ..attrs import LOWER
|
||||||
|
|
||||||
|
@ -26,9 +26,7 @@ def forward(self, docs, is_train: bool):
|
||||||
# The dtype here matches what thinc is expecting -- which differs per
|
# The dtype here matches what thinc is expecting -- which differs per
|
||||||
# platform (by int definition). This should be fixed once the problem
|
# platform (by int definition). This should be fixed once the problem
|
||||||
# is fixed on Thinc's side.
|
# is fixed on Thinc's side.
|
||||||
lengths = self.ops.asarray(
|
lengths = self.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
|
||||||
[arr.shape[0] for arr in batch_keys], dtype=numpy.int_
|
|
||||||
)
|
|
||||||
batch_keys = self.ops.xp.concatenate(batch_keys)
|
batch_keys = self.ops.xp.concatenate(batch_keys)
|
||||||
batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
|
batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
|
||||||
|
|
||||||
|
@ -36,4 +34,3 @@ def forward(self, docs, is_train: bool):
|
||||||
return dY
|
return dY
|
||||||
|
|
||||||
return (batch_keys, batch_vals, lengths), backprop
|
return (batch_keys, batch_vals, lengths), backprop
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,8 @@
|
||||||
from thinc.layers import chain, clone, concatenate, with_array, uniqued
|
from thinc.api import Model, chain, clone, concatenate, with_array, uniqued, noop
|
||||||
from thinc.model import Model
|
from thinc.api import with_padded, Maxout, expand_window, HashEmbed, StaticVectors
|
||||||
from thinc.layers import noop, with_padded
|
from thinc.api import residual, LayerNorm, FeatureExtractor
|
||||||
from thinc.layers import Maxout, expand_window
|
|
||||||
from thinc.layers import HashEmbed, StaticVectors
|
|
||||||
from thinc.layers import residual, LayerNorm, FeatureExtractor
|
|
||||||
|
|
||||||
from spacy.ml import _character_embed
|
from ..ml import _character_embed
|
||||||
from ..util import make_layer, registry
|
from ..util import make_layer, registry
|
||||||
|
|
||||||
|
|
||||||
|
@ -93,8 +90,10 @@ def MaxoutWindowEncoder(config):
|
||||||
nW = config["window_size"]
|
nW = config["window_size"]
|
||||||
nP = config["pieces"]
|
nP = config["pieces"]
|
||||||
depth = config["depth"]
|
depth = config["depth"]
|
||||||
|
cnn = (
|
||||||
cnn = expand_window(window_size=nW), Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True)
|
expand_window(window_size=nW),
|
||||||
|
Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True),
|
||||||
|
)
|
||||||
model = clone(residual(cnn), depth)
|
model = clone(residual(cnn), depth)
|
||||||
model.set_dim("nO", nO)
|
model.set_dim("nO", nO)
|
||||||
model.attrs["receptive_field"] = nW * depth
|
model.attrs["receptive_field"] = nW * depth
|
||||||
|
@ -103,13 +102,16 @@ def MaxoutWindowEncoder(config):
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
||||||
def MishWindowEncoder(config):
|
def MishWindowEncoder(config):
|
||||||
from thinc.layers import Mish
|
from thinc.api import Mish
|
||||||
|
|
||||||
nO = config["width"]
|
nO = config["width"]
|
||||||
nW = config["window_size"]
|
nW = config["window_size"]
|
||||||
depth = config["depth"]
|
depth = config["depth"]
|
||||||
|
cnn = chain(
|
||||||
cnn = chain(expand_window(window_size=nW), Mish(nO=nO, nI=nO * ((nW * 2) + 1)), LayerNorm(nO))
|
expand_window(window_size=nW),
|
||||||
|
Mish(nO=nO, nI=nO * ((nW * 2) + 1)),
|
||||||
|
LayerNorm(nO),
|
||||||
|
)
|
||||||
model = clone(residual(cnn), depth)
|
model = clone(residual(cnn), depth)
|
||||||
model.set_dim("nO", nO)
|
model.set_dim("nO", nO)
|
||||||
return model
|
return model
|
||||||
|
@ -118,14 +120,20 @@ def MishWindowEncoder(config):
|
||||||
@registry.architectures.register("spacy.PretrainedVectors.v1")
|
@registry.architectures.register("spacy.PretrainedVectors.v1")
|
||||||
def PretrainedVectors(config):
|
def PretrainedVectors(config):
|
||||||
# TODO: actual vectors instead of name
|
# TODO: actual vectors instead of name
|
||||||
return StaticVectors(vectors=config["vectors_name"], nO=config["width"], column=config["column"], dropout=0.0)
|
return StaticVectors(
|
||||||
|
vectors=config["vectors_name"],
|
||||||
|
nO=config["width"],
|
||||||
|
column=config["column"],
|
||||||
|
dropout=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
|
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
|
||||||
def TorchBiLSTMEncoder(config):
|
def TorchBiLSTMEncoder(config):
|
||||||
import torch.nn
|
import torch.nn
|
||||||
# TODO FIX
|
|
||||||
from thinc.layers import PyTorchRNNWrapper
|
# TODO: FIX
|
||||||
|
from thinc.api import PyTorchRNNWrapper
|
||||||
|
|
||||||
width = config["width"]
|
width = config["width"]
|
||||||
depth = config["depth"]
|
depth = config["depth"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from thinc.layers import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity
|
from thinc.api import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity
|
||||||
|
|
||||||
from .pipes import Pipe
|
from .pipes import Pipe
|
||||||
from ..language import component
|
from ..language import component
|
||||||
|
@ -63,8 +63,7 @@ class SimilarityHook(Pipe):
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, length):
|
def Model(cls, length):
|
||||||
return siamese(
|
return siamese(
|
||||||
concatenate(reduce_max(), reduce_mean()),
|
concatenate(reduce_max(), reduce_mean()), CauchySimilarity(length * 2)
|
||||||
CauchySimilarity(length * 2)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
|
|
|
@ -3,8 +3,8 @@ from collections import defaultdict
|
||||||
import numpy
|
import numpy
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
|
||||||
from thinc.layers import chain, list2array
|
from thinc.api import chain, list2array, to_categorical, get_array_module
|
||||||
from thinc.util import to_categorical, copy_array, get_array_module
|
from thinc.util import copy_array
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from .pipes import Pipe
|
from .pipes import Pipe
|
||||||
|
|
|
@ -3,11 +3,9 @@
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
import random
|
import random
|
||||||
from thinc.layers import chain, Linear, Maxout, Softmax, LayerNorm, list2array
|
from thinc.api import chain, Linear, Maxout, Softmax, LayerNorm, list2array
|
||||||
from thinc.initializers import zero_init
|
from thinc.api import zero_init, CosineDistance, to_categorical, get_array_module
|
||||||
from thinc.loss import CosineDistance
|
from thinc.api import set_dropout_rate
|
||||||
from thinc.util import to_categorical, get_array_module
|
|
||||||
from thinc.model import set_dropout_rate
|
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..syntax.nn_parser cimport Parser
|
from ..syntax.nn_parser cimport Parser
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from thinc.api import Model, set_dropout_rate
|
||||||
|
|
||||||
from .pipes import Pipe
|
from .pipes import Pipe
|
||||||
from ..gold import Example
|
from ..gold import Example
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
@ -5,8 +7,6 @@ from ..vocab import Vocab
|
||||||
from ..language import component
|
from ..language import component
|
||||||
from ..util import link_vectors_to_models, minibatch, registry, eg2doc
|
from ..util import link_vectors_to_models, minibatch, registry, eg2doc
|
||||||
|
|
||||||
from thinc.model import Model, set_dropout_rate
|
|
||||||
|
|
||||||
|
|
||||||
@component("tok2vec", assigns=["doc.tensor"])
|
@component("tok2vec", assigns=["doc.tensor"])
|
||||||
class Tok2Vec(Pipe):
|
class Tok2Vec(Pipe):
|
||||||
|
@ -39,7 +39,9 @@ class Tok2Vec(Pipe):
|
||||||
self.listeners = []
|
self.listeners = []
|
||||||
|
|
||||||
def create_listener(self):
|
def create_listener(self):
|
||||||
listener = Tok2VecListener(upstream_name="tok2vec", width=self.model.get_dim("nO"))
|
listener = Tok2VecListener(
|
||||||
|
upstream_name="tok2vec", width=self.model.get_dim("nO")
|
||||||
|
)
|
||||||
self.listeners.append(listener)
|
self.listeners.append(listener)
|
||||||
|
|
||||||
def add_listener(self, listener):
|
def add_listener(self, listener):
|
||||||
|
@ -115,7 +117,7 @@ class Tok2Vec(Pipe):
|
||||||
|
|
||||||
def capture_losses(d_tokvecs):
|
def capture_losses(d_tokvecs):
|
||||||
"""Accumulate tok2vec loss before doing backprop."""
|
"""Accumulate tok2vec loss before doing backprop."""
|
||||||
l2_loss = sum((d_t2v**2).sum() for d_t2v in d_tokvecs)
|
l2_loss = sum((d_t2v ** 2).sum() for d_t2v in d_tokvecs)
|
||||||
if self.name in losses:
|
if self.name in losses:
|
||||||
losses[self.name] += l2_loss / len(d_tokvecs)
|
losses[self.name] += l2_loss / len(d_tokvecs)
|
||||||
else:
|
else:
|
||||||
|
@ -133,7 +135,9 @@ class Tok2Vec(Pipe):
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
def begin_training(
|
||||||
|
self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
|
||||||
|
):
|
||||||
"""Allocate models and pre-process training data
|
"""Allocate models and pre-process training data
|
||||||
|
|
||||||
get_examples (function): Function returning example training data.
|
get_examples (function): Function returning example training data.
|
||||||
|
@ -151,6 +155,7 @@ class Tok2VecListener(Model):
|
||||||
"""A layer that gets fed its answers from an upstream connection,
|
"""A layer that gets fed its answers from an upstream connection,
|
||||||
for instance from a component earlier in the pipeline.
|
for instance from a component earlier in the pipeline.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = "tok2vec-listener"
|
name = "tok2vec-listener"
|
||||||
|
|
||||||
def __init__(self, upstream_name, width):
|
def __init__(self, upstream_name, width):
|
||||||
|
|
|
@ -11,9 +11,7 @@ from libc.string cimport memset, memcpy
|
||||||
from libc.stdlib cimport calloc, free, realloc
|
from libc.stdlib cimport calloc, free, realloc
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
from thinc.layers import Linear
|
from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops
|
||||||
from thinc.model import Model
|
|
||||||
from thinc.backends import CupyOps, NumpyOps, use_ops
|
|
||||||
from thinc.backends.linalg cimport Vec, VecVec
|
from thinc.backends.linalg cimport Vec, VecVec
|
||||||
cimport blis.cy
|
cimport blis.cy
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,8 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# cython: cdivision=True
|
# cython: cdivision=True
|
||||||
# cython: boundscheck=False
|
# cython: boundscheck=False
|
||||||
import numpy
|
|
||||||
cimport cython.parallel
|
cimport cython.parallel
|
||||||
import numpy.random
|
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from itertools import islice
|
|
||||||
from cpython.ref cimport PyObject, Py_XDECREF
|
from cpython.ref cimport PyObject, Py_XDECREF
|
||||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||||
from libc.math cimport exp
|
from libc.math cimport exp
|
||||||
|
@ -14,15 +11,16 @@ from libc.string cimport memset, memcpy
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
from thinc.layers import chain, clone, Linear, list2array
|
|
||||||
from thinc.backends import NumpyOps, CupyOps, use_ops
|
|
||||||
from thinc.util import get_array_module
|
|
||||||
from thinc.backends.linalg cimport Vec, VecVec
|
from thinc.backends.linalg cimport Vec, VecVec
|
||||||
from thinc.initializers import zero_init
|
|
||||||
from thinc.model import set_dropout_rate
|
|
||||||
import srsly
|
|
||||||
|
|
||||||
from spacy.gold import Example
|
from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops
|
||||||
|
from thinc.api import get_array_module, zero_init, set_dropout_rate
|
||||||
|
from itertools import islice
|
||||||
|
import srsly
|
||||||
|
import numpy.random
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
from ..gold import Example
|
||||||
from ..typedefs cimport weight_t, class_t, hash_t
|
from ..typedefs cimport weight_t, class_t, hash_t
|
||||||
from ._parser_model cimport alloc_activations, free_activations
|
from ._parser_model cimport alloc_activations, free_activations
|
||||||
from ._parser_model cimport predict_states, arg_max_if_valid
|
from ._parser_model cimport predict_states, arg_max_if_valid
|
||||||
|
|
|
@ -6,7 +6,7 @@ scheme.
|
||||||
"""
|
"""
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
from spacy.gold import Example
|
from ..gold import Example
|
||||||
from ..tokens.doc cimport Doc, set_children_from_heads
|
from ..tokens.doc cimport Doc, set_children_from_heads
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
|
@ -274,7 +273,19 @@ def test_doc_is_nered(en_vocab):
|
||||||
def test_doc_from_array_sent_starts(en_vocab):
|
def test_doc_from_array_sent_starts(en_vocab):
|
||||||
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
|
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
|
||||||
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
|
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
|
||||||
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
|
deps = [
|
||||||
|
"ROOT",
|
||||||
|
"dep",
|
||||||
|
"dep",
|
||||||
|
"dep",
|
||||||
|
"dep",
|
||||||
|
"dep",
|
||||||
|
"ROOT",
|
||||||
|
"dep",
|
||||||
|
"dep",
|
||||||
|
"dep",
|
||||||
|
"dep",
|
||||||
|
]
|
||||||
doc = Doc(en_vocab, words=words)
|
doc = Doc(en_vocab, words=words)
|
||||||
for i, (dep, head) in enumerate(zip(deps, heads)):
|
for i, (dep, head) in enumerate(zip(deps, heads)):
|
||||||
doc[i].dep_ = dep
|
doc[i].dep_ = dep
|
||||||
|
|
|
@ -29,7 +29,9 @@ def test_morph_props(i_has):
|
||||||
|
|
||||||
def test_morph_iter(i_has):
|
def test_morph_iter(i_has):
|
||||||
assert set(i_has[0].morph) == set(["PronType=prs"])
|
assert set(i_has[0].morph) == set(["PronType=prs"])
|
||||||
assert set(i_has[1].morph) == set(["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"])
|
assert set(i_has[1].morph) == set(
|
||||||
|
["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_morph_get(i_has):
|
def test_morph_get(i_has):
|
||||||
|
|
|
@ -8,7 +8,12 @@ from ..util import get_doc
|
||||||
|
|
||||||
def test_doc_retokenize_merge(en_tokenizer):
|
def test_doc_retokenize_merge(en_tokenizer):
|
||||||
text = "WKRO played songs by the beach boys all night"
|
text = "WKRO played songs by the beach boys all night"
|
||||||
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"}
|
attrs = {
|
||||||
|
"tag": "NAMED",
|
||||||
|
"lemma": "LEMMA",
|
||||||
|
"ent_type": "TYPE",
|
||||||
|
"morph": "Number=Plur",
|
||||||
|
}
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert len(doc) == 9
|
assert len(doc) == 9
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
def test_ar_tokenizer_handles_long_text(ar_tokenizer):
|
def test_ar_tokenizer_handles_long_text(ar_tokenizer):
|
||||||
text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين.
|
text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين.
|
||||||
ولد نجيب محفوظ في مدينة القاهرة، حيث ترعرع و تلقى تعليمه الجامعي في جامعتها،
|
ولد نجيب محفوظ في مدينة القاهرة، حيث ترعرع و تلقى تعليمه الجامعي في جامعتها،
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
def test_en_simple_punct(en_tokenizer):
|
def test_en_simple_punct(en_tokenizer):
|
||||||
text = "to walk, do foo"
|
text = "to walk, do foo"
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user