mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Tidy up and auto-format
This commit is contained in:
		
							parent
							
								
									1278161f47
								
							
						
					
					
						commit
						e3f40a6a0f
					
				| 
						 | 
				
			
			@ -5,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 | 
			
		|||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 | 
			
		||||
 | 
			
		||||
# These are imported as part of the API
 | 
			
		||||
from thinc.util import prefer_gpu, require_gpu
 | 
			
		||||
from thinc.api import prefer_gpu, require_gpu
 | 
			
		||||
 | 
			
		||||
from . import pipeline
 | 
			
		||||
from .cli.info import info as cli_info
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4,7 +4,7 @@ from .link import link  # noqa: F401
 | 
			
		|||
from .package import package  # noqa: F401
 | 
			
		||||
from .profile import profile  # noqa: F401
 | 
			
		||||
from .train import train  # noqa: F401
 | 
			
		||||
from .train_from_config import train_from_config_cli # noqa: F401
 | 
			
		||||
from .train_from_config import train_from_config_cli  # noqa: F401
 | 
			
		||||
from .pretrain import pretrain  # noqa: F401
 | 
			
		||||
from .debug_data import debug_data  # noqa: F401
 | 
			
		||||
from .evaluate import evaluate  # noqa: F401
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -192,11 +192,7 @@ def debug_data(
 | 
			
		|||
            has_ws_ents_error = True
 | 
			
		||||
 | 
			
		||||
        if gold_train_data["punct_ents"]:
 | 
			
		||||
            msg.warn(
 | 
			
		||||
                "{} entity span(s) with punctuation".format(
 | 
			
		||||
                    gold_train_data["punct_ents"]
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
            msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
 | 
			
		||||
            has_punct_ents_warning = True
 | 
			
		||||
 | 
			
		||||
        for label in new_labels:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4,14 +4,12 @@ import time
 | 
			
		|||
import re
 | 
			
		||||
from collections import Counter
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from thinc.layers import Linear, Maxout
 | 
			
		||||
from thinc.util import prefer_gpu
 | 
			
		||||
from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu
 | 
			
		||||
from thinc.api import CosineDistance, L2Distance
 | 
			
		||||
from wasabi import msg
 | 
			
		||||
import srsly
 | 
			
		||||
from thinc.layers import chain, list2array
 | 
			
		||||
from thinc.loss import CosineDistance, L2Distance
 | 
			
		||||
 | 
			
		||||
from spacy.gold import Example
 | 
			
		||||
from ..gold import Example
 | 
			
		||||
from ..errors import Errors
 | 
			
		||||
from ..tokens import Doc
 | 
			
		||||
from ..attrs import ID, HEAD
 | 
			
		||||
| 
						 | 
				
			
			@ -85,7 +83,7 @@ def pretrain(
 | 
			
		|||
        )
 | 
			
		||||
    if not output_dir.exists():
 | 
			
		||||
        output_dir.mkdir()
 | 
			
		||||
        msg.good("Created output directory: {}".format(output_dir))
 | 
			
		||||
        msg.good(f"Created output directory: {output_dir}")
 | 
			
		||||
    srsly.write_json(output_dir / "config.json", config)
 | 
			
		||||
    msg.good("Saved settings to config.json")
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,7 @@
 | 
			
		|||
import os
 | 
			
		||||
import tqdm
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from thinc.backends import use_ops
 | 
			
		||||
from thinc.api import use_ops
 | 
			
		||||
from timeit import default_timer as timer
 | 
			
		||||
import shutil
 | 
			
		||||
import srsly
 | 
			
		||||
| 
						 | 
				
			
			@ -89,7 +89,7 @@ def train(
 | 
			
		|||
        )
 | 
			
		||||
    if not output_path.exists():
 | 
			
		||||
        output_path.mkdir()
 | 
			
		||||
        msg.good("Created output directory: {}".format(output_path))
 | 
			
		||||
        msg.good(f"Created output directory: {output_path}")
 | 
			
		||||
 | 
			
		||||
    tag_map = {}
 | 
			
		||||
    if tag_map_path is not None:
 | 
			
		||||
| 
						 | 
				
			
			@ -125,17 +125,17 @@ def train(
 | 
			
		|||
    msg.text(f"Training pipeline: {pipeline}")
 | 
			
		||||
    disabled_pipes = None
 | 
			
		||||
    pipes_added = False
 | 
			
		||||
    msg.text("Training pipeline: {}".format(pipeline))
 | 
			
		||||
    msg.text(f"Training pipeline: {pipeline}")
 | 
			
		||||
    if use_gpu >= 0:
 | 
			
		||||
        activated_gpu = None
 | 
			
		||||
        try:
 | 
			
		||||
            activated_gpu = set_gpu(use_gpu)
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            msg.warn("Exception: {}".format(e))
 | 
			
		||||
            msg.warn(f"Exception: {e}")
 | 
			
		||||
        if activated_gpu is not None:
 | 
			
		||||
            msg.text("Using GPU: {}".format(use_gpu))
 | 
			
		||||
            msg.text(f"Using GPU: {use_gpu}")
 | 
			
		||||
        else:
 | 
			
		||||
            msg.warn("Unable to activate GPU: {}".format(use_gpu))
 | 
			
		||||
            msg.warn(f"Unable to activate GPU: {use_gpu}")
 | 
			
		||||
            msg.text("Using CPU only")
 | 
			
		||||
            use_gpu = -1
 | 
			
		||||
    if base_model:
 | 
			
		||||
| 
						 | 
				
			
			@ -158,11 +158,11 @@ def train(
 | 
			
		|||
                    "positive_label": textcat_positive_label,
 | 
			
		||||
                }
 | 
			
		||||
            if pipe not in nlp.pipe_names:
 | 
			
		||||
                msg.text("Adding component to base model '{}'".format(pipe))
 | 
			
		||||
                msg.text(f"Adding component to base model '{pipe}'")
 | 
			
		||||
                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
 | 
			
		||||
                pipes_added = True
 | 
			
		||||
            elif replace_components:
 | 
			
		||||
                msg.text("Replacing component from base model '{}'".format(pipe))
 | 
			
		||||
                msg.text(f"Replacing component from base model '{pipe}'")
 | 
			
		||||
                nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
 | 
			
		||||
                pipes_added = True
 | 
			
		||||
            else:
 | 
			
		||||
| 
						 | 
				
			
			@ -180,7 +180,7 @@ def train(
 | 
			
		|||
                            f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
 | 
			
		||||
                            exits=1,
 | 
			
		||||
                        )
 | 
			
		||||
                msg.text("Extending component from base model '{}'".format(pipe))
 | 
			
		||||
                msg.text(f"Extending component from base model '{pipe}'")
 | 
			
		||||
        disabled_pipes = nlp.disable_pipes(
 | 
			
		||||
            [p for p in nlp.pipe_names if p not in pipeline]
 | 
			
		||||
        )
 | 
			
		||||
| 
						 | 
				
			
			@ -377,7 +377,7 @@ def train(
 | 
			
		|||
                            msg.warn(
 | 
			
		||||
                                "Did you provide the same parameters during 'train' as during 'pretrain'?"
 | 
			
		||||
                            )
 | 
			
		||||
                        msg.fail("Original error message: {}".format(e), exits=1)
 | 
			
		||||
                        msg.fail(f"Original error message: {e}", exits=1)
 | 
			
		||||
                    if raw_text:
 | 
			
		||||
                        # If raw text is available, perform 'rehearsal' updates,
 | 
			
		||||
                        # which use unlabelled data to reduce overfitting.
 | 
			
		||||
| 
						 | 
				
			
			@ -504,11 +504,7 @@ def train(
 | 
			
		|||
                        )
 | 
			
		||||
                        break
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        msg.warn(
 | 
			
		||||
            "Aborting and saving the final best model. Encountered exception: {}".format(
 | 
			
		||||
                e
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
        msg.warn(f"Aborting and saving final best model. Encountered exception: {e}")
 | 
			
		||||
    finally:
 | 
			
		||||
        best_pipes = nlp.pipe_names
 | 
			
		||||
        if disabled_pipes:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,19 +1,20 @@
 | 
			
		|||
from typing import Optional, Dict, List, Union, Sequence
 | 
			
		||||
import plac
 | 
			
		||||
from thinc.util import require_gpu
 | 
			
		||||
from wasabi import msg
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
import thinc
 | 
			
		||||
import thinc.schedules
 | 
			
		||||
from thinc.model import Model
 | 
			
		||||
from spacy.gold import GoldCorpus
 | 
			
		||||
import spacy
 | 
			
		||||
from spacy.pipeline.tok2vec import Tok2VecListener
 | 
			
		||||
from typing import Optional, Dict, List, Union, Sequence
 | 
			
		||||
from thinc.api import Model
 | 
			
		||||
from pydantic import BaseModel, FilePath, StrictInt
 | 
			
		||||
import tqdm
 | 
			
		||||
 | 
			
		||||
from ..ml import component_models
 | 
			
		||||
from .. import util
 | 
			
		||||
# TODO: relative imports?
 | 
			
		||||
import spacy
 | 
			
		||||
from spacy.gold import GoldCorpus
 | 
			
		||||
from spacy.pipeline.tok2vec import Tok2VecListener
 | 
			
		||||
from spacy.ml import component_models
 | 
			
		||||
from spacy import util
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
registry = util.registry
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -153,10 +154,9 @@ def create_tb_parser_model(
 | 
			
		|||
    hidden_width: StrictInt = 64,
 | 
			
		||||
    maxout_pieces: StrictInt = 3,
 | 
			
		||||
):
 | 
			
		||||
    from thinc.layers import Linear, chain, list2array
 | 
			
		||||
    from thinc.api import Linear, chain, list2array, use_ops, zero_init
 | 
			
		||||
    from spacy.ml._layers import PrecomputableAffine
 | 
			
		||||
    from spacy.syntax._parser_model import ParserModel
 | 
			
		||||
    from thinc.api import use_ops, zero_init
 | 
			
		||||
 | 
			
		||||
    token_vector_width = tok2vec.get_dim("nO")
 | 
			
		||||
    tok2vec = chain(tok2vec, list2array())
 | 
			
		||||
| 
						 | 
				
			
			@ -221,13 +221,9 @@ def train_from_config_cli(
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def train_from_config(
 | 
			
		||||
    config_path,
 | 
			
		||||
    data_paths,
 | 
			
		||||
    raw_text=None,
 | 
			
		||||
    meta_path=None,
 | 
			
		||||
    output_path=None,
 | 
			
		||||
    config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
 | 
			
		||||
):
 | 
			
		||||
    msg.info("Loading config from: {}".format(config_path))
 | 
			
		||||
    msg.info(f"Loading config from: {config_path}")
 | 
			
		||||
    config = util.load_from_config(config_path, create_objects=True)
 | 
			
		||||
    use_gpu = config["training"]["use_gpu"]
 | 
			
		||||
    if use_gpu >= 0:
 | 
			
		||||
| 
						 | 
				
			
			@ -241,9 +237,7 @@ def train_from_config(
 | 
			
		|||
    msg.info("Loading training corpus")
 | 
			
		||||
    corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
 | 
			
		||||
    msg.info("Initializing the nlp pipeline")
 | 
			
		||||
    nlp.begin_training(
 | 
			
		||||
        lambda: corpus.train_examples, device=use_gpu
 | 
			
		||||
    )
 | 
			
		||||
    nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
 | 
			
		||||
 | 
			
		||||
    train_batches = create_train_batches(nlp, corpus, config["training"])
 | 
			
		||||
    evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"])
 | 
			
		||||
| 
						 | 
				
			
			@ -260,7 +254,7 @@ def train_from_config(
 | 
			
		|||
        config["training"]["eval_frequency"],
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    msg.info("Training. Initial learn rate: {}".format(optimizer.learn_rate))
 | 
			
		||||
    msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
 | 
			
		||||
    print_row = setup_printer(config)
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
| 
						 | 
				
			
			@ -414,7 +408,7 @@ def subdivide_batch(batch):
 | 
			
		|||
def setup_printer(config):
 | 
			
		||||
    score_cols = config["training"]["scores"]
 | 
			
		||||
    score_widths = [max(len(col), 6) for col in score_cols]
 | 
			
		||||
    loss_cols = ["Loss {}".format(pipe) for pipe in config["nlp"]["pipeline"]]
 | 
			
		||||
    loss_cols = [f"Loss {pipe}" for pipe in config["nlp"]["pipeline"]]
 | 
			
		||||
    loss_widths = [max(len(col), 8) for col in loss_cols]
 | 
			
		||||
    table_header = ["#"] + loss_cols + score_cols + ["Score"]
 | 
			
		||||
    table_header = [col.upper() for col in table_header]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -30,7 +30,7 @@ try:
 | 
			
		|||
except ImportError:
 | 
			
		||||
    cupy = None
 | 
			
		||||
 | 
			
		||||
from thinc.optimizers import Optimizer  # noqa: F401
 | 
			
		||||
from thinc.api import Optimizer  # noqa: F401
 | 
			
		||||
 | 
			
		||||
pickle = pickle
 | 
			
		||||
copy_reg = copy_reg
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Setting explicit height and max-width: none on the SVG is required for
 | 
			
		||||
# Jupyter to render it properly in a cell
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
def explain(term):
 | 
			
		||||
    """Get a description for a given POS tag, dependency label or entity type.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,6 @@
 | 
			
		|||
from cymem.cymem cimport Pool
 | 
			
		||||
 | 
			
		||||
from spacy.tokens import Doc
 | 
			
		||||
from .tokens import Doc
 | 
			
		||||
from .typedefs cimport attr_t
 | 
			
		||||
from .syntax.transition_system cimport Transition
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -65,5 +65,3 @@ cdef class Example:
 | 
			
		|||
    cdef public TokenAnnotation token_annotation
 | 
			
		||||
    cdef public DocAnnotation doc_annotation
 | 
			
		||||
    cdef public object goldparse
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6,7 +6,7 @@ from libcpp.vector cimport vector
 | 
			
		|||
from libc.stdint cimport int32_t, int64_t
 | 
			
		||||
from libc.stdio cimport FILE
 | 
			
		||||
 | 
			
		||||
from spacy.vocab cimport Vocab
 | 
			
		||||
from .vocab cimport Vocab
 | 
			
		||||
from .typedefs cimport hash_t
 | 
			
		||||
 | 
			
		||||
from .structs cimport KBEntryC, AliasC
 | 
			
		||||
| 
						 | 
				
			
			@ -169,4 +169,3 @@ cdef class Reader:
 | 
			
		|||
    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
 | 
			
		||||
 | 
			
		||||
    cdef int _read(self, void* value, size_t size) except -1
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Source: https://github.com/stopwords-iso/stopwords-af
 | 
			
		||||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Source: https://github.com/Alir3z4/stop-words
 | 
			
		||||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত  অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Source: https://github.com/Alir3z4/stop-words
 | 
			
		||||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
á a ab aber ach acht achte achten achter achtes ag alle allein allem allen
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
def get_pos_from_wiktionary():
 | 
			
		||||
    import re
 | 
			
		||||
    from gensim.corpora.wikicorpus import extract_pages
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# These exceptions are used to add NORM values based on a token's ORTH value.
 | 
			
		||||
# Norms are only set if no alternative is provided in the tokenizer exceptions.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Stop words
 | 
			
		||||
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
_exc = {
 | 
			
		||||
    # Slang and abbreviations
 | 
			
		||||
    "cos": "because",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Stop words
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Source: https://github.com/stopwords-iso/stopwords-et
 | 
			
		||||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
verb_roots = """
 | 
			
		||||
#هست
 | 
			
		||||
آخت#آهنج
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Stop words from HAZM package
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt
 | 
			
		||||
# Reformatted with some minor corrections
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# fmt: off
 | 
			
		||||
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
 | 
			
		||||
broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6
 | 
			
		||||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Source: https://github.com/Xangis/extra-stopwords
 | 
			
		||||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
ಹಲವು
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Source: https://github.com/stopwords-iso/stopwords-lv
 | 
			
		||||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# These exceptions are used to add NORM values based on a token's ORTH value.
 | 
			
		||||
# Individual languages can also add their own exceptions and overwrite them -
 | 
			
		||||
# for example, British vs. American spelling in English.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
_exc = {
 | 
			
		||||
    # Slang
 | 
			
		||||
    "прив": "привет",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
අතර
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Source: https://github.com/Ardevop-sk/stopwords-sk
 | 
			
		||||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Source: https://github.com/stopwords-iso/stopwords-sl
 | 
			
		||||
# TODO: probably needs to be tidied up – the list seems to have month names in
 | 
			
		||||
# it, which shouldn't be considered stop words.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Source: https://github.com/andrixh/index-albanian
 | 
			
		||||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
_exc = {
 | 
			
		||||
    # Slang
 | 
			
		||||
    "ћале": "отац",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
а
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
# Stop words
 | 
			
		||||
 | 
			
		||||
STOP_WORDS = set(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
_exc = {
 | 
			
		||||
    # Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
 | 
			
		||||
    "สนุ๊กเกอร์": "สนุกเกอร์",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -34,7 +34,7 @@ URL_PATTERN = (
 | 
			
		|||
    r"|"
 | 
			
		||||
    # host & domain names
 | 
			
		||||
    # mods: match is case-sensitive, so include [A-Z]
 | 
			
		||||
      "(?:"
 | 
			
		||||
      "(?:"  # noqa: E131
 | 
			
		||||
        "(?:"
 | 
			
		||||
          "[A-Za-z0-9\u00a1-\uffff]"
 | 
			
		||||
          "[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
>>> from spacy.lang.tr.examples import sentences
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4,7 +4,7 @@ import weakref
 | 
			
		|||
import functools
 | 
			
		||||
from contextlib import contextmanager
 | 
			
		||||
from copy import copy, deepcopy
 | 
			
		||||
from thinc.backends import get_current_ops
 | 
			
		||||
from thinc.api import get_current_ops
 | 
			
		||||
import srsly
 | 
			
		||||
import multiprocessing as mp
 | 
			
		||||
from itertools import chain, cycle
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6,7 +6,7 @@ cimport numpy as np
 | 
			
		|||
np.import_array()
 | 
			
		||||
 | 
			
		||||
import numpy
 | 
			
		||||
from thinc.util import get_array_module
 | 
			
		||||
from thinc.api import get_array_module
 | 
			
		||||
 | 
			
		||||
from .typedefs cimport attr_t, flags_t
 | 
			
		||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,18 +3,20 @@ from thinc.api import Model
 | 
			
		|||
 | 
			
		||||
def CharacterEmbed(nM, nC):
 | 
			
		||||
    # nM: Number of dimensions per character. nC: Number of characters.
 | 
			
		||||
    nO = nM*nC if (nM is not None and nC is not None) else None
 | 
			
		||||
    nO = nM * nC if (nM is not None and nC is not None) else None
 | 
			
		||||
    return Model(
 | 
			
		||||
        "charembed",
 | 
			
		||||
        forward,
 | 
			
		||||
        init=init,
 | 
			
		||||
        dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256},
 | 
			
		||||
        params={"E": None}
 | 
			
		||||
        params={"E": None},
 | 
			
		||||
    ).initialize()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def init(model, X=None, Y=None):
 | 
			
		||||
    vectors_table = model.ops.alloc3f(model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM"))
 | 
			
		||||
    vectors_table = model.ops.alloc3f(
 | 
			
		||||
        model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
 | 
			
		||||
    )
 | 
			
		||||
    model.set_param("E", vectors_table)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,5 +1,4 @@
 | 
			
		|||
from thinc.model import Model
 | 
			
		||||
from thinc.api import normal_init
 | 
			
		||||
from thinc.api import Model, normal_init
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def PrecomputableAffine(nO, nI, nF, nP):
 | 
			
		||||
| 
						 | 
				
			
			@ -20,9 +19,7 @@ def forward(model, X, is_train):
 | 
			
		|||
    nP = model.get_dim("nP")
 | 
			
		||||
    nI = model.get_dim("nI")
 | 
			
		||||
    W = model.get_param("W")
 | 
			
		||||
    Yf = model.ops.gemm(
 | 
			
		||||
        X, W.reshape((nF * nO * nP, nI)), trans2=True
 | 
			
		||||
    )
 | 
			
		||||
    Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True)
 | 
			
		||||
    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
 | 
			
		||||
    Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -44,7 +41,7 @@ def forward(model, X, is_train):
 | 
			
		|||
        assert dY.ndim == 3
 | 
			
		||||
        assert dY.shape[1] == nO, dY.shape
 | 
			
		||||
        assert dY.shape[2] == nP, dY.shape
 | 
			
		||||
        nB = dY.shape[0]
 | 
			
		||||
        # nB = dY.shape[0]
 | 
			
		||||
        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
 | 
			
		||||
        Xf = X[ids]
 | 
			
		||||
        Xf = Xf.reshape((Xf.shape[0], nF * nI))
 | 
			
		||||
| 
						 | 
				
			
			@ -118,7 +115,7 @@ def init(model, X=None, Y=None):
 | 
			
		|||
    pad = model.ops.alloc4f(1, nF, nO, nP)
 | 
			
		||||
 | 
			
		||||
    ops = model.ops
 | 
			
		||||
    W = normal_init(ops, W.shape, fan_in=nF*nI)
 | 
			
		||||
    W = normal_init(ops, W.shape, fan_in=nF * nI)
 | 
			
		||||
    model.set_param("W", W)
 | 
			
		||||
    model.set_param("b", b)
 | 
			
		||||
    model.set_param("pad", pad)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -9,7 +9,7 @@ from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged
 | 
			
		|||
from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors
 | 
			
		||||
from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain
 | 
			
		||||
from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued
 | 
			
		||||
from thinc.api import zero_init, glorot_uniform_init
 | 
			
		||||
from thinc.api import zero_init
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def build_text_classifier(arch, config):
 | 
			
		||||
| 
						 | 
				
			
			@ -33,10 +33,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg
 | 
			
		|||
            output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO"))
 | 
			
		||||
        else:
 | 
			
		||||
            # TODO: experiment with init_w=zero_init
 | 
			
		||||
            output_layer = (
 | 
			
		||||
                Linear(nO=nr_class, nI=tok2vec.get_dim("nO"))
 | 
			
		||||
                >> Logistic()
 | 
			
		||||
            )
 | 
			
		||||
            output_layer = Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) >> Logistic()
 | 
			
		||||
        model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
 | 
			
		||||
    model.set_ref("tok2vec", tok2vec)
 | 
			
		||||
    model.set_dim("nO", nr_class)
 | 
			
		||||
| 
						 | 
				
			
			@ -149,13 +146,21 @@ def Tok2Vec(
 | 
			
		|||
    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
 | 
			
		||||
        norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0)
 | 
			
		||||
        if subword_features:
 | 
			
		||||
            prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0)
 | 
			
		||||
            suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0)
 | 
			
		||||
            shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0)
 | 
			
		||||
            prefix = HashEmbed(
 | 
			
		||||
                nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0
 | 
			
		||||
            )
 | 
			
		||||
            suffix = HashEmbed(
 | 
			
		||||
                nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0
 | 
			
		||||
            )
 | 
			
		||||
            shape = HashEmbed(
 | 
			
		||||
                nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            prefix, suffix, shape = (None, None, None)
 | 
			
		||||
        if pretrained_vectors is not None:
 | 
			
		||||
            glove = StaticVectors(vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0)
 | 
			
		||||
            glove = StaticVectors(
 | 
			
		||||
                vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            if subword_features:
 | 
			
		||||
                embed = uniqued(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,5 +1,5 @@
 | 
			
		|||
import numpy
 | 
			
		||||
from thinc.model import Model
 | 
			
		||||
from thinc.api import Model
 | 
			
		||||
 | 
			
		||||
from ..attrs import LOWER
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -26,9 +26,7 @@ def forward(self, docs, is_train: bool):
 | 
			
		|||
    # The dtype here matches what thinc is expecting -- which differs per
 | 
			
		||||
    # platform (by int definition). This should be fixed once the problem
 | 
			
		||||
    # is fixed on Thinc's side.
 | 
			
		||||
    lengths = self.ops.asarray(
 | 
			
		||||
        [arr.shape[0] for arr in batch_keys], dtype=numpy.int_
 | 
			
		||||
    )
 | 
			
		||||
    lengths = self.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
 | 
			
		||||
    batch_keys = self.ops.xp.concatenate(batch_keys)
 | 
			
		||||
    batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -36,4 +34,3 @@ def forward(self, docs, is_train: bool):
 | 
			
		|||
        return dY
 | 
			
		||||
 | 
			
		||||
    return (batch_keys, batch_vals, lengths), backprop
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,11 +1,8 @@
 | 
			
		|||
from thinc.layers import chain, clone, concatenate, with_array, uniqued
 | 
			
		||||
from thinc.model import Model
 | 
			
		||||
from thinc.layers import noop, with_padded
 | 
			
		||||
from thinc.layers import Maxout, expand_window
 | 
			
		||||
from thinc.layers import HashEmbed, StaticVectors
 | 
			
		||||
from thinc.layers import residual, LayerNorm, FeatureExtractor
 | 
			
		||||
from thinc.api import Model, chain, clone, concatenate, with_array, uniqued, noop
 | 
			
		||||
from thinc.api import with_padded, Maxout, expand_window, HashEmbed, StaticVectors
 | 
			
		||||
from thinc.api import residual, LayerNorm, FeatureExtractor
 | 
			
		||||
 | 
			
		||||
from spacy.ml import _character_embed
 | 
			
		||||
from ..ml import _character_embed
 | 
			
		||||
from ..util import make_layer, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -93,8 +90,10 @@ def MaxoutWindowEncoder(config):
 | 
			
		|||
    nW = config["window_size"]
 | 
			
		||||
    nP = config["pieces"]
 | 
			
		||||
    depth = config["depth"]
 | 
			
		||||
 | 
			
		||||
    cnn = expand_window(window_size=nW), Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True)
 | 
			
		||||
    cnn = (
 | 
			
		||||
        expand_window(window_size=nW),
 | 
			
		||||
        Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True),
 | 
			
		||||
    )
 | 
			
		||||
    model = clone(residual(cnn), depth)
 | 
			
		||||
    model.set_dim("nO", nO)
 | 
			
		||||
    model.attrs["receptive_field"] = nW * depth
 | 
			
		||||
| 
						 | 
				
			
			@ -103,13 +102,16 @@ def MaxoutWindowEncoder(config):
 | 
			
		|||
 | 
			
		||||
@registry.architectures.register("spacy.MishWindowEncoder.v1")
 | 
			
		||||
def MishWindowEncoder(config):
 | 
			
		||||
    from thinc.layers import Mish
 | 
			
		||||
    from thinc.api import Mish
 | 
			
		||||
 | 
			
		||||
    nO = config["width"]
 | 
			
		||||
    nW = config["window_size"]
 | 
			
		||||
    depth = config["depth"]
 | 
			
		||||
 | 
			
		||||
    cnn = chain(expand_window(window_size=nW), Mish(nO=nO, nI=nO * ((nW * 2) + 1)), LayerNorm(nO))
 | 
			
		||||
    cnn = chain(
 | 
			
		||||
        expand_window(window_size=nW),
 | 
			
		||||
        Mish(nO=nO, nI=nO * ((nW * 2) + 1)),
 | 
			
		||||
        LayerNorm(nO),
 | 
			
		||||
    )
 | 
			
		||||
    model = clone(residual(cnn), depth)
 | 
			
		||||
    model.set_dim("nO", nO)
 | 
			
		||||
    return model
 | 
			
		||||
| 
						 | 
				
			
			@ -118,14 +120,20 @@ def MishWindowEncoder(config):
 | 
			
		|||
@registry.architectures.register("spacy.PretrainedVectors.v1")
 | 
			
		||||
def PretrainedVectors(config):
 | 
			
		||||
    # TODO: actual vectors instead of name
 | 
			
		||||
    return StaticVectors(vectors=config["vectors_name"], nO=config["width"], column=config["column"], dropout=0.0)
 | 
			
		||||
    return StaticVectors(
 | 
			
		||||
        vectors=config["vectors_name"],
 | 
			
		||||
        nO=config["width"],
 | 
			
		||||
        column=config["column"],
 | 
			
		||||
        dropout=0.0,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
 | 
			
		||||
def TorchBiLSTMEncoder(config):
 | 
			
		||||
    import torch.nn
 | 
			
		||||
    # TODO FIX
 | 
			
		||||
    from thinc.layers import PyTorchRNNWrapper
 | 
			
		||||
 | 
			
		||||
    # TODO: FIX
 | 
			
		||||
    from thinc.api import PyTorchRNNWrapper
 | 
			
		||||
 | 
			
		||||
    width = config["width"]
 | 
			
		||||
    depth = config["depth"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,4 @@
 | 
			
		|||
from thinc.layers import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity
 | 
			
		||||
from thinc.api import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity
 | 
			
		||||
 | 
			
		||||
from .pipes import Pipe
 | 
			
		||||
from ..language import component
 | 
			
		||||
| 
						 | 
				
			
			@ -63,8 +63,7 @@ class SimilarityHook(Pipe):
 | 
			
		|||
    @classmethod
 | 
			
		||||
    def Model(cls, length):
 | 
			
		||||
        return siamese(
 | 
			
		||||
            concatenate(reduce_max(), reduce_mean()),
 | 
			
		||||
            CauchySimilarity(length * 2)
 | 
			
		||||
            concatenate(reduce_max(), reduce_mean()), CauchySimilarity(length * 2)
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def __call__(self, doc):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,8 +3,8 @@ from collections import defaultdict
 | 
			
		|||
import numpy
 | 
			
		||||
cimport numpy as np
 | 
			
		||||
 | 
			
		||||
from thinc.layers import chain, list2array
 | 
			
		||||
from thinc.util import to_categorical, copy_array, get_array_module
 | 
			
		||||
from thinc.api import chain, list2array, to_categorical, get_array_module
 | 
			
		||||
from thinc.util import copy_array
 | 
			
		||||
 | 
			
		||||
from .. import util
 | 
			
		||||
from .pipes import Pipe
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,11 +3,9 @@
 | 
			
		|||
import numpy
 | 
			
		||||
import srsly
 | 
			
		||||
import random
 | 
			
		||||
from thinc.layers import chain, Linear, Maxout, Softmax, LayerNorm, list2array
 | 
			
		||||
from thinc.initializers import zero_init
 | 
			
		||||
from thinc.loss import CosineDistance
 | 
			
		||||
from thinc.util import to_categorical, get_array_module
 | 
			
		||||
from thinc.model import set_dropout_rate
 | 
			
		||||
from thinc.api import chain, Linear, Maxout, Softmax, LayerNorm, list2array
 | 
			
		||||
from thinc.api import zero_init, CosineDistance, to_categorical, get_array_module
 | 
			
		||||
from thinc.api import set_dropout_rate
 | 
			
		||||
 | 
			
		||||
from ..tokens.doc cimport Doc
 | 
			
		||||
from ..syntax.nn_parser cimport Parser
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,5 @@
 | 
			
		|||
from thinc.api import Model, set_dropout_rate
 | 
			
		||||
 | 
			
		||||
from .pipes import Pipe
 | 
			
		||||
from ..gold import Example
 | 
			
		||||
from ..tokens import Doc
 | 
			
		||||
| 
						 | 
				
			
			@ -5,8 +7,6 @@ from ..vocab import Vocab
 | 
			
		|||
from ..language import component
 | 
			
		||||
from ..util import link_vectors_to_models, minibatch, registry, eg2doc
 | 
			
		||||
 | 
			
		||||
from thinc.model import Model, set_dropout_rate
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@component("tok2vec", assigns=["doc.tensor"])
 | 
			
		||||
class Tok2Vec(Pipe):
 | 
			
		||||
| 
						 | 
				
			
			@ -39,7 +39,9 @@ class Tok2Vec(Pipe):
 | 
			
		|||
        self.listeners = []
 | 
			
		||||
 | 
			
		||||
    def create_listener(self):
 | 
			
		||||
        listener = Tok2VecListener(upstream_name="tok2vec", width=self.model.get_dim("nO"))
 | 
			
		||||
        listener = Tok2VecListener(
 | 
			
		||||
            upstream_name="tok2vec", width=self.model.get_dim("nO")
 | 
			
		||||
        )
 | 
			
		||||
        self.listeners.append(listener)
 | 
			
		||||
 | 
			
		||||
    def add_listener(self, listener):
 | 
			
		||||
| 
						 | 
				
			
			@ -115,7 +117,7 @@ class Tok2Vec(Pipe):
 | 
			
		|||
 | 
			
		||||
        def capture_losses(d_tokvecs):
 | 
			
		||||
            """Accumulate tok2vec loss before doing backprop."""
 | 
			
		||||
            l2_loss = sum((d_t2v**2).sum() for d_t2v in d_tokvecs)
 | 
			
		||||
            l2_loss = sum((d_t2v ** 2).sum() for d_t2v in d_tokvecs)
 | 
			
		||||
            if self.name in losses:
 | 
			
		||||
                losses[self.name] += l2_loss / len(d_tokvecs)
 | 
			
		||||
            else:
 | 
			
		||||
| 
						 | 
				
			
			@ -133,7 +135,9 @@ class Tok2Vec(Pipe):
 | 
			
		|||
    def get_loss(self, docs, golds, scores):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
 | 
			
		||||
    def begin_training(
 | 
			
		||||
        self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
 | 
			
		||||
    ):
 | 
			
		||||
        """Allocate models and pre-process training data
 | 
			
		||||
 | 
			
		||||
        get_examples (function): Function returning example training data.
 | 
			
		||||
| 
						 | 
				
			
			@ -151,6 +155,7 @@ class Tok2VecListener(Model):
 | 
			
		|||
    """A layer that gets fed its answers from an upstream connection,
 | 
			
		||||
    for instance from a component earlier in the pipeline.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    name = "tok2vec-listener"
 | 
			
		||||
 | 
			
		||||
    def __init__(self, upstream_name, width):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -11,9 +11,7 @@ from libc.string cimport memset, memcpy
 | 
			
		|||
from libc.stdlib cimport calloc, free, realloc
 | 
			
		||||
from cymem.cymem cimport Pool
 | 
			
		||||
from thinc.extra.search cimport Beam
 | 
			
		||||
from thinc.layers import Linear
 | 
			
		||||
from thinc.model import Model
 | 
			
		||||
from thinc.backends import CupyOps, NumpyOps, use_ops
 | 
			
		||||
from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops
 | 
			
		||||
from thinc.backends.linalg cimport Vec, VecVec
 | 
			
		||||
cimport blis.cy
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,11 +1,8 @@
 | 
			
		|||
# cython: infer_types=True
 | 
			
		||||
# cython: cdivision=True
 | 
			
		||||
# cython: boundscheck=False
 | 
			
		||||
import numpy
 | 
			
		||||
cimport cython.parallel
 | 
			
		||||
import numpy.random
 | 
			
		||||
cimport numpy as np
 | 
			
		||||
from itertools import islice
 | 
			
		||||
from cpython.ref cimport PyObject, Py_XDECREF
 | 
			
		||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 | 
			
		||||
from libc.math cimport exp
 | 
			
		||||
| 
						 | 
				
			
			@ -14,15 +11,16 @@ from libc.string cimport memset, memcpy
 | 
			
		|||
from libc.stdlib cimport calloc, free
 | 
			
		||||
from cymem.cymem cimport Pool
 | 
			
		||||
from thinc.extra.search cimport Beam
 | 
			
		||||
from thinc.layers import chain, clone, Linear, list2array
 | 
			
		||||
from thinc.backends import NumpyOps, CupyOps, use_ops
 | 
			
		||||
from thinc.util import get_array_module
 | 
			
		||||
from thinc.backends.linalg cimport Vec, VecVec
 | 
			
		||||
from thinc.initializers import zero_init
 | 
			
		||||
from thinc.model import set_dropout_rate
 | 
			
		||||
import srsly
 | 
			
		||||
 | 
			
		||||
from spacy.gold import Example
 | 
			
		||||
from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops
 | 
			
		||||
from thinc.api import get_array_module, zero_init, set_dropout_rate
 | 
			
		||||
from itertools import islice
 | 
			
		||||
import srsly
 | 
			
		||||
import numpy.random
 | 
			
		||||
import numpy
 | 
			
		||||
 | 
			
		||||
from ..gold import Example
 | 
			
		||||
from ..typedefs cimport weight_t, class_t, hash_t
 | 
			
		||||
from ._parser_model cimport alloc_activations, free_activations
 | 
			
		||||
from ._parser_model cimport predict_states, arg_max_if_valid
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6,7 +6,7 @@ scheme.
 | 
			
		|||
"""
 | 
			
		||||
from copy import copy
 | 
			
		||||
 | 
			
		||||
from spacy.gold import Example
 | 
			
		||||
from ..gold import Example
 | 
			
		||||
from ..tokens.doc cimport Doc, set_children_from_heads
 | 
			
		||||
from ..errors import Errors
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
import pytest
 | 
			
		||||
import numpy
 | 
			
		||||
from spacy.tokens import Doc, Span
 | 
			
		||||
| 
						 | 
				
			
			@ -274,7 +273,19 @@ def test_doc_is_nered(en_vocab):
 | 
			
		|||
def test_doc_from_array_sent_starts(en_vocab):
 | 
			
		||||
    words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
 | 
			
		||||
    heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
 | 
			
		||||
    deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
 | 
			
		||||
    deps = [
 | 
			
		||||
        "ROOT",
 | 
			
		||||
        "dep",
 | 
			
		||||
        "dep",
 | 
			
		||||
        "dep",
 | 
			
		||||
        "dep",
 | 
			
		||||
        "dep",
 | 
			
		||||
        "ROOT",
 | 
			
		||||
        "dep",
 | 
			
		||||
        "dep",
 | 
			
		||||
        "dep",
 | 
			
		||||
        "dep",
 | 
			
		||||
    ]
 | 
			
		||||
    doc = Doc(en_vocab, words=words)
 | 
			
		||||
    for i, (dep, head) in enumerate(zip(deps, heads)):
 | 
			
		||||
        doc[i].dep_ = dep
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -29,7 +29,9 @@ def test_morph_props(i_has):
 | 
			
		|||
 | 
			
		||||
def test_morph_iter(i_has):
 | 
			
		||||
    assert set(i_has[0].morph) == set(["PronType=prs"])
 | 
			
		||||
    assert set(i_has[1].morph) == set(["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"])
 | 
			
		||||
    assert set(i_has[1].morph) == set(
 | 
			
		||||
        ["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"]
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_morph_get(i_has):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,7 +8,12 @@ from ..util import get_doc
 | 
			
		|||
 | 
			
		||||
def test_doc_retokenize_merge(en_tokenizer):
 | 
			
		||||
    text = "WKRO played songs by the beach boys all night"
 | 
			
		||||
    attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"}
 | 
			
		||||
    attrs = {
 | 
			
		||||
        "tag": "NAMED",
 | 
			
		||||
        "lemma": "LEMMA",
 | 
			
		||||
        "ent_type": "TYPE",
 | 
			
		||||
        "morph": "Number=Plur",
 | 
			
		||||
    }
 | 
			
		||||
    doc = en_tokenizer(text)
 | 
			
		||||
    assert len(doc) == 9
 | 
			
		||||
    with doc.retokenize() as retokenizer:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,3 @@
 | 
			
		|||
 | 
			
		||||
def test_ar_tokenizer_handles_long_text(ar_tokenizer):
 | 
			
		||||
    text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين.
 | 
			
		||||
     ولد نجيب محفوظ في مدينة القاهرة، حيث ترعرع و تلقى تعليمه الجامعي في جامعتها،
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user