Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-02-18 15:38:18 +01:00
parent 1278161f47
commit e3f40a6a0f
127 changed files with 219 additions and 275 deletions

View File

@ -5,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed") warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
# These are imported as part of the API # These are imported as part of the API
from thinc.util import prefer_gpu, require_gpu from thinc.api import prefer_gpu, require_gpu
from . import pipeline from . import pipeline
from .cli.info import info as cli_info from .cli.info import info as cli_info

View File

@ -192,11 +192,7 @@ def debug_data(
has_ws_ents_error = True has_ws_ents_error = True
if gold_train_data["punct_ents"]: if gold_train_data["punct_ents"]:
msg.warn( msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
"{} entity span(s) with punctuation".format(
gold_train_data["punct_ents"]
)
)
has_punct_ents_warning = True has_punct_ents_warning = True
for label in new_labels: for label in new_labels:

View File

@ -4,14 +4,12 @@ import time
import re import re
from collections import Counter from collections import Counter
from pathlib import Path from pathlib import Path
from thinc.layers import Linear, Maxout from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu
from thinc.util import prefer_gpu from thinc.api import CosineDistance, L2Distance
from wasabi import msg from wasabi import msg
import srsly import srsly
from thinc.layers import chain, list2array
from thinc.loss import CosineDistance, L2Distance
from spacy.gold import Example from ..gold import Example
from ..errors import Errors from ..errors import Errors
from ..tokens import Doc from ..tokens import Doc
from ..attrs import ID, HEAD from ..attrs import ID, HEAD
@ -85,7 +83,7 @@ def pretrain(
) )
if not output_dir.exists(): if not output_dir.exists():
output_dir.mkdir() output_dir.mkdir()
msg.good("Created output directory: {}".format(output_dir)) msg.good(f"Created output directory: {output_dir}")
srsly.write_json(output_dir / "config.json", config) srsly.write_json(output_dir / "config.json", config)
msg.good("Saved settings to config.json") msg.good("Saved settings to config.json")

View File

@ -1,7 +1,7 @@
import os import os
import tqdm import tqdm
from pathlib import Path from pathlib import Path
from thinc.backends import use_ops from thinc.api import use_ops
from timeit import default_timer as timer from timeit import default_timer as timer
import shutil import shutil
import srsly import srsly
@ -89,7 +89,7 @@ def train(
) )
if not output_path.exists(): if not output_path.exists():
output_path.mkdir() output_path.mkdir()
msg.good("Created output directory: {}".format(output_path)) msg.good(f"Created output directory: {output_path}")
tag_map = {} tag_map = {}
if tag_map_path is not None: if tag_map_path is not None:
@ -125,17 +125,17 @@ def train(
msg.text(f"Training pipeline: {pipeline}") msg.text(f"Training pipeline: {pipeline}")
disabled_pipes = None disabled_pipes = None
pipes_added = False pipes_added = False
msg.text("Training pipeline: {}".format(pipeline)) msg.text(f"Training pipeline: {pipeline}")
if use_gpu >= 0: if use_gpu >= 0:
activated_gpu = None activated_gpu = None
try: try:
activated_gpu = set_gpu(use_gpu) activated_gpu = set_gpu(use_gpu)
except Exception as e: except Exception as e:
msg.warn("Exception: {}".format(e)) msg.warn(f"Exception: {e}")
if activated_gpu is not None: if activated_gpu is not None:
msg.text("Using GPU: {}".format(use_gpu)) msg.text(f"Using GPU: {use_gpu}")
else: else:
msg.warn("Unable to activate GPU: {}".format(use_gpu)) msg.warn(f"Unable to activate GPU: {use_gpu}")
msg.text("Using CPU only") msg.text("Using CPU only")
use_gpu = -1 use_gpu = -1
if base_model: if base_model:
@ -158,11 +158,11 @@ def train(
"positive_label": textcat_positive_label, "positive_label": textcat_positive_label,
} }
if pipe not in nlp.pipe_names: if pipe not in nlp.pipe_names:
msg.text("Adding component to base model '{}'".format(pipe)) msg.text(f"Adding component to base model '{pipe}'")
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
pipes_added = True pipes_added = True
elif replace_components: elif replace_components:
msg.text("Replacing component from base model '{}'".format(pipe)) msg.text(f"Replacing component from base model '{pipe}'")
nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg)) nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
pipes_added = True pipes_added = True
else: else:
@ -180,7 +180,7 @@ def train(
f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}", f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
exits=1, exits=1,
) )
msg.text("Extending component from base model '{}'".format(pipe)) msg.text(f"Extending component from base model '{pipe}'")
disabled_pipes = nlp.disable_pipes( disabled_pipes = nlp.disable_pipes(
[p for p in nlp.pipe_names if p not in pipeline] [p for p in nlp.pipe_names if p not in pipeline]
) )
@ -377,7 +377,7 @@ def train(
msg.warn( msg.warn(
"Did you provide the same parameters during 'train' as during 'pretrain'?" "Did you provide the same parameters during 'train' as during 'pretrain'?"
) )
msg.fail("Original error message: {}".format(e), exits=1) msg.fail(f"Original error message: {e}", exits=1)
if raw_text: if raw_text:
# If raw text is available, perform 'rehearsal' updates, # If raw text is available, perform 'rehearsal' updates,
# which use unlabelled data to reduce overfitting. # which use unlabelled data to reduce overfitting.
@ -504,11 +504,7 @@ def train(
) )
break break
except Exception as e: except Exception as e:
msg.warn( msg.warn(f"Aborting and saving final best model. Encountered exception: {e}")
"Aborting and saving the final best model. Encountered exception: {}".format(
e
)
)
finally: finally:
best_pipes = nlp.pipe_names best_pipes = nlp.pipe_names
if disabled_pipes: if disabled_pipes:

View File

@ -1,19 +1,20 @@
from typing import Optional, Dict, List, Union, Sequence
import plac import plac
from thinc.util import require_gpu
from wasabi import msg from wasabi import msg
from pathlib import Path from pathlib import Path
import thinc import thinc
import thinc.schedules import thinc.schedules
from thinc.model import Model from thinc.api import Model
from spacy.gold import GoldCorpus
import spacy
from spacy.pipeline.tok2vec import Tok2VecListener
from typing import Optional, Dict, List, Union, Sequence
from pydantic import BaseModel, FilePath, StrictInt from pydantic import BaseModel, FilePath, StrictInt
import tqdm import tqdm
from ..ml import component_models # TODO: relative imports?
from .. import util import spacy
from spacy.gold import GoldCorpus
from spacy.pipeline.tok2vec import Tok2VecListener
from spacy.ml import component_models
from spacy import util
registry = util.registry registry = util.registry
@ -153,10 +154,9 @@ def create_tb_parser_model(
hidden_width: StrictInt = 64, hidden_width: StrictInt = 64,
maxout_pieces: StrictInt = 3, maxout_pieces: StrictInt = 3,
): ):
from thinc.layers import Linear, chain, list2array from thinc.api import Linear, chain, list2array, use_ops, zero_init
from spacy.ml._layers import PrecomputableAffine from spacy.ml._layers import PrecomputableAffine
from spacy.syntax._parser_model import ParserModel from spacy.syntax._parser_model import ParserModel
from thinc.api import use_ops, zero_init
token_vector_width = tok2vec.get_dim("nO") token_vector_width = tok2vec.get_dim("nO")
tok2vec = chain(tok2vec, list2array()) tok2vec = chain(tok2vec, list2array())
@ -221,13 +221,9 @@ def train_from_config_cli(
def train_from_config( def train_from_config(
config_path, config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
data_paths,
raw_text=None,
meta_path=None,
output_path=None,
): ):
msg.info("Loading config from: {}".format(config_path)) msg.info(f"Loading config from: {config_path}")
config = util.load_from_config(config_path, create_objects=True) config = util.load_from_config(config_path, create_objects=True)
use_gpu = config["training"]["use_gpu"] use_gpu = config["training"]["use_gpu"]
if use_gpu >= 0: if use_gpu >= 0:
@ -241,9 +237,7 @@ def train_from_config(
msg.info("Loading training corpus") msg.info("Loading training corpus")
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
msg.info("Initializing the nlp pipeline") msg.info("Initializing the nlp pipeline")
nlp.begin_training( nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
lambda: corpus.train_examples, device=use_gpu
)
train_batches = create_train_batches(nlp, corpus, config["training"]) train_batches = create_train_batches(nlp, corpus, config["training"])
evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"]) evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"])
@ -260,7 +254,7 @@ def train_from_config(
config["training"]["eval_frequency"], config["training"]["eval_frequency"],
) )
msg.info("Training. Initial learn rate: {}".format(optimizer.learn_rate)) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
print_row = setup_printer(config) print_row = setup_printer(config)
try: try:
@ -414,7 +408,7 @@ def subdivide_batch(batch):
def setup_printer(config): def setup_printer(config):
score_cols = config["training"]["scores"] score_cols = config["training"]["scores"]
score_widths = [max(len(col), 6) for col in score_cols] score_widths = [max(len(col), 6) for col in score_cols]
loss_cols = ["Loss {}".format(pipe) for pipe in config["nlp"]["pipeline"]] loss_cols = [f"Loss {pipe}" for pipe in config["nlp"]["pipeline"]]
loss_widths = [max(len(col), 8) for col in loss_cols] loss_widths = [max(len(col), 8) for col in loss_cols]
table_header = ["#"] + loss_cols + score_cols + ["Score"] table_header = ["#"] + loss_cols + score_cols + ["Score"]
table_header = [col.upper() for col in table_header] table_header = [col.upper() for col in table_header]

View File

@ -30,7 +30,7 @@ try:
except ImportError: except ImportError:
cupy = None cupy = None
from thinc.optimizers import Optimizer # noqa: F401 from thinc.api import Optimizer # noqa: F401
pickle = pickle pickle = pickle
copy_reg = copy_reg copy_reg = copy_reg

View File

@ -1,4 +1,3 @@
# Setting explicit height and max-width: none on the SVG is required for # Setting explicit height and max-width: none on the SVG is required for
# Jupyter to render it properly in a cell # Jupyter to render it properly in a cell

View File

@ -1,4 +1,3 @@
def explain(term): def explain(term):
"""Get a description for a given POS tag, dependency label or entity type. """Get a description for a given POS tag, dependency label or entity type.

View File

@ -1,6 +1,6 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from spacy.tokens import Doc from .tokens import Doc
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .syntax.transition_system cimport Transition from .syntax.transition_system cimport Transition
@ -65,5 +65,3 @@ cdef class Example:
cdef public TokenAnnotation token_annotation cdef public TokenAnnotation token_annotation
cdef public DocAnnotation doc_annotation cdef public DocAnnotation doc_annotation
cdef public object goldparse cdef public object goldparse

View File

@ -6,7 +6,7 @@ from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE from libc.stdio cimport FILE
from spacy.vocab cimport Vocab from .vocab cimport Vocab
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .structs cimport KBEntryC, AliasC from .structs cimport KBEntryC, AliasC
@ -169,4 +169,3 @@ cdef class Reader:
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1 cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
cdef int _read(self, void* value, size_t size) except -1 cdef int _read(self, void* value, size_t size) except -1

View File

@ -1,4 +1,3 @@
# Source: https://github.com/stopwords-iso/stopwords-af # Source: https://github.com/stopwords-iso/stopwords-af
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
# Source: https://github.com/Alir3z4/stop-words # Source: https://github.com/Alir3z4/stop-words
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set( STOP_WORDS = set(
""" """
অতএব অথচ অথব অন অন অন অন অনতত অবধি অবশ অর অন অন অরধভ অতএব অথচ অথব অন অন অন অন অনতত অবধি অবশ অর অন অন অরধভ

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# Source: https://github.com/Alir3z4/stop-words # Source: https://github.com/Alir3z4/stop-words
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set( STOP_WORDS = set(
""" """
á a ab aber ach acht achte achten achter achtes ag alle allein allem allen á a ab aber ach acht achte achten achter achtes ag alle allein allem allen

View File

@ -1,4 +1,3 @@
def get_pos_from_wiktionary(): def get_pos_from_wiktionary():
import re import re
from gensim.corpora.wikicorpus import extract_pages from gensim.corpora.wikicorpus import extract_pages

View File

@ -1,4 +1,3 @@
# These exceptions are used to add NORM values based on a token's ORTH value. # These exceptions are used to add NORM values based on a token's ORTH value.
# Norms are only set if no alternative is provided in the tokenizer exceptions. # Norms are only set if no alternative is provided in the tokenizer exceptions.

View File

@ -1,4 +1,3 @@
# Stop words # Stop words
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0 # Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
_exc = { _exc = {
# Slang and abbreviations # Slang and abbreviations
"cos": "because", "cos": "because",

View File

@ -1,4 +1,3 @@
# Stop words # Stop words
STOP_WORDS = set( STOP_WORDS = set(
""" """

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set( STOP_WORDS = set(
""" """
actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí

View File

@ -1,4 +1,3 @@
# Source: https://github.com/stopwords-iso/stopwords-et # Source: https://github.com/stopwords-iso/stopwords-et
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
verb_roots = """ verb_roots = """
#هست #هست
آخت#آهنج آخت#آهنج

View File

@ -1,4 +1,3 @@
# Stop words from HAZM package # Stop words from HAZM package
STOP_WORDS = set( STOP_WORDS = set(
""" """

View File

@ -1,4 +1,3 @@
# Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt # Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt
# Reformatted with some minor corrections # Reformatted with some minor corrections
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set( STOP_WORDS = set(
""" """
a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons

View File

@ -1,4 +1,3 @@
# fmt: off # fmt: off
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"] consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
broad_vowels = ["a", "á", "o", "ó", "u", "ú"] broad_vowels = ["a", "á", "o", "ó", "u", "ú"]

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6 # Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set( STOP_WORDS = set(
""" """
a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# Source: https://github.com/Xangis/extra-stopwords # Source: https://github.com/Xangis/extra-stopwords
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set( STOP_WORDS = set(
""" """
a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set( STOP_WORDS = set(
""" """
ಹಲವ ಹಲವ

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# Source: https://github.com/stopwords-iso/stopwords-lv # Source: https://github.com/stopwords-iso/stopwords-lv
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
# Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json # Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json
STOP_WORDS = set( STOP_WORDS = set(
""" """

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# These exceptions are used to add NORM values based on a token's ORTH value. # These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them - # Individual languages can also add their own exceptions and overwrite them -
# for example, British vs. American spelling in English. # for example, British vs. American spelling in English.

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set( STOP_WORDS = set(
""" """
à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
_exc = { _exc = {
# Slang # Slang
"прив": "привет", "прив": "привет",

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set( STOP_WORDS = set(
""" """
අතර අතර

View File

@ -1,4 +1,3 @@
# Source: https://github.com/Ardevop-sk/stopwords-sk # Source: https://github.com/Ardevop-sk/stopwords-sk
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
# Source: https://github.com/stopwords-iso/stopwords-sl # Source: https://github.com/stopwords-iso/stopwords-sl
# TODO: probably needs to be tidied up the list seems to have month names in # TODO: probably needs to be tidied up the list seems to have month names in
# it, which shouldn't be considered stop words. # it, which shouldn't be considered stop words.

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# Source: https://github.com/andrixh/index-albanian # Source: https://github.com/andrixh/index-albanian
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
_exc = { _exc = {
# Slang # Slang
"ћале": "отац", "ћале": "отац",

View File

@ -1,4 +1,3 @@
STOP_WORDS = set( STOP_WORDS = set(
""" """
а а

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
STOP_WORDS = set( STOP_WORDS = set(
""" """
aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
# Stop words # Stop words
STOP_WORDS = set( STOP_WORDS = set(

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
_exc = { _exc = {
# Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์) # Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
"สนุ๊กเกอร์": "สนุกเกอร์", "สนุ๊กเกอร์": "สนุกเกอร์",

View File

@ -34,7 +34,7 @@ URL_PATTERN = (
r"|" r"|"
# host & domain names # host & domain names
# mods: match is case-sensitive, so include [A-Z] # mods: match is case-sensitive, so include [A-Z]
"(?:" "(?:" # noqa: E131
"(?:" "(?:"
"[A-Za-z0-9\u00a1-\uffff]" "[A-Za-z0-9\u00a1-\uffff]"
"[A-Za-z0-9\u00a1-\uffff_-]{0,62}" "[A-Za-z0-9\u00a1-\uffff_-]{0,62}"

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.
>>> from spacy.lang.tr.examples import sentences >>> from spacy.lang.tr.examples import sentences

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,4 +1,3 @@
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -4,7 +4,7 @@ import weakref
import functools import functools
from contextlib import contextmanager from contextlib import contextmanager
from copy import copy, deepcopy from copy import copy, deepcopy
from thinc.backends import get_current_ops from thinc.api import get_current_ops
import srsly import srsly
import multiprocessing as mp import multiprocessing as mp
from itertools import chain, cycle from itertools import chain, cycle

View File

@ -6,7 +6,7 @@ cimport numpy as np
np.import_array() np.import_array()
import numpy import numpy
from thinc.util import get_array_module from thinc.api import get_array_module
from .typedefs cimport attr_t, flags_t from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE

View File

@ -3,18 +3,20 @@ from thinc.api import Model
def CharacterEmbed(nM, nC): def CharacterEmbed(nM, nC):
# nM: Number of dimensions per character. nC: Number of characters. # nM: Number of dimensions per character. nC: Number of characters.
nO = nM*nC if (nM is not None and nC is not None) else None nO = nM * nC if (nM is not None and nC is not None) else None
return Model( return Model(
"charembed", "charembed",
forward, forward,
init=init, init=init,
dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256}, dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256},
params={"E": None} params={"E": None},
).initialize() ).initialize()
def init(model, X=None, Y=None): def init(model, X=None, Y=None):
vectors_table = model.ops.alloc3f(model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")) vectors_table = model.ops.alloc3f(
model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
)
model.set_param("E", vectors_table) model.set_param("E", vectors_table)

View File

@ -1,5 +1,4 @@
from thinc.model import Model from thinc.api import Model, normal_init
from thinc.api import normal_init
def PrecomputableAffine(nO, nI, nF, nP): def PrecomputableAffine(nO, nI, nF, nP):
@ -20,9 +19,7 @@ def forward(model, X, is_train):
nP = model.get_dim("nP") nP = model.get_dim("nP")
nI = model.get_dim("nI") nI = model.get_dim("nI")
W = model.get_param("W") W = model.get_param("W")
Yf = model.ops.gemm( Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True)
X, W.reshape((nF * nO * nP, nI)), trans2=True
)
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP)) Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
Yf = model.ops.xp.vstack((model.get_param("pad"), Yf)) Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
@ -44,7 +41,7 @@ def forward(model, X, is_train):
assert dY.ndim == 3 assert dY.ndim == 3
assert dY.shape[1] == nO, dY.shape assert dY.shape[1] == nO, dY.shape
assert dY.shape[2] == nP, dY.shape assert dY.shape[2] == nP, dY.shape
nB = dY.shape[0] # nB = dY.shape[0]
model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids)) model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
Xf = X[ids] Xf = X[ids]
Xf = Xf.reshape((Xf.shape[0], nF * nI)) Xf = Xf.reshape((Xf.shape[0], nF * nI))
@ -118,7 +115,7 @@ def init(model, X=None, Y=None):
pad = model.ops.alloc4f(1, nF, nO, nP) pad = model.ops.alloc4f(1, nF, nO, nP)
ops = model.ops ops = model.ops
W = normal_init(ops, W.shape, fan_in=nF*nI) W = normal_init(ops, W.shape, fan_in=nF * nI)
model.set_param("W", W) model.set_param("W", W)
model.set_param("b", b) model.set_param("b", b)
model.set_param("pad", pad) model.set_param("pad", pad)

View File

@ -9,7 +9,7 @@ from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged
from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors
from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain
from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued
from thinc.api import zero_init, glorot_uniform_init from thinc.api import zero_init
def build_text_classifier(arch, config): def build_text_classifier(arch, config):
@ -33,10 +33,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg
output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO")) output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO"))
else: else:
# TODO: experiment with init_w=zero_init # TODO: experiment with init_w=zero_init
output_layer = ( output_layer = Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) >> Logistic()
Linear(nO=nr_class, nI=tok2vec.get_dim("nO"))
>> Logistic()
)
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
model.set_ref("tok2vec", tok2vec) model.set_ref("tok2vec", tok2vec)
model.set_dim("nO", nr_class) model.set_dim("nO", nr_class)
@ -149,13 +146,21 @@ def Tok2Vec(
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0) norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0)
if subword_features: if subword_features:
prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0) prefix = HashEmbed(
suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0) nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0
shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0) )
suffix = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0
)
shape = HashEmbed(
nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0
)
else: else:
prefix, suffix, shape = (None, None, None) prefix, suffix, shape = (None, None, None)
if pretrained_vectors is not None: if pretrained_vectors is not None:
glove = StaticVectors(vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0) glove = StaticVectors(
vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0
)
if subword_features: if subword_features:
embed = uniqued( embed = uniqued(

View File

@ -1,5 +1,5 @@
import numpy import numpy
from thinc.model import Model from thinc.api import Model
from ..attrs import LOWER from ..attrs import LOWER
@ -26,9 +26,7 @@ def forward(self, docs, is_train: bool):
# The dtype here matches what thinc is expecting -- which differs per # The dtype here matches what thinc is expecting -- which differs per
# platform (by int definition). This should be fixed once the problem # platform (by int definition). This should be fixed once the problem
# is fixed on Thinc's side. # is fixed on Thinc's side.
lengths = self.ops.asarray( lengths = self.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
[arr.shape[0] for arr in batch_keys], dtype=numpy.int_
)
batch_keys = self.ops.xp.concatenate(batch_keys) batch_keys = self.ops.xp.concatenate(batch_keys)
batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f") batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
@ -36,4 +34,3 @@ def forward(self, docs, is_train: bool):
return dY return dY
return (batch_keys, batch_vals, lengths), backprop return (batch_keys, batch_vals, lengths), backprop

View File

@ -1,11 +1,8 @@
from thinc.layers import chain, clone, concatenate, with_array, uniqued from thinc.api import Model, chain, clone, concatenate, with_array, uniqued, noop
from thinc.model import Model from thinc.api import with_padded, Maxout, expand_window, HashEmbed, StaticVectors
from thinc.layers import noop, with_padded from thinc.api import residual, LayerNorm, FeatureExtractor
from thinc.layers import Maxout, expand_window
from thinc.layers import HashEmbed, StaticVectors
from thinc.layers import residual, LayerNorm, FeatureExtractor
from spacy.ml import _character_embed from ..ml import _character_embed
from ..util import make_layer, registry from ..util import make_layer, registry
@ -93,8 +90,10 @@ def MaxoutWindowEncoder(config):
nW = config["window_size"] nW = config["window_size"]
nP = config["pieces"] nP = config["pieces"]
depth = config["depth"] depth = config["depth"]
cnn = (
cnn = expand_window(window_size=nW), Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True) expand_window(window_size=nW),
Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True),
)
model = clone(residual(cnn), depth) model = clone(residual(cnn), depth)
model.set_dim("nO", nO) model.set_dim("nO", nO)
model.attrs["receptive_field"] = nW * depth model.attrs["receptive_field"] = nW * depth
@ -103,13 +102,16 @@ def MaxoutWindowEncoder(config):
@registry.architectures.register("spacy.MishWindowEncoder.v1") @registry.architectures.register("spacy.MishWindowEncoder.v1")
def MishWindowEncoder(config): def MishWindowEncoder(config):
from thinc.layers import Mish from thinc.api import Mish
nO = config["width"] nO = config["width"]
nW = config["window_size"] nW = config["window_size"]
depth = config["depth"] depth = config["depth"]
cnn = chain(
cnn = chain(expand_window(window_size=nW), Mish(nO=nO, nI=nO * ((nW * 2) + 1)), LayerNorm(nO)) expand_window(window_size=nW),
Mish(nO=nO, nI=nO * ((nW * 2) + 1)),
LayerNorm(nO),
)
model = clone(residual(cnn), depth) model = clone(residual(cnn), depth)
model.set_dim("nO", nO) model.set_dim("nO", nO)
return model return model
@ -118,14 +120,20 @@ def MishWindowEncoder(config):
@registry.architectures.register("spacy.PretrainedVectors.v1") @registry.architectures.register("spacy.PretrainedVectors.v1")
def PretrainedVectors(config): def PretrainedVectors(config):
# TODO: actual vectors instead of name # TODO: actual vectors instead of name
return StaticVectors(vectors=config["vectors_name"], nO=config["width"], column=config["column"], dropout=0.0) return StaticVectors(
vectors=config["vectors_name"],
nO=config["width"],
column=config["column"],
dropout=0.0,
)
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") @registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
def TorchBiLSTMEncoder(config): def TorchBiLSTMEncoder(config):
import torch.nn import torch.nn
# TODO FIX
from thinc.layers import PyTorchRNNWrapper # TODO: FIX
from thinc.api import PyTorchRNNWrapper
width = config["width"] width = config["width"]
depth = config["depth"] depth = config["depth"]

View File

@ -1,4 +1,4 @@
from thinc.layers import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity from thinc.api import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity
from .pipes import Pipe from .pipes import Pipe
from ..language import component from ..language import component
@ -63,8 +63,7 @@ class SimilarityHook(Pipe):
@classmethod @classmethod
def Model(cls, length): def Model(cls, length):
return siamese( return siamese(
concatenate(reduce_max(), reduce_mean()), concatenate(reduce_max(), reduce_mean()), CauchySimilarity(length * 2)
CauchySimilarity(length * 2)
) )
def __call__(self, doc): def __call__(self, doc):

View File

@ -3,8 +3,8 @@ from collections import defaultdict
import numpy import numpy
cimport numpy as np cimport numpy as np
from thinc.layers import chain, list2array from thinc.api import chain, list2array, to_categorical, get_array_module
from thinc.util import to_categorical, copy_array, get_array_module from thinc.util import copy_array
from .. import util from .. import util
from .pipes import Pipe from .pipes import Pipe

View File

@ -3,11 +3,9 @@
import numpy import numpy
import srsly import srsly
import random import random
from thinc.layers import chain, Linear, Maxout, Softmax, LayerNorm, list2array from thinc.api import chain, Linear, Maxout, Softmax, LayerNorm, list2array
from thinc.initializers import zero_init from thinc.api import zero_init, CosineDistance, to_categorical, get_array_module
from thinc.loss import CosineDistance from thinc.api import set_dropout_rate
from thinc.util import to_categorical, get_array_module
from thinc.model import set_dropout_rate
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..syntax.nn_parser cimport Parser from ..syntax.nn_parser cimport Parser

View File

@ -1,3 +1,5 @@
from thinc.api import Model, set_dropout_rate
from .pipes import Pipe from .pipes import Pipe
from ..gold import Example from ..gold import Example
from ..tokens import Doc from ..tokens import Doc
@ -5,8 +7,6 @@ from ..vocab import Vocab
from ..language import component from ..language import component
from ..util import link_vectors_to_models, minibatch, registry, eg2doc from ..util import link_vectors_to_models, minibatch, registry, eg2doc
from thinc.model import Model, set_dropout_rate
@component("tok2vec", assigns=["doc.tensor"]) @component("tok2vec", assigns=["doc.tensor"])
class Tok2Vec(Pipe): class Tok2Vec(Pipe):
@ -39,7 +39,9 @@ class Tok2Vec(Pipe):
self.listeners = [] self.listeners = []
def create_listener(self): def create_listener(self):
listener = Tok2VecListener(upstream_name="tok2vec", width=self.model.get_dim("nO")) listener = Tok2VecListener(
upstream_name="tok2vec", width=self.model.get_dim("nO")
)
self.listeners.append(listener) self.listeners.append(listener)
def add_listener(self, listener): def add_listener(self, listener):
@ -115,7 +117,7 @@ class Tok2Vec(Pipe):
def capture_losses(d_tokvecs): def capture_losses(d_tokvecs):
"""Accumulate tok2vec loss before doing backprop.""" """Accumulate tok2vec loss before doing backprop."""
l2_loss = sum((d_t2v**2).sum() for d_t2v in d_tokvecs) l2_loss = sum((d_t2v ** 2).sum() for d_t2v in d_tokvecs)
if self.name in losses: if self.name in losses:
losses[self.name] += l2_loss / len(d_tokvecs) losses[self.name] += l2_loss / len(d_tokvecs)
else: else:
@ -133,7 +135,9 @@ class Tok2Vec(Pipe):
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
pass pass
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): def begin_training(
self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
):
"""Allocate models and pre-process training data """Allocate models and pre-process training data
get_examples (function): Function returning example training data. get_examples (function): Function returning example training data.
@ -151,6 +155,7 @@ class Tok2VecListener(Model):
"""A layer that gets fed its answers from an upstream connection, """A layer that gets fed its answers from an upstream connection,
for instance from a component earlier in the pipeline. for instance from a component earlier in the pipeline.
""" """
name = "tok2vec-listener" name = "tok2vec-listener"
def __init__(self, upstream_name, width): def __init__(self, upstream_name, width):

View File

@ -11,9 +11,7 @@ from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free, realloc from libc.stdlib cimport calloc, free, realloc
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
from thinc.layers import Linear from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops
from thinc.model import Model
from thinc.backends import CupyOps, NumpyOps, use_ops
from thinc.backends.linalg cimport Vec, VecVec from thinc.backends.linalg cimport Vec, VecVec
cimport blis.cy cimport blis.cy

View File

@ -1,11 +1,8 @@
# cython: infer_types=True # cython: infer_types=True
# cython: cdivision=True # cython: cdivision=True
# cython: boundscheck=False # cython: boundscheck=False
import numpy
cimport cython.parallel cimport cython.parallel
import numpy.random
cimport numpy as np cimport numpy as np
from itertools import islice
from cpython.ref cimport PyObject, Py_XDECREF from cpython.ref cimport PyObject, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from libc.math cimport exp from libc.math cimport exp
@ -14,15 +11,16 @@ from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
from thinc.layers import chain, clone, Linear, list2array
from thinc.backends import NumpyOps, CupyOps, use_ops
from thinc.util import get_array_module
from thinc.backends.linalg cimport Vec, VecVec from thinc.backends.linalg cimport Vec, VecVec
from thinc.initializers import zero_init
from thinc.model import set_dropout_rate
import srsly
from spacy.gold import Example from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops
from thinc.api import get_array_module, zero_init, set_dropout_rate
from itertools import islice
import srsly
import numpy.random
import numpy
from ..gold import Example
from ..typedefs cimport weight_t, class_t, hash_t from ..typedefs cimport weight_t, class_t, hash_t
from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport alloc_activations, free_activations
from ._parser_model cimport predict_states, arg_max_if_valid from ._parser_model cimport predict_states, arg_max_if_valid

View File

@ -6,7 +6,7 @@ scheme.
""" """
from copy import copy from copy import copy
from spacy.gold import Example from ..gold import Example
from ..tokens.doc cimport Doc, set_children_from_heads from ..tokens.doc cimport Doc, set_children_from_heads
from ..errors import Errors from ..errors import Errors

View File

@ -1,4 +1,3 @@
import pytest import pytest
import numpy import numpy
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
@ -274,7 +273,19 @@ def test_doc_is_nered(en_vocab):
def test_doc_from_array_sent_starts(en_vocab): def test_doc_from_array_sent_starts(en_vocab):
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."] words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6] heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"] deps = [
"ROOT",
"dep",
"dep",
"dep",
"dep",
"dep",
"ROOT",
"dep",
"dep",
"dep",
"dep",
]
doc = Doc(en_vocab, words=words) doc = Doc(en_vocab, words=words)
for i, (dep, head) in enumerate(zip(deps, heads)): for i, (dep, head) in enumerate(zip(deps, heads)):
doc[i].dep_ = dep doc[i].dep_ = dep

View File

@ -29,7 +29,9 @@ def test_morph_props(i_has):
def test_morph_iter(i_has): def test_morph_iter(i_has):
assert set(i_has[0].morph) == set(["PronType=prs"]) assert set(i_has[0].morph) == set(["PronType=prs"])
assert set(i_has[1].morph) == set(["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"]) assert set(i_has[1].morph) == set(
["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"]
)
def test_morph_get(i_has): def test_morph_get(i_has):

View File

@ -8,7 +8,12 @@ from ..util import get_doc
def test_doc_retokenize_merge(en_tokenizer): def test_doc_retokenize_merge(en_tokenizer):
text = "WKRO played songs by the beach boys all night" text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"} attrs = {
"tag": "NAMED",
"lemma": "LEMMA",
"ent_type": "TYPE",
"morph": "Number=Plur",
}
doc = en_tokenizer(text) doc = en_tokenizer(text)
assert len(doc) == 9 assert len(doc) == 9
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:

View File

@ -1,4 +1,3 @@
def test_ar_tokenizer_handles_long_text(ar_tokenizer): def test_ar_tokenizer_handles_long_text(ar_tokenizer):
text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين. text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين.
ولد نجيب محفوظ في مدينة القاهرة، حيث ترعرع و تلقى تعليمه الجامعي في جامعتها، ولد نجيب محفوظ في مدينة القاهرة، حيث ترعرع و تلقى تعليمه الجامعي في جامعتها،

View File

@ -1,4 +1,3 @@
def test_en_simple_punct(en_tokenizer): def test_en_simple_punct(en_tokenizer):
text = "to walk, do foo" text = "to walk, do foo"
tokens = en_tokenizer(text) tokens = en_tokenizer(text)

Some files were not shown because too many files have changed in this diff Show More