mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 13:41:21 +03:00 
			
		
		
		
	Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
		
						commit
						bf6992c2dd
					
				|  | @ -6,7 +6,7 @@ requires = [ | |||
|     "cymem>=2.0.2,<2.1.0", | ||||
|     "preshed>=3.0.2,<3.1.0", | ||||
|     "murmurhash>=0.28.0,<1.1.0", | ||||
|     "thinc>=8.0.0rc0,<8.1.0", | ||||
|     "thinc>=8.0.0rc2,<8.1.0", | ||||
|     "blis>=0.4.0,<0.8.0", | ||||
|     "pathy" | ||||
| ] | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| # Our libraries | ||||
| cymem>=2.0.2,<2.1.0 | ||||
| preshed>=3.0.2,<3.1.0 | ||||
| thinc>=8.0.0rc0,<8.1.0 | ||||
| thinc>=8.0.0rc2,<8.1.0 | ||||
| blis>=0.4.0,<0.8.0 | ||||
| ml_datasets==0.2.0a0 | ||||
| murmurhash>=0.28.0,<1.1.0 | ||||
|  |  | |||
|  | @ -34,13 +34,13 @@ setup_requires = | |||
|     cymem>=2.0.2,<2.1.0 | ||||
|     preshed>=3.0.2,<3.1.0 | ||||
|     murmurhash>=0.28.0,<1.1.0 | ||||
|     thinc>=8.0.0rc0,<8.1.0 | ||||
|     thinc>=8.0.0rc2,<8.1.0 | ||||
| install_requires = | ||||
|     # Our libraries | ||||
|     murmurhash>=0.28.0,<1.1.0 | ||||
|     cymem>=2.0.2,<2.1.0 | ||||
|     preshed>=3.0.2,<3.1.0 | ||||
|     thinc>=8.0.0rc0,<8.1.0 | ||||
|     thinc>=8.0.0rc2,<8.1.0 | ||||
|     blis>=0.4.0,<0.8.0 | ||||
|     wasabi>=0.8.0,<1.1.0 | ||||
|     srsly>=2.3.0,<3.0.0 | ||||
|  |  | |||
|  | @ -7,7 +7,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")  # noqa | |||
| warnings.filterwarnings("ignore", message="numpy.ufunc size changed")  # noqa | ||||
| 
 | ||||
| # These are imported as part of the API | ||||
| from thinc.api import prefer_gpu, require_gpu  # noqa: F401 | ||||
| from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401 | ||||
| from thinc.api import Config | ||||
| 
 | ||||
| from . import pipeline  # noqa: F401 | ||||
|  |  | |||
|  | @ -17,7 +17,9 @@ tolerance = 0.2 | |||
| get_length = null | ||||
| 
 | ||||
| [pretraining.objective] | ||||
| type = "characters" | ||||
| @architectures = "spacy.PretrainCharacters.v1" | ||||
| maxout_pieces = 3 | ||||
| hidden_size = 300 | ||||
| n_characters = 4 | ||||
| 
 | ||||
| [pretraining.optimizer] | ||||
|  |  | |||
|  | @ -484,8 +484,8 @@ class Errors: | |||
|             "has been applied.") | ||||
|     E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This " | ||||
|             "dimension refers to the width of the vectors table.") | ||||
|     E906 = ("Unexpected `loss` value in pretraining objective: {loss_type}") | ||||
|     E907 = ("Unexpected `objective_type` value in pretraining objective: {objective_type}") | ||||
|     E906 = ("Unexpected `loss` value in pretraining objective: '{found}'. Supported values " | ||||
|             "are: {supported}") | ||||
|     E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.") | ||||
|     E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.") | ||||
|     E910 = ("Encountered NaN value when computing loss for component '{name}'.") | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| from .entity_linker import *  # noqa | ||||
| from .multi_task import *  # noqa | ||||
| from .parser import *  # noqa | ||||
| from .tagger import *  # noqa | ||||
| from .textcat import *  # noqa | ||||
|  |  | |||
|  | @ -1,7 +1,14 @@ | |||
| from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING | ||||
| import numpy | ||||
| from typing import Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING | ||||
| from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model | ||||
| from thinc.api import MultiSoftmax, list2array | ||||
| from thinc.api import to_categorical, CosineDistance, L2Distance | ||||
| 
 | ||||
| from ...util import registry | ||||
| from ...errors import Errors | ||||
| from ...attrs import ID | ||||
| 
 | ||||
| import numpy | ||||
| from functools import partial | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     # This lets us add type hints for mypy etc. without causing circular imports | ||||
|  | @ -9,6 +16,74 @@ if TYPE_CHECKING: | |||
|     from ...tokens import Doc  # noqa: F401 | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures.register("spacy.PretrainVectors.v1") | ||||
| def create_pretrain_vectors( | ||||
|     maxout_pieces: int, hidden_size: int, loss: str | ||||
| ) -> Callable[["Vocab", Model], Model]: | ||||
|     def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model: | ||||
|         model = build_cloze_multi_task_model( | ||||
|             vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces | ||||
|         ) | ||||
|         model.attrs["loss"] = create_vectors_loss() | ||||
|         return model | ||||
| 
 | ||||
|     def create_vectors_loss() -> Callable: | ||||
|         if loss == "cosine": | ||||
|             distance = CosineDistance(normalize=True, ignore_zeros=True) | ||||
|             return partial(get_vectors_loss, distance=distance) | ||||
|         elif loss == "L2": | ||||
|             distance = L2Distance(normalize=True) | ||||
|             return partial(get_vectors_loss, distance=distance) | ||||
|         else: | ||||
|             raise ValueError(Errors.E906.format(found=loss, supported="'cosine', 'L2'")) | ||||
| 
 | ||||
|     return create_vectors_objective | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures.register("spacy.PretrainCharacters.v1") | ||||
| def create_pretrain_characters( | ||||
|     maxout_pieces: int, hidden_size: int, n_characters: int | ||||
| ) -> Callable[["Vocab", Model], Model]: | ||||
|     def create_characters_objective(vocab: "Vocab", tok2vec: Model) -> Model: | ||||
|         model = build_cloze_characters_multi_task_model( | ||||
|             vocab, | ||||
|             tok2vec, | ||||
|             hidden_size=hidden_size, | ||||
|             maxout_pieces=maxout_pieces, | ||||
|             nr_char=n_characters, | ||||
|         ) | ||||
|         model.attrs["loss"] = partial(get_characters_loss, nr_char=n_characters) | ||||
|         return model | ||||
| 
 | ||||
|     return create_characters_objective | ||||
| 
 | ||||
| 
 | ||||
| def get_vectors_loss(ops, docs, prediction, distance): | ||||
|     """Compute a loss based on a distance between the documents' vectors and | ||||
|     the prediction. | ||||
|     """ | ||||
|     # The simplest way to implement this would be to vstack the | ||||
|     # token.vector values, but that's a bit inefficient, especially on GPU. | ||||
|     # Instead we fetch the index into the vectors table for each of our tokens, | ||||
|     # and look them up all at once. This prevents data copying. | ||||
|     ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) | ||||
|     target = docs[0].vocab.vectors.data[ids] | ||||
|     d_target, loss = distance(prediction, target) | ||||
|     return loss, d_target | ||||
| 
 | ||||
| 
 | ||||
| def get_characters_loss(ops, docs, prediction, nr_char): | ||||
|     """Compute a loss based on a number of characters predicted from the docs.""" | ||||
|     target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs]) | ||||
|     target_ids = target_ids.reshape((-1,)) | ||||
|     target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f") | ||||
|     target = target.reshape((-1, 256 * nr_char)) | ||||
|     diff = prediction - target | ||||
|     loss = (diff ** 2).sum() | ||||
|     d_target = diff / float(prediction.shape[0]) | ||||
|     return loss, d_target | ||||
| 
 | ||||
| 
 | ||||
| def build_multi_task_model( | ||||
|     tok2vec: Model, | ||||
|     maxout_pieces: int, | ||||
|  | @ -33,23 +108,19 @@ def build_multi_task_model( | |||
| 
 | ||||
| 
 | ||||
| def build_cloze_multi_task_model( | ||||
|     vocab: "Vocab", | ||||
|     tok2vec: Model, | ||||
|     maxout_pieces: int, | ||||
|     hidden_size: int, | ||||
|     nO: Optional[int] = None, | ||||
|     vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int | ||||
| ) -> Model: | ||||
|     # nO = vocab.vectors.data.shape[1] | ||||
|     nO = vocab.vectors.data.shape[1] | ||||
|     output_layer = chain( | ||||
|         list2array(), | ||||
|         Maxout( | ||||
|             nO=nO, | ||||
|             nO=hidden_size, | ||||
|             nI=tok2vec.get_dim("nO"), | ||||
|             nP=maxout_pieces, | ||||
|             normalize=True, | ||||
|             dropout=0.0, | ||||
|         ), | ||||
|         Linear(nO=nO, nI=nO, init_W=zero_init), | ||||
|         Linear(nO=nO, nI=hidden_size, init_W=zero_init), | ||||
|     ) | ||||
|     model = chain(tok2vec, output_layer) | ||||
|     model = build_masked_language_model(vocab, model) | ||||
|  |  | |||
|  | @ -351,9 +351,7 @@ class ConfigSchemaPretrain(BaseModel): | |||
|     batcher: Batcher = Field(..., title="Batcher for the training data") | ||||
|     component: str = Field(..., title="Component to find the layer to pretrain") | ||||
|     layer: str = Field(..., title="Layer to pretrain. Whole model if empty.") | ||||
| 
 | ||||
|     # TODO: use a more detailed schema for this? | ||||
|     objective: Dict[str, Any] = Field(..., title="Pretraining objective") | ||||
|     objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.") | ||||
|     # fmt: on | ||||
| 
 | ||||
|     class Config: | ||||
|  |  | |||
|  | @ -3,15 +3,15 @@ from thinc.api import Config, ConfigValidationError | |||
| import spacy | ||||
| from spacy.lang.en import English | ||||
| from spacy.lang.de import German | ||||
| from spacy.language import Language, DEFAULT_CONFIG | ||||
| from spacy.util import registry, load_model_from_config | ||||
| from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH | ||||
| from spacy.util import registry, load_model_from_config, load_config | ||||
| from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model | ||||
| from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder | ||||
| from spacy.schemas import ConfigSchema | ||||
| from spacy.schemas import ConfigSchema, ConfigSchemaPretrain | ||||
| 
 | ||||
| 
 | ||||
| from ..util import make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| nlp_config_string = """ | ||||
| [paths] | ||||
| train = null | ||||
|  | @ -63,6 +63,59 @@ factory = "tagger" | |||
| width = ${components.tok2vec.model.width} | ||||
| """ | ||||
| 
 | ||||
| pretrain_config_string = """ | ||||
| [paths] | ||||
| train = null | ||||
| dev = null | ||||
| 
 | ||||
| [corpora] | ||||
| 
 | ||||
| [corpora.train] | ||||
| @readers = "spacy.Corpus.v1" | ||||
| path = ${paths.train} | ||||
| 
 | ||||
| [corpora.dev] | ||||
| @readers = "spacy.Corpus.v1" | ||||
| path = ${paths.dev} | ||||
| 
 | ||||
| [training] | ||||
| 
 | ||||
| [training.batcher] | ||||
| @batchers = "spacy.batch_by_words.v1" | ||||
| size = 666 | ||||
| 
 | ||||
| [nlp] | ||||
| lang = "en" | ||||
| pipeline = ["tok2vec", "tagger"] | ||||
| 
 | ||||
| [components] | ||||
| 
 | ||||
| [components.tok2vec] | ||||
| factory = "tok2vec" | ||||
| 
 | ||||
| [components.tok2vec.model] | ||||
| @architectures = "spacy.HashEmbedCNN.v1" | ||||
| pretrained_vectors = null | ||||
| width = 342 | ||||
| depth = 4 | ||||
| window_size = 1 | ||||
| embed_size = 2000 | ||||
| maxout_pieces = 3 | ||||
| subword_features = true | ||||
| 
 | ||||
| [components.tagger] | ||||
| factory = "tagger" | ||||
| 
 | ||||
| [components.tagger.model] | ||||
| @architectures = "spacy.Tagger.v1" | ||||
| 
 | ||||
| [components.tagger.model.tok2vec] | ||||
| @architectures = "spacy.Tok2VecListener.v1" | ||||
| width = ${components.tok2vec.model.width} | ||||
| 
 | ||||
| [pretraining] | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| parser_config_string = """ | ||||
| [model] | ||||
|  | @ -126,6 +179,14 @@ def test_create_nlp_from_config(): | |||
|         load_model_from_config(Config(bad_cfg), auto_fill=True) | ||||
| 
 | ||||
| 
 | ||||
| def test_create_nlp_from_pretraining_config(): | ||||
|     """Test that the default pretraining config validates properly""" | ||||
|     config = Config().from_str(pretrain_config_string) | ||||
|     pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH) | ||||
|     filled = config.merge(pretrain_config) | ||||
|     resolved = registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain) | ||||
| 
 | ||||
| 
 | ||||
| def test_create_nlp_from_config_multiple_instances(): | ||||
|     """Test that the nlp object is created correctly for a config with multiple | ||||
|     instances of the same component.""" | ||||
|  |  | |||
|  | @ -4,7 +4,7 @@ import ctypes | |||
| from pathlib import Path | ||||
| from spacy.about import __version__ as spacy_version | ||||
| from spacy import util | ||||
| from spacy import prefer_gpu, require_gpu | ||||
| from spacy import prefer_gpu, require_gpu, require_cpu | ||||
| from spacy.ml._precomputable_affine import PrecomputableAffine | ||||
| from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding | ||||
| from spacy.util import dot_to_object, SimpleFrozenList | ||||
|  | @ -15,6 +15,8 @@ from spacy.lang.nl import Dutch | |||
| from spacy.language import DEFAULT_CONFIG_PATH | ||||
| from spacy.schemas import ConfigSchemaTraining | ||||
| 
 | ||||
| from thinc.api import get_current_ops, NumpyOps, CupyOps | ||||
| 
 | ||||
| from .util import get_random_doc | ||||
| 
 | ||||
| 
 | ||||
|  | @ -81,6 +83,8 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): | |||
| def test_prefer_gpu(): | ||||
|     try: | ||||
|         import cupy  # noqa: F401 | ||||
|         prefer_gpu() | ||||
|         assert isinstance(get_current_ops(), CupyOps) | ||||
|     except ImportError: | ||||
|         assert not prefer_gpu() | ||||
| 
 | ||||
|  | @ -88,10 +92,24 @@ def test_prefer_gpu(): | |||
| def test_require_gpu(): | ||||
|     try: | ||||
|         import cupy  # noqa: F401 | ||||
|         require_gpu() | ||||
|         assert isinstance(get_current_ops(), CupyOps) | ||||
|     except ImportError: | ||||
|         with pytest.raises(ValueError): | ||||
|             require_gpu() | ||||
| 
 | ||||
| def test_require_cpu(): | ||||
|     require_cpu() | ||||
|     assert isinstance(get_current_ops(), NumpyOps) | ||||
|     try: | ||||
|         import cupy  # noqa: F401 | ||||
|         require_gpu() | ||||
|         assert isinstance(get_current_ops(), CupyOps) | ||||
|     except ImportError: | ||||
|         pass | ||||
|     require_cpu() | ||||
|     assert isinstance(get_current_ops(), NumpyOps) | ||||
| 
 | ||||
| 
 | ||||
| def test_ascii_filenames(): | ||||
|     """Test that all filenames in the project are ASCII. | ||||
|  |  | |||
|  | @ -2,6 +2,7 @@ import pytest | |||
| from spacy.vocab import Vocab | ||||
| from spacy.tokenizer import Tokenizer | ||||
| from spacy.util import ensure_path | ||||
| from spacy.lang.en import English | ||||
| 
 | ||||
| 
 | ||||
| def test_tokenizer_handles_no_word(tokenizer): | ||||
|  | @ -150,6 +151,22 @@ def test_tokenizer_special_cases_with_affixes(tokenizer): | |||
|     ] | ||||
| 
 | ||||
| 
 | ||||
| def test_tokenizer_special_cases_with_affixes_preserve_spacy(): | ||||
|     tokenizer = English().tokenizer | ||||
|     # reset all special cases | ||||
|     tokenizer.rules = {} | ||||
| 
 | ||||
|     # in-place modification (only merges) | ||||
|     text = "''a'' " | ||||
|     tokenizer.add_special_case("''", [{"ORTH": "''"}]) | ||||
|     assert tokenizer(text).text == text | ||||
| 
 | ||||
|     # not in-place (splits and merges) | ||||
|     tokenizer.add_special_case("ab", [{"ORTH": "a"}, {"ORTH": "b"}]) | ||||
|     text = "ab ab ab ''ab ab'' ab'' ''ab" | ||||
|     assert tokenizer(text).text == text | ||||
| 
 | ||||
| 
 | ||||
| def test_tokenizer_special_cases_with_period(tokenizer): | ||||
|     text = "_SPECIAL_." | ||||
|     tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}]) | ||||
|  |  | |||
|  | @ -514,6 +514,11 @@ def test_roundtrip_docs_to_docbin(doc): | |||
|             ([[0], [1], [2, 3]], [[0], [1], [2], [2]]), | ||||
|         ), | ||||
|         ([" ", "a"], ["a"], ([[], [0]], [[1]])), | ||||
|         ( | ||||
|             ["a", "''", "'", ","], | ||||
|             ["a'", "''", ","], | ||||
|             ([[0], [0, 1], [1], [2]], [[0, 1], [1, 2], [3]]), | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_align(tokens_a, tokens_b, expected):  # noqa | ||||
|  | @ -698,7 +703,7 @@ def test_alignment_spaces(en_vocab): | |||
|     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||
|     assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1] | ||||
|     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] | ||||
|     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] | ||||
|     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2] | ||||
|     assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6] | ||||
| 
 | ||||
|     # multiple leading whitespace tokens | ||||
|  | @ -707,7 +712,7 @@ def test_alignment_spaces(en_vocab): | |||
|     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||
|     assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1] | ||||
|     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] | ||||
|     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] | ||||
|     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2] | ||||
|     assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7] | ||||
| 
 | ||||
|     # both with leading whitespace, not identical | ||||
|  |  | |||
|  | @ -338,7 +338,7 @@ cdef class Tokenizer: | |||
|                     # Copy special case tokens into doc and adjust token and | ||||
|                     # character offsets | ||||
|                     idx_offset = 0 | ||||
|                     orig_final_spacy = doc.c[span_end + offset - 1].spacy | ||||
|                     orig_final_spacy = doc.c[span_end - 1].spacy | ||||
|                     orig_idx = doc.c[i].idx | ||||
|                     for j in range(cached.length): | ||||
|                         tokens[i + offset + j] = cached.data.tokens[j] | ||||
|  |  | |||
|  | @ -7,8 +7,8 @@ from ..errors import Errors | |||
| 
 | ||||
| def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]: | ||||
|     # Create character-to-token mappings | ||||
|     char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A)))) | ||||
|     char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B)))) | ||||
|     char_to_token_a = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(A)))) | ||||
|     char_to_token_b = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(B)))) | ||||
|     str_a = "".join(A).lower() | ||||
|     str_b = "".join(B).lower() | ||||
|     cdef int len_str_a = len(str_a) | ||||
|  | @ -36,8 +36,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li | |||
|         if prev_token_idx_b != token_idx_b: | ||||
|             b2a.append(set()) | ||||
|         # Process the alignment at the current position | ||||
|         if A[token_idx_a] == B[token_idx_b]: | ||||
|             # Current tokens are identical | ||||
|         if A[token_idx_a] == B[token_idx_b] and \ | ||||
|                 (char_idx_a == 0 or \ | ||||
|                     char_to_token_a[char_idx_a - 1] < token_idx_a) and \ | ||||
|                 (char_idx_b == 0 or \ | ||||
|                     char_to_token_b[char_idx_b - 1] < token_idx_b): | ||||
|             # Current tokens are identical and both character offsets are the | ||||
|             # start of a token (either at the beginning of the document or the | ||||
|             # previous character belongs to a different token) | ||||
|             a2b[-1].add(token_idx_b) | ||||
|             b2a[-1].add(token_idx_a) | ||||
|             char_idx_a += len(A[token_idx_a]) | ||||
|  |  | |||
|  | @ -28,7 +28,7 @@ def train( | |||
|     use_gpu: int = -1, | ||||
|     stdout: IO = sys.stdout, | ||||
|     stderr: IO = sys.stderr, | ||||
| ) -> None: | ||||
| ) -> Tuple["Language", Optional[Path]]: | ||||
|     """Train a pipeline. | ||||
| 
 | ||||
|     nlp (Language): The initialized nlp object with the full config. | ||||
|  | @ -40,7 +40,7 @@ def train( | |||
|     stderr (file): A second file-like object to write output messages. To disable | ||||
|         printing, set to io.StringIO. | ||||
| 
 | ||||
|     RETURNS (Path / None): The path to the final exported model. | ||||
|     RETURNS (tuple): The final nlp object and the path to the exported model. | ||||
|     """ | ||||
|     # We use no_print here so we can respect the stdout/stderr options. | ||||
|     msg = Printer(no_print=True) | ||||
|  | @ -105,17 +105,18 @@ def train( | |||
|         raise e | ||||
|     finally: | ||||
|         finalize_logger() | ||||
|         if optimizer.averages: | ||||
|             nlp.use_params(optimizer.averages) | ||||
|         if output_path is not None: | ||||
|             final_model_path = output_path / DIR_MODEL_LAST | ||||
|             if optimizer.averages: | ||||
|                 with nlp.use_params(optimizer.averages): | ||||
|                     nlp.to_disk(final_model_path) | ||||
|             else: | ||||
|             nlp.to_disk(final_model_path) | ||||
|             # This will only run if we don't hit an error | ||||
|             stdout.write( | ||||
|                 msg.good("Saved pipeline to output directory", final_model_path) + "\n" | ||||
|             ) | ||||
|             return (nlp, final_model_path) | ||||
|         else: | ||||
|             return (nlp, None) | ||||
| 
 | ||||
| 
 | ||||
| def train_while_improving( | ||||
|  |  | |||
|  | @ -1,22 +1,16 @@ | |||
| from typing import Optional, Callable, Iterable, Union, List | ||||
| from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer | ||||
| from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance | ||||
| from thinc.api import set_dropout_rate | ||||
| from pathlib import Path | ||||
| from functools import partial | ||||
| from collections import Counter | ||||
| import srsly | ||||
| import numpy | ||||
| import time | ||||
| import re | ||||
| from wasabi import Printer | ||||
| 
 | ||||
| from .example import Example | ||||
| from ..tokens import Doc | ||||
| from ..attrs import ID | ||||
| from ..ml.models.multi_task import build_cloze_multi_task_model | ||||
| from ..ml.models.multi_task import build_cloze_characters_multi_task_model | ||||
| from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain | ||||
| from ..errors import Errors | ||||
| from ..util import registry, load_model_from_config, dot_to_object | ||||
| 
 | ||||
| 
 | ||||
|  | @ -49,6 +43,7 @@ def pretrain( | |||
|     else: | ||||
|         # Without '--resume-path' the '--epoch-resume' argument is ignored | ||||
|         epoch_resume = 0 | ||||
|     objective = model.attrs["loss"] | ||||
|     # TODO: move this to logger function? | ||||
|     tracker = ProgressTracker(frequency=10000) | ||||
|     msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") | ||||
|  | @ -69,7 +64,6 @@ def pretrain( | |||
|             with (output_dir / "log.jsonl").open("a") as file_: | ||||
|                 file_.write(srsly.json_dumps(log) + "\n") | ||||
| 
 | ||||
|     objective = create_objective(P["objective"]) | ||||
|     # TODO: I think we probably want this to look more like the | ||||
|     # 'create_train_batches' function? | ||||
|     for epoch in range(epoch_resume, P["max_epochs"]): | ||||
|  | @ -132,58 +126,6 @@ def make_update( | |||
|     return float(loss) | ||||
| 
 | ||||
| 
 | ||||
| def create_objective(config: Config): | ||||
|     """Create the objective for pretraining. | ||||
| 
 | ||||
|     We'd like to replace this with a registry function but it's tricky because | ||||
|     we're also making a model choice based on this. For now we hard-code support | ||||
|     for two types (characters, vectors). For characters you can specify | ||||
|     n_characters, for vectors you can specify the loss. | ||||
| 
 | ||||
|     Bleh. | ||||
|     """ | ||||
|     objective_type = config["type"] | ||||
|     if objective_type == "characters": | ||||
|         return partial(get_characters_loss, nr_char=config["n_characters"]) | ||||
|     elif objective_type == "vectors": | ||||
|         if config["loss"] == "cosine": | ||||
|             distance = CosineDistance(normalize=True, ignore_zeros=True) | ||||
|             return partial(get_vectors_loss, distance=distance) | ||||
|         elif config["loss"] == "L2": | ||||
|             distance = L2Distance(normalize=True, ignore_zeros=True) | ||||
|             return partial(get_vectors_loss, distance=distance) | ||||
|         else: | ||||
|             raise ValueError(Errors.E906.format(loss_type=config["loss"])) | ||||
|     else: | ||||
|         raise ValueError(Errors.E907.format(objective_type=objective_type)) | ||||
| 
 | ||||
| 
 | ||||
| def get_vectors_loss(ops, docs, prediction, distance): | ||||
|     """Compute a loss based on a distance between the documents' vectors and | ||||
|     the prediction. | ||||
|     """ | ||||
|     # The simplest way to implement this would be to vstack the | ||||
|     # token.vector values, but that's a bit inefficient, especially on GPU. | ||||
|     # Instead we fetch the index into the vectors table for each of our tokens, | ||||
|     # and look them up all at once. This prevents data copying. | ||||
|     ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) | ||||
|     target = docs[0].vocab.vectors.data[ids] | ||||
|     d_target, loss = distance(prediction, target) | ||||
|     return loss, d_target | ||||
| 
 | ||||
| 
 | ||||
| def get_characters_loss(ops, docs, prediction, nr_char): | ||||
|     """Compute a loss based on a number of characters predicted from the docs.""" | ||||
|     target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs]) | ||||
|     target_ids = target_ids.reshape((-1,)) | ||||
|     target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f") | ||||
|     target = target.reshape((-1, 256 * nr_char)) | ||||
|     diff = prediction - target | ||||
|     loss = (diff ** 2).sum() | ||||
|     d_target = diff / float(prediction.shape[0]) | ||||
|     return loss, d_target | ||||
| 
 | ||||
| 
 | ||||
| def create_pretraining_model(nlp, pretrain_config): | ||||
|     """Define a network for the pretraining. We simply add an output layer onto | ||||
|     the tok2vec input model. The tok2vec input model needs to be a model that | ||||
|  | @ -192,27 +134,15 @@ def create_pretraining_model(nlp, pretrain_config): | |||
|     The actual tok2vec layer is stored as a reference, and only this bit will be | ||||
|     serialized to file and read back in when calling the 'train' command. | ||||
|     """ | ||||
|     nlp.initialize() | ||||
|     component = nlp.get_pipe(pretrain_config["component"]) | ||||
|     if pretrain_config.get("layer"): | ||||
|         tok2vec = component.model.get_ref(pretrain_config["layer"]) | ||||
|     else: | ||||
|         tok2vec = component.model | ||||
| 
 | ||||
|     # TODO | ||||
|     maxout_pieces = 3 | ||||
|     hidden_size = 300 | ||||
|     if pretrain_config["objective"]["type"] == "vectors": | ||||
|         model = build_cloze_multi_task_model( | ||||
|             nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces | ||||
|         ) | ||||
|     elif pretrain_config["objective"]["type"] == "characters": | ||||
|         model = build_cloze_characters_multi_task_model( | ||||
|             nlp.vocab, | ||||
|             tok2vec, | ||||
|             hidden_size=hidden_size, | ||||
|             maxout_pieces=maxout_pieces, | ||||
|             nr_char=pretrain_config["objective"]["n_characters"], | ||||
|         ) | ||||
|     create_function = pretrain_config["objective"] | ||||
|     model = create_function(nlp.vocab, tok2vec) | ||||
|     model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) | ||||
|     set_dropout_rate(model, pretrain_config["dropout"]) | ||||
|     return model | ||||
|  |  | |||
|  | @ -171,6 +171,25 @@ and _before_ loading any pipelines. | |||
| | `gpu_id`    | Device index to select. Defaults to `0`. ~~int~~ | | ||||
| | **RETURNS** | `True` ~~bool~~                                  | | ||||
| 
 | ||||
| ### spacy.require_cpu {#spacy.require_cpu tag="function" new="3.0.0"} | ||||
| 
 | ||||
| Allocate data and perform operations on CPU.  | ||||
| If data has already been allocated on GPU, it will not | ||||
| be moved. Ideally, this function should be called right after importing spaCy | ||||
| and _before_ loading any pipelines. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > import spacy | ||||
| > spacy.require_cpu() | ||||
| > nlp = spacy.load("en_core_web_sm") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Description                                      | | ||||
| | ----------- | ------------------------------------------------ | | ||||
| | **RETURNS** | `True` ~~bool~~                                  | | ||||
| 
 | ||||
| ## displaCy {#displacy source="spacy/displacy"} | ||||
| 
 | ||||
| As of v2.0, spaCy comes with a built-in visualization suite. For more info and | ||||
|  |  | |||
|  | @ -158,29 +158,37 @@ The other way to install spaCy is to clone its | |||
| source. That is the common way if you want to make changes to the code base. | ||||
| You'll need to make sure that you have a development environment consisting of a | ||||
| Python distribution including header files, a compiler, | ||||
| [pip](https://pip.pypa.io/en/latest/installing/), | ||||
| [virtualenv](https://virtualenv.pypa.io/) and [git](https://git-scm.com) | ||||
| installed. The compiler part is the trickiest. How to do that depends on your | ||||
| system. See notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and | ||||
| [pip](https://pip.pypa.io/en/stable/) and [git](https://git-scm.com) installed. | ||||
| The compiler part is the trickiest. How to do that depends on your system. See | ||||
| notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and | ||||
| [Windows](#source-windows) for details. | ||||
| 
 | ||||
| ```bash | ||||
| $ python -m pip install -U pip                  # update pip | ||||
| $ python -m pip install -U pip setuptools wheel # install/update build tools | ||||
| $ git clone https://github.com/explosion/spaCy  # clone spaCy | ||||
| $ cd spaCy                                      # navigate into dir | ||||
| 
 | ||||
| $ python -m venv .env                           # create environment in .env | ||||
| $ source .env/bin/activate                      # activate virtual env | ||||
| $ export PYTHONPATH=`pwd`                       # set Python path to spaCy dir | ||||
| $ pip install -r requirements.txt               # install all requirements | ||||
| $ python setup.py build_ext --inplace           # compile spaCy | ||||
| $ python setup.py install                       # install spaCy | ||||
| $ pip install .                                 # compile and install spaCy | ||||
| ``` | ||||
| 
 | ||||
| Compared to regular install via pip, the | ||||
| [`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally installs | ||||
| developer dependencies such as Cython. See the [quickstart widget](#quickstart) | ||||
| to get the right commands for your platform and Python version. | ||||
| To install with extras: | ||||
| 
 | ||||
| ```bash | ||||
| $ pip install .[lookups,cuda102]                # install spaCy with extras | ||||
| ``` | ||||
| 
 | ||||
| To install all dependencies required for development: | ||||
| 
 | ||||
| ```bash | ||||
| $ pip install -r requirements.txt | ||||
| ``` | ||||
| 
 | ||||
| Compared to a regular install via pip, the | ||||
| [`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally includes | ||||
| developer dependencies such as Cython and the libraries required to run the test | ||||
| suite. See the [quickstart widget](#quickstart) to get the right commands for | ||||
| your platform and Python version. | ||||
| 
 | ||||
| <a id="source-ubuntu"></a><a id="source-osx"></a><a id="source-windows"></a> | ||||
| 
 | ||||
|  | @ -195,6 +203,32 @@ to get the right commands for your platform and Python version. | |||
|   [Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/) | ||||
|   that matches the version that was used to compile your Python interpreter. | ||||
| 
 | ||||
| #### Additional options for developers {#source-developers} | ||||
| 
 | ||||
| Some additional options may be useful for spaCy developers who are editing the | ||||
| source code and recompiling frequently. | ||||
| 
 | ||||
| - Install in editable mode. Changes to `.py` files will be reflected as soon as | ||||
|   the files are saved, but edits to Cython files (`.pxd`, `.pyx`) will require | ||||
|   the `pip install` or `python setup.py build_ext` command below to be run | ||||
|   again. Before installing in editable mode, be sure you have removed any | ||||
|   previous installs with `pip uninstall spacy`, which you may need to run | ||||
|   multiple times to remove all traces of earlier installs. | ||||
| 
 | ||||
|   ```bash | ||||
|   $ pip install -r requirements.txt | ||||
|   $ pip install --no-build-isolation --editable . | ||||
|   ``` | ||||
| 
 | ||||
| - Build in parallel using `N` CPUs to speed up compilation and then install in | ||||
|   editable mode: | ||||
| 
 | ||||
|   ```bash | ||||
|   $ pip install -r requirements.txt | ||||
|   $ python setup.py build_ext --inplace -j N | ||||
|   $ pip install --no-build-isolation --editable . | ||||
|   ``` | ||||
| 
 | ||||
| ### Building an executable {#executable} | ||||
| 
 | ||||
| The spaCy repository includes a [`Makefile`](%%GITHUB_SPACY/Makefile) that | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user