From 56c17973aa2526966eb93e2f26fc69c351dacf05 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 5 Aug 2020 23:53:21 +0200 Subject: [PATCH 1/3] Use "raise ... from" in custom errors for better tracebacks --- spacy/cli/train.py | 6 +++--- spacy/gold/example.pyx | 6 +++--- spacy/lang/ja/__init__.py | 4 ++-- spacy/lang/ko/__init__.py | 2 +- spacy/lang/ru/lemmatizer.py | 2 +- spacy/lang/th/__init__.py | 2 +- spacy/lang/uk/lemmatizer.py | 2 +- spacy/lang/vi/__init__.py | 2 +- spacy/lang/zh/__init__.py | 14 +++++++------- spacy/language.py | 2 +- spacy/matcher/matcher.pyx | 2 +- spacy/pipeline/attributeruler.py | 2 +- spacy/pipeline/entity_linker.py | 6 +++--- spacy/pipeline/morphologizer.pyx | 4 ++-- spacy/pipeline/multitask.pyx | 2 +- spacy/pipeline/pipe.pyx | 4 ++-- spacy/pipeline/senter.pyx | 4 ++-- spacy/pipeline/tagger.pyx | 10 +++++----- spacy/pipeline/textcat.py | 6 +++--- spacy/pipeline/transition_parser.pyx | 4 ++-- spacy/tokens/doc.pyx | 2 +- spacy/util.py | 10 +++++----- 22 files changed, 49 insertions(+), 49 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index c5c6e7252..32d22d1bc 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -211,7 +211,7 @@ def create_evaluation_callback( except KeyError as e: keys = list(scores.keys()) err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys) - raise KeyError(err) + raise KeyError(err) from None return weighted_score, scores return evaluate @@ -369,7 +369,7 @@ def setup_printer( Errors.E983.format( dict="scores (losses)", key=str(e), keys=list(info["losses"].keys()) ) - ) + ) from None try: scores = [ @@ -382,7 +382,7 @@ def setup_printer( key=str(e), keys=list(info["other_scores"].keys()), ) - ) + ) from None data = ( [info["epoch"], info["step"]] + losses diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index f90d98603..6093d2346 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -139,7 +139,7 @@ cdef class Example: def get_aligned_spans_y2x(self, y_spans): return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x) - + def _get_aligned_spans(self, doc, spans, align): seen = set() output = [] @@ -207,7 +207,7 @@ cdef class Example: sent_starts and return a list of the new Examples""" if not self.reference.is_sentenced: return [self] - + align = self.alignment.y2x seen_indices = set() output = [] @@ -267,7 +267,7 @@ def _annot2array(vocab, tok_annot, doc_annot): values.append([vocab.strings.add(v) for v in value]) except TypeError: types= set([type(v) for v in value]) - raise TypeError(Errors.E969.format(field=key, types=types)) + raise TypeError(Errors.E969.format(field=key, types=types)) from None array = numpy.asarray(values, dtype="uint64") return attrs, array.T diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index d435afe12..900db4e4c 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -200,7 +200,7 @@ def try_sudachi_import(split_mode="A"): "(https://github.com/WorksApplications/SudachiPy). " "Install with `pip install sudachipy sudachidict_core` or " "install spaCy with `pip install spacy[ja]`." - ) + ) from None def resolve_pos(orth, tag, next_tag): @@ -263,7 +263,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): try: word_start = text[text_pos:].index(word) except ValueError: - raise ValueError(Errors.E194.format(text=text, words=words)) + raise ValueError(Errors.E194.format(text=text, words=words)) from None # space token if word_start > 0: diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 6197ab927..f2954f461 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -85,7 +85,7 @@ def try_mecab_import() -> None: "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " "and [natto-py](https://github.com/buruzaemon/natto-py)" - ) + ) from None def check_spaces(text, tokens): diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index a9a7ad80f..28767348d 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -21,7 +21,7 @@ class RussianLemmatizer(Lemmatizer): 'try to fix it with "pip install pymorphy2==0.8" ' 'or "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"' "if you need Ukrainian too" - ) + ) from None if RussianLemmatizer._morph is None: RussianLemmatizer._morph = MorphAnalyzer() diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 989c22a42..a35ae987f 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -31,7 +31,7 @@ class ThaiTokenizer(DummyTokenizer): raise ImportError( "The Thai tokenizer requires the PyThaiNLP library: " "https://github.com/PyThaiNLP/pythainlp" - ) + ) from None self.word_tokenize = word_tokenize self.vocab = nlp.vocab diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index de2d0c170..cf89d1a12 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -23,7 +23,7 @@ class UkrainianLemmatizer(Lemmatizer): "The Ukrainian lemmatizer requires the pymorphy2 library and " 'dictionaries: try to fix it with "pip uninstall pymorphy2" and' '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"' - ) + ) from None def __call__( self, string: str, univ_pos: str, morphology: Optional[dict] = None diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py index 2b06d33f7..1db762adb 100644 --- a/spacy/lang/vi/__init__.py +++ b/spacy/lang/vi/__init__.py @@ -38,7 +38,7 @@ class VietnameseTokenizer(DummyTokenizer): "Pyvi not installed. Either set use_pyvi = False, " "or install it https://pypi.python.org/pypi/pyvi" ) - raise ImportError(msg) + raise ImportError(msg) from None def __call__(self, text: str) -> Doc: if self.use_pyvi: diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index fe0613c80..5d3bd2a96 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -129,7 +129,7 @@ class ChineseTokenizer(DummyTokenizer): "pkuseg not installed: unable to reset pkuseg " "user dict. Please " + _PKUSEG_INSTALL_MSG ) - raise ImportError(msg) + raise ImportError(msg) from None for word in words: self.pkuseg_seg.preprocesser.insert(word.strip(), "") else: @@ -208,7 +208,7 @@ class ChineseTokenizer(DummyTokenizer): raise ImportError( "pkuseg not installed. To use this model, " + _PKUSEG_INSTALL_MSG - ) + ) from None self.pkuseg_seg = pkuseg.pkuseg(str(tempdir)) if pkuseg_data["processors_data"]: processors_data = pkuseg_data["processors_data"] @@ -258,7 +258,7 @@ class ChineseTokenizer(DummyTokenizer): raise ImportError( "pkuseg not installed. To use this model, " + _PKUSEG_INSTALL_MSG - ) + ) from None if path.exists(): self.pkuseg_seg = pkuseg.pkuseg(path) @@ -267,7 +267,7 @@ class ChineseTokenizer(DummyTokenizer): import pkuseg except ImportError: if self.segmenter == Segmenter.pkuseg: - raise ImportError(self._pkuseg_install_msg) + raise ImportError(self._pkuseg_install_msg) from None if self.segmenter == Segmenter.pkuseg: data = srsly.read_msgpack(path) (user_dict, do_process, common_words, other_words) = data @@ -311,7 +311,7 @@ def try_jieba_import(segmenter: str) -> None: "Jieba not installed. To use jieba, install it with `pip " " install jieba` or from https://github.com/fxsjy/jieba" ) - raise ImportError(msg) + raise ImportError(msg) from None def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None: @@ -332,11 +332,11 @@ def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) except ImportError: if segmenter == Segmenter.pkuseg: msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG - raise ImportError(msg) + raise ImportError(msg) from None except FileNotFoundError: if segmenter == Segmenter.pkuseg: msg = "Unable to load pkuseg model from: " + pkuseg_model - raise FileNotFoundError(msg) + raise FileNotFoundError(msg) from None def _get_pkuseg_trie_data(node, path=""): diff --git a/spacy/language.py b/spacy/language.py index e9d7e9eb6..9018af73c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -869,7 +869,7 @@ class Language: try: doc = proc(doc, **component_cfg.get(name, {})) except KeyError: - raise ValueError(Errors.E109.format(name=name)) + raise ValueError(Errors.E109.format(name=name)) from None if doc is None: raise ValueError(Errors.E005.format(name=name)) return doc diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 325c81369..a0f3f1655 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -131,7 +131,7 @@ cdef class Matcher: for attr, _ in spec[1]: self._seen_attrs.add(attr) except OverflowError, AttributeError: - raise ValueError(Errors.E154.format()) + raise ValueError(Errors.E154.format()) from None self._patterns.setdefault(key, []) self._callbacks[key] = on_match self._filter[key] = greedy diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 1f1e63959..d5abf7863 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -85,7 +85,7 @@ class AttributeRuler(Pipe): span=[t.text for t in span], index=index, ) - ) + ) from None set_token_attrs(token, attrs) return doc diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 923d925dc..d922db1ad 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -195,7 +195,7 @@ class EntityLinker(Pipe): types = set([type(eg) for eg in examples]) raise TypeError( Errors.E978.format(name="EntityLinker", method="update", types=types) - ) + ) from None if set_annotations: # This seems simpler than other ways to get that exact output -- but # it does run the model twice :( @@ -213,7 +213,7 @@ class EntityLinker(Pipe): sent_index = sentences.index(ent.sent) except AttributeError: # Catch the exception when ent.sent is None and provide a user-friendly warning - raise RuntimeError(Errors.E030) + raise RuntimeError(Errors.E030) from None # get n previous sentences, if there are any start_sentence = max(0, sent_index - self.n_sents) # get n posterior sentences, or as many < n as there are @@ -439,7 +439,7 @@ class EntityLinker(Pipe): try: self.model.from_bytes(p.open("rb").read()) except AttributeError: - raise ValueError(Errors.E149) + raise ValueError(Errors.E149) from None def load_kb(p): self.kb = KnowledgeBase(entity_vector_length=self.cfg["entity_width"]) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 18673f85b..06c9f9a25 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -262,7 +262,7 @@ class Morphologizer(Tagger): try: self.model.from_bytes(b) except AttributeError: - raise ValueError(Errors.E149) + raise ValueError(Errors.E149) from None deserialize = { "vocab": lambda b: self.vocab.from_bytes(b), @@ -301,7 +301,7 @@ class Morphologizer(Tagger): try: self.model.from_bytes(file_.read()) except AttributeError: - raise ValueError(Errors.E149) + raise ValueError(Errors.E149) from None deserialize = { "vocab": lambda p: self.vocab.from_disk(p), diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index d85030adb..4b582045d 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -211,7 +211,7 @@ class ClozeMultitask(Pipe): predictions, bp_predictions = self.model.begin_update([eg.predicted for eg in examples]) except AttributeError: types = set([type(eg) for eg in examples]) - raise TypeError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types)) + raise TypeError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types)) from None loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) bp_predictions(d_predictions) if sgd is not None: diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 1a94905a2..bed4cdd16 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -204,7 +204,7 @@ cdef class Pipe: try: self.model.from_bytes(b) except AttributeError: - raise ValueError(Errors.E149) + raise ValueError(Errors.E149) from None deserialize = {} if hasattr(self, "vocab"): @@ -242,7 +242,7 @@ cdef class Pipe: try: self.model.from_bytes(p.open("rb").read()) except AttributeError: - raise ValueError(Errors.E149) + raise ValueError(Errors.E149) from None deserialize = {} deserialize["vocab"] = lambda p: self.vocab.from_disk(p) diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 620a8557e..3147cc902 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -183,7 +183,7 @@ class SentenceRecognizer(Tagger): try: self.model.from_bytes(b) except AttributeError: - raise ValueError(Errors.E149) + raise ValueError(Errors.E149) from None deserialize = { "vocab": lambda b: self.vocab.from_bytes(b), @@ -222,7 +222,7 @@ class SentenceRecognizer(Tagger): try: self.model.from_bytes(file_.read()) except AttributeError: - raise ValueError(Errors.E149) + raise ValueError(Errors.E149) from None deserialize = { "vocab": lambda p: self.vocab.from_disk(p), diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 43f5b02cb..da1b3d3aa 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -195,7 +195,7 @@ class Tagger(Pipe): return except AttributeError: types = set([type(eg) for eg in examples]) - raise TypeError(Errors.E978.format(name="Tagger", method="update", types=types)) + raise TypeError(Errors.E978.format(name="Tagger", method="update", types=types)) from None set_dropout_rate(self.model, drop) tag_scores, bp_tag_scores = self.model.begin_update( [eg.predicted for eg in examples]) @@ -232,7 +232,7 @@ class Tagger(Pipe): docs = [eg.predicted for eg in examples] except AttributeError: types = set([type(eg) for eg in examples]) - raise TypeError(Errors.E978.format(name="Tagger", method="rehearse", types=types)) + raise TypeError(Errors.E978.format(name="Tagger", method="rehearse", types=types)) from None if self._rehearsal_model is None: return if not any(len(doc) for doc in docs): @@ -292,7 +292,7 @@ class Tagger(Pipe): try: y = example.y except AttributeError: - raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) + raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) from None for token in y: tag = token.tag_ if tag in orig_tag_map: @@ -400,7 +400,7 @@ class Tagger(Pipe): try: self.model.from_bytes(b) except AttributeError: - raise ValueError(Errors.E149) + raise ValueError(Errors.E149) from None def load_tag_map(b): tag_map = srsly.msgpack_loads(b) @@ -456,7 +456,7 @@ class Tagger(Pipe): try: self.model.from_bytes(file_.read()) except AttributeError: - raise ValueError(Errors.E149) + raise ValueError(Errors.E149) from None def load_tag_map(p): tag_map = srsly.read_msgpack(p) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index bc16e790f..8b46082cb 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -203,7 +203,7 @@ class TextCategorizer(Pipe): types = set([type(eg) for eg in examples]) raise TypeError( Errors.E978.format(name="TextCategorizer", method="update", types=types) - ) + ) from None set_dropout_rate(self.model, drop) scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples]) loss, d_scores = self.get_loss(examples, scores) @@ -250,7 +250,7 @@ class TextCategorizer(Pipe): err = Errors.E978.format( name="TextCategorizer", method="rehearse", types=types ) - raise TypeError(err) + raise TypeError(err) from None if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return losses @@ -351,7 +351,7 @@ class TextCategorizer(Pipe): err = Errors.E978.format( name="TextCategorizer", method="update", types=type(example) ) - raise TypeError(err) + raise TypeError(err) from None for cat in y.cats: self.add_label(cat) self.require_labels() diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index b14a55cb4..9829e764d 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -473,7 +473,7 @@ cdef class Parser(Pipe): self._resize() self.model.from_bytes(bytes_data) except AttributeError: - raise ValueError(Errors.E149) + raise ValueError(Errors.E149) from None return self def to_bytes(self, exclude=tuple()): @@ -498,7 +498,7 @@ cdef class Parser(Pipe): try: self.model.from_bytes(msg['model']) except AttributeError: - raise ValueError(Errors.E149) + raise ValueError(Errors.E149) from None return self def _init_gold_batch(self, examples, min_length=5, max_length=500): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2fcc0983b..935af88d1 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -699,7 +699,7 @@ cdef class Doc: for id_ in py_attr_ids] except KeyError as msg: keys = [k for k in IDS.keys() if not k.startswith("FLAG")] - raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) + raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None # Make an array from the attributes --- otherwise our inner loop is # Python dict iteration. cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i") diff --git a/spacy/util.py b/spacy/util.py index 52073097e..05f8ef017 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -138,7 +138,7 @@ def get_lang_class(lang: str) -> "Language": try: module = importlib.import_module(f".lang.{lang}", "spacy") except ImportError as err: - raise ImportError(Errors.E048.format(lang=lang, err=err)) + raise ImportError(Errors.E048.format(lang=lang, err=err)) from err set_lang_class(lang, getattr(module, module.__all__[0])) return registry.languages.get(lang) @@ -502,7 +502,7 @@ def run_command(command: Union[str, List[str]]) -> None: except FileNotFoundError: raise FileNotFoundError( Errors.E970.format(str_command=" ".join(command), tool=command[0]) - ) + ) from None if status != 0: sys.exit(status) @@ -891,7 +891,7 @@ def get_words_and_spaces( try: word_start = text[text_pos:].index(word) except ValueError: - raise ValueError(Errors.E194.format(text=text, words=words)) + raise ValueError(Errors.E194.format(text=text, words=words)) from None if word_start > 0: text_words.append(text[text_pos : text_pos + word_start]) text_spaces.append(False) @@ -918,7 +918,7 @@ def copy_config(config: Union[Dict[str, Any], Config]) -> Config: try: return Config(config).copy() except ValueError: - raise ValueError(Errors.E961.format(config=config)) + raise ValueError(Errors.E961.format(config=config)) from None def deep_merge_configs( @@ -1002,7 +1002,7 @@ def dot_to_object(config: Config, section: str): try: component = component[item] except (KeyError, TypeError): - raise KeyError(Errors.E952.format(name=section)) + raise KeyError(Errors.E952.format(name=section)) from None return component From 06e80d95cdf6ae2fbfcf8f6ef20c882559190c7f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 6 Aug 2020 00:28:14 +0200 Subject: [PATCH 2/3] Sync develop with nightly docs state (#5883) Co-authored-by: svlandeg --- spacy/cli/debug_model.py | 2 +- website/docs/api/cli.md | 170 ++++++++++++++++++++-- website/docs/api/data-formats.md | 119 ++++++++++++++- website/docs/api/entitylinker.md | 45 ++---- website/docs/api/example.md | 58 +++++++- website/docs/api/top-level.md | 53 ------- website/docs/usage/linguistic-features.md | 41 +++--- 7 files changed, 359 insertions(+), 129 deletions(-) diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 7c6c76a34..cc6cb98ea 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -54,7 +54,7 @@ def debug_model_cli( nlp, config = util.load_model_from_config(cfg) except ValueError as e: msg.fail(str(e), exits=1) - seed = config.get("training", {}).get("seed", None) + seed = config["pretraining"]["seed"] if seed is not None: msg.info(f"Fixing random seed: {seed}") fix_random_seed(seed) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 0ea67747e..abe050661 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -221,17 +221,21 @@ config from being resolved. This means that you may not see all validation errors at once and some issues are only shown once previous errors have been fixed. +Instead of specifying all required settings in the config file, you can rely on +an auto-fill functionality that uses spaCy's built-in defaults. The resulting +full config can be written to file and used in downstream training tasks. + ```bash -$ python -m spacy debug config [config_path] [--code] [overrides] +$ python -m spacy debug config [config_path] [--code_path] [--output] [--auto_fill] [--diff] [overrides] ``` -> #### Example +> #### Example 1 > > ```bash > $ python -m spacy debug config ./config.cfg > ``` - + ``` ✘ Config validation error @@ -250,12 +254,30 @@ training -> width extra fields not permitted -| Argument | Type | Description | -| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | -| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. | +> #### Example 2 +> +> ```bash +> $ python -m spacy debug config ./minimal_config.cfg -F -o ./filled_config.cfg +> ``` + + + +``` +✔ Auto-filled config is valid +✔ Saved updated config to ./filled_config.cfg +``` + + + +| Argument | Type | Default | Description | +| --------------------- | ---------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `config_path` | positional | - | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | +| `--code_path`, `-c` | option | `None` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | +| `--auto_fill`, `-F` | option | `False` | Whether or not to auto-fill the config with built-in defaults if possible. If `False`, the provided config needs to be complete. | +| `--output_path`, `-o` | option | `None` | Output path where the filled config can be stored. Use '-' for standard output. | +| `--diff`, `-D` | option | `False` | Show a visual diff if config was auto-filled. | +| `--help`, `-h` | flag | `False` | Show help message and available arguments. | +| overrides | | `None` | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. | ### debug data {#debug-data} @@ -433,7 +455,135 @@ will not be available. | `--help`, `-h` | flag | Show help message and available arguments. | | overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. | - + + +### debug model {#debug-model} + +Debug a Thinc [`Model`](https://thinc.ai/docs/api-model) by running it on a +sample text and checking how it updates its internal weights and parameters. + +```bash +$ python -m spacy debug model [config_path] [component] [--layers] [-DIM] [-PAR] [-GRAD] [-ATTR] [-P0] [-P1] [-P2] [P3] [--gpu_id] +``` + +> #### Example 1 +> +> ```bash +> $ python -m spacy debug model ./config.cfg tagger -P0 +> ``` + + + +``` +ℹ Using CPU +ℹ Fixing random seed: 0 +ℹ Analysing model with ID 62 + +========================== STEP 0 - before training ========================== +ℹ Layer 0: model ID 62: +'extract_features>>list2ragged>>with_array-ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed>>with_array-maxout>>layernorm>>dropout>>ragged2list>>with_array-residual>>residual>>residual>>residual>>with_array-softmax' +ℹ Layer 1: model ID 59: +'extract_features>>list2ragged>>with_array-ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed>>with_array-maxout>>layernorm>>dropout>>ragged2list>>with_array-residual>>residual>>residual>>residual' +ℹ Layer 2: model ID 61: 'with_array-softmax' +ℹ Layer 3: model ID 24: +'extract_features>>list2ragged>>with_array-ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed>>with_array-maxout>>layernorm>>dropout>>ragged2list' +ℹ Layer 4: model ID 58: 'with_array-residual>>residual>>residual>>residual' +ℹ Layer 5: model ID 60: 'softmax' +ℹ Layer 6: model ID 13: 'extract_features' +ℹ Layer 7: model ID 14: 'list2ragged' +ℹ Layer 8: model ID 16: +'with_array-ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed' +ℹ Layer 9: model ID 22: 'with_array-maxout>>layernorm>>dropout' +ℹ Layer 10: model ID 23: 'ragged2list' +ℹ Layer 11: model ID 57: 'residual>>residual>>residual>>residual' +ℹ Layer 12: model ID 15: +'ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed' +ℹ Layer 13: model ID 21: 'maxout>>layernorm>>dropout' +ℹ Layer 14: model ID 32: 'residual' +ℹ Layer 15: model ID 40: 'residual' +ℹ Layer 16: model ID 48: 'residual' +ℹ Layer 17: model ID 56: 'residual' +ℹ Layer 18: model ID 3: 'ints-getitem>>hashembed' +ℹ Layer 19: model ID 6: 'ints-getitem>>hashembed' +ℹ Layer 20: model ID 9: 'ints-getitem>>hashembed' +... +``` + + + +In this example log, we just print the name of each layer after creation of the +model ("Step 0"), which helps us to understand the internal structure of the +Neural Network, and to focus on specific layers that we want to inspect further +(see next example). + +> #### Example 2 +> +> ```bash +> $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P2 +> ``` + + + +``` +ℹ Using CPU +ℹ Fixing random seed: 0 +ℹ Analysing model with ID 62 + +========================= STEP 0 - before training ========================= +ℹ Layer 5: model ID 60: 'softmax' +ℹ - dim nO: None +ℹ - dim nI: 96 +ℹ - param W: None +ℹ - param b: None +ℹ Layer 15: model ID 40: 'residual' +ℹ - dim nO: None +ℹ - dim nI: None + +======================= STEP 1 - after initialization ======================= +ℹ Layer 5: model ID 60: 'softmax' +ℹ - dim nO: 4 +ℹ - dim nI: 96 +ℹ - param W: (4, 96) - sample: [0. 0. 0. 0. 0.] +ℹ - param b: (4,) - sample: [0. 0. 0. 0.] +ℹ Layer 15: model ID 40: 'residual' +ℹ - dim nO: 96 +ℹ - dim nI: None + +========================== STEP 2 - after training ========================== +ℹ Layer 5: model ID 60: 'softmax' +ℹ - dim nO: 4 +ℹ - dim nI: 96 +ℹ - param W: (4, 96) - sample: [ 0.00283958 -0.00294119 0.00268396 -0.00296219 +-0.00297141] +ℹ - param b: (4,) - sample: [0.00300002 0.00300002 0.00300002 0.00300002] +ℹ Layer 15: model ID 40: 'residual' +ℹ - dim nO: 96 +ℹ - dim nI: None +``` + + + +In this example log, we see how initialization of the model (Step 1) propagates +the correct values for the `nI` (input) and `nO` (output) dimensions of the +various layers. In the `softmax` layer, this step also defines the `W` matrix as +an all-zero matrix determined by the `nO` and `nI` dimensions. After a first +training step (Step 2), this matrix has clearly updated its values through the +training feedback loop. + +| Argument | Type | Default | Description | +| ----------------------- | ---------- | ------- | ---------------------------------------------------------------------------------------------------- | +| `config_path` | positional | | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | +| `component` | positional | | Name of the pipeline component of which the model should be analysed. | +| `--layers`, `-l` | option | | Comma-separated names of layer IDs to print. | +| `--dimensions`, `-DIM` | option | `False` | Show dimensions of each layer. | +| `--parameters`, `-PAR` | option | `False` | Show parameters of each layer. | +| `--gradients`, `-GRAD` | option | `False` | Show gradients of each layer. | +| `--attributes`, `-ATTR` | option | `False` | Show attributes of each layer. | +| `--print-step0`, `-P0` | option | `False` | Print model before training. | +| `--print-step1`, `-P1` | option | `False` | Print model after initialization. | +| `--print-step2`, `-P2` | option | `False` | Print model after training. | +| `--print-step3`, `-P3` | option | `False` | Print final predictions. | +| `--help`, `-h` | flag | | Show help message and available arguments. | ## Train {#train} diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 10fef6ba6..210e5d47d 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -28,7 +28,7 @@ spaCy's training format. To convert one or more existing `Doc` objects to spaCy's JSON format, you can use the [`gold.docs_to_json`](/api/top-level#docs_to_json) helper. -> #### Annotating entities +> #### Annotating entities {#biluo} > > Named entities are provided in the > [BILUO](/usage/linguistic-features#accessing-ner) notation. Tokens outside an @@ -75,6 +75,123 @@ from the English Wall Street Journal portion of the Penn Treebank: https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json ``` +### Annotations in dictionary format {#dict-input} + +To create [`Example`](/api/example) objects, you can create a dictionary of the +gold-standard annotations `gold_dict`, and then call + +```python +example = Example.from_dict(doc, gold_dict) +``` + +There are currently two formats supported for this dictionary of annotations: +one with a simple, flat structure of keywords, and one with a more hierarchical +structure. + +#### Flat structure {#dict-flat} + +Here is the full overview of potential entries in a flat dictionary of +annotations. You need to only specify those keys corresponding to the task you +want to train. + +```python +### Flat dictionary +{ + "text": string, # Raw text. + "words": List[string], # List of gold tokens. + "lemmas": List[string], # List of lemmas. + "spaces": List[bool], # List of boolean values indicating whether the corresponding tokens is followed by a space or not. + "tags": List[string], # List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging). + "pos": List[string], # List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging). + "morphs": List[string], # List of [morphological features](/usage/linguistic-features#rule-based-morphology). + "sent_starts": List[bool], # List of boolean values indicating whether each token is the first of a sentence or not. + "deps": List[string], # List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. + "heads": List[int], # List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. + "entities": List[string], # Option 1: List of [BILUO tags](#biluo) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. + "entities": List[(int, int, string)], # Option 2: List of `"(start, end, label)"` tuples defining all entities in. + "cats": Dict[str, float], # Dictionary of `label:value` pairs indicating how relevant a certain [category](/api/textcategorizer) is for the text. + "links": Dict[(int, int), Dict], # Dictionary of `offset:dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The charachter offsets are linked to a dictionary of relevant knowledge base IDs. +} +``` + +There are a few caveats to take into account: + +- Multiple formats are possible for the "entities" entry, but you have to pick + one. +- Any values for sentence starts will be ignored if there are annotations for + dependency relations. +- If the dictionary contains values for "text" and "words", but not "spaces", + the latter are inferred automatically. If "words" is not provided either, the + values are inferred from the `doc` argument. + +##### Examples + +```python +# Training data for a part-of-speech tagger +doc = Doc(vocab, words=["I", "like", "stuff"]) +example = Example.from_dict(doc, {"tags": ["NOUN", "VERB", "NOUN"]}) + +# Training data for an entity recognizer (option 1) +doc = nlp("Laura flew to Silicon Valley.") +biluo_tags = ["U-PERS", "O", "O", "B-LOC", "L-LOC"] +example = Example.from_dict(doc, {"entities": biluo_tags}) + +# Training data for an entity recognizer (option 2) +doc = nlp("Laura flew to Silicon Valley.") +entity_tuples = [ + (0, 5, "PERSON"), + (14, 28, "LOC"), + ] +example = Example.from_dict(doc, {"entities": entity_tuples}) + +# Training data for text categorization +doc = nlp("I'm pretty happy about that!") +example = Example.from_dict(doc, {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}) + +# Training data for an Entity Linking component +doc = nlp("Russ Cochran his reprints include EC Comics.") +example = Example.from_dict(doc, {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}) +``` + +#### Hierachical structure {#dict-hierarch} + +Internally, a more hierarchical dictionary structure is used to store +gold-standard annotations. Its format is similar to the structure described in +the previous section, but there are two main sections `token_annotation` and +`doc_annotation`, and the keys for token annotations should be uppercase +[`Token` attributes](/api/token#attributes) such as "ORTH" and "TAG". + +```python +### Hierarchical dictionary +{ + "text": string, # Raw text. + "token_annotation": { + "ORTH": List[string], # List of gold tokens. + "LEMMA": List[string], # List of lemmas. + "SPACY": List[bool], # List of boolean values indicating whether the corresponding tokens is followed by a space or not. + "TAG": List[string], # List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging). + "POS": List[string], # List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging). + "MORPH": List[string], # List of [morphological features](/usage/linguistic-features#rule-based-morphology). + "SENT_START": List[bool], # List of boolean values indicating whether each token is the first of a sentence or not. + "DEP": List[string], # List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. + "HEAD": List[int], # List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. + }, + "doc_annotation": { + "entities": List[(int, int, string)], # List of [BILUO tags](#biluo) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. + "cats": Dict[str, float], # Dictionary of `label:value` pairs indicating how relevant a certain [category](/api/textcategorizer) is for the text. + "links": Dict[(int, int), Dict], # Dictionary of `offset:dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The charachter offsets are linked to a dictionary of relevant knowledge base IDs. + } +} +``` + +There are a few caveats to take into account: + +- Any values for sentence starts will be ignored if there are annotations for + dependency relations. +- If the dictionary contains values for "text" and "ORTH", but not "SPACY", the + latter are inferred automatically. If "ORTH" is not provided either, the + values are inferred from the `doc` argument. + ## Training config {#config new="3"} Config files define the training process and model pipeline and can be passed to diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 1e9beaf82..18d9c5edd 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -32,15 +32,13 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` - - -| Setting | Type | Description | Default | -| ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- | -| `kb` | `KnowledgeBase` | | `None` | -| `labels_discard` | `Iterable[str]` | | `[]` | -| `incl_prior` | bool | |  `True` | -| `incl_context` | bool | | `True` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) | +| Setting | Type | Description | Default | +| ---------------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- | +| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb) holding all entities and their aliases. | `None` | +| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | `[]` | +| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | `True` | +| `incl_context` | bool | Whether or not to include the local context in the model. | `True` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py @@ -75,10 +73,10 @@ shortcut for this and instantiate the component using its string name and | `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | _keyword-only_ | | | -| `kb` | `KnowlegeBase` | | -| `labels_discard` | `Iterable[str]` | | -| `incl_prior` | bool | | -| `incl_context` | bool | | +| `kb` | `KnowlegeBase` | The [`KnowledgeBase`](/api/kb) holding all entities and their aliases. | +| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | +| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | +| `incl_context` | bool | Whether or not to include the local context in the model. | ## EntityLinker.\_\_call\_\_ {#call tag="method"} @@ -130,15 +128,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and ## EntityLinker.begin_training {#begin_training tag="method"} Initialize the pipe for training, using data examples if available. Returns an -[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this -method, a knowledge base should have been defined with -[`set_kb`](/api/entitylinker#set_kb). +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example > > ```python > entity_linker = nlp.add_pipe("entity_linker", last=True) -> entity_linker.set_kb(kb) > optimizer = entity_linker.begin_training(pipeline=nlp.pipeline) > ``` @@ -210,22 +205,6 @@ pipe's entity linking model and context encoder. Delegates to | `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | -## EntityLinker.set_kb {#set_kb tag="method"} - -Define the knowledge base (KB) used for disambiguating named entities to KB -identifiers. - -> #### Example -> -> ```python -> entity_linker = nlp.add_pipe("entity_linker") -> entity_linker.set_kb(kb) -> ``` - -| Name | Type | Description | -| ---- | --------------- | ------------------------------- | -| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). | - ## EntityLinker.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/example.md b/website/docs/api/example.md index 1257fdc1e..8c117aec7 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -8,8 +8,9 @@ new: 3.0 An `Example` holds the information for one training instance. It stores two `Doc` objects: one for holding the gold-standard reference data, and one for -holding the predictions of the pipeline. An `Alignment` object stores the -alignment between these two documents, as they can differ in tokenization. +holding the predictions of the pipeline. An [`Alignment`](#alignment-object) +object stores the alignment between these two documents, as they can differ in +tokenization. ## Example.\_\_init\_\_ {#init tag="method"} @@ -40,9 +41,8 @@ both documents. ## Example.from_dict {#from_dict tag="classmethod"} Construct an `Example` object from the `predicted` document and the reference -annotations provided as a dictionary. - - +annotations provided as a dictionary. For more details on the required format, +see the [training format documentation](/api/data-formats#dict-input). > #### Example > @@ -244,8 +244,9 @@ accuracy of predicted entities against the original gold-standard annotation. ## Example.to_dict {#to_dict tag="method"} -Return a dictionary representation of the reference annotation contained in this -`Example`. +Return a +[hierarchical dictionary representation](/api/data-formats#dict-hierarch) of the +reference annotation contained in this `Example`. > #### Example > @@ -276,3 +277,46 @@ Split one `Example` into multiple `Example` objects, one for each sentence. | Name | Type | Description | | ----------- | --------------- | ---------------------------------------------------------- | | **RETURNS** | `List[Example]` | List of `Example` objects, one for each original sentence. | + +## Alignment {#alignment-object new="3"} + +Calculate alignment tables between two tokenizations. + +### Alignment attributes {#alignment-attributes"} + +| Name | Type | Description | +| ----- | -------------------------------------------------- | ---------------------------------------------------------- | +| `x2y` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `x` to `y`. | +| `y2x` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `y` to `x`. | + + + +The current implementation of the alignment algorithm assumes that both +tokenizations add up to the same string. For example, you'll be able to align +`["I", "'", "m"]` and `["I", "'m"]`, which both add up to `"I'm"`, but not +`["I", "'m"]` and `["I", "am"]`. + + + +> #### Example +> +> ```python +> from spacy.gold import Alignment +> +> bert_tokens = ["obama", "'", "s", "podcast"] +> spacy_tokens = ["obama", "'s", "podcast"] +> alignment = Alignment.from_strings(bert_tokens, spacy_tokens) +> a2b = alignment.x2y +> assert list(a2b.dataXd) == [0, 1, 1, 2] +> ``` +> +> If `a2b.dataXd[1] == a2b.dataXd[2] == 1`, that means that `A[1]` (`"'"`) and +> `A[2]` (`"s"`) both align to `B[1]` (`"'s"`). + +### Alignment.from_strings {#classmethod tag="function"} + +| Name | Type | Description | +| ----------- | ----------- | ----------------------------------------------- | +| `A` | list | String values of candidate tokens to align. | +| `B` | list | String values of reference tokens to align. | +| **RETURNS** | `Alignment` | An `Alignment` object describing the alignment. | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 71b53f844..0954fb577 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -468,59 +468,6 @@ Convert a list of Doc objects into the | `id` | int | ID to assign to the JSON. Defaults to `0`. | | **RETURNS** | dict | The data in spaCy's JSON format. | -### gold.align {#align tag="function"} - -Calculate alignment tables between two tokenizations, using the Levenshtein -algorithm. The alignment is case-insensitive. - - - -The current implementation of the alignment algorithm assumes that both -tokenizations add up to the same string. For example, you'll be able to align -`["I", "'", "m"]` and `["I", "'m"]`, which both add up to `"I'm"`, but not -`["I", "'m"]` and `["I", "am"]`. - - - -> #### Example -> -> ```python -> from spacy.gold import align -> -> bert_tokens = ["obama", "'", "s", "podcast"] -> spacy_tokens = ["obama", "'s", "podcast"] -> alignment = align(bert_tokens, spacy_tokens) -> cost, a2b, b2a, a2b_multi, b2a_multi = alignment -> ``` - -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------------------------------- | -| `tokens_a` | list | String values of candidate tokens to align. | -| `tokens_b` | list | String values of reference tokens to align. | -| **RETURNS** | tuple | A `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the alignment. | - -The returned tuple contains the following alignment information: - -> #### Example -> -> ```python -> a2b = array([0, -1, -1, 2]) -> b2a = array([0, 2, 3]) -> a2b_multi = {1: 1, 2: 1} -> b2a_multi = {} -> ``` -> -> If `a2b[3] == 2`, that means that `tokens_a[3]` aligns to `tokens_b[2]`. If -> there's no one-to-one alignment for a token, it has the value `-1`. - -| Name | Type | Description | -| ----------- | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `cost` | int | The number of misaligned tokens. | -| `a2b` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_a` to indices in `tokens_b`. | -| `b2a` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_b` to indices in `tokens_a`. | -| `a2b_multi` | dict | A dictionary mapping indices in `tokens_a` to indices in `tokens_b`, where multiple tokens of `tokens_a` align to the same token of `tokens_b`. | -| `b2a_multi` | dict | A dictionary mapping indices in `tokens_b` to indices in `tokens_a`, where multiple tokens of `tokens_b` align to the same token of `tokens_a`. | - ### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} Encode labelled spans into per-token tags, using the diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 881a0e333..8d3c7e1b6 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1089,51 +1089,44 @@ In situations like that, you often want to align the tokenization so that you can merge annotations from different sources together, or take vectors predicted by a [pretrained BERT model](https://github.com/huggingface/pytorch-transformers) and -apply them to spaCy tokens. spaCy's [`gold.align`](/api/top-level#align) helper -returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the number -of misaligned tokens, the one-to-one mappings of token indices in both -directions and the indices where multiple tokens align to one single token. +apply them to spaCy tokens. spaCy's [`Alignment`](/api/example#alignment-object) object +allows the one-to-one mappings of token indices in both directions as well as +taking into account indices where multiple tokens align to one single token. > #### ✏️ Things to try > > 1. Change the capitalization in one of the token lists – for example, > `"obama"` to `"Obama"`. You'll see that the alignment is case-insensitive. > 2. Change `"podcasts"` in `other_tokens` to `"pod", "casts"`. You should see -> that there are now 4 misaligned tokens and that the new many-to-one mapping -> is reflected in `a2b_multi`. -> 3. Make `other_tokens` and `spacy_tokens` identical. You'll see that the -> `cost` is `0` and all corresponding mappings are also identical. +> that there are now two tokens of length 2 in `y2x`, one corresponding to +> "'s", and one to "podcasts". +> 3. Make `other_tokens` and `spacy_tokens` identical. You'll see that all +> tokens now correspond 1-to-1. ```python ### {executable="true"} -from spacy.gold import align +from spacy.gold import Alignment other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."] spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."] -cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens) -print("Edit distance:", cost) # 3 -print("One-to-one mappings a -> b", a2b) # array([0, 1, 2, 3, -1, -1, 5, 6]) -print("One-to-one mappings b -> a", b2a) # array([0, 1, 2, 3, -1, 6, 7]) -print("Many-to-one mappings a -> b", a2b_multi) # {4: 4, 5: 4} -print("Many-to-one mappings b-> a", b2a_multi) # {} +align = Alignment.from_strings(other_tokens, spacy_tokens) +print(f"a -> b, lengths: {align.x2y.lengths}") # array([1, 1, 1, 1, 1, 1, 1, 1]) +print(f"a -> b, mapping: {align.x2y.dataXd}") # array([0, 1, 2, 3, 4, 4, 5, 6]) : two tokens both refer to "'s" +print(f"b -> a, lengths: {align.y2x.lengths}") # array([1, 1, 1, 1, 2, 1, 1]) : the token "'s" refers to two tokens +print(f"b -> a, mappings: {align.y2x.dataXd}") # array([0, 1, 2, 3, 4, 5, 6, 7]) ``` Here are some insights from the alignment information generated in the example above: -- The edit distance (cost) is `3`: two deletions and one insertion. - The one-to-one mappings for the first four tokens are identical, which means they map to each other. This makes sense because they're also identical in the input: `"i"`, `"listened"`, `"to"` and `"obama"`. -- The index mapped to `a2b[6]` is `5`, which means that `other_tokens[6]` +- The value of `x2y.dataXd[6]` is `5`, which means that `other_tokens[6]` (`"podcasts"`) aligns to `spacy_tokens[5]` (also `"podcasts"`). -- `a2b[4]` is `-1`, which means that there is no one-to-one alignment for the - token at `other_tokens[4]`. The token `"'"` doesn't exist on its own in - `spacy_tokens`. The same goes for `a2b[5]` and `other_tokens[5]`, i.e. `"s"`. -- The dictionary `a2b_multi` shows that both tokens 4 and 5 of `other_tokens` - (`"'"` and `"s"`) align to token 4 of `spacy_tokens` (`"'s"`). -- The dictionary `b2a_multi` shows that there are no tokens in `spacy_tokens` - that map to multiple tokens in `other_tokens`. +- `x2y.dataXd[4]` and `x2y.dataXd[5]` are both `4`, which means that both tokens + 4 and 5 of `other_tokens` (`"'"` and `"s"`) align to token 4 of `spacy_tokens` + (`"'s"`). From 30f316c688953959a42c79d8e0cec6b891348c0c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 6 Aug 2020 00:51:55 +0200 Subject: [PATCH 3/3] Fix server-side rendering [ci skip] --- website/src/components/quickstart.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/src/components/quickstart.js b/website/src/components/quickstart.js index f1d3616a5..f886ee7b3 100644 --- a/website/src/components/quickstart.js +++ b/website/src/components/quickstart.js @@ -1,7 +1,7 @@ import React, { Fragment, useState, useEffect, useRef } from 'react' import PropTypes from 'prop-types' import classNames from 'classnames' -import { window } from 'browser-monads' +import { window, document } from 'browser-monads' import Section from './section' import Icon from './icon'