From bf92625edead3707038fbd00e0b249ebe1f04855 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Fri, 26 Apr 2019 13:19:50 +0200
Subject: [PATCH 001/496] Update from master
---
.github/contributors/bjascob.md | 106 ++++++++++++++++++++++++++++
spacy/cli/evaluate.py | 2 +-
spacy/cli/pretrain.py | 43 +++++++-----
spacy/cli/train.py | 45 +++++++-----
spacy/displacy/__init__.py | 7 +-
spacy/lang/th/__init__.py | 10 ++-
spacy/lang/th/norm_exceptions.py | 114 +++++++++++++++++++++++++++++++
website/meta/universe.json | 22 ++++++
8 files changed, 310 insertions(+), 39 deletions(-)
create mode 100644 .github/contributors/bjascob.md
create mode 100644 spacy/lang/th/norm_exceptions.py
diff --git a/.github/contributors/bjascob.md b/.github/contributors/bjascob.md
new file mode 100644
index 000000000..4870c494a
--- /dev/null
+++ b/.github/contributors/bjascob.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Brad Jascob |
+| Company name (if applicable) | n/a |
+| Title or role (if applicable) | Software Engineer |
+| Date | 04/25/2019 |
+| GitHub username | bjascob |
+| Website (optional) | n/a |
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index df391d730..468698e2f 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -17,7 +17,7 @@ from .. import displacy
gpu_id=("Use GPU", "option", "g", int),
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
- return_scores=("Return dict containing model scores", "flag", "r", bool),
+ return_scores=("Return dict containing model scores", "flag", "R", bool),
)
def evaluate(
model,
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 0b316b47c..ef91937a6 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -34,7 +34,8 @@ from .. import util
max_length=("Max words per example.", "option", "xw", int),
min_length=("Min words per example.", "option", "nw", int),
seed=("Seed for random number generators", "option", "s", float),
- nr_iter=("Number of iterations to pretrain", "option", "i", int),
+ n_iter=("Number of iterations to pretrain", "option", "i", int),
+ n_save_every=("Save model every X batches.", "option", "se", int),
)
def pretrain(
texts_loc,
@@ -46,11 +47,12 @@ def pretrain(
loss_func="cosine",
use_vectors=False,
dropout=0.2,
- nr_iter=1000,
+ n_iter=1000,
batch_size=3000,
max_length=500,
min_length=5,
seed=0,
+ n_save_every=None,
):
"""
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
@@ -115,9 +117,26 @@ def pretrain(
msg.divider("Pre-training tok2vec layer")
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
- for epoch in range(nr_iter):
- for batch in util.minibatch_by_words(
- ((text, None) for text in texts), size=batch_size
+
+ def _save_model(epoch, is_temp=False):
+ is_temp_str = ".temp" if is_temp else ""
+ with model.use_params(optimizer.averages):
+ with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
+ "wb"
+ ) as file_:
+ file_.write(model.tok2vec.to_bytes())
+ log = {
+ "nr_word": tracker.nr_word,
+ "loss": tracker.loss,
+ "epoch_loss": tracker.epoch_loss,
+ "epoch": epoch,
+ }
+ with (output_dir / "log.jsonl").open("a") as file_:
+ file_.write(srsly.json_dumps(log) + "\n")
+
+ for epoch in range(n_iter):
+ for batch_id, batch in enumerate(
+ util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
):
docs = make_docs(
nlp,
@@ -133,17 +152,9 @@ def pretrain(
msg.row(progress, **row_settings)
if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
break
- with model.use_params(optimizer.averages):
- with (output_dir / ("model%d.bin" % epoch)).open("wb") as file_:
- file_.write(model.tok2vec.to_bytes())
- log = {
- "nr_word": tracker.nr_word,
- "loss": tracker.loss,
- "epoch_loss": tracker.epoch_loss,
- "epoch": epoch,
- }
- with (output_dir / "log.jsonl").open("a") as file_:
- file_.write(srsly.json_dumps(log) + "\n")
+ if n_save_every and (batch_id % n_save_every == 0):
+ _save_model(epoch, is_temp=True)
+ _save_model(epoch)
tracker.epoch_loss = 0.0
if texts_loc != "-":
# Reshuffle the texts if texts were loaded from a file
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 5cf0f5f6f..63c6242de 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -35,7 +35,12 @@ from .. import about
pipeline=("Comma-separated names of pipeline components", "option", "p", str),
vectors=("Model to load vectors from", "option", "v", str),
n_iter=("Number of iterations", "option", "n", int),
- early_stopping_iter=("Maximum number of training epochs without dev accuracy improvement", "option", "e", int),
+ n_early_stopping=(
+ "Maximum number of training epochs without dev accuracy improvement",
+ "option",
+ "ne",
+ int,
+ ),
n_examples=("Number of examples", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int),
version=("Model version", "option", "V", str),
@@ -75,7 +80,7 @@ def train(
pipeline="tagger,parser,ner",
vectors=None,
n_iter=30,
- early_stopping_iter=None,
+ n_early_stopping=None,
n_examples=0,
use_gpu=-1,
version="0.0.0",
@@ -226,7 +231,7 @@ def train(
msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
try:
iter_since_best = 0
- best_score = 0.
+ best_score = 0.0
for i in range(n_iter):
train_docs = corpus.train_docs(
nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
@@ -335,17 +340,23 @@ def train(
gpu_wps=gpu_wps,
)
msg.row(progress, **row_settings)
- # early stopping
- if early_stopping_iter is not None:
+ # Early stopping
+ if n_early_stopping is not None:
current_score = _score_for_model(meta)
if current_score < best_score:
iter_since_best += 1
else:
iter_since_best = 0
best_score = current_score
- if iter_since_best >= early_stopping_iter:
- msg.text("Early stopping, best iteration is: {}".format(i-iter_since_best))
- msg.text("Best score = {}; Final iteration score = {}".format(best_score, current_score))
+ if iter_since_best >= n_early_stopping:
+ msg.text(
+ "Early stopping, best iteration "
+ "is: {}".format(i - iter_since_best)
+ )
+ msg.text(
+ "Best score = {}; Final iteration "
+ "score = {}".format(best_score, current_score)
+ )
break
finally:
with nlp.use_params(optimizer.averages):
@@ -356,19 +367,21 @@ def train(
best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
msg.good("Created best model", best_model_path)
+
def _score_for_model(meta):
""" Returns mean score between tasks in pipeline that can be used for early stopping. """
mean_acc = list()
- pipes = meta['pipeline']
- acc = meta['accuracy']
- if 'tagger' in pipes:
- mean_acc.append(acc['tags_acc'])
- if 'parser' in pipes:
- mean_acc.append((acc['uas']+acc['las']) / 2)
- if 'ner' in pipes:
- mean_acc.append((acc['ents_p']+acc['ents_r']+acc['ents_f']) / 3)
+ pipes = meta["pipeline"]
+ acc = meta["accuracy"]
+ if "tagger" in pipes:
+ mean_acc.append(acc["tags_acc"])
+ if "parser" in pipes:
+ mean_acc.append((acc["uas"] + acc["las"]) / 2)
+ if "ner" in pipes:
+ mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
return sum(mean_acc) / len(mean_acc)
+
@contextlib.contextmanager
def _create_progress_bar(total):
if int(os.environ.get("LOG_FRIENDLY", 0)):
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index fadbaaa7e..b651c0996 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -19,7 +19,7 @@ RENDER_WRAPPER = None
def render(
- docs, style="dep", page=False, minify=False, jupyter=False, options={}, manual=False
+ docs, style="dep", page=False, minify=False, jupyter=None, options={}, manual=False
):
"""Render displaCy visualisation.
@@ -27,7 +27,7 @@ def render(
style (unicode): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
- jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
+ jupyter (bool): Override Jupyter auto-detection.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
RETURNS (unicode): Rendered HTML markup.
@@ -53,7 +53,8 @@ def render(
html = _html["parsed"]
if RENDER_WRAPPER is not None:
html = RENDER_WRAPPER(html)
- if jupyter or is_in_jupyter(): # return HTML rendered by IPython display()
+ if jupyter or (jupyter is None and is_in_jupyter()):
+ # return HTML rendered by IPython display()
from IPython.core.display import display, HTML
return display(HTML(html))
diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index 0bd8333db..ba5b86d77 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -4,11 +4,13 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
+from .norm_exceptions import NORM_EXCEPTIONS
-from ...attrs import LANG
+from ..norm_exceptions import BASE_NORMS
+from ...attrs import LANG, NORM
from ...language import Language
from ...tokens import Doc
-from ...util import DummyTokenizer
+from ...util import DummyTokenizer, add_lookups
class ThaiTokenizer(DummyTokenizer):
@@ -33,7 +35,9 @@ class ThaiTokenizer(DummyTokenizer):
class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda _text: "th"
-
+ lex_attr_getters[NORM] = add_lookups(
+ Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
+ )
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
diff --git a/spacy/lang/th/norm_exceptions.py b/spacy/lang/th/norm_exceptions.py
new file mode 100644
index 000000000..497779cf9
--- /dev/null
+++ b/spacy/lang/th/norm_exceptions.py
@@ -0,0 +1,114 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+_exc = {
+ # Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
+ "สนุ๊กเกอร์": "สนุกเกอร์",
+ "โน้ต": "โน้ต",
+ # Misspelled because of being lazy or hustle (สะกดผิดเพราะขี้เกียจพิมพ์ หรือเร่งรีบ)
+ "โทสับ": "โทรศัพท์",
+ "พุ่งนี้": "พรุ่งนี้",
+ # Strange (ให้ดูแปลกตา)
+ "ชะมะ": "ใช่ไหม",
+ "ชิมิ": "ใช่ไหม",
+ "ชะ": "ใช่ไหม",
+ "ช่ายมะ": "ใช่ไหม",
+ "ป่าว": "เปล่า",
+ "ป่ะ": "เปล่า",
+ "ปล่าว": "เปล่า",
+ "คัย": "ใคร",
+ "ไค": "ใคร",
+ "คราย": "ใคร",
+ "เตง": "ตัวเอง",
+ "ตะเอง": "ตัวเอง",
+ "รึ": "หรือ",
+ "เหรอ": "หรือ",
+ "หรา": "หรือ",
+ "หรอ": "หรือ",
+ "ชั้น": "ฉัน",
+ "ชั้ล": "ฉัน",
+ "ช้าน": "ฉัน",
+ "เทอ": "เธอ",
+ "เทอร์": "เธอ",
+ "เทอว์": "เธอ",
+ "แกร": "แก",
+ "ป๋ม": "ผม",
+ "บ่องตง": "บอกตรงๆ",
+ "ถ่ามตง": "ถามตรงๆ",
+ "ต่อมตง": "ตอบตรงๆ",
+ "เพิ่ล": "เพื่อน",
+ "จอบอ": "จอบอ",
+ "ดั้ย": "ได้",
+ "ขอบคุง": "ขอบคุณ",
+ "ยังงัย": "ยังไง",
+ "Inw": "เทพ",
+ "uou": "นอน",
+ "Lกรีeu": "เกรียน",
+ # Misspelled to express emotions (คำที่สะกดผิดเพื่อแสดงอารมณ์)
+ "เปงราย": "เป็นอะไร",
+ "เปนรัย": "เป็นอะไร",
+ "เปงรัย": "เป็นอะไร",
+ "เป็นอัลไล": "เป็นอะไร",
+ "ทามมาย": "ทำไม",
+ "ทามมัย": "ทำไม",
+ "จังรุย": "จังเลย",
+ "จังเยย": "จังเลย",
+ "จุงเบย": "จังเลย",
+ "ไม่รู้": "มะรุ",
+ "เฮ่ย": "เฮ้ย",
+ "เห้ย": "เฮ้ย",
+ "น่าร็อค": "น่ารัก",
+ "น่าร๊าก": "น่ารัก",
+ "ตั้ลล๊าก": "น่ารัก",
+ "คือร๊ะ": "คืออะไร",
+ "โอป่ะ": "โอเคหรือเปล่า",
+ "น่ามคาน": "น่ารำคาญ",
+ "น่ามสาร": "น่าสงสาร",
+ "วงวาร": "สงสาร",
+ "บับว่า": "แบบว่า",
+ "อัลไล": "อะไร",
+ "อิจ": "อิจฉา",
+ # Reduce rough words or Avoid to software filter (คำที่สะกดผิดเพื่อลดความหยาบของคำ หรืออาจใช้หลีกเลี่ยงการกรองคำหยาบของซอฟต์แวร์)
+ "กรู": "กู",
+ "กุ": "กู",
+ "กรุ": "กู",
+ "ตู": "กู",
+ "ตรู": "กู",
+ "มรึง": "มึง",
+ "เมิง": "มึง",
+ "มืง": "มึง",
+ "มุง": "มึง",
+ "สาด": "สัตว์",
+ "สัส": "สัตว์",
+ "สัก": "สัตว์",
+ "แสรด": "สัตว์",
+ "โคโตะ": "โคตร",
+ "โคด": "โคตร",
+ "โครต": "โคตร",
+ "โคตะระ": "โคตร",
+ "พ่อง": "พ่อมึง",
+ "แม่เมิง": "แม่มึง",
+ "เชี่ย": "เหี้ย",
+ # Imitate words (คำเลียนเสียง โดยส่วนใหญ่จะเพิ่มทัณฑฆาต หรือซ้ำตัวอักษร)
+ "แอร๊ยย": "อ๊าย",
+ "อร๊ายยย": "อ๊าย",
+ "มันส์": "มัน",
+ "วู๊วววววววว์": "วู้",
+ # Acronym (แบบคำย่อ)
+ "หมาลัย": "มหาวิทยาลัย",
+ "วิดวะ": "วิศวะ",
+ "สินสาด ": "ศิลปศาสตร์",
+ "สินกำ ": "ศิลปกรรมศาสตร์",
+ "เสารีย์ ": "อนุเสาวรีย์ชัยสมรภูมิ",
+ "เมกา ": "อเมริกา",
+ "มอไซค์ ": "มอเตอร์ไซค์",
+}
+
+
+NORM_EXCEPTIONS = {}
+
+for string, norm in _exc.items():
+ NORM_EXCEPTIONS[string] = norm
+ NORM_EXCEPTIONS[string.title()] = norm
+
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 29e050964..a6a8bf247 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1316,6 +1316,28 @@
"author_links": {
"github": "oterrier"
}
+ },
+ {
+ "id": "pyInflect",
+ "slogan": "A python module for word inflections",
+ "description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add word inflections to the system.",
+ "github": "bjascob/pyInflect",
+ "pip": "pyinflect",
+ "code_example": [
+ "import spacy",
+ "import pyinflect",
+ "",
+ "nlp = spacy.load('en_core_web_sm')",
+ "doc = nlp('This is an example.')",
+ "doc[3].tag_ # NN",
+ "doc[3]._.inflect('NNS') # examples"
+ ],
+ "author": "Brad Jascob",
+ "author_links": {
+ "github": "bjascob"
+ },
+ "category": ["pipeline"],
+ "tags": ["inflection"]
}
],
"categories": [
From 4762f5606276f77b6cd2c11d4279eaaf5b7bc463 Mon Sep 17 00:00:00 2001
From: Bram Vanroy
Date: Mon, 6 May 2019 21:08:01 +0200
Subject: [PATCH 002/496] Re-added Universe readme (#3688) (closes #3680)
---
website/UNIVERSE.md | 95 +++++++++++++++++++++++++++++++
website/src/templates/universe.js | 2 +-
2 files changed, 96 insertions(+), 1 deletion(-)
create mode 100644 website/UNIVERSE.md
diff --git a/website/UNIVERSE.md b/website/UNIVERSE.md
new file mode 100644
index 000000000..c26c0fce4
--- /dev/null
+++ b/website/UNIVERSE.md
@@ -0,0 +1,95 @@
+
+
+# spaCy Universe
+
+The [spaCy Universe](https://spacy.io/universe) collects the many great resources developed with or for spaCy. It
+includes standalone packages, plugins, extensions, educational materials,
+operational utilities and bindings for other languages.
+
+If you have a project that you want the spaCy community to make use of, you can
+suggest it by submitting a pull request to this repository. The Universe
+database is open-source and collected in a simple JSON file.
+
+Looking for inspiration for your own spaCy plugin or extension? Check out the
+[`project idea`](https://github.com/explosion/spaCy/labels/project%20idea) label
+on the issue tracker.
+
+## Checklist
+
+### Projects
+
+✅ Libraries and packages should be **open-source** (with a user-friendly license) and at least somewhat **documented** (e.g. a simple `README` with usage instructions).
+
+✅ We're happy to include work in progress and prereleases, but we'd like to keep the emphasis on projects that should be useful to the community **right away**.
+
+✅ Demos and visualizers should be available via a **public URL**.
+
+### Educational Materials
+
+✅ Books should be **available for purchase or download** (not just pre-order). Ebooks and self-published books are fine, too, if they include enough substantial content.
+
+✅ The `"url"` of book entries should either point to the publisher's website or a reseller of your choice (ideally one that ships worldwide or as close as possible).
+
+✅ If an online course is only available behind a paywall, it should at least have a **free excerpt** or chapter available, so users know what to expect.
+
+## JSON format
+
+To add a project, fork this repository, edit the [`universe.json`](universe.json)
+and add an object of the following format to the list of `"resources"`. Before
+you submit your pull request, make sure to use a linter to verify that your
+markup is correct. We'll also be adding linting for the `universe.json` to our
+automated GitHub checks soon.
+
+```json
+{
+ "id": "unique-project-id",
+ "title": "Project title",
+ "slogan": "A short summary",
+ "description": "A longer description – *Mardown allowed!*",
+ "github": "user/repo",
+ "pip": "package-name",
+ "code_example": [
+ "import spacy",
+ "import package_name",
+ "",
+ "nlp = spacy.load('en')",
+ "nlp.add_pipe(package_name)"
+ ],
+ "code_language": "python",
+ "url": "https://example.com",
+ "thumb": "https://example.com/thumb.jpg",
+ "image": "https://example.com/image.jpg",
+ "author": "Your Name",
+ "author_links": {
+ "twitter": "username",
+ "github": "username",
+ "website": "https://example.com"
+ },
+ "category": ["pipeline", "standalone"],
+ "tags": ["some-tag", "etc"]
+}
+```
+
+| Field | Type | Description |
+| --- | --- | --- |
+| `id` | string | Unique ID of the project. |
+| `title` | string | Project title. If not set, the `id` will be used as the display title. |
+| `slogan` | string | A short description of the project. Displayed in the overview and under the title. |
+| `description` | string | A longer description of the project. Markdown is allowed, but should be limited to basic formatting like bold, italics, code or links. |
+| `github` | string | Associated GitHub repo in the format `user/repo`. Will be displayed as a link and used for release, license and star badges. |
+| `pip` | string | Package name on pip. If available, the installation command will be displayed. |
+| `cran` | string | For R packages: package name on CRAN. If available, the installation command will be displayed. |
+| `code_example` | array | Short example that shows how to use the project. Formatted as an array with one string per line. |
+| `code_language` | string | Defaults to `'python'`. Optional code language used for syntax highlighting with [Prism](http://prismjs.com/). |
+| `url` | string | Optional project link to display as button. |
+| `thumb` | string | Optional URL to project thumbnail to display in overview and project header. Recommended size is 100x100px. |
+| `image` | string | Optional URL to project image to display with description. |
+| `author` | string | Name(s) of project author(s). |
+| `author_links` | object | Usernames and links to display as icons to author info. Currently supports `twitter` and `github` usernames, as well as `website` link. |
+| `category` | list | One or more categories to assign to project. Must be one of the available options. |
+| `tags` | list | Still experimental and not used for filtering: one or more tags to assign to project. |
+
+To separate them from the projects, educational materials also specify
+`"type": "education`. Books can also set a `"cover"` field containing a URL
+to a cover image. If available, it's used in the overview and displayed on
+the individual book page.
\ No newline at end of file
diff --git a/website/src/templates/universe.js b/website/src/templates/universe.js
index 644a2de17..379b1a541 100644
--- a/website/src/templates/universe.js
+++ b/website/src/templates/universe.js
@@ -125,7 +125,7 @@ const UniverseContent = ({ content = [], categories, pageContext, location, mdxC
-
+
Read the docs
From 61829f1e79b92890c2cfa0590b0ad667bb831e25 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 9 May 2019 15:36:29 +0200
Subject: [PATCH 003/496] Fix typo
---
website/docs/api/top-level.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 57af729f0..924aca283 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -351,7 +351,7 @@ the two-letter language code.
| `name` | unicode | Two-letter language code, e.g. `'en'`. |
| `cls` | `Language` | The language class, e.g. `English`. |
-### util.lang_class_is_loaded (#util.lang_class_is_loaded tag="function" new="2.1")
+### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
Check whether a `Language` class is already loaded. `Language` classes are
loaded lazily, to avoid expensive setup code associated with the language data.
From f256bfbcc407a565897ed5483a552707386aeae1 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Fri, 10 May 2019 14:06:06 +0200
Subject: [PATCH 004/496] Add version tag to `--base-model` argument (closes
#3720)
---
website/docs/api/cli.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 6d3a33c49..d9886004a 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -210,7 +210,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
| `output_path` | positional | Directory to store model in. Will be created if it doesn't exist. |
| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. |
| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
-| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. |
+| `--base-model`, `-b` 2.1 | option | Optional name of base model to update. Can be any loadable spaCy model. |
| `--pipeline`, `-p` 2.1 | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
| `--vectors`, `-v` | option | Model to load vectors from. |
| `--n-iter`, `-n` | option | Number of iterations (default: `30`). |
From 914f4b2938be4eb0a7dd4d148854617325fd3f48 Mon Sep 17 00:00:00 2001
From: Aaron Kub
Date: Fri, 10 May 2019 08:23:52 -0400
Subject: [PATCH 005/496] fixing regex matcher examples (#3708) (#3719)
---
.github/contributors/aaronkub.md | 106 ++++++++++++++++++++++
website/docs/usage/rule-based-matching.md | 5 +-
2 files changed, 109 insertions(+), 2 deletions(-)
create mode 100644 .github/contributors/aaronkub.md
diff --git a/.github/contributors/aaronkub.md b/.github/contributors/aaronkub.md
new file mode 100644
index 000000000..c2a7f494e
--- /dev/null
+++ b/.github/contributors/aaronkub.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Aaron Kub |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 2019-05-09 |
+| GitHub username | aaronkub |
+| Website (optional) | |
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 37626f6a4..a0959bfbc 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -214,7 +214,8 @@ example, you might want to match different spellings of a word, without having
to add a new pattern for each spelling.
```python
-pattern = [{"TEXT": {"REGEX": "^([Uu](\\.?|nited) ?[Ss](\\.?|tates)"}},
+pattern = [{"TEXT": {"REGEX": "^[Uu](\\.?|nited)$"}},
+ {"TEXT": {"REGEX": "^[Ss](\\.?|tates)$"}},
{"LOWER": "president"}]
```
@@ -227,7 +228,7 @@ attributes:
pattern = [{"TAG": {"REGEX": "^V"}}]
# Match custom attribute values with regular expressions
-pattern = [{"_": {"country": {"REGEX": "^([Uu](\\.?|nited) ?[Ss](\\.?|tates)"}}}]
+pattern = [{"_": {"country": {"REGEX": "^[Uu](\\.?|nited) ?[Ss](\\.?|tates)$"}}}]
```
From 377ab1cffb9074e173a77e6163948fb8a5aa7b89 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sat, 11 May 2019 15:22:34 +0200
Subject: [PATCH 006/496] Improve Token.prob and Lexeme.prob docs (resolves
#3701)
---
website/docs/api/cython-structs.md | 2 +-
website/docs/api/lexeme.md | 2 +-
website/docs/api/token.md | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/website/docs/api/cython-structs.md b/website/docs/api/cython-structs.md
index 1d3139a96..0e427a8d5 100644
--- a/website/docs/api/cython-structs.md
+++ b/website/docs/api/cython-structs.md
@@ -172,7 +172,7 @@ struct.
| `prefix` | `attr_t` | Length-N substring from the start of the lexeme. Defaults to `N=1`. |
| `suffix` | `attr_t` | Length-N substring from the end of the lexeme. Defaults to `N=3`. |
| `cluster` | `attr_t` | Brown cluster ID. |
-| `prob` | `float` | Smoothed log probability estimate of the lexeme's type. |
+| `prob` | `float` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). |
| `sentiment` | `float` | A scalar value indicating positivity or negativity. |
### Lexeme.get_struct_attr {#lexeme_get_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"}
diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md
index d5e5c54b8..5ec2aaf0c 100644
--- a/website/docs/api/lexeme.md
+++ b/website/docs/api/lexeme.md
@@ -161,6 +161,6 @@ The L2 norm of the lexeme's vector representation.
| `is_stop` | bool | Is the lexeme part of a "stop list"? |
| `lang` | int | Language of the parent vocabulary. |
| `lang_` | unicode | Language of the parent vocabulary. |
-| `prob` | float | Smoothed log probability estimate of the lexeme's type. |
+| `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). |
| `cluster` | int | Brown cluster ID. |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. |
diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index a4607b186..2085a02c6 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -465,7 +465,7 @@ The L2 norm of the token's vector representation.
| `dep_` | unicode | Syntactic dependency relation. |
| `lang` | int | Language of the parent document's vocabulary. |
| `lang_` | unicode | Language of the parent document's vocabulary. |
-| `prob` | float | Smoothed log probability estimate of token's type. |
+| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). |
| `idx` | int | The character offset of the token within the parent document. |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. |
| `lex_id` | int | Sequential ID of the token's lexical type. |
From 7819404127804db2d76ac2eb8e1569f22f2b1d2f Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sat, 11 May 2019 15:37:30 +0200
Subject: [PATCH 007/496] Fix DependencyParser.predict docs (resolves #3561)
---
website/docs/api/dependencyparser.md | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index 329f96ead..58acc4425 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -102,10 +102,10 @@ Apply the pipeline's model to a batch of docs, without modifying them.
> scores = parser.predict([doc1, doc2])
> ```
-| Name | Type | Description |
-| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs` | iterable | The documents to predict. |
-| **RETURNS** | tuple | A `(scores, tensors)` tuple where `scores` is the model's prediction for each document and `tensors` is the token representations used to predict the scores. Each tensor is an array with one row for each token in the document. |
+| Name | Type | Description |
+| ----------- | ------------------- | ---------------------------------------------- |
+| `docs` | iterable | The documents to predict. |
+| **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). |
## DependencyParser.set_annotations {#set_annotations tag="method"}
From 0e680046ac3ced5ed1aeb065a11ef16ada1e57b3 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Fri, 2 Aug 2019 21:44:26 +0200
Subject: [PATCH 008/496] Update languages.json
---
website/meta/languages.json | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/website/meta/languages.json b/website/meta/languages.json
index ef336ef5f..549bd058b 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -3,14 +3,21 @@
{
"code": "en",
"name": "English",
- "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_vectors_web_lg"],
+ "models": [
+ "en_core_web_sm",
+ "en_core_web_md",
+ "en_core_web_lg",
+ "en_vectors_web_lg",
+ "en_pytt_bertbaseuncased_lg",
+ "en_pytt_xlnetbasecased_lg"
+ ],
"example": "This is a sentence.",
"has_examples": true
},
{
"code": "de",
"name": "German",
- "models": ["de_core_news_sm", "de_core_news_md"],
+ "models": ["de_core_news_sm", "de_core_news_md", "de_pytt_bertbasecased_lg"],
"example": "Dies ist ein Satz.",
"has_examples": true
},
From 95d63c74b4c007bf77fcfba84de6e6fbf7eeb98c Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 7 Aug 2019 00:47:40 +0200
Subject: [PATCH 009/496] Update site.json
---
website/meta/site.json | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/website/meta/site.json b/website/meta/site.json
index 1820ff5df..7ec146cf5 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -29,7 +29,7 @@
"spacyVersion": "2.1",
"binderUrl": "ines/spacy-io-binder",
"binderBranch": "live",
- "binderVersion": "2.1.3",
+ "binderVersion": "2.1.7",
"sections": [
{ "id": "usage", "title": "Usage Documentation", "theme": "blue" },
{ "id": "models", "title": "Models Documentation", "theme": "blue" },
From b6a509a8d159fd8ec8a4f2fca234b0545095eb27 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 26 Sep 2019 16:23:02 +0200
Subject: [PATCH 010/496] Fix tag
---
website/docs/api/vectors.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md
index ffc1fc083..c04085091 100644
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@@ -211,7 +211,7 @@ Iterate over `(key, vector)` pairs, in order.
| ---------- | ----- | -------------------------------- |
| **YIELDS** | tuple | `(key, vector)` pairs, in order. |
-## Vectors.find (#find tag="method")
+## Vectors.find {#find tag="method"}
Look up one or more keys by row, or vice versa.
From 06d8c3a20f8fd198239052efc3bc314f03f45c0f Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Mon, 30 Sep 2019 13:14:48 +0200
Subject: [PATCH 011/496] Revert "Merge branch 'master' into spacy.io"
This reverts commit c8bb08b5453da865dda4c54ad3c03ae8817d2c6a, reversing
changes made to b6a509a8d159fd8ec8a4f2fca234b0545095eb27.
---
.github/contributors/EARL_GREYT.md | 106 -
.github/contributors/Hazoom.md | 106 -
.github/contributors/jaydeepborkar.md | 106 -
.github/contributors/seanBE.md | 106 -
.github/contributors/zqianem.md | 106 -
CONTRIBUTING.md | 5 +-
Makefile | 16 +-
README.md | 13 +-
azure-pipelines.yml | 18 +-
bin/get-version.sh | 12 -
bin/ud/run_eval.py | 36 +-
bin/ud/ud_run_test.py | 8 +-
bin/ud/ud_train.py | 105 +-
examples/pipeline/dummy_entity_linking.py | 0
examples/pipeline/wikidata_entity_linking.py | 0
examples/training/pretrain_kb.py | 5 +-
.../training/textcat_example_data/CC0.txt | 121 -
.../textcat_example_data/CC_BY-SA-3.0.txt | 359 --
.../textcat_example_data/CC_BY-SA-4.0.txt | 428 --
.../training/textcat_example_data/README.md | 34 -
.../textcat_example_data/cooking.json | 3487 -----------------
.../textcat_example_data/cooking.jsonl | 10 -
.../jigsaw-toxic-comment.json | 2987 --------------
.../jigsaw-toxic-comment.jsonl | 10 -
.../textcatjsonl_to_trainjson.py | 53 -
examples/training/train_entity_linker.py | 4 +-
examples/training/training-data.json | 2 +-
fabfile.py | 122 +-
requirements.txt | 6 +-
setup.py | 11 +-
spacy/_ml.py | 161 +-
spacy/about.py | 12 +-
spacy/attrs.pyx | 6 +-
spacy/cli/debug_data.py | 66 +-
spacy/cli/download.py | 27 +-
spacy/cli/evaluate.py | 1 -
spacy/cli/init_model.py | 24 +-
spacy/cli/train.py | 259 +-
spacy/errors.py | 25 +-
spacy/glossary.py | 6 -
spacy/gold.pxd | 1 -
spacy/gold.pyx | 168 +-
spacy/kb.pyx | 2 +-
spacy/lang/char_classes.py | 4 +-
spacy/lang/de/__init__.py | 14 -
spacy/lang/de/tag_map.py | 6 +-
spacy/lang/el/lemmatizer/__init__.py | 7 +-
spacy/lang/en/__init__.py | 8 -
spacy/lang/en/lemmatizer/lemma_lookup.json | 2 +-
spacy/lang/en/morph_rules.py | 53 +-
spacy/lang/en/tag_map.py | 20 +-
spacy/lang/en/tokenizer_exceptions.py | 9 +-
spacy/lang/fr/lemmatizer/__init__.py | 6 +-
spacy/lang/hi/stop_words.py | 8 +-
spacy/lang/ja/__init__.py | 23 +-
spacy/lang/ja/tag_map.py | 4 +-
spacy/lang/ko/__init__.py | 39 +-
spacy/lang/lt/tag_map.py | 236 +-
spacy/lang/nl/lemmatizer/__init__.py | 9 +-
spacy/lang/ru/lemmatizer.py | 4 +-
spacy/lang/uk/lemmatizer.py | 4 +-
spacy/language.py | 48 +-
spacy/lemmatizer.py | 36 +-
spacy/lookups.py | 159 +-
spacy/matcher/matcher.pyx | 29 +-
spacy/matcher/phrasematcher.pxd | 26 +-
spacy/matcher/phrasematcher.pyx | 348 +-
spacy/morphology.pxd | 300 +-
spacy/morphology.pyx | 1376 ++-----
spacy/pipeline/__init__.py | 2 -
spacy/pipeline/entityruler.py | 37 +-
spacy/pipeline/morphologizer.pyx | 164 -
spacy/pipeline/pipes.pyx | 30 +-
spacy/scorer.py | 385 +-
spacy/strings.pyx | 16 +-
spacy/structs.pxd | 48 -
spacy/syntax/arc_eager.pyx | 1 -
spacy/syntax/ner.pyx | 53 +-
spacy/syntax/nn_parser.pyx | 26 +-
spacy/syntax/transition_system.pyx | 10 +-
spacy/tests/conftest.py | 6 -
spacy/tests/doc/test_add_entities.py | 25 +-
spacy/tests/doc/test_creation.py | 4 +-
spacy/tests/doc/test_morphanalysis.py | 33 -
spacy/tests/lang/ja/test_tokenizer.py | 7 -
spacy/tests/lang/lt/test_lemmatizer.py | 2 +-
spacy/tests/lang/ru/test_lemmatizer.py | 7 +
...{test_exceptions.py => test_еxceptions.py} | 0
spacy/tests/matcher/test_matcher_api.py | 8 -
spacy/tests/matcher/test_phrase_matcher.py | 89 +-
spacy/tests/morphology/__init__.py | 0
spacy/tests/morphology/test_morph_features.py | 48 -
spacy/tests/parser/test_ner.py | 202 +-
spacy/tests/regression/test_issue1-1000.py | 2 +-
spacy/tests/regression/test_issue1501-2000.py | 2 +-
spacy/tests/regression/test_issue2501-3000.py | 2 +-
spacy/tests/regression/test_issue3001-3500.py | 6 +-
spacy/tests/regression/test_issue4042.py | 82 -
spacy/tests/regression/test_issue4054.py | 4 +-
spacy/tests/regression/test_issue4267.py | 42 -
spacy/tests/regression/test_issue4278.py | 2 +-
spacy/tests/regression/test_issue4313.py | 39 -
spacy/tests/serialize/test_serialize_kb.py | 6 +-
spacy/tests/test_displacy.py | 2 +-
spacy/tests/test_gold.py | 29 -
spacy/tests/test_scorer.py | 75 +-
spacy/tests/vocab_vectors/test_lookups.py | 70 +-
spacy/tests/vocab_vectors/test_vectors.py | 4 +-
spacy/tokens/__init__.py | 3 +-
spacy/tokens/_retokenize.pyx | 3 +-
spacy/tokens/_serialize.py | 124 +-
spacy/tokens/doc.pyx | 87 +-
spacy/tokens/morphanalysis.pxd | 9 -
spacy/tokens/morphanalysis.pyx | 423 --
spacy/tokens/token.pyx | 12 +-
spacy/util.py | 2 +-
spacy/vectors.pyx | 2 +-
spacy/vocab.pyx | 10 +-
website/README.md | 13 +-
website/docs/api/annotation.md | 10 +-
website/docs/api/cli.md | 249 +-
website/docs/api/cython-classes.md | 6 +-
website/docs/api/cython-structs.md | 6 +-
website/docs/api/dependencyparser.md | 2 +-
website/docs/api/doc.md | 79 +-
website/docs/api/docbin.md | 149 -
website/docs/api/entitylinker.md | 300 --
website/docs/api/entityrecognizer.md | 21 +-
website/docs/api/entityruler.md | 4 +-
website/docs/api/goldparse.md | 30 +-
website/docs/api/kb.md | 268 --
website/docs/api/language.md | 28 +-
website/docs/api/lemmatizer.md | 25 +-
website/docs/api/lexeme.md | 18 +-
website/docs/api/lookups.md | 318 --
website/docs/api/matcher.md | 4 +-
website/docs/api/phrasematcher.md | 35 +-
website/docs/api/pipeline-functions.md | 8 +-
website/docs/api/scorer.md | 24 +-
website/docs/api/sentencizer.md | 2 +-
website/docs/api/span.md | 93 +-
website/docs/api/stringstore.md | 30 +-
website/docs/api/tagger.md | 17 +-
website/docs/api/textcategorizer.md | 21 +-
website/docs/api/token.md | 62 +-
website/docs/api/tokenizer.md | 17 +-
website/docs/api/top-level.md | 41 +-
website/docs/api/vectors.md | 23 +-
website/docs/api/vocab.md | 45 +-
website/docs/images/displacy-ent-snek.html | 18 -
website/docs/usage/101/_named-entities.md | 4 +-
website/docs/usage/101/_pipelines.md | 20 +-
website/docs/usage/101/_pos-deps.md | 6 +-
website/docs/usage/101/_serialization.md | 6 +-
website/docs/usage/101/_tokenization.md | 2 +-
website/docs/usage/101/_training.md | 2 +-
website/docs/usage/101/_vectors-similarity.md | 8 +-
website/docs/usage/adding-languages.md | 151 +-
website/docs/usage/facts-figures.md | 2 +-
website/docs/usage/index.md | 2 +-
website/docs/usage/linguistic-features.md | 149 +-
website/docs/usage/models.md | 20 +-
website/docs/usage/processing-pipelines.md | 31 +-
website/docs/usage/rule-based-matching.md | 42 +-
website/docs/usage/saving-loading.md | 137 +-
website/docs/usage/spacy-101.md | 140 +-
website/docs/usage/training.md | 105 +-
website/docs/usage/v2-1.md | 8 +-
website/docs/usage/v2-2.md | 351 --
website/docs/usage/v2.md | 42 +-
website/docs/usage/vectors-similarity.md | 15 +-
website/docs/usage/visualizers.md | 17 +-
website/meta/languages.json | 18 +-
website/meta/sidebars.json | 7 +-
website/meta/site.json | 1 +
website/meta/universe.json | 34 +-
website/src/components/table.js | 13 +-
website/src/styles/accordion.module.sass | 1 -
website/src/styles/code.module.sass | 1 -
website/src/styles/grid.module.sass | 2 +-
website/src/styles/table.module.sass | 3 -
website/src/templates/models.js | 69 +-
website/src/widgets/landing.js | 36 +-
website/src/widgets/quickstart-models.js | 2 +-
184 files changed, 2266 insertions(+), 15310 deletions(-)
delete mode 100644 .github/contributors/EARL_GREYT.md
delete mode 100644 .github/contributors/Hazoom.md
delete mode 100644 .github/contributors/jaydeepborkar.md
delete mode 100644 .github/contributors/seanBE.md
delete mode 100644 .github/contributors/zqianem.md
delete mode 100755 bin/get-version.sh
delete mode 100644 examples/pipeline/dummy_entity_linking.py
delete mode 100644 examples/pipeline/wikidata_entity_linking.py
delete mode 100644 examples/training/textcat_example_data/CC0.txt
delete mode 100644 examples/training/textcat_example_data/CC_BY-SA-3.0.txt
delete mode 100644 examples/training/textcat_example_data/CC_BY-SA-4.0.txt
delete mode 100644 examples/training/textcat_example_data/README.md
delete mode 100644 examples/training/textcat_example_data/cooking.json
delete mode 100644 examples/training/textcat_example_data/cooking.jsonl
delete mode 100644 examples/training/textcat_example_data/jigsaw-toxic-comment.json
delete mode 100644 examples/training/textcat_example_data/jigsaw-toxic-comment.jsonl
delete mode 100644 examples/training/textcat_example_data/textcatjsonl_to_trainjson.py
delete mode 100644 spacy/pipeline/morphologizer.pyx
delete mode 100644 spacy/tests/doc/test_morphanalysis.py
rename spacy/tests/lang/sr/{test_exceptions.py => test_еxceptions.py} (100%)
delete mode 100644 spacy/tests/morphology/__init__.py
delete mode 100644 spacy/tests/morphology/test_morph_features.py
delete mode 100644 spacy/tests/regression/test_issue4042.py
delete mode 100644 spacy/tests/regression/test_issue4267.py
delete mode 100644 spacy/tests/regression/test_issue4313.py
delete mode 100644 spacy/tokens/morphanalysis.pxd
delete mode 100644 spacy/tokens/morphanalysis.pyx
delete mode 100644 website/docs/api/docbin.md
delete mode 100644 website/docs/api/entitylinker.md
delete mode 100644 website/docs/api/kb.md
delete mode 100644 website/docs/api/lookups.md
delete mode 100644 website/docs/images/displacy-ent-snek.html
delete mode 100644 website/docs/usage/v2-2.md
diff --git a/.github/contributors/EARL_GREYT.md b/.github/contributors/EARL_GREYT.md
deleted file mode 100644
index 3ee7d4f41..000000000
--- a/.github/contributors/EARL_GREYT.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# spaCy contributor agreement
-
-This spaCy Contributor Agreement (**"SCA"**) is based on the
-[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
-The SCA applies to any contribution that you make to any product or project
-managed by us (the **"project"**), and sets out the intellectual property rights
-you grant to us in the contributed materials. The term **"us"** shall mean
-[ExplosionAI GmbH](https://explosion.ai/legal). The term
-**"you"** shall mean the person or entity identified below.
-
-If you agree to be bound by these terms, fill in the information requested
-below and include the filled-in version with your first pull request, under the
-folder [`.github/contributors/`](/.github/contributors/). The name of the file
-should be your GitHub username, with the extension `.md`. For example, the user
-example_user would create the file `.github/contributors/example_user.md`.
-
-Read this agreement carefully before signing. These terms and conditions
-constitute a binding legal agreement.
-
-## Contributor Agreement
-
-1. The term "contribution" or "contributed materials" means any source code,
-object code, patch, tool, sample, graphic, specification, manual,
-documentation, or any other material posted or submitted by you to the project.
-
-2. With respect to any worldwide copyrights, or copyright applications and
-registrations, in your contribution:
-
- * you hereby assign to us joint ownership, and to the extent that such
- assignment is or becomes invalid, ineffective or unenforceable, you hereby
- grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
- royalty-free, unrestricted license to exercise all rights under those
- copyrights. This includes, at our option, the right to sublicense these same
- rights to third parties through multiple levels of sublicensees or other
- licensing arrangements;
-
- * you agree that each of us can do all things in relation to your
- contribution as if each of us were the sole owners, and if one of us makes
- a derivative work of your contribution, the one who makes the derivative
- work (or has it made will be the sole owner of that derivative work;
-
- * you agree that you will not assert any moral rights in your contribution
- against us, our licensees or transferees;
-
- * you agree that we may register a copyright in your contribution and
- exercise all ownership rights associated with it; and
-
- * you agree that neither of us has any duty to consult with, obtain the
- consent of, pay or render an accounting to the other for any use or
- distribution of your contribution.
-
-3. With respect to any patents you own, or that you can license without payment
-to any third party, you hereby grant to us a perpetual, irrevocable,
-non-exclusive, worldwide, no-charge, royalty-free license to:
-
- * make, have made, use, sell, offer to sell, import, and otherwise transfer
- your contribution in whole or in part, alone or in combination with or
- included in any product, work or materials arising out of the project to
- which your contribution was submitted, and
-
- * at our option, to sublicense these same rights to third parties through
- multiple levels of sublicensees or other licensing arrangements.
-
-4. Except as set out above, you keep all right, title, and interest in your
-contribution. The rights that you grant to us under these terms are effective
-on the date you first submitted a contribution to us, even if your submission
-took place before the date you sign these terms.
-
-5. You covenant, represent, warrant and agree that:
-
- * Each contribution that you submit is and shall be an original work of
- authorship and you can legally grant the rights set out in this SCA;
-
- * to the best of your knowledge, each contribution will not violate any
- third party's copyrights, trademarks, patents, or other intellectual
- property rights; and
-
- * each contribution shall be in compliance with U.S. export control laws and
- other applicable export and import laws. You agree to notify us if you
- become aware of any circumstance which would make any of the foregoing
- representations inaccurate in any respect. We may publicly disclose your
- participation in the project, including the fact that you have signed the SCA.
-
-6. This SCA is governed by the laws of the State of California and applicable
-U.S. Federal law. Any choice of law rules will not apply.
-
-7. Please place an “x” on one of the applicable statement below. Please do NOT
-mark both statements:
-
- * [x] I am signing on behalf of myself as an individual and no other person
- or entity, including my employer, has or will have rights with respect to my
- contributions.
-
- * [ ] I am signing on behalf of my employer or a legal entity and I have the
- actual authority to contractually bind that entity.
-
-## Contributor Details
-
-| Field | Entry |
-|------------------------------- | -------------------- |
-| Name | David Weßling |
-| Company name (if applicable) | |
-| Title or role (if applicable) | |
-| Date | 27.09.19 |
-| GitHub username | EarlGreyT |
-| Website (optional) | |
diff --git a/.github/contributors/Hazoom.md b/.github/contributors/Hazoom.md
deleted file mode 100644
index 762cb5bef..000000000
--- a/.github/contributors/Hazoom.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# spaCy contributor agreement
-
-This spaCy Contributor Agreement (**"SCA"**) is based on the
-[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
-The SCA applies to any contribution that you make to any product or project
-managed by us (the **"project"**), and sets out the intellectual property rights
-you grant to us in the contributed materials. The term **"us"** shall mean
-[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
-**"you"** shall mean the person or entity identified below.
-
-If you agree to be bound by these terms, fill in the information requested
-below and include the filled-in version with your first pull request, under the
-folder [`.github/contributors/`](/.github/contributors/). The name of the file
-should be your GitHub username, with the extension `.md`. For example, the user
-example_user would create the file `.github/contributors/example_user.md`.
-
-Read this agreement carefully before signing. These terms and conditions
-constitute a binding legal agreement.
-
-## Contributor Agreement
-
-1. The term "contribution" or "contributed materials" means any source code,
-object code, patch, tool, sample, graphic, specification, manual,
-documentation, or any other material posted or submitted by you to the project.
-
-2. With respect to any worldwide copyrights, or copyright applications and
-registrations, in your contribution:
-
- * you hereby assign to us joint ownership, and to the extent that such
- assignment is or becomes invalid, ineffective or unenforceable, you hereby
- grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
- royalty-free, unrestricted license to exercise all rights under those
- copyrights. This includes, at our option, the right to sublicense these same
- rights to third parties through multiple levels of sublicensees or other
- licensing arrangements;
-
- * you agree that each of us can do all things in relation to your
- contribution as if each of us were the sole owners, and if one of us makes
- a derivative work of your contribution, the one who makes the derivative
- work (or has it made will be the sole owner of that derivative work;
-
- * you agree that you will not assert any moral rights in your contribution
- against us, our licensees or transferees;
-
- * you agree that we may register a copyright in your contribution and
- exercise all ownership rights associated with it; and
-
- * you agree that neither of us has any duty to consult with, obtain the
- consent of, pay or render an accounting to the other for any use or
- distribution of your contribution.
-
-3. With respect to any patents you own, or that you can license without payment
-to any third party, you hereby grant to us a perpetual, irrevocable,
-non-exclusive, worldwide, no-charge, royalty-free license to:
-
- * make, have made, use, sell, offer to sell, import, and otherwise transfer
- your contribution in whole or in part, alone or in combination with or
- included in any product, work or materials arising out of the project to
- which your contribution was submitted, and
-
- * at our option, to sublicense these same rights to third parties through
- multiple levels of sublicensees or other licensing arrangements.
-
-4. Except as set out above, you keep all right, title, and interest in your
-contribution. The rights that you grant to us under these terms are effective
-on the date you first submitted a contribution to us, even if your submission
-took place before the date you sign these terms.
-
-5. You covenant, represent, warrant and agree that:
-
- * Each contribution that you submit is and shall be an original work of
- authorship and you can legally grant the rights set out in this SCA;
-
- * to the best of your knowledge, each contribution will not violate any
- third party's copyrights, trademarks, patents, or other intellectual
- property rights; and
-
- * each contribution shall be in compliance with U.S. export control laws and
- other applicable export and import laws. You agree to notify us if you
- become aware of any circumstance which would make any of the foregoing
- representations inaccurate in any respect. We may publicly disclose your
- participation in the project, including the fact that you have signed the SCA.
-
-6. This SCA is governed by the laws of the State of California and applicable
-U.S. Federal law. Any choice of law rules will not apply.
-
-7. Please place an “x” on one of the applicable statement below. Please do NOT
-mark both statements:
-
- * [x] I am signing on behalf of myself as an individual and no other person
- or entity, including my employer, has or will have rights with respect to my
- contributions.
-
- * [ ] I am signing on behalf of my employer or a legal entity and I have the
- actual authority to contractually bind that entity.
-
-## Contributor Details
-
-| Field | Entry |
-|------------------------------- | -------------------- |
-| Name | Moshe Hazoom |
-| Company name (if applicable) | Amenity Analytics |
-| Title or role (if applicable) | NLP Engineer |
-| Date | 2019-09-15 |
-| GitHub username | Hazoom |
-| Website (optional) | |
diff --git a/.github/contributors/jaydeepborkar.md b/.github/contributors/jaydeepborkar.md
deleted file mode 100644
index 32199d596..000000000
--- a/.github/contributors/jaydeepborkar.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# spaCy contributor agreement
-
-This spaCy Contributor Agreement (**"SCA"**) is based on the
-[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
-The SCA applies to any contribution that you make to any product or project
-managed by us (the **"project"**), and sets out the intellectual property rights
-you grant to us in the contributed materials. The term **"us"** shall mean
-[ExplosionAI GmbH](https://explosion.ai/legal). The term
-**"you"** shall mean the person or entity identified below.
-
-If you agree to be bound by these terms, fill in the information requested
-below and include the filled-in version with your first pull request, under the
-folder [`.github/contributors/`](/.github/contributors/). The name of the file
-should be your GitHub username, with the extension `.md`. For example, the user
-example_user would create the file `.github/contributors/example_user.md`.
-
-Read this agreement carefully before signing. These terms and conditions
-constitute a binding legal agreement.
-
-## Contributor Agreement
-
-1. The term "contribution" or "contributed materials" means any source code,
-object code, patch, tool, sample, graphic, specification, manual,
-documentation, or any other material posted or submitted by you to the project.
-
-2. With respect to any worldwide copyrights, or copyright applications and
-registrations, in your contribution:
-
- * you hereby assign to us joint ownership, and to the extent that such
- assignment is or becomes invalid, ineffective or unenforceable, you hereby
- grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
- royalty-free, unrestricted license to exercise all rights under those
- copyrights. This includes, at our option, the right to sublicense these same
- rights to third parties through multiple levels of sublicensees or other
- licensing arrangements;
-
- * you agree that each of us can do all things in relation to your
- contribution as if each of us were the sole owners, and if one of us makes
- a derivative work of your contribution, the one who makes the derivative
- work (or has it made will be the sole owner of that derivative work;
-
- * you agree that you will not assert any moral rights in your contribution
- against us, our licensees or transferees;
-
- * you agree that we may register a copyright in your contribution and
- exercise all ownership rights associated with it; and
-
- * you agree that neither of us has any duty to consult with, obtain the
- consent of, pay or render an accounting to the other for any use or
- distribution of your contribution.
-
-3. With respect to any patents you own, or that you can license without payment
-to any third party, you hereby grant to us a perpetual, irrevocable,
-non-exclusive, worldwide, no-charge, royalty-free license to:
-
- * make, have made, use, sell, offer to sell, import, and otherwise transfer
- your contribution in whole or in part, alone or in combination with or
- included in any product, work or materials arising out of the project to
- which your contribution was submitted, and
-
- * at our option, to sublicense these same rights to third parties through
- multiple levels of sublicensees or other licensing arrangements.
-
-4. Except as set out above, you keep all right, title, and interest in your
-contribution. The rights that you grant to us under these terms are effective
-on the date you first submitted a contribution to us, even if your submission
-took place before the date you sign these terms.
-
-5. You covenant, represent, warrant and agree that:
-
- * Each contribution that you submit is and shall be an original work of
- authorship and you can legally grant the rights set out in this SCA;
-
- * to the best of your knowledge, each contribution will not violate any
- third party's copyrights, trademarks, patents, or other intellectual
- property rights; and
-
- * each contribution shall be in compliance with U.S. export control laws and
- other applicable export and import laws. You agree to notify us if you
- become aware of any circumstance which would make any of the foregoing
- representations inaccurate in any respect. We may publicly disclose your
- participation in the project, including the fact that you have signed the SCA.
-
-6. This SCA is governed by the laws of the State of California and applicable
-U.S. Federal law. Any choice of law rules will not apply.
-
-7. Please place an “x” on one of the applicable statement below. Please do NOT
-mark both statements:
-
- * [ ] I am signing on behalf of myself as an individual and no other person
- or entity, including my employer, has or will have rights with respect to my
- contributions.
-
- * [ ] I am signing on behalf of my employer or a legal entity and I have the
- actual authority to contractually bind that entity.
-
-## Contributor Details
-
-| Field | Entry |
-|------------------------------- | -------------------- |
-| Name | Jaydeep Borkar |
-| Company name (if applicable) | Pune University, India |
-| Title or role (if applicable) | CS Undergrad |
-| Date | 9/26/2019 |
-| GitHub username | jaydeepborkar |
-| Website (optional) | http://jaydeepborkar.github.io |
diff --git a/.github/contributors/seanBE.md b/.github/contributors/seanBE.md
deleted file mode 100644
index 5e4b4de0a..000000000
--- a/.github/contributors/seanBE.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# spaCy contributor agreement
-
-This spaCy Contributor Agreement (**"SCA"**) is based on the
-[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
-The SCA applies to any contribution that you make to any product or project
-managed by us (the **"project"**), and sets out the intellectual property rights
-you grant to us in the contributed materials. The term **"us"** shall mean
-[ExplosionAI GmbH](https://explosion.ai/legal). The term
-**"you"** shall mean the person or entity identified below.
-
-If you agree to be bound by these terms, fill in the information requested
-below and include the filled-in version with your first pull request, under the
-folder [`.github/contributors/`](/.github/contributors/). The name of the file
-should be your GitHub username, with the extension `.md`. For example, the user
-example_user would create the file `.github/contributors/example_user.md`.
-
-Read this agreement carefully before signing. These terms and conditions
-constitute a binding legal agreement.
-
-## Contributor Agreement
-
-1. The term "contribution" or "contributed materials" means any source code,
-object code, patch, tool, sample, graphic, specification, manual,
-documentation, or any other material posted or submitted by you to the project.
-
-2. With respect to any worldwide copyrights, or copyright applications and
-registrations, in your contribution:
-
- * you hereby assign to us joint ownership, and to the extent that such
- assignment is or becomes invalid, ineffective or unenforceable, you hereby
- grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
- royalty-free, unrestricted license to exercise all rights under those
- copyrights. This includes, at our option, the right to sublicense these same
- rights to third parties through multiple levels of sublicensees or other
- licensing arrangements;
-
- * you agree that each of us can do all things in relation to your
- contribution as if each of us were the sole owners, and if one of us makes
- a derivative work of your contribution, the one who makes the derivative
- work (or has it made will be the sole owner of that derivative work;
-
- * you agree that you will not assert any moral rights in your contribution
- against us, our licensees or transferees;
-
- * you agree that we may register a copyright in your contribution and
- exercise all ownership rights associated with it; and
-
- * you agree that neither of us has any duty to consult with, obtain the
- consent of, pay or render an accounting to the other for any use or
- distribution of your contribution.
-
-3. With respect to any patents you own, or that you can license without payment
-to any third party, you hereby grant to us a perpetual, irrevocable,
-non-exclusive, worldwide, no-charge, royalty-free license to:
-
- * make, have made, use, sell, offer to sell, import, and otherwise transfer
- your contribution in whole or in part, alone or in combination with or
- included in any product, work or materials arising out of the project to
- which your contribution was submitted, and
-
- * at our option, to sublicense these same rights to third parties through
- multiple levels of sublicensees or other licensing arrangements.
-
-4. Except as set out above, you keep all right, title, and interest in your
-contribution. The rights that you grant to us under these terms are effective
-on the date you first submitted a contribution to us, even if your submission
-took place before the date you sign these terms.
-
-5. You covenant, represent, warrant and agree that:
-
- * Each contribution that you submit is and shall be an original work of
- authorship and you can legally grant the rights set out in this SCA;
-
- * to the best of your knowledge, each contribution will not violate any
- third party's copyrights, trademarks, patents, or other intellectual
- property rights; and
-
- * each contribution shall be in compliance with U.S. export control laws and
- other applicable export and import laws. You agree to notify us if you
- become aware of any circumstance which would make any of the foregoing
- representations inaccurate in any respect. We may publicly disclose your
- participation in the project, including the fact that you have signed the SCA.
-
-6. This SCA is governed by the laws of the State of California and applicable
-U.S. Federal law. Any choice of law rules will not apply.
-
-7. Please place an “x” on one of the applicable statement below. Please do NOT
-mark both statements:
-
- * [x] I am signing on behalf of myself as an individual and no other person
- or entity, including my employer, has or will have rights with respect to my
- contributions.
-
- * [ ] I am signing on behalf of my employer or a legal entity and I have the
- actual authority to contractually bind that entity.
-
-## Contributor Details
-
-| Field | Entry |
-|------------------------------- | ------------------------- |
-| Name | Sean Löfgren |
-| Company name (if applicable) | |
-| Title or role (if applicable) | |
-| Date | 2019-09-17 |
-| GitHub username | seanBE |
-| Website (optional) | http://seanbe.github.io |
diff --git a/.github/contributors/zqianem.md b/.github/contributors/zqianem.md
deleted file mode 100644
index 13f6ab214..000000000
--- a/.github/contributors/zqianem.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# spaCy contributor agreement
-
-This spaCy Contributor Agreement (**"SCA"**) is based on the
-[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
-The SCA applies to any contribution that you make to any product or project
-managed by us (the **"project"**), and sets out the intellectual property rights
-you grant to us in the contributed materials. The term **"us"** shall mean
-[ExplosionAI GmbH](https://explosion.ai/legal). The term
-**"you"** shall mean the person or entity identified below.
-
-If you agree to be bound by these terms, fill in the information requested
-below and include the filled-in version with your first pull request, under the
-folder [`.github/contributors/`](/.github/contributors/). The name of the file
-should be your GitHub username, with the extension `.md`. For example, the user
-example_user would create the file `.github/contributors/example_user.md`.
-
-Read this agreement carefully before signing. These terms and conditions
-constitute a binding legal agreement.
-
-## Contributor Agreement
-
-1. The term "contribution" or "contributed materials" means any source code,
-object code, patch, tool, sample, graphic, specification, manual,
-documentation, or any other material posted or submitted by you to the project.
-
-2. With respect to any worldwide copyrights, or copyright applications and
-registrations, in your contribution:
-
- * you hereby assign to us joint ownership, and to the extent that such
- assignment is or becomes invalid, ineffective or unenforceable, you hereby
- grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
- royalty-free, unrestricted license to exercise all rights under those
- copyrights. This includes, at our option, the right to sublicense these same
- rights to third parties through multiple levels of sublicensees or other
- licensing arrangements;
-
- * you agree that each of us can do all things in relation to your
- contribution as if each of us were the sole owners, and if one of us makes
- a derivative work of your contribution, the one who makes the derivative
- work (or has it made will be the sole owner of that derivative work;
-
- * you agree that you will not assert any moral rights in your contribution
- against us, our licensees or transferees;
-
- * you agree that we may register a copyright in your contribution and
- exercise all ownership rights associated with it; and
-
- * you agree that neither of us has any duty to consult with, obtain the
- consent of, pay or render an accounting to the other for any use or
- distribution of your contribution.
-
-3. With respect to any patents you own, or that you can license without payment
-to any third party, you hereby grant to us a perpetual, irrevocable,
-non-exclusive, worldwide, no-charge, royalty-free license to:
-
- * make, have made, use, sell, offer to sell, import, and otherwise transfer
- your contribution in whole or in part, alone or in combination with or
- included in any product, work or materials arising out of the project to
- which your contribution was submitted, and
-
- * at our option, to sublicense these same rights to third parties through
- multiple levels of sublicensees or other licensing arrangements.
-
-4. Except as set out above, you keep all right, title, and interest in your
-contribution. The rights that you grant to us under these terms are effective
-on the date you first submitted a contribution to us, even if your submission
-took place before the date you sign these terms.
-
-5. You covenant, represent, warrant and agree that:
-
- * Each contribution that you submit is and shall be an original work of
- authorship and you can legally grant the rights set out in this SCA;
-
- * to the best of your knowledge, each contribution will not violate any
- third party's copyrights, trademarks, patents, or other intellectual
- property rights; and
-
- * each contribution shall be in compliance with U.S. export control laws and
- other applicable export and import laws. You agree to notify us if you
- become aware of any circumstance which would make any of the foregoing
- representations inaccurate in any respect. We may publicly disclose your
- participation in the project, including the fact that you have signed the SCA.
-
-6. This SCA is governed by the laws of the State of California and applicable
-U.S. Federal law. Any choice of law rules will not apply.
-
-7. Please place an “x” on one of the applicable statement below. Please do NOT
-mark both statements:
-
- * [x] I am signing on behalf of myself as an individual and no other person
- or entity, including my employer, has or will have rights with respect to my
- contributions.
-
- * [ ] I am signing on behalf of my employer or a legal entity and I have the
- actual authority to contractually bind that entity.
-
-## Contributor Details
-
-| Field | Entry |
-|------------------------------- | -------------------- |
-| Name | Em Zhan |
-| Company name (if applicable) | |
-| Title or role (if applicable) | |
-| Date | 2019-09-25 |
-| GitHub username | zqianem |
-| Website (optional) | |
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3c2b56cd3..8b02b7055 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -73,8 +73,9 @@ issue body. A few more tips:
### Issue labels
-[See this page](https://github.com/explosion/spaCy/labels) for an overview of
-the system we use to tag our issues and pull requests.
+To distinguish issues that are opened by us, the maintainers, we usually add a
+💫 to the title. [See this page](https://github.com/explosion/spaCy/labels)
+for an overview of the system we use to tag our issues and pull requests.
## Contributing to the code base
diff --git a/Makefile b/Makefile
index 0f5c31ca6..2834096b7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,17 +1,7 @@
SHELL := /bin/bash
sha = $(shell "git" "rev-parse" "--short" "HEAD")
-version = $(shell "bin/get-version.sh")
-wheel = spacy-$(version)-cp36-cp36m-linux_x86_64.whl
-dist/spacy.pex : dist/spacy-$(sha).pex
- cp dist/spacy-$(sha).pex dist/spacy.pex
- chmod a+rx dist/spacy.pex
-
-dist/spacy-$(sha).pex : dist/$(wheel)
- env3.6/bin/python -m pip install pex==1.5.3
- env3.6/bin/pex pytest dist/$(wheel) -e spacy -o dist/spacy-$(sha).pex
-
-dist/$(wheel) : setup.py spacy/*.py* spacy/*/*.py*
+dist/spacy.pex : spacy/*.py* spacy/*/*.py*
python3.6 -m venv env3.6
source env3.6/bin/activate
env3.6/bin/pip install wheel
@@ -19,6 +9,10 @@ dist/$(wheel) : setup.py spacy/*.py* spacy/*/*.py*
env3.6/bin/python setup.py build_ext --inplace
env3.6/bin/python setup.py sdist
env3.6/bin/python setup.py bdist_wheel
+ env3.6/bin/python -m pip install pex==1.5.3
+ env3.6/bin/pex pytest dist/*.whl -e spacy -o dist/spacy-$(sha).pex
+ cp dist/spacy-$(sha).pex dist/spacy.pex
+ chmod a+rx dist/spacy.pex
.PHONY : clean
diff --git a/README.md b/README.md
index 6bdbc7e46..27a49f465 100644
--- a/README.md
+++ b/README.md
@@ -49,12 +49,9 @@ It's commercial open-source software, released under the MIT license.
## 💬 Where to ask questions
The spaCy project is maintained by [@honnibal](https://github.com/honnibal)
-and [@ines](https://github.com/ines), along with core contributors
-[@svlandeg](https://github.com/svlandeg) and
-[@adrianeboyd](https://github.com/adrianeboyd). Please understand that we won't
-be able to provide individual support via email. We also believe that help is
-much more valuable if it's shared publicly, so that more people can benefit
-from it.
+and [@ines](https://github.com/ines). Please understand that we won't be able
+to provide individual support via email. We also believe that help is much more
+valuable if it's shared publicly, so that more people can benefit from it.
| Type | Platforms |
| ------------------------ | ------------------------------------------------------ |
@@ -175,8 +172,8 @@ python -m spacy download en_core_web_sm
python -m spacy download en
# pip install .tar.gz archive from path or URL
-pip install /Users/you/en_core_web_sm-2.2.0.tar.gz
-pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
+pip install /Users/you/en_core_web_sm-2.1.0.tar.gz
+pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz
```
### Loading and using models
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index c23995de6..c5fa563be 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -79,24 +79,14 @@ jobs:
# Downgrading pip is necessary to prevent a wheel version incompatiblity.
# Might be fixed in the future or some other way, so investigate again.
- script: |
- python -m pip install -U pip==18.1 setuptools
+ python -m pip install --upgrade pip==18.1
pip install -r requirements.txt
displayName: 'Install dependencies'
- script: |
python setup.py build_ext --inplace
- python setup.py sdist --formats=gztar
- displayName: 'Compile and build sdist'
+ pip install -e .
+ displayName: 'Build and install'
- - task: DeleteFiles@1
- inputs:
- contents: 'spacy'
- displayName: 'Delete source directory'
-
- - bash: |
- SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
- pip install dist/$SDIST
- displayName: 'Install from sdist'
-
- - script: python -m pytest --pyargs spacy
+ - script: python -m pytest --tb=native spacy
displayName: 'Run tests'
diff --git a/bin/get-version.sh b/bin/get-version.sh
deleted file mode 100755
index 5a12ddd7a..000000000
--- a/bin/get-version.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-version=$(grep "__version__ = " spacy/about.py)
-version=${version/__version__ = }
-version=${version/\'/}
-version=${version/\'/}
-version=${version/\"/}
-version=${version/\"/}
-
-echo $version
diff --git a/bin/ud/run_eval.py b/bin/ud/run_eval.py
index 2da476721..171687980 100644
--- a/bin/ud/run_eval.py
+++ b/bin/ud/run_eval.py
@@ -7,16 +7,14 @@ import datetime
from pathlib import Path
import xml.etree.ElementTree as ET
-import conll17_ud_eval
-from ud_train import write_conllu
+from spacy.cli.ud import conll17_ud_eval
+from spacy.cli.ud.ud_train import write_conllu
from spacy.lang.lex_attrs import word_shape
from spacy.util import get_lang_class
# All languages in spaCy - in UD format (note that Norwegian is 'no' instead of 'nb')
-ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, fa, fi, fr,"
- "ga, he, hi, hr, hu, id, is, it, ja, kn, ko, lt, lv, mr, no,"
- "nl, pl, pt, ro, ru, si, sk, sl, sq, sr, sv, ta, te, th, tl,"
- "tr, tt, uk, ur, vi, zh")
+ALL_LANGUAGES = "ar, ca, da, de, el, en, es, fa, fi, fr, ga, he, hi, hr, hu, id, " \
+ "it, ja, no, nl, pl, pt, ro, ru, sv, tr, ur, vi, zh"
# Non-parsing tasks that will be evaluated (works for default models)
EVAL_NO_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats']
@@ -75,10 +73,10 @@ def _contains_blinded_text(stats_xml):
tree = ET.parse(stats_xml)
root = tree.getroot()
total_tokens = int(root.find('size/total/tokens').text)
- unique_forms = int(root.find('forms').get('unique'))
+ unique_lemmas = int(root.find('lemmas').get('unique'))
# assume the corpus is largely blinded when there are less than 1% unique tokens
- return (unique_forms / total_tokens) < 0.01
+ return (unique_lemmas / total_tokens) < 0.01
def fetch_all_treebanks(ud_dir, languages, corpus, best_per_language):
@@ -264,26 +262,22 @@ def main(out_path, ud_dir, check_parse=False, langs=ALL_LANGUAGES, exclude_train
if not exclude_trained_models:
if 'de' in models:
models['de'].append(load_model('de_core_news_sm'))
- models['de'].append(load_model('de_core_news_md'))
- if 'el' in models:
- models['el'].append(load_model('el_core_news_sm'))
- models['el'].append(load_model('el_core_news_md'))
- if 'en' in models:
- models['en'].append(load_model('en_core_web_sm'))
- models['en'].append(load_model('en_core_web_md'))
- models['en'].append(load_model('en_core_web_lg'))
if 'es' in models:
models['es'].append(load_model('es_core_news_sm'))
models['es'].append(load_model('es_core_news_md'))
- if 'fr' in models:
- models['fr'].append(load_model('fr_core_news_sm'))
- models['fr'].append(load_model('fr_core_news_md'))
+ if 'pt' in models:
+ models['pt'].append(load_model('pt_core_news_sm'))
if 'it' in models:
models['it'].append(load_model('it_core_news_sm'))
if 'nl' in models:
models['nl'].append(load_model('nl_core_news_sm'))
- if 'pt' in models:
- models['pt'].append(load_model('pt_core_news_sm'))
+ if 'en' in models:
+ models['en'].append(load_model('en_core_web_sm'))
+ models['en'].append(load_model('en_core_web_md'))
+ models['en'].append(load_model('en_core_web_lg'))
+ if 'fr' in models:
+ models['fr'].append(load_model('fr_core_news_sm'))
+ models['fr'].append(load_model('fr_core_news_md'))
with out_path.open(mode='w', encoding='utf-8') as out_file:
run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks)
diff --git a/bin/ud/ud_run_test.py b/bin/ud/ud_run_test.py
index de01cf350..1c529c831 100644
--- a/bin/ud/ud_run_test.py
+++ b/bin/ud/ud_run_test.py
@@ -109,13 +109,15 @@ def write_conllu(docs, file_):
merger = Matcher(docs[0].vocab)
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
for i, doc in enumerate(docs):
- matches = []
- if doc.is_parsed:
- matches = merger(doc)
+ matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches]
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
+ # TODO: This shouldn't be necessary? Should be handled in merge
+ for word in doc:
+ if word.i == word.head.i:
+ word.dep_ = "ROOT"
file_.write("# newdoc id = {i}\n".format(i=i))
for j, sent in enumerate(doc.sents):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py
index c1a1501d9..8f699db4f 100644
--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@@ -25,7 +25,7 @@ import itertools
import random
import numpy.random
-import conll17_ud_eval
+from . import conll17_ud_eval
from spacy import lang
from spacy.lang import zh
@@ -82,8 +82,6 @@ def read_data(
head = int(head) - 1 if head != "0" else id_
sent["words"].append(word)
sent["tags"].append(tag)
- sent["morphology"].append(_parse_morph_string(morph))
- sent["morphology"][-1].add("POS_%s" % pos)
sent["heads"].append(head)
sent["deps"].append("ROOT" if dep == "root" else dep)
sent["spaces"].append(space_after == "_")
@@ -92,12 +90,10 @@ def read_data(
if oracle_segments:
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
golds.append(GoldParse(docs[-1], **sent))
- assert golds[-1].morphology is not None
sent_annots.append(sent)
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
doc, gold = _make_gold(nlp, None, sent_annots)
- assert gold.morphology is not None
sent_annots = []
docs.append(doc)
golds.append(gold)
@@ -112,17 +108,6 @@ def read_data(
return docs, golds
return docs, golds
-def _parse_morph_string(morph_string):
- if morph_string == '_':
- return set()
- output = []
- replacements = {'1': 'one', '2': 'two', '3': 'three'}
- for feature in morph_string.split('|'):
- key, value = feature.split('=')
- value = replacements.get(value, value)
- value = value.split(',')[0]
- output.append('%s_%s' % (key, value.lower()))
- return set(output)
def read_conllu(file_):
docs = []
@@ -156,8 +141,8 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
flat = defaultdict(list)
sent_starts = []
for sent in sent_annots:
- flat["heads"].extend(len(flat["words"])+head for head in sent["heads"])
- for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
+ flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
+ for field in ["words", "tags", "deps", "entities", "spaces"]:
flat[field].extend(sent[field])
sent_starts.append(True)
sent_starts.extend([False] * (len(sent["words"]) - 1))
@@ -229,18 +214,11 @@ def write_conllu(docs, file_):
merger = Matcher(docs[0].vocab)
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
for i, doc in enumerate(docs):
- matches = []
- if doc.is_parsed:
- matches = merger(doc)
+ matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches]
- seen_tokens = set()
with doc.retokenize() as retokenizer:
for span in spans:
- span_tokens = set(range(span.start, span.end))
- if not span_tokens.intersection(seen_tokens):
- retokenizer.merge(span)
- seen_tokens.update(span_tokens)
-
+ retokenizer.merge(span)
file_.write("# newdoc id = {i}\n".format(i=i))
for j, sent in enumerate(doc.sents):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
@@ -263,29 +241,27 @@ def write_conllu(docs, file_):
def print_progress(itn, losses, ud_scores):
fields = {
"dep_loss": losses.get("parser", 0.0),
- "morph_loss": losses.get("morphologizer", 0.0),
"tag_loss": losses.get("tagger", 0.0),
"words": ud_scores["Words"].f1 * 100,
"sents": ud_scores["Sentences"].f1 * 100,
"tags": ud_scores["XPOS"].f1 * 100,
"uas": ud_scores["UAS"].f1 * 100,
"las": ud_scores["LAS"].f1 * 100,
- "morph": ud_scores["Feats"].f1 * 100,
}
- header = ["Epoch", "P.Loss", "M.Loss", "LAS", "UAS", "TAG", "MORPH", "SENT", "WORD"]
+ header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
if itn == 0:
print("\t".join(header))
- tpl = "\t".join((
- "{:d}",
- "{dep_loss:.1f}",
- "{morph_loss:.1f}",
- "{las:.1f}",
- "{uas:.1f}",
- "{tags:.1f}",
- "{morph:.1f}",
- "{sents:.1f}",
- "{words:.1f}",
- ))
+ tpl = "\t".join(
+ (
+ "{:d}",
+ "{dep_loss:.1f}",
+ "{las:.1f}",
+ "{uas:.1f}",
+ "{tags:.1f}",
+ "{sents:.1f}",
+ "{words:.1f}",
+ )
+ )
print(tpl.format(itn, **fields))
@@ -306,27 +282,25 @@ def get_token_conllu(token, i):
head = 0
else:
head = i + (token.head.i - token.i) + 1
- features = list(token.morph)
- feat_str = []
- replacements = {"one": "1", "two": "2", "three": "3"}
- for feat in features:
- if not feat.startswith("begin") and not feat.startswith("end"):
- key, value = feat.split("_", 1)
- value = replacements.get(value, value)
- feat_str.append("%s=%s" % (key, value.title()))
- if not feat_str:
- feat_str = "_"
- else:
- feat_str = "|".join(feat_str)
- fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, feat_str,
- str(head), token.dep_.lower(), "_", "_"]
+ fields = [
+ str(i + 1),
+ token.text,
+ token.lemma_,
+ token.pos_,
+ token.tag_,
+ "_",
+ str(head),
+ token.dep_.lower(),
+ "_",
+ "_",
+ ]
lines.append("\t".join(fields))
return "\n".join(lines)
-Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
-Token.set_extension("begins_fused", default=False, force=True)
-Token.set_extension("inside_fused", default=False, force=True)
+Token.set_extension("get_conllu_lines", method=get_token_conllu)
+Token.set_extension("begins_fused", default=False)
+Token.set_extension("inside_fused", default=False)
##################
@@ -350,8 +324,7 @@ def load_nlp(corpus, config, vectors=None):
def initialize_pipeline(nlp, docs, golds, config, device):
- nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False}))
- nlp.add_pipe(nlp.create_pipe("morphologizer"))
+ nlp.add_pipe(nlp.create_pipe("tagger"))
nlp.add_pipe(nlp.create_pipe("parser"))
if config.multitask_tag:
nlp.parser.add_multitask_objective("tag")
@@ -551,12 +524,14 @@ def main(
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
with nlp.use_params(optimizer.averages):
if use_oracle_segments:
- parsed_docs, scores = evaluate(nlp, paths.dev.conllu,
- paths.dev.conllu, out_path)
+ parsed_docs, scores = evaluate(
+ nlp, paths.dev.conllu, paths.dev.conllu, out_path
+ )
else:
- parsed_docs, scores = evaluate(nlp, paths.dev.text,
- paths.dev.conllu, out_path)
- print_progress(i, losses, scores)
+ parsed_docs, scores = evaluate(
+ nlp, paths.dev.text, paths.dev.conllu, out_path
+ )
+ print_progress(i, losses, scores)
def _render_parses(i, to_render):
diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/training/pretrain_kb.py b/examples/training/pretrain_kb.py
index 2c494d5c4..d5281ad42 100644
--- a/examples/training/pretrain_kb.py
+++ b/examples/training/pretrain_kb.py
@@ -8,8 +8,8 @@ For more details, see the documentation:
* Knowledge base: https://spacy.io/api/kb
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
-Compatible with: spaCy v2.2
-Last tested with: v2.2
+Compatible with: spaCy vX.X
+Last tested with: vX.X
"""
from __future__ import unicode_literals, print_function
@@ -73,6 +73,7 @@ def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
input_dim=INPUT_DIM,
desc_width=DESC_WIDTH,
epochs=n_iter,
+ threshold=0.001,
)
encoder.train(description_list=descriptions, to_print=True)
diff --git a/examples/training/textcat_example_data/CC0.txt b/examples/training/textcat_example_data/CC0.txt
deleted file mode 100644
index 0e259d42c..000000000
--- a/examples/training/textcat_example_data/CC0.txt
+++ /dev/null
@@ -1,121 +0,0 @@
-Creative Commons Legal Code
-
-CC0 1.0 Universal
-
- CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
- LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
- ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
- INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
- REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
- PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
- THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
- HEREUNDER.
-
-Statement of Purpose
-
-The laws of most jurisdictions throughout the world automatically confer
-exclusive Copyright and Related Rights (defined below) upon the creator
-and subsequent owner(s) (each and all, an "owner") of an original work of
-authorship and/or a database (each, a "Work").
-
-Certain owners wish to permanently relinquish those rights to a Work for
-the purpose of contributing to a commons of creative, cultural and
-scientific works ("Commons") that the public can reliably and without fear
-of later claims of infringement build upon, modify, incorporate in other
-works, reuse and redistribute as freely as possible in any form whatsoever
-and for any purposes, including without limitation commercial purposes.
-These owners may contribute to the Commons to promote the ideal of a free
-culture and the further production of creative, cultural and scientific
-works, or to gain reputation or greater distribution for their Work in
-part through the use and efforts of others.
-
-For these and/or other purposes and motivations, and without any
-expectation of additional consideration or compensation, the person
-associating CC0 with a Work (the "Affirmer"), to the extent that he or she
-is an owner of Copyright and Related Rights in the Work, voluntarily
-elects to apply CC0 to the Work and publicly distribute the Work under its
-terms, with knowledge of his or her Copyright and Related Rights in the
-Work and the meaning and intended legal effect of CC0 on those rights.
-
-1. Copyright and Related Rights. A Work made available under CC0 may be
-protected by copyright and related or neighboring rights ("Copyright and
-Related Rights"). Copyright and Related Rights include, but are not
-limited to, the following:
-
- i. the right to reproduce, adapt, distribute, perform, display,
- communicate, and translate a Work;
- ii. moral rights retained by the original author(s) and/or performer(s);
-iii. publicity and privacy rights pertaining to a person's image or
- likeness depicted in a Work;
- iv. rights protecting against unfair competition in regards to a Work,
- subject to the limitations in paragraph 4(a), below;
- v. rights protecting the extraction, dissemination, use and reuse of data
- in a Work;
- vi. database rights (such as those arising under Directive 96/9/EC of the
- European Parliament and of the Council of 11 March 1996 on the legal
- protection of databases, and under any national implementation
- thereof, including any amended or successor version of such
- directive); and
-vii. other similar, equivalent or corresponding rights throughout the
- world based on applicable law or treaty, and any national
- implementations thereof.
-
-2. Waiver. To the greatest extent permitted by, but not in contravention
-of, applicable law, Affirmer hereby overtly, fully, permanently,
-irrevocably and unconditionally waives, abandons, and surrenders all of
-Affirmer's Copyright and Related Rights and associated claims and causes
-of action, whether now known or unknown (including existing as well as
-future claims and causes of action), in the Work (i) in all territories
-worldwide, (ii) for the maximum duration provided by applicable law or
-treaty (including future time extensions), (iii) in any current or future
-medium and for any number of copies, and (iv) for any purpose whatsoever,
-including without limitation commercial, advertising or promotional
-purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
-member of the public at large and to the detriment of Affirmer's heirs and
-successors, fully intending that such Waiver shall not be subject to
-revocation, rescission, cancellation, termination, or any other legal or
-equitable action to disrupt the quiet enjoyment of the Work by the public
-as contemplated by Affirmer's express Statement of Purpose.
-
-3. Public License Fallback. Should any part of the Waiver for any reason
-be judged legally invalid or ineffective under applicable law, then the
-Waiver shall be preserved to the maximum extent permitted taking into
-account Affirmer's express Statement of Purpose. In addition, to the
-extent the Waiver is so judged Affirmer hereby grants to each affected
-person a royalty-free, non transferable, non sublicensable, non exclusive,
-irrevocable and unconditional license to exercise Affirmer's Copyright and
-Related Rights in the Work (i) in all territories worldwide, (ii) for the
-maximum duration provided by applicable law or treaty (including future
-time extensions), (iii) in any current or future medium and for any number
-of copies, and (iv) for any purpose whatsoever, including without
-limitation commercial, advertising or promotional purposes (the
-"License"). The License shall be deemed effective as of the date CC0 was
-applied by Affirmer to the Work. Should any part of the License for any
-reason be judged legally invalid or ineffective under applicable law, such
-partial invalidity or ineffectiveness shall not invalidate the remainder
-of the License, and in such case Affirmer hereby affirms that he or she
-will not (i) exercise any of his or her remaining Copyright and Related
-Rights in the Work or (ii) assert any associated claims and causes of
-action with respect to the Work, in either case contrary to Affirmer's
-express Statement of Purpose.
-
-4. Limitations and Disclaimers.
-
- a. No trademark or patent rights held by Affirmer are waived, abandoned,
- surrendered, licensed or otherwise affected by this document.
- b. Affirmer offers the Work as-is and makes no representations or
- warranties of any kind concerning the Work, express, implied,
- statutory or otherwise, including without limitation warranties of
- title, merchantability, fitness for a particular purpose, non
- infringement, or the absence of latent or other defects, accuracy, or
- the present or absence of errors, whether or not discoverable, all to
- the greatest extent permissible under applicable law.
- c. Affirmer disclaims responsibility for clearing rights of other persons
- that may apply to the Work or any use thereof, including without
- limitation any person's Copyright and Related Rights in the Work.
- Further, Affirmer disclaims responsibility for obtaining any necessary
- consents, permissions or other rights required for any use of the
- Work.
- d. Affirmer understands and acknowledges that Creative Commons is not a
- party to this document and has no duty or obligation with respect to
- this CC0 or use of the Work.
diff --git a/examples/training/textcat_example_data/CC_BY-SA-3.0.txt b/examples/training/textcat_example_data/CC_BY-SA-3.0.txt
deleted file mode 100644
index 604209a80..000000000
--- a/examples/training/textcat_example_data/CC_BY-SA-3.0.txt
+++ /dev/null
@@ -1,359 +0,0 @@
-Creative Commons Legal Code
-
-Attribution-ShareAlike 3.0 Unported
-
- CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
- LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN
- ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
- INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
- REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR
- DAMAGES RESULTING FROM ITS USE.
-
-License
-
-THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE
-COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY
-COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS
-AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
-
-BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE
-TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY
-BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS
-CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND
-CONDITIONS.
-
-1. Definitions
-
- a. "Adaptation" means a work based upon the Work, or upon the Work and
- other pre-existing works, such as a translation, adaptation,
- derivative work, arrangement of music or other alterations of a
- literary or artistic work, or phonogram or performance and includes
- cinematographic adaptations or any other form in which the Work may be
- recast, transformed, or adapted including in any form recognizably
- derived from the original, except that a work that constitutes a
- Collection will not be considered an Adaptation for the purpose of
- this License. For the avoidance of doubt, where the Work is a musical
- work, performance or phonogram, the synchronization of the Work in
- timed-relation with a moving image ("synching") will be considered an
- Adaptation for the purpose of this License.
- b. "Collection" means a collection of literary or artistic works, such as
- encyclopedias and anthologies, or performances, phonograms or
- broadcasts, or other works or subject matter other than works listed
- in Section 1(f) below, which, by reason of the selection and
- arrangement of their contents, constitute intellectual creations, in
- which the Work is included in its entirety in unmodified form along
- with one or more other contributions, each constituting separate and
- independent works in themselves, which together are assembled into a
- collective whole. A work that constitutes a Collection will not be
- considered an Adaptation (as defined below) for the purposes of this
- License.
- c. "Creative Commons Compatible License" means a license that is listed
- at https://creativecommons.org/compatiblelicenses that has been
- approved by Creative Commons as being essentially equivalent to this
- License, including, at a minimum, because that license: (i) contains
- terms that have the same purpose, meaning and effect as the License
- Elements of this License; and, (ii) explicitly permits the relicensing
- of adaptations of works made available under that license under this
- License or a Creative Commons jurisdiction license with the same
- License Elements as this License.
- d. "Distribute" means to make available to the public the original and
- copies of the Work or Adaptation, as appropriate, through sale or
- other transfer of ownership.
- e. "License Elements" means the following high-level license attributes
- as selected by Licensor and indicated in the title of this License:
- Attribution, ShareAlike.
- f. "Licensor" means the individual, individuals, entity or entities that
- offer(s) the Work under the terms of this License.
- g. "Original Author" means, in the case of a literary or artistic work,
- the individual, individuals, entity or entities who created the Work
- or if no individual or entity can be identified, the publisher; and in
- addition (i) in the case of a performance the actors, singers,
- musicians, dancers, and other persons who act, sing, deliver, declaim,
- play in, interpret or otherwise perform literary or artistic works or
- expressions of folklore; (ii) in the case of a phonogram the producer
- being the person or legal entity who first fixes the sounds of a
- performance or other sounds; and, (iii) in the case of broadcasts, the
- organization that transmits the broadcast.
- h. "Work" means the literary and/or artistic work offered under the terms
- of this License including without limitation any production in the
- literary, scientific and artistic domain, whatever may be the mode or
- form of its expression including digital form, such as a book,
- pamphlet and other writing; a lecture, address, sermon or other work
- of the same nature; a dramatic or dramatico-musical work; a
- choreographic work or entertainment in dumb show; a musical
- composition with or without words; a cinematographic work to which are
- assimilated works expressed by a process analogous to cinematography;
- a work of drawing, painting, architecture, sculpture, engraving or
- lithography; a photographic work to which are assimilated works
- expressed by a process analogous to photography; a work of applied
- art; an illustration, map, plan, sketch or three-dimensional work
- relative to geography, topography, architecture or science; a
- performance; a broadcast; a phonogram; a compilation of data to the
- extent it is protected as a copyrightable work; or a work performed by
- a variety or circus performer to the extent it is not otherwise
- considered a literary or artistic work.
- i. "You" means an individual or entity exercising rights under this
- License who has not previously violated the terms of this License with
- respect to the Work, or who has received express permission from the
- Licensor to exercise rights under this License despite a previous
- violation.
- j. "Publicly Perform" means to perform public recitations of the Work and
- to communicate to the public those public recitations, by any means or
- process, including by wire or wireless means or public digital
- performances; to make available to the public Works in such a way that
- members of the public may access these Works from a place and at a
- place individually chosen by them; to perform the Work to the public
- by any means or process and the communication to the public of the
- performances of the Work, including by public digital performance; to
- broadcast and rebroadcast the Work by any means including signs,
- sounds or images.
- k. "Reproduce" means to make copies of the Work by any means including
- without limitation by sound or visual recordings and the right of
- fixation and reproducing fixations of the Work, including storage of a
- protected performance or phonogram in digital form or other electronic
- medium.
-
-2. Fair Dealing Rights. Nothing in this License is intended to reduce,
-limit, or restrict any uses free from copyright or rights arising from
-limitations or exceptions that are provided for in connection with the
-copyright protection under copyright law or other applicable laws.
-
-3. License Grant. Subject to the terms and conditions of this License,
-Licensor hereby grants You a worldwide, royalty-free, non-exclusive,
-perpetual (for the duration of the applicable copyright) license to
-exercise the rights in the Work as stated below:
-
- a. to Reproduce the Work, to incorporate the Work into one or more
- Collections, and to Reproduce the Work as incorporated in the
- Collections;
- b. to create and Reproduce Adaptations provided that any such Adaptation,
- including any translation in any medium, takes reasonable steps to
- clearly label, demarcate or otherwise identify that changes were made
- to the original Work. For example, a translation could be marked "The
- original work was translated from English to Spanish," or a
- modification could indicate "The original work has been modified.";
- c. to Distribute and Publicly Perform the Work including as incorporated
- in Collections; and,
- d. to Distribute and Publicly Perform Adaptations.
- e. For the avoidance of doubt:
-
- i. Non-waivable Compulsory License Schemes. In those jurisdictions in
- which the right to collect royalties through any statutory or
- compulsory licensing scheme cannot be waived, the Licensor
- reserves the exclusive right to collect such royalties for any
- exercise by You of the rights granted under this License;
- ii. Waivable Compulsory License Schemes. In those jurisdictions in
- which the right to collect royalties through any statutory or
- compulsory licensing scheme can be waived, the Licensor waives the
- exclusive right to collect such royalties for any exercise by You
- of the rights granted under this License; and,
- iii. Voluntary License Schemes. The Licensor waives the right to
- collect royalties, whether individually or, in the event that the
- Licensor is a member of a collecting society that administers
- voluntary licensing schemes, via that society, from any exercise
- by You of the rights granted under this License.
-
-The above rights may be exercised in all media and formats whether now
-known or hereafter devised. The above rights include the right to make
-such modifications as are technically necessary to exercise the rights in
-other media and formats. Subject to Section 8(f), all rights not expressly
-granted by Licensor are hereby reserved.
-
-4. Restrictions. The license granted in Section 3 above is expressly made
-subject to and limited by the following restrictions:
-
- a. You may Distribute or Publicly Perform the Work only under the terms
- of this License. You must include a copy of, or the Uniform Resource
- Identifier (URI) for, this License with every copy of the Work You
- Distribute or Publicly Perform. You may not offer or impose any terms
- on the Work that restrict the terms of this License or the ability of
- the recipient of the Work to exercise the rights granted to that
- recipient under the terms of the License. You may not sublicense the
- Work. You must keep intact all notices that refer to this License and
- to the disclaimer of warranties with every copy of the Work You
- Distribute or Publicly Perform. When You Distribute or Publicly
- Perform the Work, You may not impose any effective technological
- measures on the Work that restrict the ability of a recipient of the
- Work from You to exercise the rights granted to that recipient under
- the terms of the License. This Section 4(a) applies to the Work as
- incorporated in a Collection, but this does not require the Collection
- apart from the Work itself to be made subject to the terms of this
- License. If You create a Collection, upon notice from any Licensor You
- must, to the extent practicable, remove from the Collection any credit
- as required by Section 4(c), as requested. If You create an
- Adaptation, upon notice from any Licensor You must, to the extent
- practicable, remove from the Adaptation any credit as required by
- Section 4(c), as requested.
- b. You may Distribute or Publicly Perform an Adaptation only under the
- terms of: (i) this License; (ii) a later version of this License with
- the same License Elements as this License; (iii) a Creative Commons
- jurisdiction license (either this or a later license version) that
- contains the same License Elements as this License (e.g.,
- Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible
- License. If you license the Adaptation under one of the licenses
- mentioned in (iv), you must comply with the terms of that license. If
- you license the Adaptation under the terms of any of the licenses
- mentioned in (i), (ii) or (iii) (the "Applicable License"), you must
- comply with the terms of the Applicable License generally and the
- following provisions: (I) You must include a copy of, or the URI for,
- the Applicable License with every copy of each Adaptation You
- Distribute or Publicly Perform; (II) You may not offer or impose any
- terms on the Adaptation that restrict the terms of the Applicable
- License or the ability of the recipient of the Adaptation to exercise
- the rights granted to that recipient under the terms of the Applicable
- License; (III) You must keep intact all notices that refer to the
- Applicable License and to the disclaimer of warranties with every copy
- of the Work as included in the Adaptation You Distribute or Publicly
- Perform; (IV) when You Distribute or Publicly Perform the Adaptation,
- You may not impose any effective technological measures on the
- Adaptation that restrict the ability of a recipient of the Adaptation
- from You to exercise the rights granted to that recipient under the
- terms of the Applicable License. This Section 4(b) applies to the
- Adaptation as incorporated in a Collection, but this does not require
- the Collection apart from the Adaptation itself to be made subject to
- the terms of the Applicable License.
- c. If You Distribute, or Publicly Perform the Work or any Adaptations or
- Collections, You must, unless a request has been made pursuant to
- Section 4(a), keep intact all copyright notices for the Work and
- provide, reasonable to the medium or means You are utilizing: (i) the
- name of the Original Author (or pseudonym, if applicable) if supplied,
- and/or if the Original Author and/or Licensor designate another party
- or parties (e.g., a sponsor institute, publishing entity, journal) for
- attribution ("Attribution Parties") in Licensor's copyright notice,
- terms of service or by other reasonable means, the name of such party
- or parties; (ii) the title of the Work if supplied; (iii) to the
- extent reasonably practicable, the URI, if any, that Licensor
- specifies to be associated with the Work, unless such URI does not
- refer to the copyright notice or licensing information for the Work;
- and (iv) , consistent with Ssection 3(b), in the case of an
- Adaptation, a credit identifying the use of the Work in the Adaptation
- (e.g., "French translation of the Work by Original Author," or
- "Screenplay based on original Work by Original Author"). The credit
- required by this Section 4(c) may be implemented in any reasonable
- manner; provided, however, that in the case of a Adaptation or
- Collection, at a minimum such credit will appear, if a credit for all
- contributing authors of the Adaptation or Collection appears, then as
- part of these credits and in a manner at least as prominent as the
- credits for the other contributing authors. For the avoidance of
- doubt, You may only use the credit required by this Section for the
- purpose of attribution in the manner set out above and, by exercising
- Your rights under this License, You may not implicitly or explicitly
- assert or imply any connection with, sponsorship or endorsement by the
- Original Author, Licensor and/or Attribution Parties, as appropriate,
- of You or Your use of the Work, without the separate, express prior
- written permission of the Original Author, Licensor and/or Attribution
- Parties.
- d. Except as otherwise agreed in writing by the Licensor or as may be
- otherwise permitted by applicable law, if You Reproduce, Distribute or
- Publicly Perform the Work either by itself or as part of any
- Adaptations or Collections, You must not distort, mutilate, modify or
- take other derogatory action in relation to the Work which would be
- prejudicial to the Original Author's honor or reputation. Licensor
- agrees that in those jurisdictions (e.g. Japan), in which any exercise
- of the right granted in Section 3(b) of this License (the right to
- make Adaptations) would be deemed to be a distortion, mutilation,
- modification or other derogatory action prejudicial to the Original
- Author's honor and reputation, the Licensor will waive or not assert,
- as appropriate, this Section, to the fullest extent permitted by the
- applicable national law, to enable You to reasonably exercise Your
- right under Section 3(b) of this License (right to make Adaptations)
- but not otherwise.
-
-5. Representations, Warranties and Disclaimer
-
-UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR
-OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY
-KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE,
-INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY,
-FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF
-LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS,
-WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION
-OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
-
-6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE
-LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR
-ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES
-ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS
-BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
-
-7. Termination
-
- a. This License and the rights granted hereunder will terminate
- automatically upon any breach by You of the terms of this License.
- Individuals or entities who have received Adaptations or Collections
- from You under this License, however, will not have their licenses
- terminated provided such individuals or entities remain in full
- compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will
- survive any termination of this License.
- b. Subject to the above terms and conditions, the license granted here is
- perpetual (for the duration of the applicable copyright in the Work).
- Notwithstanding the above, Licensor reserves the right to release the
- Work under different license terms or to stop distributing the Work at
- any time; provided, however that any such election will not serve to
- withdraw this License (or any other license that has been, or is
- required to be, granted under the terms of this License), and this
- License will continue in full force and effect unless terminated as
- stated above.
-
-8. Miscellaneous
-
- a. Each time You Distribute or Publicly Perform the Work or a Collection,
- the Licensor offers to the recipient a license to the Work on the same
- terms and conditions as the license granted to You under this License.
- b. Each time You Distribute or Publicly Perform an Adaptation, Licensor
- offers to the recipient a license to the original Work on the same
- terms and conditions as the license granted to You under this License.
- c. If any provision of this License is invalid or unenforceable under
- applicable law, it shall not affect the validity or enforceability of
- the remainder of the terms of this License, and without further action
- by the parties to this agreement, such provision shall be reformed to
- the minimum extent necessary to make such provision valid and
- enforceable.
- d. No term or provision of this License shall be deemed waived and no
- breach consented to unless such waiver or consent shall be in writing
- and signed by the party to be charged with such waiver or consent.
- e. This License constitutes the entire agreement between the parties with
- respect to the Work licensed here. There are no understandings,
- agreements or representations with respect to the Work not specified
- here. Licensor shall not be bound by any additional provisions that
- may appear in any communication from You. This License may not be
- modified without the mutual written agreement of the Licensor and You.
- f. The rights granted under, and the subject matter referenced, in this
- License were drafted utilizing the terminology of the Berne Convention
- for the Protection of Literary and Artistic Works (as amended on
- September 28, 1979), the Rome Convention of 1961, the WIPO Copyright
- Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996
- and the Universal Copyright Convention (as revised on July 24, 1971).
- These rights and subject matter take effect in the relevant
- jurisdiction in which the License terms are sought to be enforced
- according to the corresponding provisions of the implementation of
- those treaty provisions in the applicable national law. If the
- standard suite of rights granted under applicable copyright law
- includes additional rights not granted under this License, such
- additional rights are deemed to be included in the License; this
- License is not intended to restrict the license of any rights under
- applicable law.
-
-
-Creative Commons Notice
-
- Creative Commons is not a party to this License, and makes no warranty
- whatsoever in connection with the Work. Creative Commons will not be
- liable to You or any party on any legal theory for any damages
- whatsoever, including without limitation any general, special,
- incidental or consequential damages arising in connection to this
- license. Notwithstanding the foregoing two (2) sentences, if Creative
- Commons has expressly identified itself as the Licensor hereunder, it
- shall have all rights and obligations of Licensor.
-
- Except for the limited purpose of indicating to the public that the
- Work is licensed under the CCPL, Creative Commons does not authorize
- the use by either party of the trademark "Creative Commons" or any
- related trademark or logo of Creative Commons without the prior
- written consent of Creative Commons. Any permitted use will be in
- compliance with Creative Commons' then-current trademark usage
- guidelines, as may be published on its website or otherwise made
- available upon request from time to time. For the avoidance of doubt,
- this trademark restriction does not form part of the License.
-
- Creative Commons may be contacted at https://creativecommons.org/.
diff --git a/examples/training/textcat_example_data/CC_BY-SA-4.0.txt b/examples/training/textcat_example_data/CC_BY-SA-4.0.txt
deleted file mode 100644
index a73481c4b..000000000
--- a/examples/training/textcat_example_data/CC_BY-SA-4.0.txt
+++ /dev/null
@@ -1,428 +0,0 @@
-Attribution-ShareAlike 4.0 International
-
-=======================================================================
-
-Creative Commons Corporation ("Creative Commons") is not a law firm and
-does not provide legal services or legal advice. Distribution of
-Creative Commons public licenses does not create a lawyer-client or
-other relationship. Creative Commons makes its licenses and related
-information available on an "as-is" basis. Creative Commons gives no
-warranties regarding its licenses, any material licensed under their
-terms and conditions, or any related information. Creative Commons
-disclaims all liability for damages resulting from their use to the
-fullest extent possible.
-
-Using Creative Commons Public Licenses
-
-Creative Commons public licenses provide a standard set of terms and
-conditions that creators and other rights holders may use to share
-original works of authorship and other material subject to copyright
-and certain other rights specified in the public license below. The
-following considerations are for informational purposes only, are not
-exhaustive, and do not form part of our licenses.
-
- Considerations for licensors: Our public licenses are
- intended for use by those authorized to give the public
- permission to use material in ways otherwise restricted by
- copyright and certain other rights. Our licenses are
- irrevocable. Licensors should read and understand the terms
- and conditions of the license they choose before applying it.
- Licensors should also secure all rights necessary before
- applying our licenses so that the public can reuse the
- material as expected. Licensors should clearly mark any
- material not subject to the license. This includes other CC-
- licensed material, or material used under an exception or
- limitation to copyright. More considerations for licensors:
- wiki.creativecommons.org/Considerations_for_licensors
-
- Considerations for the public: By using one of our public
- licenses, a licensor grants the public permission to use the
- licensed material under specified terms and conditions. If
- the licensor's permission is not necessary for any reason--for
- example, because of any applicable exception or limitation to
- copyright--then that use is not regulated by the license. Our
- licenses grant only permissions under copyright and certain
- other rights that a licensor has authority to grant. Use of
- the licensed material may still be restricted for other
- reasons, including because others have copyright or other
- rights in the material. A licensor may make special requests,
- such as asking that all changes be marked or described.
- Although not required by our licenses, you are encouraged to
- respect those requests where reasonable. More considerations
- for the public:
- wiki.creativecommons.org/Considerations_for_licensees
-
-=======================================================================
-
-Creative Commons Attribution-ShareAlike 4.0 International Public
-License
-
-By exercising the Licensed Rights (defined below), You accept and agree
-to be bound by the terms and conditions of this Creative Commons
-Attribution-ShareAlike 4.0 International Public License ("Public
-License"). To the extent this Public License may be interpreted as a
-contract, You are granted the Licensed Rights in consideration of Your
-acceptance of these terms and conditions, and the Licensor grants You
-such rights in consideration of benefits the Licensor receives from
-making the Licensed Material available under these terms and
-conditions.
-
-
-Section 1 -- Definitions.
-
- a. Adapted Material means material subject to Copyright and Similar
- Rights that is derived from or based upon the Licensed Material
- and in which the Licensed Material is translated, altered,
- arranged, transformed, or otherwise modified in a manner requiring
- permission under the Copyright and Similar Rights held by the
- Licensor. For purposes of this Public License, where the Licensed
- Material is a musical work, performance, or sound recording,
- Adapted Material is always produced where the Licensed Material is
- synched in timed relation with a moving image.
-
- b. Adapter's License means the license You apply to Your Copyright
- and Similar Rights in Your contributions to Adapted Material in
- accordance with the terms and conditions of this Public License.
-
- c. BY-SA Compatible License means a license listed at
- creativecommons.org/compatiblelicenses, approved by Creative
- Commons as essentially the equivalent of this Public License.
-
- d. Copyright and Similar Rights means copyright and/or similar rights
- closely related to copyright including, without limitation,
- performance, broadcast, sound recording, and Sui Generis Database
- Rights, without regard to how the rights are labeled or
- categorized. For purposes of this Public License, the rights
- specified in Section 2(b)(1)-(2) are not Copyright and Similar
- Rights.
-
- e. Effective Technological Measures means those measures that, in the
- absence of proper authority, may not be circumvented under laws
- fulfilling obligations under Article 11 of the WIPO Copyright
- Treaty adopted on December 20, 1996, and/or similar international
- agreements.
-
- f. Exceptions and Limitations means fair use, fair dealing, and/or
- any other exception or limitation to Copyright and Similar Rights
- that applies to Your use of the Licensed Material.
-
- g. License Elements means the license attributes listed in the name
- of a Creative Commons Public License. The License Elements of this
- Public License are Attribution and ShareAlike.
-
- h. Licensed Material means the artistic or literary work, database,
- or other material to which the Licensor applied this Public
- License.
-
- i. Licensed Rights means the rights granted to You subject to the
- terms and conditions of this Public License, which are limited to
- all Copyright and Similar Rights that apply to Your use of the
- Licensed Material and that the Licensor has authority to license.
-
- j. Licensor means the individual(s) or entity(ies) granting rights
- under this Public License.
-
- k. Share means to provide material to the public by any means or
- process that requires permission under the Licensed Rights, such
- as reproduction, public display, public performance, distribution,
- dissemination, communication, or importation, and to make material
- available to the public including in ways that members of the
- public may access the material from a place and at a time
- individually chosen by them.
-
- l. Sui Generis Database Rights means rights other than copyright
- resulting from Directive 96/9/EC of the European Parliament and of
- the Council of 11 March 1996 on the legal protection of databases,
- as amended and/or succeeded, as well as other essentially
- equivalent rights anywhere in the world.
-
- m. You means the individual or entity exercising the Licensed Rights
- under this Public License. Your has a corresponding meaning.
-
-
-Section 2 -- Scope.
-
- a. License grant.
-
- 1. Subject to the terms and conditions of this Public License,
- the Licensor hereby grants You a worldwide, royalty-free,
- non-sublicensable, non-exclusive, irrevocable license to
- exercise the Licensed Rights in the Licensed Material to:
-
- a. reproduce and Share the Licensed Material, in whole or
- in part; and
-
- b. produce, reproduce, and Share Adapted Material.
-
- 2. Exceptions and Limitations. For the avoidance of doubt, where
- Exceptions and Limitations apply to Your use, this Public
- License does not apply, and You do not need to comply with
- its terms and conditions.
-
- 3. Term. The term of this Public License is specified in Section
- 6(a).
-
- 4. Media and formats; technical modifications allowed. The
- Licensor authorizes You to exercise the Licensed Rights in
- all media and formats whether now known or hereafter created,
- and to make technical modifications necessary to do so. The
- Licensor waives and/or agrees not to assert any right or
- authority to forbid You from making technical modifications
- necessary to exercise the Licensed Rights, including
- technical modifications necessary to circumvent Effective
- Technological Measures. For purposes of this Public License,
- simply making modifications authorized by this Section 2(a)
- (4) never produces Adapted Material.
-
- 5. Downstream recipients.
-
- a. Offer from the Licensor -- Licensed Material. Every
- recipient of the Licensed Material automatically
- receives an offer from the Licensor to exercise the
- Licensed Rights under the terms and conditions of this
- Public License.
-
- b. Additional offer from the Licensor -- Adapted Material.
- Every recipient of Adapted Material from You
- automatically receives an offer from the Licensor to
- exercise the Licensed Rights in the Adapted Material
- under the conditions of the Adapter's License You apply.
-
- c. No downstream restrictions. You may not offer or impose
- any additional or different terms or conditions on, or
- apply any Effective Technological Measures to, the
- Licensed Material if doing so restricts exercise of the
- Licensed Rights by any recipient of the Licensed
- Material.
-
- 6. No endorsement. Nothing in this Public License constitutes or
- may be construed as permission to assert or imply that You
- are, or that Your use of the Licensed Material is, connected
- with, or sponsored, endorsed, or granted official status by,
- the Licensor or others designated to receive attribution as
- provided in Section 3(a)(1)(A)(i).
-
- b. Other rights.
-
- 1. Moral rights, such as the right of integrity, are not
- licensed under this Public License, nor are publicity,
- privacy, and/or other similar personality rights; however, to
- the extent possible, the Licensor waives and/or agrees not to
- assert any such rights held by the Licensor to the limited
- extent necessary to allow You to exercise the Licensed
- Rights, but not otherwise.
-
- 2. Patent and trademark rights are not licensed under this
- Public License.
-
- 3. To the extent possible, the Licensor waives any right to
- collect royalties from You for the exercise of the Licensed
- Rights, whether directly or through a collecting society
- under any voluntary or waivable statutory or compulsory
- licensing scheme. In all other cases the Licensor expressly
- reserves any right to collect such royalties.
-
-
-Section 3 -- License Conditions.
-
-Your exercise of the Licensed Rights is expressly made subject to the
-following conditions.
-
- a. Attribution.
-
- 1. If You Share the Licensed Material (including in modified
- form), You must:
-
- a. retain the following if it is supplied by the Licensor
- with the Licensed Material:
-
- i. identification of the creator(s) of the Licensed
- Material and any others designated to receive
- attribution, in any reasonable manner requested by
- the Licensor (including by pseudonym if
- designated);
-
- ii. a copyright notice;
-
- iii. a notice that refers to this Public License;
-
- iv. a notice that refers to the disclaimer of
- warranties;
-
- v. a URI or hyperlink to the Licensed Material to the
- extent reasonably practicable;
-
- b. indicate if You modified the Licensed Material and
- retain an indication of any previous modifications; and
-
- c. indicate the Licensed Material is licensed under this
- Public License, and include the text of, or the URI or
- hyperlink to, this Public License.
-
- 2. You may satisfy the conditions in Section 3(a)(1) in any
- reasonable manner based on the medium, means, and context in
- which You Share the Licensed Material. For example, it may be
- reasonable to satisfy the conditions by providing a URI or
- hyperlink to a resource that includes the required
- information.
-
- 3. If requested by the Licensor, You must remove any of the
- information required by Section 3(a)(1)(A) to the extent
- reasonably practicable.
-
- b. ShareAlike.
-
- In addition to the conditions in Section 3(a), if You Share
- Adapted Material You produce, the following conditions also apply.
-
- 1. The Adapter's License You apply must be a Creative Commons
- license with the same License Elements, this version or
- later, or a BY-SA Compatible License.
-
- 2. You must include the text of, or the URI or hyperlink to, the
- Adapter's License You apply. You may satisfy this condition
- in any reasonable manner based on the medium, means, and
- context in which You Share Adapted Material.
-
- 3. You may not offer or impose any additional or different terms
- or conditions on, or apply any Effective Technological
- Measures to, Adapted Material that restrict exercise of the
- rights granted under the Adapter's License You apply.
-
-
-Section 4 -- Sui Generis Database Rights.
-
-Where the Licensed Rights include Sui Generis Database Rights that
-apply to Your use of the Licensed Material:
-
- a. for the avoidance of doubt, Section 2(a)(1) grants You the right
- to extract, reuse, reproduce, and Share all or a substantial
- portion of the contents of the database;
-
- b. if You include all or a substantial portion of the database
- contents in a database in which You have Sui Generis Database
- Rights, then the database in which You have Sui Generis Database
- Rights (but not its individual contents) is Adapted Material,
-
- including for purposes of Section 3(b); and
- c. You must comply with the conditions in Section 3(a) if You Share
- all or a substantial portion of the contents of the database.
-
-For the avoidance of doubt, this Section 4 supplements and does not
-replace Your obligations under this Public License where the Licensed
-Rights include other Copyright and Similar Rights.
-
-
-Section 5 -- Disclaimer of Warranties and Limitation of Liability.
-
- a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
- EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
- AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
- ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
- IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
- WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
- PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
- ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
- KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
- ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
-
- b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
- TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
- NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
- INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
- COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
- USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
- ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
- DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
- IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
-
- c. The disclaimer of warranties and limitation of liability provided
- above shall be interpreted in a manner that, to the extent
- possible, most closely approximates an absolute disclaimer and
- waiver of all liability.
-
-
-Section 6 -- Term and Termination.
-
- a. This Public License applies for the term of the Copyright and
- Similar Rights licensed here. However, if You fail to comply with
- this Public License, then Your rights under this Public License
- terminate automatically.
-
- b. Where Your right to use the Licensed Material has terminated under
- Section 6(a), it reinstates:
-
- 1. automatically as of the date the violation is cured, provided
- it is cured within 30 days of Your discovery of the
- violation; or
-
- 2. upon express reinstatement by the Licensor.
-
- For the avoidance of doubt, this Section 6(b) does not affect any
- right the Licensor may have to seek remedies for Your violations
- of this Public License.
-
- c. For the avoidance of doubt, the Licensor may also offer the
- Licensed Material under separate terms or conditions or stop
- distributing the Licensed Material at any time; however, doing so
- will not terminate this Public License.
-
- d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
- License.
-
-
-Section 7 -- Other Terms and Conditions.
-
- a. The Licensor shall not be bound by any additional or different
- terms or conditions communicated by You unless expressly agreed.
-
- b. Any arrangements, understandings, or agreements regarding the
- Licensed Material not stated herein are separate from and
- independent of the terms and conditions of this Public License.
-
-
-Section 8 -- Interpretation.
-
- a. For the avoidance of doubt, this Public License does not, and
- shall not be interpreted to, reduce, limit, restrict, or impose
- conditions on any use of the Licensed Material that could lawfully
- be made without permission under this Public License.
-
- b. To the extent possible, if any provision of this Public License is
- deemed unenforceable, it shall be automatically reformed to the
- minimum extent necessary to make it enforceable. If the provision
- cannot be reformed, it shall be severed from this Public License
- without affecting the enforceability of the remaining terms and
- conditions.
-
- c. No term or condition of this Public License will be waived and no
- failure to comply consented to unless expressly agreed to by the
- Licensor.
-
- d. Nothing in this Public License constitutes or may be interpreted
- as a limitation upon, or waiver of, any privileges and immunities
- that apply to the Licensor or You, including from the legal
- processes of any jurisdiction or authority.
-
-
-=======================================================================
-
-Creative Commons is not a party to its public
-licenses. Notwithstanding, Creative Commons may elect to apply one of
-its public licenses to material it publishes and in those instances
-will be considered the “Licensor.” The text of the Creative Commons
-public licenses is dedicated to the public domain under the CC0 Public
-Domain Dedication. Except for the limited purpose of indicating that
-material is shared under a Creative Commons public license or as
-otherwise permitted by the Creative Commons policies published at
-creativecommons.org/policies, Creative Commons does not authorize the
-use of the trademark "Creative Commons" or any other trademark or logo
-of Creative Commons without its prior written consent including,
-without limitation, in connection with any unauthorized modifications
-to any of its public licenses or any other arrangements,
-understandings, or agreements concerning use of licensed material. For
-the avoidance of doubt, this paragraph does not form part of the
-public licenses.
-
-Creative Commons may be contacted at creativecommons.org.
-
diff --git a/examples/training/textcat_example_data/README.md b/examples/training/textcat_example_data/README.md
deleted file mode 100644
index 1165f0293..000000000
--- a/examples/training/textcat_example_data/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-## Examples of textcat training data
-
-spacy JSON training files were generated from JSONL with:
-
-```
-python textcatjsonl_to_trainjson.py -m en file.jsonl .
-```
-
-`cooking.json` is an example with mutually-exclusive classes with two labels:
-
-* `baking`
-* `not_baking`
-
-`jigsaw-toxic-comment.json` is an example with multiple labels per instance:
-
-* `insult`
-* `obscene`
-* `severe_toxic`
-* `toxic`
-
-### Data Sources
-
-* `cooking.jsonl`: https://cooking.stackexchange.com. The meta IDs link to the
- original question as `https://cooking.stackexchange.com/questions/ID`, e.g.,
- `https://cooking.stackexchange.com/questions/2` for the first instance.
-* `jigsaw-toxic-comment.jsonl`: [Jigsaw Toxic Comments Classification
- Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)
-
-### Data Licenses
-
-* `cooking.jsonl`: CC BY-SA 4.0 ([`CC_BY-SA-4.0.txt`](CC_BY-SA-4.0.txt))
-* `jigsaw-toxic-comment.jsonl`:
- * text: CC BY-SA 3.0 ([`CC_BY-SA-3.0.txt`](CC_BY-SA-3.0.txt))
- * annotation: CC0 ([`CC0.txt`](CC0.txt))
diff --git a/examples/training/textcat_example_data/cooking.json b/examples/training/textcat_example_data/cooking.json
deleted file mode 100644
index 4bad4db79..000000000
--- a/examples/training/textcat_example_data/cooking.json
+++ /dev/null
@@ -1,3487 +0,0 @@
-[
- {
- "id":0,
- "paragraphs":[
- {
- "raw":"How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven by laying the strips out on a cookie sheet. When using this method, how long should I cook the bacon for, and at what temperature?\n",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"How",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"should",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"cook",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"bacon",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"in",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"an",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"oven",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":9,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"'ve",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"heard",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"of",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"people",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"cooking",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"bacon",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"in",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"an",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"oven",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"by",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"laying",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"strips",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"out",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":26,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":27,
- "orth":"cookie",
- "ner":"O"
- },
- {
- "id":28,
- "orth":"sheet",
- "ner":"O"
- },
- {
- "id":29,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":30,
- "orth":"When",
- "ner":"O"
- },
- {
- "id":31,
- "orth":"using",
- "ner":"O"
- },
- {
- "id":32,
- "orth":"this",
- "ner":"O"
- },
- {
- "id":33,
- "orth":"method",
- "ner":"O"
- },
- {
- "id":34,
- "orth":",",
- "ner":"O"
- },
- {
- "id":35,
- "orth":"how",
- "ner":"O"
- },
- {
- "id":36,
- "orth":"long",
- "ner":"O"
- },
- {
- "id":37,
- "orth":"should",
- "ner":"O"
- },
- {
- "id":38,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":39,
- "orth":"cook",
- "ner":"O"
- },
- {
- "id":40,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":41,
- "orth":"bacon",
- "ner":"O"
- },
- {
- "id":42,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":43,
- "orth":",",
- "ner":"O"
- },
- {
- "id":44,
- "orth":"and",
- "ner":"O"
- },
- {
- "id":45,
- "orth":"at",
- "ner":"O"
- },
- {
- "id":46,
- "orth":"what",
- "ner":"O"
- },
- {
- "id":47,
- "orth":"temperature",
- "ner":"O"
- },
- {
- "id":48,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":49,
- "orth":"\n",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"baking",
- "value":0.0
- },
- {
- "label":"not_baking",
- "value":1.0
- }
- ]
- },
- {
- "raw":"What is the difference between white and brown eggs?\nI always use brown extra large eggs, but I can't honestly say why I do this other than habit at this point. Are there any distinct advantages or disadvantages like flavor, shelf life, etc?\n",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"What",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"is",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"difference",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"between",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"white",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"and",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"brown",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"eggs",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":10,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"always",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"use",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"brown",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"extra",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"large",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"eggs",
- "ner":"O"
- },
- {
- "id":18,
- "orth":",",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"but",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"ca",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"n't",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"honestly",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"say",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"why",
- "ner":"O"
- },
- {
- "id":26,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":27,
- "orth":"do",
- "ner":"O"
- },
- {
- "id":28,
- "orth":"this",
- "ner":"O"
- },
- {
- "id":29,
- "orth":"other",
- "ner":"O"
- },
- {
- "id":30,
- "orth":"than",
- "ner":"O"
- },
- {
- "id":31,
- "orth":"habit",
- "ner":"O"
- },
- {
- "id":32,
- "orth":"at",
- "ner":"O"
- },
- {
- "id":33,
- "orth":"this",
- "ner":"O"
- },
- {
- "id":34,
- "orth":"point",
- "ner":"O"
- },
- {
- "id":35,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":36,
- "orth":"Are",
- "ner":"O"
- },
- {
- "id":37,
- "orth":"there",
- "ner":"O"
- },
- {
- "id":38,
- "orth":"any",
- "ner":"O"
- },
- {
- "id":39,
- "orth":"distinct",
- "ner":"O"
- },
- {
- "id":40,
- "orth":"advantages",
- "ner":"O"
- },
- {
- "id":41,
- "orth":"or",
- "ner":"O"
- },
- {
- "id":42,
- "orth":"disadvantages",
- "ner":"O"
- },
- {
- "id":43,
- "orth":"like",
- "ner":"O"
- },
- {
- "id":44,
- "orth":"flavor",
- "ner":"O"
- },
- {
- "id":45,
- "orth":",",
- "ner":"O"
- },
- {
- "id":46,
- "orth":"shelf",
- "ner":"O"
- },
- {
- "id":47,
- "orth":"life",
- "ner":"O"
- },
- {
- "id":48,
- "orth":",",
- "ner":"O"
- },
- {
- "id":49,
- "orth":"etc",
- "ner":"O"
- },
- {
- "id":50,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":51,
- "orth":"\n",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"baking",
- "value":0.0
- },
- {
- "label":"not_baking",
- "value":1.0
- }
- ]
- },
- {
- "raw":"What is the difference between baking soda and baking powder?\nAnd can I use one in place of the other in certain recipes?\n",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"What",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"is",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"difference",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"between",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"baking",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"soda",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"and",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"baking",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"powder",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":11,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"And",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"can",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"use",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"one",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"in",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"place",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"of",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"other",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"in",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"certain",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"recipes",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":26,
- "orth":"\n",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"baking",
- "value":0.0
- },
- {
- "label":"not_baking",
- "value":1.0
- }
- ]
- },
- {
- "raw":"In a tomato sauce recipe, how can I cut the acidity?\nIt seems that every time I make a tomato sauce for pasta, the sauce is a little bit too acid for my taste. I've tried using sugar or sodium bicarbonate, but I'm not satisfied with the results.\n",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"In",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"tomato",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"sauce",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"recipe",
- "ner":"O"
- },
- {
- "id":5,
- "orth":",",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"how",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"can",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"cut",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"acidity",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":13,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"It",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"seems",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"that",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"every",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"time",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"make",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"tomato",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"sauce",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"pasta",
- "ner":"O"
- },
- {
- "id":26,
- "orth":",",
- "ner":"O"
- },
- {
- "id":27,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":28,
- "orth":"sauce",
- "ner":"O"
- },
- {
- "id":29,
- "orth":"is",
- "ner":"O"
- },
- {
- "id":30,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":31,
- "orth":"little",
- "ner":"O"
- },
- {
- "id":32,
- "orth":"bit",
- "ner":"O"
- },
- {
- "id":33,
- "orth":"too",
- "ner":"O"
- },
- {
- "id":34,
- "orth":"acid",
- "ner":"O"
- },
- {
- "id":35,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":36,
- "orth":"my",
- "ner":"O"
- },
- {
- "id":37,
- "orth":"taste",
- "ner":"O"
- },
- {
- "id":38,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":39,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":40,
- "orth":"'ve",
- "ner":"O"
- },
- {
- "id":41,
- "orth":"tried",
- "ner":"O"
- },
- {
- "id":42,
- "orth":"using",
- "ner":"O"
- },
- {
- "id":43,
- "orth":"sugar",
- "ner":"O"
- },
- {
- "id":44,
- "orth":"or",
- "ner":"O"
- },
- {
- "id":45,
- "orth":"sodium",
- "ner":"O"
- },
- {
- "id":46,
- "orth":"bicarbonate",
- "ner":"O"
- },
- {
- "id":47,
- "orth":",",
- "ner":"O"
- },
- {
- "id":48,
- "orth":"but",
- "ner":"O"
- },
- {
- "id":49,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":50,
- "orth":"'m",
- "ner":"O"
- },
- {
- "id":51,
- "orth":"not",
- "ner":"O"
- },
- {
- "id":52,
- "orth":"satisfied",
- "ner":"O"
- },
- {
- "id":53,
- "orth":"with",
- "ner":"O"
- },
- {
- "id":54,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":55,
- "orth":"results",
- "ner":"O"
- },
- {
- "id":56,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":57,
- "orth":"\n",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"baking",
- "value":0.0
- },
- {
- "label":"not_baking",
- "value":1.0
- }
- ]
- },
- {
- "raw":"What ingredients (available in specific regions) can I substitute for parsley?\nI have a recipe that calls for fresh parsley. I have substituted other fresh herbs for their dried equivalents but I don't have fresh or dried parsley. Is there something else (ex another dried herb) that I can use instead of parsley?\nI know it is used mainly for looks rather than taste but I have a pasta recipe that calls for 2 tablespoons of parsley in the sauce and then another 2 tablespoons on top when it is done. I know the parsley on top is more for looks but there must be something about the taste otherwise it would call for parsley within the sauce as well.\nI would especially like to hear about substitutes available in Southeast Asia and other parts of the world where the obvious answers (such as cilantro) are not widely available.\n",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"What",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"ingredients",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"(",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"available",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"in",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"specific",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"regions",
- "ner":"O"
- },
- {
- "id":7,
- "orth":")",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"can",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"substitute",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"parsley",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":14,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"have",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"recipe",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"that",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"calls",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"fresh",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"parsley",
- "ner":"O"
- },
- {
- "id":24,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":25,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":26,
- "orth":"have",
- "ner":"O"
- },
- {
- "id":27,
- "orth":"substituted",
- "ner":"O"
- },
- {
- "id":28,
- "orth":"other",
- "ner":"O"
- },
- {
- "id":29,
- "orth":"fresh",
- "ner":"O"
- },
- {
- "id":30,
- "orth":"herbs",
- "ner":"O"
- },
- {
- "id":31,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":32,
- "orth":"their",
- "ner":"O"
- },
- {
- "id":33,
- "orth":"dried",
- "ner":"O"
- },
- {
- "id":34,
- "orth":"equivalents",
- "ner":"O"
- },
- {
- "id":35,
- "orth":"but",
- "ner":"O"
- },
- {
- "id":36,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":37,
- "orth":"do",
- "ner":"O"
- },
- {
- "id":38,
- "orth":"n't",
- "ner":"O"
- },
- {
- "id":39,
- "orth":"have",
- "ner":"O"
- },
- {
- "id":40,
- "orth":"fresh",
- "ner":"O"
- },
- {
- "id":41,
- "orth":"or",
- "ner":"O"
- },
- {
- "id":42,
- "orth":"dried",
- "ner":"O"
- },
- {
- "id":43,
- "orth":"parsley",
- "ner":"O"
- },
- {
- "id":44,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":45,
- "orth":"Is",
- "ner":"O"
- },
- {
- "id":46,
- "orth":"there",
- "ner":"O"
- },
- {
- "id":47,
- "orth":"something",
- "ner":"O"
- },
- {
- "id":48,
- "orth":"else",
- "ner":"O"
- },
- {
- "id":49,
- "orth":"(",
- "ner":"O"
- },
- {
- "id":50,
- "orth":"ex",
- "ner":"O"
- },
- {
- "id":51,
- "orth":"another",
- "ner":"O"
- },
- {
- "id":52,
- "orth":"dried",
- "ner":"O"
- },
- {
- "id":53,
- "orth":"herb",
- "ner":"O"
- },
- {
- "id":54,
- "orth":")",
- "ner":"O"
- },
- {
- "id":55,
- "orth":"that",
- "ner":"O"
- },
- {
- "id":56,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":57,
- "orth":"can",
- "ner":"O"
- },
- {
- "id":58,
- "orth":"use",
- "ner":"O"
- },
- {
- "id":59,
- "orth":"instead",
- "ner":"O"
- },
- {
- "id":60,
- "orth":"of",
- "ner":"O"
- },
- {
- "id":61,
- "orth":"parsley",
- "ner":"O"
- },
- {
- "id":62,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":63,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":64,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":65,
- "orth":"know",
- "ner":"O"
- },
- {
- "id":66,
- "orth":"it",
- "ner":"O"
- },
- {
- "id":67,
- "orth":"is",
- "ner":"O"
- },
- {
- "id":68,
- "orth":"used",
- "ner":"O"
- },
- {
- "id":69,
- "orth":"mainly",
- "ner":"O"
- },
- {
- "id":70,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":71,
- "orth":"looks",
- "ner":"O"
- },
- {
- "id":72,
- "orth":"rather",
- "ner":"O"
- },
- {
- "id":73,
- "orth":"than",
- "ner":"O"
- },
- {
- "id":74,
- "orth":"taste",
- "ner":"O"
- },
- {
- "id":75,
- "orth":"but",
- "ner":"O"
- },
- {
- "id":76,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":77,
- "orth":"have",
- "ner":"O"
- },
- {
- "id":78,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":79,
- "orth":"pasta",
- "ner":"O"
- },
- {
- "id":80,
- "orth":"recipe",
- "ner":"O"
- },
- {
- "id":81,
- "orth":"that",
- "ner":"O"
- },
- {
- "id":82,
- "orth":"calls",
- "ner":"O"
- },
- {
- "id":83,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":84,
- "orth":"2",
- "ner":"O"
- },
- {
- "id":85,
- "orth":"tablespoons",
- "ner":"O"
- },
- {
- "id":86,
- "orth":"of",
- "ner":"O"
- },
- {
- "id":87,
- "orth":"parsley",
- "ner":"O"
- },
- {
- "id":88,
- "orth":"in",
- "ner":"O"
- },
- {
- "id":89,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":90,
- "orth":"sauce",
- "ner":"O"
- },
- {
- "id":91,
- "orth":"and",
- "ner":"O"
- },
- {
- "id":92,
- "orth":"then",
- "ner":"O"
- },
- {
- "id":93,
- "orth":"another",
- "ner":"O"
- },
- {
- "id":94,
- "orth":"2",
- "ner":"O"
- },
- {
- "id":95,
- "orth":"tablespoons",
- "ner":"O"
- },
- {
- "id":96,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":97,
- "orth":"top",
- "ner":"O"
- },
- {
- "id":98,
- "orth":"when",
- "ner":"O"
- },
- {
- "id":99,
- "orth":"it",
- "ner":"O"
- },
- {
- "id":100,
- "orth":"is",
- "ner":"O"
- },
- {
- "id":101,
- "orth":"done",
- "ner":"O"
- },
- {
- "id":102,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":103,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":104,
- "orth":"know",
- "ner":"O"
- },
- {
- "id":105,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":106,
- "orth":"parsley",
- "ner":"O"
- },
- {
- "id":107,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":108,
- "orth":"top",
- "ner":"O"
- },
- {
- "id":109,
- "orth":"is",
- "ner":"O"
- },
- {
- "id":110,
- "orth":"more",
- "ner":"O"
- },
- {
- "id":111,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":112,
- "orth":"looks",
- "ner":"O"
- },
- {
- "id":113,
- "orth":"but",
- "ner":"O"
- },
- {
- "id":114,
- "orth":"there",
- "ner":"O"
- },
- {
- "id":115,
- "orth":"must",
- "ner":"O"
- },
- {
- "id":116,
- "orth":"be",
- "ner":"O"
- },
- {
- "id":117,
- "orth":"something",
- "ner":"O"
- },
- {
- "id":118,
- "orth":"about",
- "ner":"O"
- },
- {
- "id":119,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":120,
- "orth":"taste",
- "ner":"O"
- },
- {
- "id":121,
- "orth":"otherwise",
- "ner":"O"
- },
- {
- "id":122,
- "orth":"it",
- "ner":"O"
- },
- {
- "id":123,
- "orth":"would",
- "ner":"O"
- },
- {
- "id":124,
- "orth":"call",
- "ner":"O"
- },
- {
- "id":125,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":126,
- "orth":"parsley",
- "ner":"O"
- },
- {
- "id":127,
- "orth":"within",
- "ner":"O"
- },
- {
- "id":128,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":129,
- "orth":"sauce",
- "ner":"O"
- },
- {
- "id":130,
- "orth":"as",
- "ner":"O"
- },
- {
- "id":131,
- "orth":"well",
- "ner":"O"
- },
- {
- "id":132,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":133,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":134,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":135,
- "orth":"would",
- "ner":"O"
- },
- {
- "id":136,
- "orth":"especially",
- "ner":"O"
- },
- {
- "id":137,
- "orth":"like",
- "ner":"O"
- },
- {
- "id":138,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":139,
- "orth":"hear",
- "ner":"O"
- },
- {
- "id":140,
- "orth":"about",
- "ner":"O"
- },
- {
- "id":141,
- "orth":"substitutes",
- "ner":"O"
- },
- {
- "id":142,
- "orth":"available",
- "ner":"O"
- },
- {
- "id":143,
- "orth":"in",
- "ner":"O"
- },
- {
- "id":144,
- "orth":"Southeast",
- "ner":"O"
- },
- {
- "id":145,
- "orth":"Asia",
- "ner":"O"
- },
- {
- "id":146,
- "orth":"and",
- "ner":"O"
- },
- {
- "id":147,
- "orth":"other",
- "ner":"O"
- },
- {
- "id":148,
- "orth":"parts",
- "ner":"O"
- },
- {
- "id":149,
- "orth":"of",
- "ner":"O"
- },
- {
- "id":150,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":151,
- "orth":"world",
- "ner":"O"
- },
- {
- "id":152,
- "orth":"where",
- "ner":"O"
- },
- {
- "id":153,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":154,
- "orth":"obvious",
- "ner":"O"
- },
- {
- "id":155,
- "orth":"answers",
- "ner":"O"
- },
- {
- "id":156,
- "orth":"(",
- "ner":"O"
- },
- {
- "id":157,
- "orth":"such",
- "ner":"O"
- },
- {
- "id":158,
- "orth":"as",
- "ner":"O"
- },
- {
- "id":159,
- "orth":"cilantro",
- "ner":"O"
- },
- {
- "id":160,
- "orth":")",
- "ner":"O"
- },
- {
- "id":161,
- "orth":"are",
- "ner":"O"
- },
- {
- "id":162,
- "orth":"not",
- "ner":"O"
- },
- {
- "id":163,
- "orth":"widely",
- "ner":"O"
- },
- {
- "id":164,
- "orth":"available",
- "ner":"O"
- },
- {
- "id":165,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":166,
- "orth":"\n",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"baking",
- "value":0.0
- },
- {
- "label":"not_baking",
- "value":1.0
- }
- ]
- },
- {
- "raw":"What is the internal temperature a steak should be cooked to for Rare/Medium Rare/Medium/Well?\nI'd like to know when to take my steaks off the grill and please everybody.\n",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"What",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"is",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"internal",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"temperature",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"steak",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"should",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"be",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"cooked",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"Rare",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"/",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"Medium",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"Rare",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"/",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"Medium",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"/",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"Well",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":21,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"'d",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"like",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":26,
- "orth":"know",
- "ner":"O"
- },
- {
- "id":27,
- "orth":"when",
- "ner":"O"
- },
- {
- "id":28,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":29,
- "orth":"take",
- "ner":"O"
- },
- {
- "id":30,
- "orth":"my",
- "ner":"O"
- },
- {
- "id":31,
- "orth":"steaks",
- "ner":"O"
- },
- {
- "id":32,
- "orth":"off",
- "ner":"O"
- },
- {
- "id":33,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":34,
- "orth":"grill",
- "ner":"O"
- },
- {
- "id":35,
- "orth":"and",
- "ner":"O"
- },
- {
- "id":36,
- "orth":"please",
- "ner":"O"
- },
- {
- "id":37,
- "orth":"everybody",
- "ner":"O"
- },
- {
- "id":38,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":39,
- "orth":"\n",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"baking",
- "value":0.0
- },
- {
- "label":"not_baking",
- "value":1.0
- }
- ]
- },
- {
- "raw":"How should I poach an egg?\nWhat's the best method to poach an egg without it turning into an eggy soupy mess?\n",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"How",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"should",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"poach",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"an",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"egg",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":7,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"What",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"'s",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"best",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"method",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"poach",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"an",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"egg",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"without",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"it",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"turning",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"into",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"an",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"eggy",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"soupy",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"mess",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":26,
- "orth":"\n",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"baking",
- "value":0.0
- },
- {
- "label":"not_baking",
- "value":1.0
- }
- ]
- },
- {
- "raw":"How can I make my Ice Cream \"creamier\"\nMy ice cream doesn't feel creamy enough. I got the recipe from Good Eats, and I can't tell if it's just the recipe or maybe that I'm just not getting my \"batter\" cold enough before I try to make it (I let it chill overnight in the refrigerator, but it doesn't always come out of the machine looking like \"soft serve\" as he said on the show - it's usually a little thinner).\nRecipe: http://www.foodnetwork.com/recipes/alton-brown/serious-vanilla-ice-cream-recipe/index.html\nThanks!\n",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"How",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"can",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"make",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"my",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"Ice",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"Cream",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"\"",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"creamier",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"\"",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"My",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"ice",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"cream",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"does",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"n't",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"feel",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"creamy",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"enough",
- "ner":"O"
- },
- {
- "id":19,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":20,
- "orth":" ",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"got",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"recipe",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"from",
- "ner":"O"
- },
- {
- "id":26,
- "orth":"Good",
- "ner":"O"
- },
- {
- "id":27,
- "orth":"Eats",
- "ner":"O"
- },
- {
- "id":28,
- "orth":",",
- "ner":"O"
- },
- {
- "id":29,
- "orth":"and",
- "ner":"O"
- },
- {
- "id":30,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":31,
- "orth":"ca",
- "ner":"O"
- },
- {
- "id":32,
- "orth":"n't",
- "ner":"O"
- },
- {
- "id":33,
- "orth":"tell",
- "ner":"O"
- },
- {
- "id":34,
- "orth":"if",
- "ner":"O"
- },
- {
- "id":35,
- "orth":"it",
- "ner":"O"
- },
- {
- "id":36,
- "orth":"'s",
- "ner":"O"
- },
- {
- "id":37,
- "orth":"just",
- "ner":"O"
- },
- {
- "id":38,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":39,
- "orth":"recipe",
- "ner":"O"
- },
- {
- "id":40,
- "orth":"or",
- "ner":"O"
- },
- {
- "id":41,
- "orth":"maybe",
- "ner":"O"
- },
- {
- "id":42,
- "orth":"that",
- "ner":"O"
- },
- {
- "id":43,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":44,
- "orth":"'m",
- "ner":"O"
- },
- {
- "id":45,
- "orth":"just",
- "ner":"O"
- },
- {
- "id":46,
- "orth":"not",
- "ner":"O"
- },
- {
- "id":47,
- "orth":"getting",
- "ner":"O"
- },
- {
- "id":48,
- "orth":"my",
- "ner":"O"
- },
- {
- "id":49,
- "orth":"\"",
- "ner":"O"
- },
- {
- "id":50,
- "orth":"batter",
- "ner":"O"
- },
- {
- "id":51,
- "orth":"\"",
- "ner":"O"
- },
- {
- "id":52,
- "orth":"cold",
- "ner":"O"
- },
- {
- "id":53,
- "orth":"enough",
- "ner":"O"
- },
- {
- "id":54,
- "orth":"before",
- "ner":"O"
- },
- {
- "id":55,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":56,
- "orth":"try",
- "ner":"O"
- },
- {
- "id":57,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":58,
- "orth":"make",
- "ner":"O"
- },
- {
- "id":59,
- "orth":"it",
- "ner":"O"
- },
- {
- "id":60,
- "orth":"(",
- "ner":"O"
- },
- {
- "id":61,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":62,
- "orth":"let",
- "ner":"O"
- },
- {
- "id":63,
- "orth":"it",
- "ner":"O"
- },
- {
- "id":64,
- "orth":"chill",
- "ner":"O"
- },
- {
- "id":65,
- "orth":"overnight",
- "ner":"O"
- },
- {
- "id":66,
- "orth":"in",
- "ner":"O"
- },
- {
- "id":67,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":68,
- "orth":"refrigerator",
- "ner":"O"
- },
- {
- "id":69,
- "orth":",",
- "ner":"O"
- },
- {
- "id":70,
- "orth":"but",
- "ner":"O"
- },
- {
- "id":71,
- "orth":"it",
- "ner":"O"
- },
- {
- "id":72,
- "orth":"does",
- "ner":"O"
- },
- {
- "id":73,
- "orth":"n't",
- "ner":"O"
- },
- {
- "id":74,
- "orth":"always",
- "ner":"O"
- },
- {
- "id":75,
- "orth":"come",
- "ner":"O"
- },
- {
- "id":76,
- "orth":"out",
- "ner":"O"
- },
- {
- "id":77,
- "orth":"of",
- "ner":"O"
- },
- {
- "id":78,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":79,
- "orth":"machine",
- "ner":"O"
- },
- {
- "id":80,
- "orth":"looking",
- "ner":"O"
- },
- {
- "id":81,
- "orth":"like",
- "ner":"O"
- },
- {
- "id":82,
- "orth":"\"",
- "ner":"O"
- },
- {
- "id":83,
- "orth":"soft",
- "ner":"O"
- },
- {
- "id":84,
- "orth":"serve",
- "ner":"O"
- },
- {
- "id":85,
- "orth":"\"",
- "ner":"O"
- },
- {
- "id":86,
- "orth":"as",
- "ner":"O"
- },
- {
- "id":87,
- "orth":"he",
- "ner":"O"
- },
- {
- "id":88,
- "orth":"said",
- "ner":"O"
- },
- {
- "id":89,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":90,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":91,
- "orth":"show",
- "ner":"O"
- },
- {
- "id":92,
- "orth":"-",
- "ner":"O"
- },
- {
- "id":93,
- "orth":"it",
- "ner":"O"
- },
- {
- "id":94,
- "orth":"'s",
- "ner":"O"
- },
- {
- "id":95,
- "orth":"usually",
- "ner":"O"
- },
- {
- "id":96,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":97,
- "orth":"little",
- "ner":"O"
- },
- {
- "id":98,
- "orth":"thinner",
- "ner":"O"
- },
- {
- "id":99,
- "orth":")",
- "ner":"O"
- },
- {
- "id":100,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":101,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":102,
- "orth":"Recipe",
- "ner":"O"
- },
- {
- "id":103,
- "orth":":",
- "ner":"O"
- },
- {
- "id":104,
- "orth":"http://www.foodnetwork.com/recipes/alton-brown/serious-vanilla-ice-cream-recipe/index.html",
- "ner":"O"
- },
- {
- "id":105,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":106,
- "orth":"Thanks",
- "ner":"O"
- },
- {
- "id":107,
- "orth":"!",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":108,
- "orth":"\n",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"baking",
- "value":0.0
- },
- {
- "label":"not_baking",
- "value":1.0
- }
- ]
- },
- {
- "raw":"How long and at what temperature do the various parts of a chicken need to be cooked?\nI'm interested in baking thighs, legs, breasts and wings. How long do each of these items need to bake and at what temperature?\n",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"How",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"long",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"and",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"at",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"what",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"temperature",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"do",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"various",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"parts",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"of",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"chicken",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"need",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"be",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"cooked",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":18,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"'m",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"interested",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"in",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"baking",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"thighs",
- "ner":"O"
- },
- {
- "id":25,
- "orth":",",
- "ner":"O"
- },
- {
- "id":26,
- "orth":"legs",
- "ner":"O"
- },
- {
- "id":27,
- "orth":",",
- "ner":"O"
- },
- {
- "id":28,
- "orth":"breasts",
- "ner":"O"
- },
- {
- "id":29,
- "orth":"and",
- "ner":"O"
- },
- {
- "id":30,
- "orth":"wings",
- "ner":"O"
- },
- {
- "id":31,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":32,
- "orth":" ",
- "ner":"O"
- },
- {
- "id":33,
- "orth":"How",
- "ner":"O"
- },
- {
- "id":34,
- "orth":"long",
- "ner":"O"
- },
- {
- "id":35,
- "orth":"do",
- "ner":"O"
- },
- {
- "id":36,
- "orth":"each",
- "ner":"O"
- },
- {
- "id":37,
- "orth":"of",
- "ner":"O"
- },
- {
- "id":38,
- "orth":"these",
- "ner":"O"
- },
- {
- "id":39,
- "orth":"items",
- "ner":"O"
- },
- {
- "id":40,
- "orth":"need",
- "ner":"O"
- },
- {
- "id":41,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":42,
- "orth":"bake",
- "ner":"O"
- },
- {
- "id":43,
- "orth":"and",
- "ner":"O"
- },
- {
- "id":44,
- "orth":"at",
- "ner":"O"
- },
- {
- "id":45,
- "orth":"what",
- "ner":"O"
- },
- {
- "id":46,
- "orth":"temperature",
- "ner":"O"
- },
- {
- "id":47,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":48,
- "orth":"\n",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"baking",
- "value":1.0
- },
- {
- "label":"not_baking",
- "value":0.0
- }
- ]
- },
- {
- "raw":"Do I need to sift flour that is labeled sifted?\nIs there really an advantage to sifting flour that I bought that was labeled 'sifted'?\n",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"Do",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"need",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"sift",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"flour",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"that",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"is",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"labeled",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"sifted",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":11,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"Is",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"there",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"really",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"an",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"advantage",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"sifting",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"flour",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"that",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"bought",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"that",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"was",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"labeled",
- "ner":"O"
- },
- {
- "id":26,
- "orth":"'",
- "ner":"O"
- },
- {
- "id":27,
- "orth":"sifted",
- "ner":"O"
- },
- {
- "id":28,
- "orth":"'",
- "ner":"O"
- },
- {
- "id":29,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":30,
- "orth":"\n",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"baking",
- "value":1.0
- },
- {
- "label":"not_baking",
- "value":0.0
- }
- ]
- }
- ]
- }
-]
\ No newline at end of file
diff --git a/examples/training/textcat_example_data/cooking.jsonl b/examples/training/textcat_example_data/cooking.jsonl
deleted file mode 100644
index cfdc9be87..000000000
--- a/examples/training/textcat_example_data/cooking.jsonl
+++ /dev/null
@@ -1,10 +0,0 @@
-{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "2"}, "text": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven by laying the strips out on a cookie sheet. When using this method, how long should I cook the bacon for, and at what temperature?\n"}
-{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "3"}, "text": "What is the difference between white and brown eggs?\nI always use brown extra large eggs, but I can't honestly say why I do this other than habit at this point. Are there any distinct advantages or disadvantages like flavor, shelf life, etc?\n"}
-{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "4"}, "text": "What is the difference between baking soda and baking powder?\nAnd can I use one in place of the other in certain recipes?\n"}
-{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "5"}, "text": "In a tomato sauce recipe, how can I cut the acidity?\nIt seems that every time I make a tomato sauce for pasta, the sauce is a little bit too acid for my taste. I've tried using sugar or sodium bicarbonate, but I'm not satisfied with the results.\n"}
-{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "6"}, "text": "What ingredients (available in specific regions) can I substitute for parsley?\nI have a recipe that calls for fresh parsley. I have substituted other fresh herbs for their dried equivalents but I don't have fresh or dried parsley. Is there something else (ex another dried herb) that I can use instead of parsley?\nI know it is used mainly for looks rather than taste but I have a pasta recipe that calls for 2 tablespoons of parsley in the sauce and then another 2 tablespoons on top when it is done. I know the parsley on top is more for looks but there must be something about the taste otherwise it would call for parsley within the sauce as well.\nI would especially like to hear about substitutes available in Southeast Asia and other parts of the world where the obvious answers (such as cilantro) are not widely available.\n"}
-{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "9"}, "text": "What is the internal temperature a steak should be cooked to for Rare/Medium Rare/Medium/Well?\nI'd like to know when to take my steaks off the grill and please everybody.\n"}
-{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "11"}, "text": "How should I poach an egg?\nWhat's the best method to poach an egg without it turning into an eggy soupy mess?\n"}
-{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "12"}, "text": "How can I make my Ice Cream \"creamier\"\nMy ice cream doesn't feel creamy enough. I got the recipe from Good Eats, and I can't tell if it's just the recipe or maybe that I'm just not getting my \"batter\" cold enough before I try to make it (I let it chill overnight in the refrigerator, but it doesn't always come out of the machine looking like \"soft serve\" as he said on the show - it's usually a little thinner).\nRecipe: http://www.foodnetwork.com/recipes/alton-brown/serious-vanilla-ice-cream-recipe/index.html\nThanks!\n"}
-{"cats": {"baking": 1.0, "not_baking": 0.0}, "meta": {"id": "17"}, "text": "How long and at what temperature do the various parts of a chicken need to be cooked?\nI'm interested in baking thighs, legs, breasts and wings. How long do each of these items need to bake and at what temperature?\n"}
-{"cats": {"baking": 1.0, "not_baking": 0.0}, "meta": {"id": "27"}, "text": "Do I need to sift flour that is labeled sifted?\nIs there really an advantage to sifting flour that I bought that was labeled 'sifted'?\n"}
diff --git a/examples/training/textcat_example_data/jigsaw-toxic-comment.json b/examples/training/textcat_example_data/jigsaw-toxic-comment.json
deleted file mode 100644
index 0c8d8f8e0..000000000
--- a/examples/training/textcat_example_data/jigsaw-toxic-comment.json
+++ /dev/null
@@ -1,2987 +0,0 @@
-[
- {
- "id":0,
- "paragraphs":[
- {
- "raw":"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"Explanation",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"Why",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"edits",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"made",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"under",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"my",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"username",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"Hardcore",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"Metallica",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"Fan",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"were",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"reverted",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":15,
- "orth":"They",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"were",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"n't",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"vandalisms",
- "ner":"O"
- },
- {
- "id":19,
- "orth":",",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"just",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"closure",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"some",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"GAs",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"after",
- "ner":"O"
- },
- {
- "id":26,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":27,
- "orth":"voted",
- "ner":"O"
- },
- {
- "id":28,
- "orth":"at",
- "ner":"O"
- },
- {
- "id":29,
- "orth":"New",
- "ner":"O"
- },
- {
- "id":30,
- "orth":"York",
- "ner":"O"
- },
- {
- "id":31,
- "orth":"Dolls",
- "ner":"O"
- },
- {
- "id":32,
- "orth":"FAC",
- "ner":"O"
- },
- {
- "id":33,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":34,
- "orth":"And",
- "ner":"O"
- },
- {
- "id":35,
- "orth":"please",
- "ner":"O"
- },
- {
- "id":36,
- "orth":"do",
- "ner":"O"
- },
- {
- "id":37,
- "orth":"n't",
- "ner":"O"
- },
- {
- "id":38,
- "orth":"remove",
- "ner":"O"
- },
- {
- "id":39,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":40,
- "orth":"template",
- "ner":"O"
- },
- {
- "id":41,
- "orth":"from",
- "ner":"O"
- },
- {
- "id":42,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":43,
- "orth":"talk",
- "ner":"O"
- },
- {
- "id":44,
- "orth":"page",
- "ner":"O"
- },
- {
- "id":45,
- "orth":"since",
- "ner":"O"
- },
- {
- "id":46,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":47,
- "orth":"'m",
- "ner":"O"
- },
- {
- "id":48,
- "orth":"retired",
- "ner":"O"
- },
- {
- "id":49,
- "orth":"now.89.205.38.27",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"insult",
- "value":0
- },
- {
- "label":"obscene",
- "value":0
- },
- {
- "label":"severe_toxic",
- "value":0
- },
- {
- "label":"toxic",
- "value":0
- }
- ]
- },
- {
- "raw":"I'm Sorry \n\nI'm sorry I screwed around with someones talk page. It was very bad to do. I know how having the templates on their talk page helps you assert your dominance over them. I know I should bow down to the almighty administrators. But then again, I'm going to go play outside....with your mom. 76.122.79.82",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"'m",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"Sorry",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"\n\n",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"'m",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"sorry",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"screwed",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"around",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"with",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"someones",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"talk",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"page",
- "ner":"O"
- },
- {
- "id":14,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":15,
- "orth":" ",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"It",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"was",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"very",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"bad",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"do",
- "ner":"O"
- },
- {
- "id":22,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":23,
- "orth":" ",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"know",
- "ner":"O"
- },
- {
- "id":26,
- "orth":"how",
- "ner":"O"
- },
- {
- "id":27,
- "orth":"having",
- "ner":"O"
- },
- {
- "id":28,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":29,
- "orth":"templates",
- "ner":"O"
- },
- {
- "id":30,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":31,
- "orth":"their",
- "ner":"O"
- },
- {
- "id":32,
- "orth":"talk",
- "ner":"O"
- },
- {
- "id":33,
- "orth":"page",
- "ner":"O"
- },
- {
- "id":34,
- "orth":"helps",
- "ner":"O"
- },
- {
- "id":35,
- "orth":"you",
- "ner":"O"
- },
- {
- "id":36,
- "orth":"assert",
- "ner":"O"
- },
- {
- "id":37,
- "orth":"your",
- "ner":"O"
- },
- {
- "id":38,
- "orth":"dominance",
- "ner":"O"
- },
- {
- "id":39,
- "orth":"over",
- "ner":"O"
- },
- {
- "id":40,
- "orth":"them",
- "ner":"O"
- },
- {
- "id":41,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":42,
- "orth":" ",
- "ner":"O"
- },
- {
- "id":43,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":44,
- "orth":"know",
- "ner":"O"
- },
- {
- "id":45,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":46,
- "orth":"should",
- "ner":"O"
- },
- {
- "id":47,
- "orth":"bow",
- "ner":"O"
- },
- {
- "id":48,
- "orth":"down",
- "ner":"O"
- },
- {
- "id":49,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":50,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":51,
- "orth":"almighty",
- "ner":"O"
- },
- {
- "id":52,
- "orth":"administrators",
- "ner":"O"
- },
- {
- "id":53,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":54,
- "orth":" ",
- "ner":"O"
- },
- {
- "id":55,
- "orth":"But",
- "ner":"O"
- },
- {
- "id":56,
- "orth":"then",
- "ner":"O"
- },
- {
- "id":57,
- "orth":"again",
- "ner":"O"
- },
- {
- "id":58,
- "orth":",",
- "ner":"O"
- },
- {
- "id":59,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":60,
- "orth":"'m",
- "ner":"O"
- },
- {
- "id":61,
- "orth":"going",
- "ner":"O"
- },
- {
- "id":62,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":63,
- "orth":"go",
- "ner":"O"
- },
- {
- "id":64,
- "orth":"play",
- "ner":"O"
- },
- {
- "id":65,
- "orth":"outside",
- "ner":"O"
- },
- {
- "id":66,
- "orth":"....",
- "ner":"O"
- },
- {
- "id":67,
- "orth":"with",
- "ner":"O"
- },
- {
- "id":68,
- "orth":"your",
- "ner":"O"
- },
- {
- "id":69,
- "orth":"mom",
- "ner":"O"
- },
- {
- "id":70,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":71,
- "orth":" ",
- "ner":"O"
- },
- {
- "id":72,
- "orth":"76.122.79.82",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"insult",
- "value":0
- },
- {
- "label":"obscene",
- "value":0
- },
- {
- "label":"severe_toxic",
- "value":0
- },
- {
- "label":"toxic",
- "value":1
- }
- ]
- },
- {
- "raw":"Stupid peace of shit stop deleting my stuff asshole go die and fall in a hole go to hell!",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"Stupid",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"peace",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"of",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"shit",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"stop",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"deleting",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"my",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"stuff",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"asshole",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"go",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"die",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"and",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"fall",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"in",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"hole",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"go",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"hell",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"!",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"insult",
- "value":1
- },
- {
- "label":"obscene",
- "value":1
- },
- {
- "label":"severe_toxic",
- "value":1
- },
- {
- "label":"toxic",
- "value":1
- }
- ]
- },
- {
- "raw":"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"D'aww",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"!",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":2,
- "orth":"He",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"matches",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"this",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"background",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"colour",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"'m",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"seemingly",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"stuck",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"with",
- "ner":"O"
- },
- {
- "id":12,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":13,
- "orth":"Thanks",
- "ner":"O"
- },
- {
- "id":14,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":15,
- "orth":" ",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"(",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"talk",
- "ner":"O"
- },
- {
- "id":18,
- "orth":")",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"21:51",
- "ner":"O"
- },
- {
- "id":20,
- "orth":",",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"January",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"11",
- "ner":"O"
- },
- {
- "id":23,
- "orth":",",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"2016",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"(",
- "ner":"O"
- },
- {
- "id":26,
- "orth":"UTC",
- "ner":"O"
- },
- {
- "id":27,
- "orth":")",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"insult",
- "value":0
- },
- {
- "label":"obscene",
- "value":0
- },
- {
- "label":"severe_toxic",
- "value":0
- },
- {
- "label":"toxic",
- "value":0
- }
- ]
- },
- {
- "raw":"GET FUCKED UP. GET FUCKEEED UP. GOT A DRINK THAT YOU CANT PUT DOWN???/ GET FUCK UP GET FUCKED UP. I'M FUCKED UP RIGHT NOW!",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"GET",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"FUCKED",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"UP",
- "ner":"O"
- },
- {
- "id":3,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":4,
- "orth":"GET",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"FUCKEEED",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"UP",
- "ner":"O"
- },
- {
- "id":7,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":8,
- "orth":" ",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"GOT",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"A",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"DRINK",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"THAT",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"YOU",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"CANT",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"PUT",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"DOWN???/",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"GET",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"FUCK",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"UP",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"GET",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"FUCKED",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"UP",
- "ner":"O"
- },
- {
- "id":23,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":24,
- "orth":" ",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"I'M",
- "ner":"O"
- },
- {
- "id":26,
- "orth":"FUCKED",
- "ner":"O"
- },
- {
- "id":27,
- "orth":"UP",
- "ner":"O"
- },
- {
- "id":28,
- "orth":"RIGHT",
- "ner":"O"
- },
- {
- "id":29,
- "orth":"NOW",
- "ner":"O"
- },
- {
- "id":30,
- "orth":"!",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"insult",
- "value":0
- },
- {
- "label":"obscene",
- "value":1
- },
- {
- "label":"severe_toxic",
- "value":0
- },
- {
- "label":"toxic",
- "value":1
- }
- ]
- },
- {
- "raw":"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"Hey",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"man",
- "ner":"O"
- },
- {
- "id":2,
- "orth":",",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"'m",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"really",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"not",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"trying",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"edit",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"war",
- "ner":"O"
- },
- {
- "id":11,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":12,
- "orth":"It",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"'s",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"just",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"that",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"this",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"guy",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"is",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"constantly",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"removing",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"relevant",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"information",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"and",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"talking",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":26,
- "orth":"me",
- "ner":"O"
- },
- {
- "id":27,
- "orth":"through",
- "ner":"O"
- },
- {
- "id":28,
- "orth":"edits",
- "ner":"O"
- },
- {
- "id":29,
- "orth":"instead",
- "ner":"O"
- },
- {
- "id":30,
- "orth":"of",
- "ner":"O"
- },
- {
- "id":31,
- "orth":"my",
- "ner":"O"
- },
- {
- "id":32,
- "orth":"talk",
- "ner":"O"
- },
- {
- "id":33,
- "orth":"page",
- "ner":"O"
- },
- {
- "id":34,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":35,
- "orth":"He",
- "ner":"O"
- },
- {
- "id":36,
- "orth":"seems",
- "ner":"O"
- },
- {
- "id":37,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":38,
- "orth":"care",
- "ner":"O"
- },
- {
- "id":39,
- "orth":"more",
- "ner":"O"
- },
- {
- "id":40,
- "orth":"about",
- "ner":"O"
- },
- {
- "id":41,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":42,
- "orth":"formatting",
- "ner":"O"
- },
- {
- "id":43,
- "orth":"than",
- "ner":"O"
- },
- {
- "id":44,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":45,
- "orth":"actual",
- "ner":"O"
- },
- {
- "id":46,
- "orth":"info",
- "ner":"O"
- },
- {
- "id":47,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"insult",
- "value":0
- },
- {
- "label":"obscene",
- "value":0
- },
- {
- "label":"severe_toxic",
- "value":0
- },
- {
- "label":"toxic",
- "value":0
- }
- ]
- },
- {
- "raw":"\"\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of \"\"types of accidents\"\" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport \"",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"\"",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"More",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"\n",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"ca",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"n't",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"make",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"any",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"real",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"suggestions",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"improvement",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"-",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"wondered",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"if",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"section",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"statistics",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"should",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"be",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"later",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":24,
- "orth":",",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"or",
- "ner":"O"
- },
- {
- "id":26,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":27,
- "orth":"subsection",
- "ner":"O"
- },
- {
- "id":28,
- "orth":"of",
- "ner":"O"
- },
- {
- "id":29,
- "orth":"\"",
- "ner":"O"
- },
- {
- "id":30,
- "orth":"\"",
- "ner":"O"
- },
- {
- "id":31,
- "orth":"types",
- "ner":"O"
- },
- {
- "id":32,
- "orth":"of",
- "ner":"O"
- },
- {
- "id":33,
- "orth":"accidents",
- "ner":"O"
- },
- {
- "id":34,
- "orth":"\"",
- "ner":"O"
- },
- {
- "id":35,
- "orth":"\"",
- "ner":"O"
- },
- {
- "id":36,
- "orth":" ",
- "ner":"O"
- },
- {
- "id":37,
- "orth":"-I",
- "ner":"O"
- },
- {
- "id":38,
- "orth":"think",
- "ner":"O"
- },
- {
- "id":39,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":40,
- "orth":"references",
- "ner":"O"
- },
- {
- "id":41,
- "orth":"may",
- "ner":"O"
- },
- {
- "id":42,
- "orth":"need",
- "ner":"O"
- },
- {
- "id":43,
- "orth":"tidying",
- "ner":"O"
- },
- {
- "id":44,
- "orth":"so",
- "ner":"O"
- },
- {
- "id":45,
- "orth":"that",
- "ner":"O"
- },
- {
- "id":46,
- "orth":"they",
- "ner":"O"
- },
- {
- "id":47,
- "orth":"are",
- "ner":"O"
- },
- {
- "id":48,
- "orth":"all",
- "ner":"O"
- },
- {
- "id":49,
- "orth":"in",
- "ner":"O"
- },
- {
- "id":50,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":51,
- "orth":"exact",
- "ner":"O"
- },
- {
- "id":52,
- "orth":"same",
- "ner":"O"
- },
- {
- "id":53,
- "orth":"format",
- "ner":"O"
- },
- {
- "id":54,
- "orth":"ie",
- "ner":"O"
- },
- {
- "id":55,
- "orth":"date",
- "ner":"O"
- },
- {
- "id":56,
- "orth":"format",
- "ner":"O"
- },
- {
- "id":57,
- "orth":"etc",
- "ner":"O"
- },
- {
- "id":58,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":59,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":60,
- "orth":"can",
- "ner":"O"
- },
- {
- "id":61,
- "orth":"do",
- "ner":"O"
- },
- {
- "id":62,
- "orth":"that",
- "ner":"O"
- },
- {
- "id":63,
- "orth":"later",
- "ner":"O"
- },
- {
- "id":64,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":65,
- "orth":",",
- "ner":"O"
- },
- {
- "id":66,
- "orth":"if",
- "ner":"O"
- },
- {
- "id":67,
- "orth":"no",
- "ner":"O"
- },
- {
- "id":68,
- "orth":"-",
- "ner":"O"
- },
- {
- "id":69,
- "orth":"one",
- "ner":"O"
- },
- {
- "id":70,
- "orth":"else",
- "ner":"O"
- },
- {
- "id":71,
- "orth":"does",
- "ner":"O"
- },
- {
- "id":72,
- "orth":"first",
- "ner":"O"
- },
- {
- "id":73,
- "orth":"-",
- "ner":"O"
- },
- {
- "id":74,
- "orth":"if",
- "ner":"O"
- },
- {
- "id":75,
- "orth":"you",
- "ner":"O"
- },
- {
- "id":76,
- "orth":"have",
- "ner":"O"
- },
- {
- "id":77,
- "orth":"any",
- "ner":"O"
- },
- {
- "id":78,
- "orth":"preferences",
- "ner":"O"
- },
- {
- "id":79,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":80,
- "orth":"formatting",
- "ner":"O"
- },
- {
- "id":81,
- "orth":"style",
- "ner":"O"
- },
- {
- "id":82,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":83,
- "orth":"references",
- "ner":"O"
- },
- {
- "id":84,
- "orth":"or",
- "ner":"O"
- },
- {
- "id":85,
- "orth":"want",
- "ner":"O"
- },
- {
- "id":86,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":87,
- "orth":"do",
- "ner":"O"
- },
- {
- "id":88,
- "orth":"it",
- "ner":"O"
- },
- {
- "id":89,
- "orth":"yourself",
- "ner":"O"
- },
- {
- "id":90,
- "orth":"please",
- "ner":"O"
- },
- {
- "id":91,
- "orth":"let",
- "ner":"O"
- },
- {
- "id":92,
- "orth":"me",
- "ner":"O"
- },
- {
- "id":93,
- "orth":"know",
- "ner":"O"
- },
- {
- "id":94,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":95,
- "orth":"\n\n",
- "ner":"O"
- },
- {
- "id":96,
- "orth":"There",
- "ner":"O"
- },
- {
- "id":97,
- "orth":"appears",
- "ner":"O"
- },
- {
- "id":98,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":99,
- "orth":"be",
- "ner":"O"
- },
- {
- "id":100,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":101,
- "orth":"backlog",
- "ner":"O"
- },
- {
- "id":102,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":103,
- "orth":"articles",
- "ner":"O"
- },
- {
- "id":104,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":105,
- "orth":"review",
- "ner":"O"
- },
- {
- "id":106,
- "orth":"so",
- "ner":"O"
- },
- {
- "id":107,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":108,
- "orth":"guess",
- "ner":"O"
- },
- {
- "id":109,
- "orth":"there",
- "ner":"O"
- },
- {
- "id":110,
- "orth":"may",
- "ner":"O"
- },
- {
- "id":111,
- "orth":"be",
- "ner":"O"
- },
- {
- "id":112,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":113,
- "orth":"delay",
- "ner":"O"
- },
- {
- "id":114,
- "orth":"until",
- "ner":"O"
- },
- {
- "id":115,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":116,
- "orth":"reviewer",
- "ner":"O"
- },
- {
- "id":117,
- "orth":"turns",
- "ner":"O"
- },
- {
- "id":118,
- "orth":"up",
- "ner":"O"
- },
- {
- "id":119,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":120,
- "orth":"It",
- "ner":"O"
- },
- {
- "id":121,
- "orth":"'s",
- "ner":"O"
- },
- {
- "id":122,
- "orth":"listed",
- "ner":"O"
- },
- {
- "id":123,
- "orth":"in",
- "ner":"O"
- },
- {
- "id":124,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":125,
- "orth":"relevant",
- "ner":"O"
- },
- {
- "id":126,
- "orth":"form",
- "ner":"O"
- },
- {
- "id":127,
- "orth":"eg",
- "ner":"O"
- },
- {
- "id":128,
- "orth":"Wikipedia",
- "ner":"O"
- },
- {
- "id":129,
- "orth":":",
- "ner":"O"
- },
- {
- "id":130,
- "orth":"Good_article_nominations#Transport",
- "ner":"O"
- },
- {
- "id":131,
- "orth":" ",
- "ner":"O"
- },
- {
- "id":132,
- "orth":"\"",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"insult",
- "value":0
- },
- {
- "label":"obscene",
- "value":0
- },
- {
- "label":"severe_toxic",
- "value":0
- },
- {
- "label":"toxic",
- "value":0
- }
- ]
- },
- {
- "raw":"You, sir, are my hero. Any chance you remember what page that's on?",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"You",
- "ner":"O"
- },
- {
- "id":1,
- "orth":",",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"sir",
- "ner":"O"
- },
- {
- "id":3,
- "orth":",",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"are",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"my",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"hero",
- "ner":"O"
- },
- {
- "id":7,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":8,
- "orth":"Any",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"chance",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"you",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"remember",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"what",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"page",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"that",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"'s",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"insult",
- "value":0
- },
- {
- "label":"obscene",
- "value":0
- },
- {
- "label":"severe_toxic",
- "value":0
- },
- {
- "label":"toxic",
- "value":0
- }
- ]
- },
- {
- "raw":"\"\n\nCongratulations from me as well, use the tools well. \u00a0\u00b7 talk \"",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"\"",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"\n\n",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"Congratulations",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"from",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"me",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"as",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"well",
- "ner":"O"
- },
- {
- "id":7,
- "orth":",",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"use",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":10,
- "orth":"tools",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"well",
- "ner":"O"
- },
- {
- "id":12,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":13,
- "orth":"\u00a0",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"\u00b7",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"talk",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"\"",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"insult",
- "value":0
- },
- {
- "label":"obscene",
- "value":0
- },
- {
- "label":"severe_toxic",
- "value":0
- },
- {
- "label":"toxic",
- "value":0
- }
- ]
- },
- {
- "raw":"Why can't you believe how fat Artie is? Did you see him on his recent appearence on the Tonight Show with Jay Leno? He looks absolutely AWFUL! If I had to put money on it, I'd say that Artie Lange is a can't miss candidate for the 2007 Dead pool! \n\n \nKindly keep your malicious fingers off of my above comment, . Everytime you remove it, I will repost it!!!",
- "sentences":[
- {
- "tokens":[
- {
- "id":0,
- "orth":"Why",
- "ner":"O"
- },
- {
- "id":1,
- "orth":"ca",
- "ner":"O"
- },
- {
- "id":2,
- "orth":"n't",
- "ner":"O"
- },
- {
- "id":3,
- "orth":"you",
- "ner":"O"
- },
- {
- "id":4,
- "orth":"believe",
- "ner":"O"
- },
- {
- "id":5,
- "orth":"how",
- "ner":"O"
- },
- {
- "id":6,
- "orth":"fat",
- "ner":"O"
- },
- {
- "id":7,
- "orth":"Artie",
- "ner":"O"
- },
- {
- "id":8,
- "orth":"is",
- "ner":"O"
- },
- {
- "id":9,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":10,
- "orth":"Did",
- "ner":"O"
- },
- {
- "id":11,
- "orth":"you",
- "ner":"O"
- },
- {
- "id":12,
- "orth":"see",
- "ner":"O"
- },
- {
- "id":13,
- "orth":"him",
- "ner":"O"
- },
- {
- "id":14,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":15,
- "orth":"his",
- "ner":"O"
- },
- {
- "id":16,
- "orth":"recent",
- "ner":"O"
- },
- {
- "id":17,
- "orth":"appearence",
- "ner":"O"
- },
- {
- "id":18,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":19,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":20,
- "orth":"Tonight",
- "ner":"O"
- },
- {
- "id":21,
- "orth":"Show",
- "ner":"O"
- },
- {
- "id":22,
- "orth":"with",
- "ner":"O"
- },
- {
- "id":23,
- "orth":"Jay",
- "ner":"O"
- },
- {
- "id":24,
- "orth":"Leno",
- "ner":"O"
- },
- {
- "id":25,
- "orth":"?",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":26,
- "orth":"He",
- "ner":"O"
- },
- {
- "id":27,
- "orth":"looks",
- "ner":"O"
- },
- {
- "id":28,
- "orth":"absolutely",
- "ner":"O"
- },
- {
- "id":29,
- "orth":"AWFUL",
- "ner":"O"
- },
- {
- "id":30,
- "orth":"!",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":31,
- "orth":"If",
- "ner":"O"
- },
- {
- "id":32,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":33,
- "orth":"had",
- "ner":"O"
- },
- {
- "id":34,
- "orth":"to",
- "ner":"O"
- },
- {
- "id":35,
- "orth":"put",
- "ner":"O"
- },
- {
- "id":36,
- "orth":"money",
- "ner":"O"
- },
- {
- "id":37,
- "orth":"on",
- "ner":"O"
- },
- {
- "id":38,
- "orth":"it",
- "ner":"O"
- },
- {
- "id":39,
- "orth":",",
- "ner":"O"
- },
- {
- "id":40,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":41,
- "orth":"'d",
- "ner":"O"
- },
- {
- "id":42,
- "orth":"say",
- "ner":"O"
- },
- {
- "id":43,
- "orth":"that",
- "ner":"O"
- },
- {
- "id":44,
- "orth":"Artie",
- "ner":"O"
- },
- {
- "id":45,
- "orth":"Lange",
- "ner":"O"
- },
- {
- "id":46,
- "orth":"is",
- "ner":"O"
- },
- {
- "id":47,
- "orth":"a",
- "ner":"O"
- },
- {
- "id":48,
- "orth":"ca",
- "ner":"O"
- },
- {
- "id":49,
- "orth":"n't",
- "ner":"O"
- },
- {
- "id":50,
- "orth":"miss",
- "ner":"O"
- },
- {
- "id":51,
- "orth":"candidate",
- "ner":"O"
- },
- {
- "id":52,
- "orth":"for",
- "ner":"O"
- },
- {
- "id":53,
- "orth":"the",
- "ner":"O"
- },
- {
- "id":54,
- "orth":"2007",
- "ner":"O"
- },
- {
- "id":55,
- "orth":"Dead",
- "ner":"O"
- },
- {
- "id":56,
- "orth":"pool",
- "ner":"O"
- },
- {
- "id":57,
- "orth":"!",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":58,
- "orth":" \n\n \n",
- "ner":"O"
- },
- {
- "id":59,
- "orth":"Kindly",
- "ner":"O"
- },
- {
- "id":60,
- "orth":"keep",
- "ner":"O"
- },
- {
- "id":61,
- "orth":"your",
- "ner":"O"
- },
- {
- "id":62,
- "orth":"malicious",
- "ner":"O"
- },
- {
- "id":63,
- "orth":"fingers",
- "ner":"O"
- },
- {
- "id":64,
- "orth":"off",
- "ner":"O"
- },
- {
- "id":65,
- "orth":"of",
- "ner":"O"
- },
- {
- "id":66,
- "orth":"my",
- "ner":"O"
- },
- {
- "id":67,
- "orth":"above",
- "ner":"O"
- },
- {
- "id":68,
- "orth":"comment",
- "ner":"O"
- },
- {
- "id":69,
- "orth":",",
- "ner":"O"
- },
- {
- "id":70,
- "orth":".",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- },
- {
- "tokens":[
- {
- "id":71,
- "orth":"Everytime",
- "ner":"O"
- },
- {
- "id":72,
- "orth":"you",
- "ner":"O"
- },
- {
- "id":73,
- "orth":"remove",
- "ner":"O"
- },
- {
- "id":74,
- "orth":"it",
- "ner":"O"
- },
- {
- "id":75,
- "orth":",",
- "ner":"O"
- },
- {
- "id":76,
- "orth":"I",
- "ner":"O"
- },
- {
- "id":77,
- "orth":"will",
- "ner":"O"
- },
- {
- "id":78,
- "orth":"repost",
- "ner":"O"
- },
- {
- "id":79,
- "orth":"it",
- "ner":"O"
- },
- {
- "id":80,
- "orth":"!",
- "ner":"O"
- },
- {
- "id":81,
- "orth":"!",
- "ner":"O"
- },
- {
- "id":82,
- "orth":"!",
- "ner":"O"
- }
- ],
- "brackets":[
-
- ]
- }
- ],
- "cats":[
- {
- "label":"insult",
- "value":0
- },
- {
- "label":"obscene",
- "value":0
- },
- {
- "label":"severe_toxic",
- "value":0
- },
- {
- "label":"toxic",
- "value":1
- }
- ]
- }
- ]
- }
-]
\ No newline at end of file
diff --git a/examples/training/textcat_example_data/jigsaw-toxic-comment.jsonl b/examples/training/textcat_example_data/jigsaw-toxic-comment.jsonl
deleted file mode 100644
index ac31b6255..000000000
--- a/examples/training/textcat_example_data/jigsaw-toxic-comment.jsonl
+++ /dev/null
@@ -1,10 +0,0 @@
-{"meta": {"id": "0000997932d777bf"}, "text": "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}}
-{"meta": {"id": "001956c382006abd"}, "text": "I'm Sorry \n\nI'm sorry I screwed around with someones talk page. It was very bad to do. I know how having the templates on their talk page helps you assert your dominance over them. I know I should bow down to the almighty administrators. But then again, I'm going to go play outside....with your mom. 76.122.79.82", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 1}}
-{"meta": {"id": "0020e7119b96eeeb"}, "text": "Stupid peace of shit stop deleting my stuff asshole go die and fall in a hole go to hell!", "cats": {"insult": 1, "obscene": 1, "severe_toxic": 1, "toxic": 1}}
-{"meta": {"id": "000103f0d9cfb60f"}, "text": "D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}}
-{"meta": {"id": "001dc38a83d420cf"}, "text": "GET FUCKED UP. GET FUCKEEED UP. GOT A DRINK THAT YOU CANT PUT DOWN???/ GET FUCK UP GET FUCKED UP. I'M FUCKED UP RIGHT NOW!", "cats": {"insult": 0, "obscene": 1, "severe_toxic": 0, "toxic": 1}}
-{"meta": {"id": "000113f07ec002fd"}, "text": "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}}
-{"meta": {"id": "0001b41b1c6bb37e"}, "text": "\"\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of \"\"types of accidents\"\" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport \"", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}}
-{"meta": {"id": "0001d958c54c6e35"}, "text": "You, sir, are my hero. Any chance you remember what page that's on?", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}}
-{"meta": {"id": "00025465d4725e87"}, "text": "\"\n\nCongratulations from me as well, use the tools well. · talk \"", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}}
-{"meta": {"id": "002264ea4d5f2887"}, "text": "Why can't you believe how fat Artie is? Did you see him on his recent appearence on the Tonight Show with Jay Leno? He looks absolutely AWFUL! If I had to put money on it, I'd say that Artie Lange is a can't miss candidate for the 2007 Dead pool! \n\n \nKindly keep your malicious fingers off of my above comment, . Everytime you remove it, I will repost it!!!", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 1}}
diff --git a/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py b/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py
deleted file mode 100644
index 339ce39be..000000000
--- a/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from pathlib import Path
-import plac
-import spacy
-from spacy.gold import docs_to_json
-import srsly
-import sys
-
-@plac.annotations(
- model=("Model name. Defaults to 'en'.", "option", "m", str),
- input_file=("Input file (jsonl)", "positional", None, Path),
- output_dir=("Output directory", "positional", None, Path),
- n_texts=("Number of texts to convert", "option", "t", int),
-)
-def convert(model='en', input_file=None, output_dir=None, n_texts=0):
- # Load model with tokenizer + sentencizer only
- nlp = spacy.load(model)
- nlp.disable_pipes(*nlp.pipe_names)
- sentencizer = nlp.create_pipe("sentencizer")
- nlp.add_pipe(sentencizer, first=True)
-
- texts = []
- cats = []
- count = 0
-
- if not input_file.exists():
- print("Input file not found:", input_file)
- sys.exit(1)
- else:
- with open(input_file) as fileh:
- for line in fileh:
- data = srsly.json_loads(line)
- texts.append(data["text"])
- cats.append(data["cats"])
-
- if output_dir is not None:
- output_dir = Path(output_dir)
- if not output_dir.exists():
- output_dir.mkdir()
- else:
- output_dir = Path(".")
-
- docs = []
- for i, doc in enumerate(nlp.pipe(texts)):
- doc.cats = cats[i]
- docs.append(doc)
- if n_texts > 0 and count == n_texts:
- break
- count += 1
-
- srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)])
-
-if __name__ == "__main__":
- plac.call(convert)
diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py
index d2b2c2417..12ed531a6 100644
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@@ -8,8 +8,8 @@ For more details, see the documentation:
* Training: https://spacy.io/usage/training
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
-Compatible with: spaCy v2.2
-Last tested with: v2.2
+Compatible with: spaCy vX.X
+Last tested with: vX.X
"""
from __future__ import unicode_literals, print_function
diff --git a/examples/training/training-data.json b/examples/training/training-data.json
index 1f57e1fd9..2565ce149 100644
--- a/examples/training/training-data.json
+++ b/examples/training/training-data.json
@@ -8,7 +8,7 @@
{
"tokens": [
{
- "head": 4,
+ "head": 44,
"dep": "prep",
"tag": "IN",
"orth": "In",
diff --git a/fabfile.py b/fabfile.py
index 56570e8e0..0e69551c3 100644
--- a/fabfile.py
+++ b/fabfile.py
@@ -10,145 +10,113 @@ import sys
PWD = path.dirname(__file__)
-ENV = environ["VENV_DIR"] if "VENV_DIR" in environ else ".env"
+ENV = environ['VENV_DIR'] if 'VENV_DIR' in environ else '.env'
VENV_DIR = Path(PWD) / ENV
@contextlib.contextmanager
-def virtualenv(name, create=False, python="/usr/bin/python3.6"):
+def virtualenv(name, create=False, python='/usr/bin/python3.6'):
python = Path(python).resolve()
env_path = VENV_DIR
if create:
if env_path.exists():
shutil.rmtree(str(env_path))
- local("{python} -m venv {env_path}".format(python=python, env_path=VENV_DIR))
-
+ local('{python} -m venv {env_path}'.format(python=python, env_path=VENV_DIR))
def wrapped_local(cmd, env_vars=[], capture=False, direct=False):
- return local(
- "source {}/bin/activate && {}".format(env_path, cmd),
- shell="/bin/bash",
- capture=False,
- )
-
+ return local('source {}/bin/activate && {}'.format(env_path, cmd),
+ shell='/bin/bash', capture=False)
yield wrapped_local
-def env(lang="python3.6"):
+def env(lang='python3.6'):
if VENV_DIR.exists():
- local("rm -rf {env}".format(env=VENV_DIR))
- if lang.startswith("python3"):
- local("{lang} -m venv {env}".format(lang=lang, env=VENV_DIR))
+ local('rm -rf {env}'.format(env=VENV_DIR))
+ if lang.startswith('python3'):
+ local('{lang} -m venv {env}'.format(lang=lang, env=VENV_DIR))
else:
- local("{lang} -m pip install virtualenv --no-cache-dir".format(lang=lang))
- local(
- "{lang} -m virtualenv {env} --no-cache-dir".format(lang=lang, env=VENV_DIR)
- )
+ local('{lang} -m pip install virtualenv --no-cache-dir'.format(lang=lang))
+ local('{lang} -m virtualenv {env} --no-cache-dir'.format(lang=lang, env=VENV_DIR))
with virtualenv(VENV_DIR) as venv_local:
- print(venv_local("python --version", capture=True))
- venv_local("pip install --upgrade setuptools --no-cache-dir")
- venv_local("pip install pytest --no-cache-dir")
- venv_local("pip install wheel --no-cache-dir")
- venv_local("pip install -r requirements.txt --no-cache-dir")
- venv_local("pip install pex --no-cache-dir")
+ print(venv_local('python --version', capture=True))
+ venv_local('pip install --upgrade setuptools --no-cache-dir')
+ venv_local('pip install pytest --no-cache-dir')
+ venv_local('pip install wheel --no-cache-dir')
+ venv_local('pip install -r requirements.txt --no-cache-dir')
+ venv_local('pip install pex --no-cache-dir')
+
def install():
with virtualenv(VENV_DIR) as venv_local:
- venv_local("pip install dist/*.tar.gz")
+ venv_local('pip install dist/*.tar.gz')
def make():
with lcd(path.dirname(__file__)):
- local(
- "export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace",
- shell="/bin/bash",
- )
-
+ local('export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace',
+ shell='/bin/bash')
def sdist():
with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)):
- local("python -m pip install -U setuptools srsly")
- local("python setup.py sdist")
-
+ local('python -m pip install -U setuptools')
+ local('python setup.py sdist')
def wheel():
with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)):
- venv_local("python setup.py bdist_wheel")
-
+ venv_local('python setup.py bdist_wheel')
def pex():
with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)):
- sha = local("git rev-parse --short HEAD", capture=True)
- venv_local(
- "pex dist/*.whl -e spacy -o dist/spacy-%s.pex" % sha, direct=True
- )
+ sha = local('git rev-parse --short HEAD', capture=True)
+ venv_local('pex dist/*.whl -e spacy -o dist/spacy-%s.pex' % sha,
+ direct=True)
def clean():
with lcd(path.dirname(__file__)):
- local("rm -f dist/*.whl")
- local("rm -f dist/*.pex")
+ local('rm -f dist/*.whl')
+ local('rm -f dist/*.pex')
with virtualenv(VENV_DIR) as venv_local:
- venv_local("python setup.py clean --all")
+ venv_local('python setup.py clean --all')
def test():
with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)):
- venv_local("pytest -x spacy/tests")
-
+ venv_local('pytest -x spacy/tests')
def train():
- args = environ.get("SPACY_TRAIN_ARGS", "")
+ args = environ.get('SPACY_TRAIN_ARGS', '')
with virtualenv(VENV_DIR) as venv_local:
- venv_local("spacy train {args}".format(args=args))
+ venv_local('spacy train {args}'.format(args=args))
-def conll17(treebank_dir, experiment_dir, vectors_dir, config, corpus=""):
- is_not_clean = local("git status --porcelain", capture=True)
+def conll17(treebank_dir, experiment_dir, vectors_dir, config, corpus=''):
+ is_not_clean = local('git status --porcelain', capture=True)
if is_not_clean:
print("Repository is not clean")
print(is_not_clean)
sys.exit(1)
- git_sha = local("git rev-parse --short HEAD", capture=True)
- config_checksum = local("sha256sum {config}".format(config=config), capture=True)
- experiment_dir = Path(experiment_dir) / "{}--{}".format(
- config_checksum[:6], git_sha
- )
+ git_sha = local('git rev-parse --short HEAD', capture=True)
+ config_checksum = local('sha256sum {config}'.format(config=config), capture=True)
+ experiment_dir = Path(experiment_dir) / '{}--{}'.format(config_checksum[:6], git_sha)
if not experiment_dir.exists():
experiment_dir.mkdir()
- test_data_dir = Path(treebank_dir) / "ud-test-v2.0-conll2017"
+ test_data_dir = Path(treebank_dir) / 'ud-test-v2.0-conll2017'
assert test_data_dir.exists()
assert test_data_dir.is_dir()
if corpus:
corpora = [corpus]
else:
- corpora = ["UD_English", "UD_Chinese", "UD_Japanese", "UD_Vietnamese"]
+ corpora = ['UD_English', 'UD_Chinese', 'UD_Japanese', 'UD_Vietnamese']
- local(
- "cp {config} {experiment_dir}/config.json".format(
- config=config, experiment_dir=experiment_dir
- )
- )
+ local('cp {config} {experiment_dir}/config.json'.format(config=config, experiment_dir=experiment_dir))
with virtualenv(VENV_DIR) as venv_local:
for corpus in corpora:
- venv_local(
- "spacy ud-train {treebank_dir} {experiment_dir} {config} {corpus} -v {vectors_dir}".format(
- treebank_dir=treebank_dir,
- experiment_dir=experiment_dir,
- config=config,
- corpus=corpus,
- vectors_dir=vectors_dir,
- )
- )
- venv_local(
- "spacy ud-run-test {test_data_dir} {experiment_dir} {corpus}".format(
- test_data_dir=test_data_dir,
- experiment_dir=experiment_dir,
- config=config,
- corpus=corpus,
- )
- )
+ venv_local('spacy ud-train {treebank_dir} {experiment_dir} {config} {corpus} -v {vectors_dir}'.format(
+ treebank_dir=treebank_dir, experiment_dir=experiment_dir, config=config, corpus=corpus, vectors_dir=vectors_dir))
+ venv_local('spacy ud-run-test {test_data_dir} {experiment_dir} {corpus}'.format(
+ test_data_dir=test_data_dir, experiment_dir=experiment_dir, config=config, corpus=corpus))
diff --git a/requirements.txt b/requirements.txt
index ebe660b97..a6d721e96 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,8 @@
# Our libraries
cymem>=2.0.2,<2.1.0
-preshed>=3.0.2,<3.1.0
-thinc>=7.1.1,<7.2.0
-blis>=0.4.0,<0.5.0
+preshed>=2.0.1,<2.1.0
+thinc>=7.0.8,<7.1.0
+blis>=0.2.2,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.2.0,<1.1.0
srsly>=0.1.0,<1.1.0
diff --git a/setup.py b/setup.py
index abe3fb509..984de2250 100755
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ def is_new_osx():
return False
-PACKAGE_DATA = {"": ["*.pyx", "*.pxd", "*.txt", "*.tokens", "*.json", "*.json.gz"]}
+PACKAGE_DATA = {"": ["*.pyx", "*.pxd", "*.txt", "*.tokens", "*.json"]}
PACKAGES = find_packages()
@@ -43,7 +43,6 @@ MOD_NAMES = [
"spacy.kb",
"spacy.morphology",
"spacy.pipeline.pipes",
- "spacy.pipeline.morphologizer",
"spacy.syntax.stateclass",
"spacy.syntax._state",
"spacy.tokenizer",
@@ -57,7 +56,6 @@ MOD_NAMES = [
"spacy.tokens.doc",
"spacy.tokens.span",
"spacy.tokens.token",
- "spacy.tokens.morphanalysis",
"spacy.tokens._retokenize",
"spacy.matcher.matcher",
"spacy.matcher.phrasematcher",
@@ -247,9 +245,9 @@ def setup_package():
"numpy>=1.15.0",
"murmurhash>=0.28.0,<1.1.0",
"cymem>=2.0.2,<2.1.0",
- "preshed>=3.0.2,<3.1.0",
- "thinc>=7.1.1,<7.2.0",
- "blis>=0.4.0,<0.5.0",
+ "preshed>=2.0.1,<2.1.0",
+ "thinc>=7.0.8,<7.1.0",
+ "blis>=0.2.2,<0.3.0",
"plac<1.0.0,>=0.9.6",
"requests>=2.13.0,<3.0.0",
"wasabi>=0.2.0,<1.1.0",
@@ -283,6 +281,7 @@ def setup_package():
"Programming Language :: Python :: 2",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 6104324ab..660d20c46 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -15,7 +15,7 @@ from thinc.api import uniqued, wrap, noop
from thinc.api import with_square_sequences
from thinc.linear.linear import LinearModel
from thinc.neural.ops import NumpyOps, CupyOps
-from thinc.neural.util import get_array_module, copy_array
+from thinc.neural.util import get_array_module
from thinc.neural.optimizers import Adam
from thinc import describe
@@ -286,7 +286,10 @@ def link_vectors_to_models(vocab):
if vectors.name is None:
vectors.name = VECTORS_KEY
if vectors.data.size != 0:
- user_warning(Warnings.W020.format(shape=vectors.data.shape))
+ print(
+ "Warning: Unnamed vectors -- this won't allow multiple vectors "
+ "models to be loaded. (Shape: (%d, %d))" % vectors.data.shape
+ )
ops = Model.ops
for word in vocab:
if word.orth in vectors.key2row:
@@ -320,9 +323,6 @@ def Tok2Vec(width, embed_size, **kwargs):
pretrained_vectors = kwargs.get("pretrained_vectors", None)
cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
subword_features = kwargs.get("subword_features", True)
- char_embed = kwargs.get("char_embed", False)
- if char_embed:
- subword_features = False
conv_depth = kwargs.get("conv_depth", 4)
bilstm_depth = kwargs.get("bilstm_depth", 0)
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
@@ -362,14 +362,6 @@ def Tok2Vec(width, embed_size, **kwargs):
>> LN(Maxout(width, width * 4, pieces=3)),
column=cols.index(ORTH),
)
- elif char_embed:
- embed = concatenate_lists(
- CharacterEmbed(nM=64, nC=8),
- FeatureExtracter(cols) >> with_flatten(norm),
- )
- reduce_dimensions = LN(
- Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
- )
else:
embed = norm
@@ -377,15 +369,9 @@ def Tok2Vec(width, embed_size, **kwargs):
ExtractWindow(nW=1)
>> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
)
- if char_embed:
- tok2vec = embed >> with_flatten(
- reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
- )
- else:
- tok2vec = FeatureExtracter(cols) >> with_flatten(
- embed >> convolution ** conv_depth, pad=conv_depth
- )
-
+ tok2vec = FeatureExtracter(cols) >> with_flatten(
+ embed >> convolution ** conv_depth, pad=conv_depth
+ )
if bilstm_depth >= 1:
tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
@@ -518,46 +504,6 @@ def getitem(i):
return layerize(getitem_fwd)
-@describe.attributes(
- W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None)
-)
-class MultiSoftmax(Affine):
- """Neural network layer that predicts several multi-class attributes at once.
- For instance, we might predict one class with 6 variables, and another with 5.
- We predict the 11 neurons required for this, and then softmax them such
- that columns 0-6 make a probability distribution and coumns 6-11 make another.
- """
-
- name = "multisoftmax"
-
- def __init__(self, out_sizes, nI=None, **kwargs):
- Model.__init__(self, **kwargs)
- self.out_sizes = out_sizes
- self.nO = sum(out_sizes)
- self.nI = nI
-
- def predict(self, input__BI):
- output__BO = self.ops.affine(self.W, self.b, input__BI)
- i = 0
- for out_size in self.out_sizes:
- self.ops.softmax(output__BO[:, i : i + out_size], inplace=True)
- i += out_size
- return output__BO
-
- def begin_update(self, input__BI, drop=0.0):
- output__BO = self.predict(input__BI)
-
- def finish_update(grad__BO, sgd=None):
- self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True)
- self.d_b += grad__BO.sum(axis=0)
- grad__BI = self.ops.gemm(grad__BO, self.W)
- if sgd is not None:
- sgd(self._mem.weights, self._mem.gradient, key=self.id)
- return grad__BI
-
- return output__BO, finish_update
-
-
def build_tagger_model(nr_class, **cfg):
embed_size = util.env_opt("embed_size", 2000)
if "token_vector_width" in cfg:
@@ -584,33 +530,6 @@ def build_tagger_model(nr_class, **cfg):
return model
-def build_morphologizer_model(class_nums, **cfg):
- embed_size = util.env_opt("embed_size", 7000)
- if "token_vector_width" in cfg:
- token_vector_width = cfg["token_vector_width"]
- else:
- token_vector_width = util.env_opt("token_vector_width", 128)
- pretrained_vectors = cfg.get("pretrained_vectors")
- char_embed = cfg.get("char_embed", True)
- with Model.define_operators({">>": chain, "+": add, "**": clone}):
- if "tok2vec" in cfg:
- tok2vec = cfg["tok2vec"]
- else:
- tok2vec = Tok2Vec(
- token_vector_width,
- embed_size,
- char_embed=char_embed,
- pretrained_vectors=pretrained_vectors,
- )
- softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
- softmax.out_sizes = class_nums
- model = tok2vec >> softmax
- model.nI = None
- model.tok2vec = tok2vec
- model.softmax = softmax
- return model
-
-
@layerize
def SpacyVectors(docs, drop=0.0):
batch = []
@@ -801,8 +720,7 @@ def concatenate_lists(*layers, **kwargs): # pragma: no cover
concat = concatenate(*layers)
def concatenate_lists_fwd(Xs, drop=0.0):
- if drop is not None:
- drop *= drop_factor
+ drop *= drop_factor
lengths = ops.asarray([len(X) for X in Xs], dtype="i")
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
ys = ops.unflatten(flat_y, lengths)
@@ -892,67 +810,6 @@ def _replace_word(word, random_words, mask="[MASK]"):
return word
-def _uniform_init(lo, hi):
- def wrapped(W, ops):
- copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
-
- return wrapped
-
-
-@describe.attributes(
- nM=Dimension("Vector dimensions"),
- nC=Dimension("Number of characters per word"),
- vectors=Synapses(
- "Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1)
- ),
- d_vectors=Gradient("vectors"),
-)
-class CharacterEmbed(Model):
- def __init__(self, nM=None, nC=None, **kwargs):
- Model.__init__(self, **kwargs)
- self.nM = nM
- self.nC = nC
-
- @property
- def nO(self):
- return self.nM * self.nC
-
- @property
- def nV(self):
- return 256
-
- def begin_update(self, docs, drop=0.0):
- if not docs:
- return []
- ids = []
- output = []
- weights = self.vectors
- # This assists in indexing; it's like looping over this dimension.
- # Still consider this weird witch craft...But thanks to Mark Neumann
- # for the tip.
- nCv = self.ops.xp.arange(self.nC)
- for doc in docs:
- doc_ids = doc.to_utf8_array(nr_char=self.nC)
- doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
- # Let's say I have a 2d array of indices, and a 3d table of data. What numpy
- # incantation do I chant to get
- # output[i, j, k] == data[j, ids[i, j], k]?
- doc_vectors[:, nCv] = weights[nCv, doc_ids[:, nCv]]
- output.append(doc_vectors.reshape((len(doc), self.nO)))
- ids.append(doc_ids)
-
- def backprop_character_embed(d_vectors, sgd=None):
- gradient = self.d_vectors
- for doc_ids, d_doc_vectors in zip(ids, d_vectors):
- d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), self.nC, self.nM))
- gradient[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
- if sgd is not None:
- sgd(self._mem.weights, self._mem.gradient, key=self.id)
- return None
-
- return output, backprop_character_embed
-
-
def get_cossim_loss(yh, y):
# Add a small constant to avoid 0 vectors
yh = yh + 1e-8
diff --git a/spacy/about.py b/spacy/about.py
index 7bb8e7ead..9587c9071 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,12 +1,16 @@
+# inspired from:
+# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
+# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
# fmt: off
+
__title__ = "spacy"
-__version__ = "2.2.0.dev15"
-__summary__ = "Industrial-strength Natural Language Processing (NLP) in Python"
+__version__ = "2.1.8"
+__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io"
-__author__ = "Explosion"
+__author__ = "Explosion AI"
__email__ = "contact@explosion.ai"
__license__ = "MIT"
-__release__ = False
+__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 40236630a..8eeea363f 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -144,12 +144,8 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
for name, value in stringy_attrs.items():
if isinstance(name, int):
int_key = name
- elif name in IDS:
- int_key = IDS[name]
- elif name.upper() in IDS:
- int_key = IDS[name.upper()]
else:
- continue
+ int_key = IDS[name.upper()]
if strings_map is not None and isinstance(value, basestring):
if hasattr(strings_map, 'add'):
value = strings_map.add(value)
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index b649e6666..0a9a0f7ef 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -34,6 +34,12 @@ BLANK_MODEL_THRESHOLD = 2000
str,
),
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
+ ignore_validation=(
+ "Don't exit if JSON format validation fails",
+ "flag",
+ "IV",
+ bool,
+ ),
verbose=("Print additional information and explanations", "flag", "V", bool),
no_format=("Don't pretty-print the results", "flag", "NF", bool),
)
@@ -44,14 +50,10 @@ def debug_data(
base_model=None,
pipeline="tagger,parser,ner",
ignore_warnings=False,
+ ignore_validation=False,
verbose=False,
no_format=False,
):
- """
- Analyze, debug and validate your training and development data, get useful
- stats, and find problems like invalid entity annotations, cyclic
- dependencies, low data labels and more.
- """
msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
# Make sure all files and paths exists if they are needed
@@ -70,9 +72,21 @@ def debug_data(
msg.divider("Data format validation")
- # TODO: Validate data format using the JSON schema
+ # Validate data format using the JSON schema
# TODO: update once the new format is ready
# TODO: move validation to GoldCorpus in order to be able to load from dir
+ train_data_errors = [] # TODO: validate_json
+ dev_data_errors = [] # TODO: validate_json
+ if not train_data_errors:
+ msg.good("Training data JSON format is valid")
+ if not dev_data_errors:
+ msg.good("Development data JSON format is valid")
+ for error in train_data_errors:
+ msg.fail("Training data: {}".format(error))
+ for error in dev_data_errors:
+ msg.fail("Develoment data: {}".format(error))
+ if (train_data_errors or dev_data_errors) and not ignore_validation:
+ sys.exit(1)
# Create the gold corpus to be able to better analyze data
loading_train_error_message = ""
@@ -270,7 +284,7 @@ def debug_data(
if "textcat" in pipeline:
msg.divider("Text Classification")
- labels = [label for label in gold_train_data["cats"]]
+ labels = [label for label in gold_train_data["textcat"]]
model_labels = _get_labels_from_model(nlp, "textcat")
new_labels = [l for l in labels if l not in model_labels]
existing_labels = [l for l in labels if l in model_labels]
@@ -281,45 +295,13 @@ def debug_data(
)
if new_labels:
labels_with_counts = _format_labels(
- gold_train_data["cats"].most_common(), counts=True
+ gold_train_data["textcat"].most_common(), counts=True
)
msg.text("New: {}".format(labels_with_counts), show=verbose)
if existing_labels:
msg.text(
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
)
- if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
- msg.fail(
- "The train and dev labels are not the same. "
- "Train labels: {}. "
- "Dev labels: {}.".format(
- _format_labels(gold_train_data["cats"]),
- _format_labels(gold_dev_data["cats"]),
- )
- )
- if gold_train_data["n_cats_multilabel"] > 0:
- msg.info(
- "The train data contains instances without "
- "mutually-exclusive classes. Use '--textcat-multilabel' "
- "when training."
- )
- if gold_dev_data["n_cats_multilabel"] == 0:
- msg.warn(
- "Potential train/dev mismatch: the train data contains "
- "instances without mutually-exclusive classes while the "
- "dev data does not."
- )
- else:
- msg.info(
- "The train data contains only instances with "
- "mutually-exclusive classes."
- )
- if gold_dev_data["n_cats_multilabel"] > 0:
- msg.fail(
- "Train/dev mismatch: the dev data contains instances "
- "without mutually-exclusive classes while the train data "
- "contains only instances with mutually-exclusive classes."
- )
if "tagger" in pipeline:
msg.divider("Part-of-speech Tagging")
@@ -348,7 +330,6 @@ def debug_data(
)
if "parser" in pipeline:
- has_low_data_warning = False
msg.divider("Dependency Parsing")
# profile sentence length
@@ -537,7 +518,6 @@ def _compile_gold(train_docs, pipeline):
"n_sents": 0,
"n_nonproj": 0,
"n_cycles": 0,
- "n_cats_multilabel": 0,
"texts": set(),
}
for doc, gold in train_docs:
@@ -560,8 +540,6 @@ def _compile_gold(train_docs, pipeline):
data["ner"]["-"] += 1
if "textcat" in pipeline:
data["cats"].update(gold.cats)
- if list(gold.cats.values()).count(1.0) != 1:
- data["n_cats_multilabel"] += 1
if "tagger" in pipeline:
data["tags"].update([x for x in gold.tags if x is not None])
if "parser" in pipeline:
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 64ab03a75..8a993178a 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -28,16 +28,6 @@ def download(model, direct=False, *pip_args):
can be shortcut, model name or, if --direct flag is set, full model name
with version. For direct downloads, the compatibility check will be skipped.
"""
- if not require_package("spacy") and "--no-deps" not in pip_args:
- msg.warn(
- "Skipping model package dependencies and setting `--no-deps`. "
- "You don't seem to have the spaCy package itself installed "
- "(maybe because you've built from source?), so installing the "
- "model dependencies would cause spaCy to be downloaded, which "
- "probably isn't what you want. If the model package has other "
- "dependencies, you'll have to install them manually."
- )
- pip_args = pip_args + ("--no-deps",)
dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}"
if direct:
components = model.split("-")
@@ -82,15 +72,12 @@ def download(model, direct=False, *pip_args):
# is_package check currently fails, because pkg_resources.working_set
# is not refreshed automatically (see #3923). We're trying to work
# around this here be requiring the package explicitly.
- require_package(model_name)
-
-
-def require_package(name):
- try:
- pkg_resources.working_set.require(name)
- return True
- except: # noqa: E722
- return False
+ try:
+ pkg_resources.working_set.require(model_name)
+ except: # noqa: E722
+ # Maybe it's possible to remove this – mostly worried about cross-
+ # platform and cross-Python copmpatibility here
+ pass
def get_json(url, desc):
@@ -130,7 +117,7 @@ def get_version(model, comp):
def download_model(filename, user_pip_args=None):
download_url = about.__download_url__ + "/" + filename
- pip_args = ["--no-cache-dir"]
+ pip_args = ["--no-cache-dir", "--no-deps"]
if user_pip_args:
pip_args.extend(user_pip_args)
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 1114ada08..0a57ef2da 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -61,7 +61,6 @@ def evaluate(
"NER P": "%.2f" % scorer.ents_p,
"NER R": "%.2f" % scorer.ents_r,
"NER F": "%.2f" % scorer.ents_f,
- "Textcat": "%.2f" % scorer.textcat_score,
}
msg.table(results, title="Results")
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index c285a12a6..955b420aa 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -35,13 +35,6 @@ msg = Printer()
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
- vectors_name=(
- "Optional name for the word vectors, e.g. en_core_web_lg.vectors",
- "option",
- "vn",
- str,
- ),
- model_name=("Optional name for the model meta", "option", "mn", str),
)
def init_model(
lang,
@@ -51,8 +44,6 @@ def init_model(
jsonl_loc=None,
vectors_loc=None,
prune_vectors=-1,
- vectors_name=None,
- model_name=None,
):
"""
Create a new model from raw data, like word frequencies, Brown clusters
@@ -84,10 +75,10 @@ def init_model(
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
with msg.loading("Creating model..."):
- nlp = create_model(lang, lex_attrs, name=model_name)
+ nlp = create_model(lang, lex_attrs)
msg.good("Successfully created model")
if vectors_loc is not None:
- add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
+ add_vectors(nlp, vectors_loc, prune_vectors)
vec_added = len(nlp.vocab.vectors)
lex_added = len(nlp.vocab)
msg.good(
@@ -147,7 +138,7 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
return lex_attrs
-def create_model(lang, lex_attrs, name=None):
+def create_model(lang, lex_attrs):
lang_class = get_lang_class(lang)
nlp = lang_class()
for lexeme in nlp.vocab:
@@ -166,12 +157,10 @@ def create_model(lang, lex_attrs, name=None):
else:
oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob})
- if name:
- nlp.meta["name"] = name
return nlp
-def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
+def add_vectors(nlp, vectors_loc, prune_vectors):
vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@@ -192,10 +181,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
lexeme.is_oov = False
if vectors_data is not None:
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
- if name is None:
- nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
- else:
- nlp.vocab.vectors.name = name
+ nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
if prune_vectors >= 1:
nlp.vocab.prune_vectors(prune_vectors)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 2588a81a2..fe30e1a3c 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -21,35 +21,54 @@ from .. import about
@plac.annotations(
- # fmt: off
lang=("Model language", "positional", None, str),
output_path=("Output directory to store model in", "positional", None, Path),
train_path=("Location of JSON-formatted training data", "positional", None, Path),
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
- raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
+ raw_text=(
+ "Path to jsonl file with unlabelled text documents.",
+ "option",
+ "rt",
+ Path,
+ ),
base_model=("Name of model to update (optional)", "option", "b", str),
pipeline=("Comma-separated names of pipeline components", "option", "p", str),
vectors=("Model to load vectors from", "option", "v", str),
n_iter=("Number of iterations", "option", "n", int),
- n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int),
+ n_early_stopping=(
+ "Maximum number of training epochs without dev accuracy improvement",
+ "option",
+ "ne",
+ int,
+ ),
n_examples=("Number of examples", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int),
version=("Model version", "option", "V", str),
meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
- init_tok2vec=("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
- parser_multitasks=("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str),
- entity_multitasks=("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str),
+ init_tok2vec=(
+ "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
+ "option",
+ "t2v",
+ Path,
+ ),
+ parser_multitasks=(
+ "Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'",
+ "option",
+ "pt",
+ str,
+ ),
+ entity_multitasks=(
+ "Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'",
+ "option",
+ "et",
+ str,
+ ),
noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
- orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float),
eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
- textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool),
- textcat_arch=("Textcat model architecture", "option", "ta", str),
- textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
verbose=("Display more information for debug", "flag", "VV", bool),
debug=("Run data diagnostics before training", "flag", "D", bool),
- # fmt: on
)
def train(
lang,
@@ -70,13 +89,9 @@ def train(
parser_multitasks="",
entity_multitasks="",
noise_level=0.0,
- orth_variant_level=0.0,
eval_beam_widths="",
gold_preproc=False,
learn_tokens=False,
- textcat_multilabel=False,
- textcat_arch="bow",
- textcat_positive_label=None,
verbose=False,
debug=False,
):
@@ -162,37 +177,9 @@ def train(
if pipe not in nlp.pipe_names:
if pipe == "parser":
pipe_cfg = {"learn_tokens": learn_tokens}
- elif pipe == "textcat":
- pipe_cfg = {
- "exclusive_classes": not textcat_multilabel,
- "architecture": textcat_arch,
- "positive_label": textcat_positive_label,
- }
else:
pipe_cfg = {}
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
- else:
- if pipe == "textcat":
- textcat_cfg = nlp.get_pipe("textcat").cfg
- base_cfg = {
- "exclusive_classes": textcat_cfg["exclusive_classes"],
- "architecture": textcat_cfg["architecture"],
- "positive_label": textcat_cfg["positive_label"],
- }
- pipe_cfg = {
- "exclusive_classes": not textcat_multilabel,
- "architecture": textcat_arch,
- "positive_label": textcat_positive_label,
- }
- if base_cfg != pipe_cfg:
- msg.fail(
- "The base textcat model configuration does"
- "not match the provided training options. "
- "Existing cfg: {}, provided cfg: {}".format(
- base_cfg, pipe_cfg
- ),
- exits=1,
- )
else:
msg.text("Starting with blank model '{}'".format(lang))
lang_cls = util.get_lang_class(lang)
@@ -200,12 +187,6 @@ def train(
for pipe in pipeline:
if pipe == "parser":
pipe_cfg = {"learn_tokens": learn_tokens}
- elif pipe == "textcat":
- pipe_cfg = {
- "exclusive_classes": not textcat_multilabel,
- "architecture": textcat_arch,
- "positive_label": textcat_positive_label,
- }
else:
pipe_cfg = {}
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
@@ -246,89 +227,12 @@ def train(
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
msg.text("Loaded pretrained tok2vec for: {}".format(components))
- # Verify textcat config
- if "textcat" in pipeline:
- textcat_labels = nlp.get_pipe("textcat").cfg["labels"]
- if textcat_positive_label and textcat_positive_label not in textcat_labels:
- msg.fail(
- "The textcat_positive_label (tpl) '{}' does not match any "
- "label in the training data.".format(textcat_positive_label),
- exits=1,
- )
- if textcat_positive_label and len(textcat_labels) != 2:
- msg.fail(
- "A textcat_positive_label (tpl) '{}' was provided for training "
- "data that does not appear to be a binary classification "
- "problem with two labels.".format(textcat_positive_label),
- exits=1,
- )
- train_docs = corpus.train_docs(
- nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
- )
- train_labels = set()
- if textcat_multilabel:
- multilabel_found = False
- for text, gold in train_docs:
- train_labels.update(gold.cats.keys())
- if list(gold.cats.values()).count(1.0) != 1:
- multilabel_found = True
- if not multilabel_found and not base_model:
- msg.warn(
- "The textcat training instances look like they have "
- "mutually-exclusive classes. Remove the flag "
- "'--textcat-multilabel' to train a classifier with "
- "mutually-exclusive classes."
- )
- if not textcat_multilabel:
- for text, gold in train_docs:
- train_labels.update(gold.cats.keys())
- if list(gold.cats.values()).count(1.0) != 1 and not base_model:
- msg.warn(
- "Some textcat training instances do not have exactly "
- "one positive label. Modifying training options to "
- "include the flag '--textcat-multilabel' for classes "
- "that are not mutually exclusive."
- )
- nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
- textcat_multilabel = True
- break
- if base_model and set(textcat_labels) != train_labels:
- msg.fail(
- "Cannot extend textcat model using data with different "
- "labels. Base model labels: {}, training data labels: "
- "{}.".format(textcat_labels, list(train_labels)),
- exits=1,
- )
- if textcat_multilabel:
- msg.text(
- "Textcat evaluation score: ROC AUC score macro-averaged across "
- "the labels '{}'".format(", ".join(textcat_labels))
- )
- elif textcat_positive_label and len(textcat_labels) == 2:
- msg.text(
- "Textcat evaluation score: F1-score for the "
- "label '{}'".format(textcat_positive_label)
- )
- elif len(textcat_labels) > 1:
- if len(textcat_labels) == 2:
- msg.warn(
- "If the textcat component is a binary classifier with "
- "exclusive classes, provide '--textcat_positive_label' for "
- "an evaluation on the positive class."
- )
- msg.text(
- "Textcat evaluation score: F1-score macro-averaged across "
- "the labels '{}'".format(", ".join(textcat_labels))
- )
- else:
- msg.fail(
- "Unsupported textcat configuration. Use `spacy debug-data` "
- "for more information."
- )
-
# fmt: off
- row_head, output_stats = _configure_training_output(pipeline, use_gpu, has_beam_widths)
- row_widths = [len(w) for w in row_head]
+ row_head = ["Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS"]
+ row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7]
+ if has_beam_widths:
+ row_head.insert(1, "Beam W.")
+ row_widths.insert(1, 7)
row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
# fmt: on
print("")
@@ -339,11 +243,7 @@ def train(
best_score = 0.0
for i in range(n_iter):
train_docs = corpus.train_docs(
- nlp,
- noise_level=noise_level,
- orth_variant_level=orth_variant_level,
- gold_preproc=gold_preproc,
- max_length=0,
+ nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
)
if raw_text:
random.shuffle(raw_text)
@@ -386,7 +286,7 @@ def train(
)
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
start_time = timer()
- scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
+ scorer = nlp_loaded.evaluate(dev_docs, debug)
end_time = timer()
if use_gpu < 0:
gpu_wps = None
@@ -402,7 +302,7 @@ def train(
corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
)
start_time = timer()
- scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
+ scorer = nlp_loaded.evaluate(dev_docs)
end_time = timer()
cpu_wps = nwords / (end_time - start_time)
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
@@ -436,7 +336,6 @@ def train(
}
meta.setdefault("name", "model%d" % i)
meta.setdefault("version", version)
- meta["labels"] = nlp.meta["labels"]
meta_loc = output_path / ("model%d" % i) / "meta.json"
srsly.write_json(meta_loc, meta)
util.set_env_log(verbose)
@@ -445,19 +344,10 @@ def train(
i,
losses,
scorer.scores,
- output_stats,
beam_width=beam_width if has_beam_widths else None,
cpu_wps=cpu_wps,
gpu_wps=gpu_wps,
)
- if i == 0 and "textcat" in pipeline:
- textcats_per_cat = scorer.scores.get("textcats_per_cat", {})
- for cat, cat_score in textcats_per_cat.items():
- if cat_score.get("roc_auc_score", 0) < 0:
- msg.warn(
- "Textcat ROC AUC score is undefined due to "
- "only one value in label '{}'.".format(cat)
- )
msg.row(progress, **row_settings)
# Early stopping
if n_early_stopping is not None:
@@ -498,8 +388,6 @@ def _score_for_model(meta):
mean_acc.append((acc["uas"] + acc["las"]) / 2)
if "ner" in pipes:
mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
- if "textcat" in pipes:
- mean_acc.append(acc["textcat_score"])
return sum(mean_acc) / len(mean_acc)
@@ -583,55 +471,40 @@ def _get_metrics(component):
return ("token_acc",)
-def _configure_training_output(pipeline, use_gpu, has_beam_widths):
- row_head = ["Itn"]
- output_stats = []
- for pipe in pipeline:
- if pipe == "tagger":
- row_head.extend(["Tag Loss ", " Tag % "])
- output_stats.extend(["tag_loss", "tags_acc"])
- elif pipe == "parser":
- row_head.extend(["Dep Loss ", " UAS ", " LAS "])
- output_stats.extend(["dep_loss", "uas", "las"])
- elif pipe == "ner":
- row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "])
- output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"])
- elif pipe == "textcat":
- row_head.extend(["Textcat Loss", "Textcat"])
- output_stats.extend(["textcat_loss", "textcat_score"])
- row_head.extend(["Token %", "CPU WPS"])
- output_stats.extend(["token_acc", "cpu_wps"])
-
- if use_gpu >= 0:
- row_head.extend(["GPU WPS"])
- output_stats.extend(["gpu_wps"])
-
- if has_beam_widths:
- row_head.insert(1, "Beam W.")
- return row_head, output_stats
-
-
-def _get_progress(
- itn, losses, dev_scores, output_stats, beam_width=None, cpu_wps=0.0, gpu_wps=0.0
-):
+def _get_progress(itn, losses, dev_scores, beam_width=None, cpu_wps=0.0, gpu_wps=0.0):
scores = {}
- for stat in output_stats:
- scores[stat] = 0.0
+ for col in [
+ "dep_loss",
+ "tag_loss",
+ "uas",
+ "tags_acc",
+ "token_acc",
+ "ents_p",
+ "ents_r",
+ "ents_f",
+ "cpu_wps",
+ "gpu_wps",
+ ]:
+ scores[col] = 0.0
scores["dep_loss"] = losses.get("parser", 0.0)
scores["ner_loss"] = losses.get("ner", 0.0)
scores["tag_loss"] = losses.get("tagger", 0.0)
- scores["textcat_loss"] = losses.get("textcat", 0.0)
+ scores.update(dev_scores)
scores["cpu_wps"] = cpu_wps
scores["gpu_wps"] = gpu_wps or 0.0
- scores.update(dev_scores)
- formatted_scores = []
- for stat in output_stats:
- format_spec = "{:.3f}"
- if stat.endswith("_wps"):
- format_spec = "{:.0f}"
- formatted_scores.append(format_spec.format(scores[stat]))
- result = [itn + 1]
- result.extend(formatted_scores)
+ result = [
+ itn,
+ "{:.3f}".format(scores["dep_loss"]),
+ "{:.3f}".format(scores["ner_loss"]),
+ "{:.3f}".format(scores["uas"]),
+ "{:.3f}".format(scores["ents_p"]),
+ "{:.3f}".format(scores["ents_r"]),
+ "{:.3f}".format(scores["ents_f"]),
+ "{:.3f}".format(scores["tags_acc"]),
+ "{:.3f}".format(scores["token_acc"]),
+ "{:.0f}".format(scores["cpu_wps"]),
+ "{:.0f}".format(scores["gpu_wps"]),
+ ]
if beam_width is not None:
result.insert(1, beam_width)
return result
diff --git a/spacy/errors.py b/spacy/errors.py
index 30c7a5f48..587a6e700 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -84,10 +84,6 @@ class Warnings(object):
W018 = ("Entity '{entity}' already exists in the Knowledge base.")
W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
"previously loaded vectors. See Issue #3853.")
- W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
- "loaded. (Shape: {shape})")
- W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
- "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
@add_codes
@@ -122,7 +118,7 @@ class Errors(object):
E011 = ("Unknown operator: '{op}'. Options: {opts}")
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
E013 = ("Error selecting action in matcher")
- E014 = ("Unknown tag ID: {tag}")
+ E014 = ("Uknown tag ID: {tag}")
E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
"`force=True` to overwrite.")
E016 = ("MultitaskObjective target should be function or one of: dep, "
@@ -461,25 +457,6 @@ class Errors(object):
E160 = ("Can't find language data file: {path}")
E161 = ("Found an internal inconsistency when predicting entity links. "
"This is likely a bug in spaCy, so feel free to open an issue.")
- E162 = ("Cannot evaluate textcat model on data with different labels.\n"
- "Labels in model: {model_labels}\nLabels in evaluation "
- "data: {eval_labels}")
- E163 = ("cumsum was found to be unstable: its last element does not "
- "correspond to sum")
- E164 = ("x is neither increasing nor decreasing: {}.")
- E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
- "that case.")
- E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
- "Current DocBin: {current}\nOther DocBin: {other}")
- E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
- "happen if the tagger was trained with a different set of "
- "morphological features. If you're using a pre-trained model, make "
- "sure that your models are up to date:\npython -m spacy validate")
- E168 = ("Unknown field: {field}")
- E169 = ("Can't find module: {module}")
- E170 = ("Cannot apply transition {name}: invalid for the current state.")
- E171 = ("Matcher.add received invalid on_match callback argument: expected "
- "callable or None, but got: {arg_type}")
@add_codes
diff --git a/spacy/glossary.py b/spacy/glossary.py
index 52abc7bb5..ff38e7138 100644
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@@ -307,10 +307,4 @@ GLOSSARY = {
# https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
"PER": "Named person or family.",
"MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art",
- # https://github.com/ltgoslo/norne
- "EVT": "Festivals, cultural events, sports events, weather phenomena, wars, etc.",
- "PROD": "Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas",
- "DRV": "Words (and phrases?) that are dervied from a name, but not a name in themselves, e.g. 'Oslo-mannen' ('the man from Oslo')",
- "GPE_LOC": "Geo-political entity, with a locative sense, e.g. 'John lives in Spain'",
- "GPE_ORG": "Geo-political entity, with an organisation sense, e.g. 'Spain declined to meet with Belgium'",
}
diff --git a/spacy/gold.pxd b/spacy/gold.pxd
index 20a25a939..a3123f7fa 100644
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@@ -24,7 +24,6 @@ cdef class GoldParse:
cdef public int loss
cdef public list words
cdef public list tags
- cdef public list morphology
cdef public list heads
cdef public list labels
cdef public dict orths
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 4cc44f757..f6ec8d3fa 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -7,7 +7,6 @@ import random
import numpy
import tempfile
import shutil
-import itertools
from pathlib import Path
import srsly
@@ -57,7 +56,6 @@ def tags_to_entities(tags):
def merge_sents(sents):
m_deps = [[], [], [], [], [], []]
m_brackets = []
- m_cats = sents.pop()
i = 0
for (ids, words, tags, heads, labels, ner), brackets in sents:
m_deps[0].extend(id_ + i for id_ in ids)
@@ -69,7 +67,6 @@ def merge_sents(sents):
m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
for b in brackets)
i += len(ids)
- m_deps.append(m_cats)
return [(m_deps, m_brackets)]
@@ -201,7 +198,6 @@ class GoldCorpus(object):
n = 0
i = 0
for raw_text, paragraph_tuples in self.train_tuples:
- cats = paragraph_tuples.pop()
for sent_tuples, brackets in paragraph_tuples:
n += len(sent_tuples[1])
if self.limit and i >= self.limit:
@@ -210,14 +206,13 @@ class GoldCorpus(object):
return n
def train_docs(self, nlp, gold_preproc=False, max_length=None,
- noise_level=0.0, orth_variant_level=0.0):
+ noise_level=0.0):
locs = list((self.tmp_dir / 'train').iterdir())
random.shuffle(locs)
train_tuples = self.read_tuples(locs, limit=self.limit)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
max_length=max_length,
noise_level=noise_level,
- orth_variant_level=orth_variant_level,
make_projective=True)
yield from gold_docs
@@ -231,132 +226,43 @@ class GoldCorpus(object):
@classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
- noise_level=0.0, orth_variant_level=0.0, make_projective=False):
+ noise_level=0.0, make_projective=False):
for raw_text, paragraph_tuples in tuples:
if gold_preproc:
raw_text = None
else:
paragraph_tuples = merge_sents(paragraph_tuples)
- docs, paragraph_tuples = cls._make_docs(nlp, raw_text,
- paragraph_tuples, gold_preproc, noise_level=noise_level,
- orth_variant_level=orth_variant_level)
+ docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc,
+ noise_level=noise_level)
golds = cls._make_golds(docs, paragraph_tuples, make_projective)
for doc, gold in zip(docs, golds):
if (not max_length) or len(doc) < max_length:
yield doc, gold
@classmethod
- def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
+ def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0):
if raw_text is not None:
- raw_text, paragraph_tuples = make_orth_variants(nlp, raw_text, paragraph_tuples, orth_variant_level=orth_variant_level)
raw_text = add_noise(raw_text, noise_level)
- return [nlp.make_doc(raw_text)], paragraph_tuples
+ return [nlp.make_doc(raw_text)]
else:
- docs = []
- raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level=orth_variant_level)
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
- for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples
-
+ for (sent_tuples, brackets) in paragraph_tuples]
@classmethod
def _make_golds(cls, docs, paragraph_tuples, make_projective):
if len(docs) != len(paragraph_tuples):
n_annots = len(paragraph_tuples)
raise ValueError(Errors.E070.format(n_docs=len(docs), n_annots=n_annots))
- return [GoldParse.from_annot_tuples(doc, sent_tuples,
+ if len(docs) == 1:
+ return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0],
+ make_projective=make_projective)]
+ else:
+ return [GoldParse.from_annot_tuples(doc, sent_tuples,
make_projective=make_projective)
for doc, (sent_tuples, brackets)
in zip(docs, paragraph_tuples)]
-def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
- if random.random() >= orth_variant_level:
- return raw, paragraph_tuples
- if random.random() >= 0.5:
- lower = True
- if raw is not None:
- raw = raw.lower()
- ndsv = nlp.Defaults.single_orth_variants
- ndpv = nlp.Defaults.paired_orth_variants
- # modify words in paragraph_tuples
- variant_paragraph_tuples = []
- for sent_tuples, brackets in paragraph_tuples:
- ids, words, tags, heads, labels, ner, cats = sent_tuples
- if lower:
- words = [w.lower() for w in words]
- # single variants
- punct_choices = [random.choice(x["variants"]) for x in ndsv]
- for word_idx in range(len(words)):
- for punct_idx in range(len(ndsv)):
- if tags[word_idx] in ndsv[punct_idx]["tags"] \
- and words[word_idx] in ndsv[punct_idx]["variants"]:
- words[word_idx] = punct_choices[punct_idx]
- # paired variants
- punct_choices = [random.choice(x["variants"]) for x in ndpv]
- for word_idx in range(len(words)):
- for punct_idx in range(len(ndpv)):
- if tags[word_idx] in ndpv[punct_idx]["tags"] \
- and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
- # backup option: random left vs. right from pair
- pair_idx = random.choice([0, 1])
- # best option: rely on paired POS tags like `` / ''
- if len(ndpv[punct_idx]["tags"]) == 2:
- pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
- # next best option: rely on position in variants
- # (may not be unambiguous, so order of variants matters)
- else:
- for pair in ndpv[punct_idx]["variants"]:
- if words[word_idx] in pair:
- pair_idx = pair.index(words[word_idx])
- words[word_idx] = punct_choices[punct_idx][pair_idx]
-
- variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner, cats), brackets))
- # modify raw to match variant_paragraph_tuples
- if raw is not None:
- variants = []
- for single_variants in ndsv:
- variants.extend(single_variants["variants"])
- for paired_variants in ndpv:
- variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
- # store variants in reverse length order to be able to prioritize
- # longer matches (e.g., "---" before "--")
- variants = sorted(variants, key=lambda x: len(x))
- variants.reverse()
- variant_raw = ""
- raw_idx = 0
- # add initial whitespace
- while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
- variant_raw += raw[raw_idx]
- raw_idx += 1
- for sent_tuples, brackets in variant_paragraph_tuples:
- ids, words, tags, heads, labels, ner, cats = sent_tuples
- for word in words:
- match_found = False
- # add identical word
- if word not in variants and raw[raw_idx:].startswith(word):
- variant_raw += word
- raw_idx += len(word)
- match_found = True
- # add variant word
- else:
- for variant in variants:
- if not match_found and \
- raw[raw_idx:].startswith(variant):
- raw_idx += len(variant)
- variant_raw += word
- match_found = True
- # something went wrong, abort
- # (add a warning message?)
- if not match_found:
- return raw, paragraph_tuples
- # add following whitespace
- while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
- variant_raw += raw[raw_idx]
- raw_idx += 1
- return variant_raw, variant_paragraph_tuples
- return raw, variant_paragraph_tuples
-
-
def add_noise(orig, noise_level):
if random.random() >= noise_level:
return orig
@@ -371,8 +277,12 @@ def add_noise(orig, noise_level):
def _corrupt(c, noise_level):
if random.random() >= noise_level:
return c
- elif c in [".", "'", "!", "?", ","]:
+ elif c == " ":
return "\n"
+ elif c == "\n":
+ return " "
+ elif c in [".", "'", "!", "?", ","]:
+ return ""
else:
return c.lower()
@@ -420,10 +330,6 @@ def json_to_tuple(doc):
sents.append([
[ids, words, tags, heads, labels, ner],
sent.get("brackets", [])])
- cats = {}
- for cat in paragraph.get("cats", {}):
- cats[cat["label"]] = cat["value"]
- sents.append(cats)
if sents:
yield [paragraph.get("raw", None), sents]
@@ -537,12 +443,11 @@ cdef class GoldParse:
"""
@classmethod
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
- _, words, tags, heads, deps, entities, cats = annot_tuples
+ _, words, tags, heads, deps, entities = annot_tuples
return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
- entities=entities, cats=cats,
- make_projective=make_projective)
+ entities=entities, make_projective=make_projective)
- def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None,
+ def __init__(self, doc, annot_tuples=None, words=None, tags=None,
heads=None, deps=None, entities=None, make_projective=False,
cats=None, links=None, **_):
"""Create a GoldParse.
@@ -577,13 +482,11 @@ cdef class GoldParse:
if words is None:
words = [token.text for token in doc]
if tags is None:
- tags = [None for _ in words]
+ tags = [None for _ in doc]
if heads is None:
- heads = [None for _ in words]
+ heads = [None for token in doc]
if deps is None:
- deps = [None for _ in words]
- if morphology is None:
- morphology = [None for _ in words]
+ deps = [None for _ in doc]
if entities is None:
entities = ["-" for _ in doc]
elif len(entities) == 0:
@@ -595,6 +498,7 @@ cdef class GoldParse:
if not isinstance(entities[0], basestring):
# Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities)
+
self.mem = Pool()
self.loss = 0
self.length = len(doc)
@@ -614,7 +518,6 @@ cdef class GoldParse:
self.heads = [None] * len(doc)
self.labels = [None] * len(doc)
self.ner = [None] * len(doc)
- self.morphology = [None] * len(doc)
# This needs to be done before we align the words
if make_projective and heads is not None and deps is not None:
@@ -641,13 +544,11 @@ cdef class GoldParse:
self.tags[i] = "_SP"
self.heads[i] = None
self.labels[i] = None
- self.ner[i] = None
- self.morphology[i] = set()
+ self.ner[i] = "O"
if gold_i is None:
if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
- self.morphology[i] = morphology[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
is_first = i2j_multi[i] != i2j_multi.get(i-1)
# Set next word in multi-token span as head, until last
@@ -684,7 +585,6 @@ cdef class GoldParse:
else:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
- self.morphology[i] = morphology[gold_i]
if heads[gold_i] is None:
self.heads[i] = None
else:
@@ -692,20 +592,9 @@ cdef class GoldParse:
self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i]
- # Prevent whitespace that isn't within entities from being tagged as
- # an entity.
- for i in range(len(self.ner)):
- if self.tags[i] == "_SP":
- prev_ner = self.ner[i-1] if i >= 1 else None
- next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
- if prev_ner == "O" or next_ner == "O":
- self.ner[i] = "O"
-
cycle = nonproj.contains_cycle(self.heads)
if cycle is not None:
- raise ValueError(Errors.E069.format(cycle=cycle,
- cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
- doc_tokens=" ".join(words[:50])))
+ raise ValueError(Errors.E069.format(cycle=cycle, cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), doc_tokens=" ".join(words[:50])))
def __len__(self):
"""Get the number of gold-standard tokens.
@@ -749,10 +638,7 @@ def docs_to_json(docs, id=0):
docs = [docs]
json_doc = {"id": id, "paragraphs": []}
for i, doc in enumerate(docs):
- json_para = {'raw': doc.text, "sentences": [], "cats": []}
- for cat, val in doc.cats.items():
- json_cat = {"label": cat, "value": val}
- json_para["cats"].append(json_cat)
+ json_para = {'raw': doc.text, "sentences": []}
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
for j, sent in enumerate(doc.sents):
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 6cbc06e2c..176ac17de 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -24,7 +24,7 @@ cdef class Candidate:
algorithm which will disambiguate the various candidates to the correct one.
Each candidate (alias, entity) pair is assigned to a certain prior probability.
- DOCS: https://spacy.io/api/kb/#candidate_init
+ DOCS: https://spacy.io/api/candidate
"""
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index cb5b50ffc..131bdcd51 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -201,9 +201,7 @@ _ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ"
_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper
_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
-_uncased = (
- _bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu
-)
+_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu
ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
ALPHA_LOWER = group_chars(_lower + _uncased)
diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py
index b96069235..1b5aee6a8 100644
--- a/spacy/lang/de/__init__.py
+++ b/spacy/lang/de/__init__.py
@@ -27,20 +27,6 @@ class GermanDefaults(Language.Defaults):
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
resources = {"lemma_lookup": "lemma_lookup.json"}
- single_orth_variants = [
- {"tags": ["$("], "variants": ["…", "..."]},
- {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]},
- ]
- paired_orth_variants = [
- {
- "tags": ["$("],
- "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")],
- },
- {
- "tags": ["$("],
- "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")],
- },
- ]
class German(Language):
diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py
index 394478145..3bb6247c4 100644
--- a/spacy/lang/de/tag_map.py
+++ b/spacy/lang/de/tag_map.py
@@ -10,7 +10,7 @@ TAG_MAP = {
"$,": {POS: PUNCT, "PunctType": "comm"},
"$.": {POS: PUNCT, "PunctType": "peri"},
"ADJA": {POS: ADJ},
- "ADJD": {POS: ADJ},
+ "ADJD": {POS: ADJ, "Variant": "short"},
"ADV": {POS: ADV},
"APPO": {POS: ADP, "AdpType": "post"},
"APPR": {POS: ADP, "AdpType": "prep"},
@@ -32,7 +32,7 @@ TAG_MAP = {
"PDAT": {POS: DET, "PronType": "dem"},
"PDS": {POS: PRON, "PronType": "dem"},
"PIAT": {POS: DET, "PronType": "ind|neg|tot"},
- "PIDAT": {POS: DET, "PronType": "ind|neg|tot"},
+ "PIDAT": {POS: DET, "AdjType": "pdt", "PronType": "ind|neg|tot"},
"PIS": {POS: PRON, "PronType": "ind|neg|tot"},
"PPER": {POS: PRON, "PronType": "prs"},
"PPOSAT": {POS: DET, "Poss": "yes", "PronType": "prs"},
@@ -42,7 +42,7 @@ TAG_MAP = {
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
"PTKA": {POS: PART},
"PTKANT": {POS: PART, "PartType": "res"},
- "PTKNEG": {POS: PART, "Polarity": "neg"},
+ "PTKNEG": {POS: PART, "Polarity": "Neg"},
"PTKVZ": {POS: PART, "PartType": "vbp"},
"PTKZU": {POS: PART, "PartType": "inf"},
"PWAT": {POS: DET, "PronType": "int"},
diff --git a/spacy/lang/el/lemmatizer/__init__.py b/spacy/lang/el/lemmatizer/__init__.py
index 994bf9c16..c0ce5c2ad 100644
--- a/spacy/lang/el/lemmatizer/__init__.py
+++ b/spacy/lang/el/lemmatizer/__init__.py
@@ -46,10 +46,9 @@ class GreekLemmatizer(object):
)
return lemmas
- def lookup(self, string, orth=None):
- key = orth if orth is not None else string
- if key in self.lookup_table:
- return self.lookup_table[key]
+ def lookup(self, string):
+ if string in self.lookup_table:
+ return self.lookup_table[string]
return string
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index e4c745c83..7d00c749c 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -38,14 +38,6 @@ class EnglishDefaults(Language.Defaults):
"lemma_index": "lemmatizer/lemma_index.json",
"lemma_exc": "lemmatizer/lemma_exc.json",
}
- single_orth_variants = [
- {"tags": ["NFP"], "variants": ["…", "..."]},
- {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
- ]
- paired_orth_variants = [
- {"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
- {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
- ]
class English(Language):
diff --git a/spacy/lang/en/lemmatizer/lemma_lookup.json b/spacy/lang/en/lemmatizer/lemma_lookup.json
index 15d41e4ba..d0f92c37c 100644
--- a/spacy/lang/en/lemmatizer/lemma_lookup.json
+++ b/spacy/lang/en/lemmatizer/lemma_lookup.json
@@ -20574,7 +20574,7 @@
"lengthier": "lengthy",
"lengthiest": "lengthy",
"lengths": "length",
- "lenses": "lens",
+ "lenses": "lense",
"lent": "lend",
"lenticels": "lenticel",
"lentils": "lentil",
diff --git a/spacy/lang/en/morph_rules.py b/spacy/lang/en/morph_rules.py
index 5ed4eac59..198182ff0 100644
--- a/spacy/lang/en/morph_rules.py
+++ b/spacy/lang/en/morph_rules.py
@@ -3,59 +3,55 @@ from __future__ import unicode_literals
from ...symbols import LEMMA, PRON_LEMMA
-# Several entries here look pretty suspicious. These will get the POS SCONJ
-# given the tag IN, when an adpositional reading seems much more likely for
-# a lot of these prepositions. I'm not sure what I was running in 04395ffa4
-# when I did this? It doesn't seem right.
_subordinating_conjunctions = [
"that",
"if",
"as",
"because",
- # "of",
- # "for",
- # "before",
- # "in",
+ "of",
+ "for",
+ "before",
+ "in",
"while",
- # "after",
+ "after",
"since",
"like",
- # "with",
+ "with",
"so",
- # "to",
- # "by",
- # "on",
- # "about",
+ "to",
+ "by",
+ "on",
+ "about",
"than",
"whether",
"although",
- # "from",
+ "from",
"though",
- # "until",
+ "until",
"unless",
"once",
- # "without",
- # "at",
- # "into",
+ "without",
+ "at",
+ "into",
"cause",
- # "over",
+ "over",
"upon",
"till",
"whereas",
- # "beyond",
+ "beyond",
"whilst",
"except",
"despite",
"wether",
- # "then",
+ "then",
"but",
"becuse",
"whie",
- # "below",
- # "against",
+ "below",
+ "against",
"it",
"w/out",
- # "toward",
+ "toward",
"albeit",
"save",
"besides",
@@ -67,17 +63,16 @@ _subordinating_conjunctions = [
"out",
"near",
"seince",
- # "towards",
+ "towards",
"tho",
"sice",
"will",
]
-# This seems kind of wrong too?
-# _relative_pronouns = ["this", "that", "those", "these"]
+_relative_pronouns = ["this", "that", "those", "these"]
MORPH_RULES = {
- # "DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
+ "DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
"IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
"NN": {
"something": {"POS": "PRON"},
diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py
index 9bd884a3a..246258f57 100644
--- a/spacy/lang/en/tag_map.py
+++ b/spacy/lang/en/tag_map.py
@@ -14,10 +14,10 @@ TAG_MAP = {
'""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
":": {POS: PUNCT},
- "$": {POS: SYM},
- "#": {POS: SYM},
- "AFX": {POS: ADJ, "Hyph": "yes"},
- "CC": {POS: CCONJ, "ConjType": "comp"},
+ "$": {POS: SYM, "Other": {"SymType": "currency"}},
+ "#": {POS: SYM, "Other": {"SymType": "numbersign"}},
+ "AFX": {POS: X, "Hyph": "yes"},
+ "CC": {POS: CCONJ, "ConjType": "coor"},
"CD": {POS: NUM, "NumType": "card"},
"DT": {POS: DET},
"EX": {POS: PRON, "AdvType": "ex"},
@@ -34,7 +34,7 @@ TAG_MAP = {
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
"NNS": {POS: NOUN, "Number": "plur"},
- "PDT": {POS: DET},
+ "PDT": {POS: DET, "AdjType": "pdt", "PronType": "prn"},
"POS": {POS: PART, "Poss": "yes"},
"PRP": {POS: PRON, "PronType": "prs"},
"PRP$": {POS: PRON, "PronType": "prs", "Poss": "yes"},
@@ -56,12 +56,12 @@ TAG_MAP = {
"VerbForm": "fin",
"Tense": "pres",
"Number": "sing",
- "Person": "three",
+ "Person": 3,
},
- "WDT": {POS: PRON},
- "WP": {POS: PRON},
- "WP$": {POS: PRON, "Poss": "yes"},
- "WRB": {POS: ADV},
+ "WDT": {POS: PRON, "PronType": "int|rel"},
+ "WP": {POS: PRON, "PronType": "int|rel"},
+ "WP$": {POS: PRON, "Poss": "yes", "PronType": "int|rel"},
+ "WRB": {POS: ADV, "PronType": "int|rel"},
"ADD": {POS: X},
"NFP": {POS: PUNCT},
"GW": {POS: X},
diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index c45197771..21e664f7f 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -30,7 +30,14 @@ for pron in ["i"]:
for orth in [pron, pron.title()]:
_exc[orth + "'m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
- {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP"},
+ {
+ ORTH: "'m",
+ LEMMA: "be",
+ NORM: "am",
+ TAG: "VBP",
+ "tenspect": 1,
+ "number": 1,
+ },
]
_exc[orth + "m"] = [
diff --git a/spacy/lang/fr/lemmatizer/__init__.py b/spacy/lang/fr/lemmatizer/__init__.py
index dfd822188..a0a0d2021 100644
--- a/spacy/lang/fr/lemmatizer/__init__.py
+++ b/spacy/lang/fr/lemmatizer/__init__.py
@@ -114,9 +114,9 @@ class FrenchLemmatizer(object):
def punct(self, string, morphology=None):
return self(string, "punct", morphology)
- def lookup(self, string, orth=None):
- if orth is not None and orth in self.lookup_table:
- return self.lookup_table[orth][0]
+ def lookup(self, string):
+ if string in self.lookup_table:
+ return self.lookup_table[string][0]
return string
diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py
index efad18c84..430a18a22 100644
--- a/spacy/lang/hi/stop_words.py
+++ b/spacy/lang/hi/stop_words.py
@@ -2,8 +2,7 @@
from __future__ import unicode_literals
-# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6
-
+# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt
STOP_WORDS = set(
"""
अंदर
@@ -19,7 +18,6 @@ STOP_WORDS = set(
अंदर
आदि
आप
-अगर
इंहिं
इंहें
इंहों
@@ -173,9 +171,6 @@ STOP_WORDS = set(
मानो
मे
में
-मैं
-मुझको
-मेरा
यदि
यह
यहाँ
@@ -232,7 +227,6 @@ STOP_WORDS = set(
है
हैं
हो
-हूँ
होता
होति
होती
diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 056a6893b..3a6074bba 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -37,11 +37,6 @@ def resolve_pos(token):
in the sentence. This function adds information to the POS tag to
resolve ambiguous mappings.
"""
-
- # this is only used for consecutive ascii spaces
- if token.pos == "空白":
- return "空白"
-
# TODO: This is a first take. The rules here are crude approximations.
# For many of these, full dependencies are needed to properly resolve
# PoS mappings.
@@ -59,7 +54,6 @@ def detailed_tokens(tokenizer, text):
node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it
words = []
- spaces = []
while node.posid != 0:
surface = node.surface
base = surface # a default value. Updated if available later.
@@ -70,20 +64,8 @@ def detailed_tokens(tokenizer, text):
# dictionary
base = parts[7]
words.append(ShortUnitWord(surface, base, pos))
-
- # The way MeCab stores spaces is that the rlength of the next token is
- # the length of that token plus any preceding whitespace, **in bytes**.
- # also note that this is only for half-width / ascii spaces. Full width
- # spaces just become tokens.
- scount = node.next.rlength - node.next.length
- spaces.append(bool(scount))
- while scount > 1:
- words.append(ShortUnitWord(" ", " ", "空白"))
- spaces.append(False)
- scount -= 1
-
node = node.next
- return words, spaces
+ return words
class JapaneseTokenizer(DummyTokenizer):
@@ -93,8 +75,9 @@ class JapaneseTokenizer(DummyTokenizer):
self.tokenizer.parseToNode("") # see #2901
def __call__(self, text):
- dtokens, spaces = detailed_tokens(self.tokenizer, text)
+ dtokens = detailed_tokens(self.tokenizer, text)
words = [x.surface for x in dtokens]
+ spaces = [False] * len(words)
doc = Doc(self.vocab, words=words, spaces=spaces)
mecab_tags = []
for token, dtoken in zip(doc, dtokens):
diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py
index 4ff0a35ee..6b114eb10 100644
--- a/spacy/lang/ja/tag_map.py
+++ b/spacy/lang/ja/tag_map.py
@@ -2,7 +2,7 @@
from __future__ import unicode_literals
from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN
-from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE
+from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET
TAG_MAP = {
@@ -21,8 +21,6 @@ TAG_MAP = {
"感動詞,一般,*,*": {POS: INTJ},
# this is specifically for unicode full-width space
"空白,*,*,*": {POS: X},
- # This is used when sequential half-width spaces are present
- "空白": {POS: SPACE},
"形状詞,一般,*,*": {POS: ADJ},
"形状詞,タリ,*,*": {POS: ADJ},
"形状詞,助動詞語幹,*,*": {POS: ADJ},
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index ec79a95ab..c8cd9c3fd 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -1,6 +1,8 @@
# encoding: utf8
from __future__ import unicode_literals, print_function
+import sys
+
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP
from ...attrs import LANG
@@ -8,12 +10,35 @@ from ...language import Language
from ...tokens import Doc
from ...compat import copy_reg
from ...util import DummyTokenizer
+from ...compat import is_python3, is_python_pre_3_5
+
+is_python_post_3_7 = is_python3 and sys.version_info[1] >= 7
+
+# fmt: off
+if is_python_pre_3_5:
+ from collections import namedtuple
+ Morpheme = namedtuple("Morpheme", "surface lemma tag")
+elif is_python_post_3_7:
+ from dataclasses import dataclass
+
+ @dataclass(frozen=True)
+ class Morpheme:
+ surface: str
+ lemma: str
+ tag: str
+else:
+ from typing import NamedTuple
+
+ class Morpheme(NamedTuple):
+
+ surface = str("")
+ lemma = str("")
+ tag = str("")
def try_mecab_import():
try:
from natto import MeCab
-
return MeCab
except ImportError:
raise ImportError(
@@ -21,8 +46,6 @@ def try_mecab_import():
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
"and [natto-py](https://github.com/buruzaemon/natto-py)"
)
-
-
# fmt: on
@@ -46,13 +69,13 @@ class KoreanTokenizer(DummyTokenizer):
def __call__(self, text):
dtokens = list(self.detailed_tokens(text))
- surfaces = [dt["surface"] for dt in dtokens]
+ surfaces = [dt.surface for dt in dtokens]
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
for token, dtoken in zip(doc, dtokens):
- first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
+ first_tag, sep, eomi_tags = dtoken.tag.partition("+")
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
- token.lemma_ = dtoken["lemma"]
- doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
+ token.lemma_ = dtoken.lemma
+ doc.user_data["full_tags"] = [dt.tag for dt in dtokens]
return doc
def detailed_tokens(self, text):
@@ -68,7 +91,7 @@ class KoreanTokenizer(DummyTokenizer):
lemma, _, remainder = expr.partition("/")
if lemma == "*":
lemma = surface
- yield {"surface": surface, "lemma": lemma, "tag": tag}
+ yield Morpheme(surface, lemma, tag)
class KoreanDefaults(Language.Defaults):
diff --git a/spacy/lang/lt/tag_map.py b/spacy/lang/lt/tag_map.py
index 6ea4f8ae0..eab231b2c 100644
--- a/spacy/lang/lt/tag_map.py
+++ b/spacy/lang/lt/tag_map.py
@@ -1605,7 +1605,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Imp",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"VerbForm": "Fin",
},
@@ -1613,7 +1613,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"VerbForm": "Fin",
},
@@ -1621,7 +1621,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Imp",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Reflex": "Yes",
"VerbForm": "Fin",
@@ -1630,7 +1630,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Imp",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"VerbForm": "Fin",
},
@@ -1638,7 +1638,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"Reflex": "Yes",
"VerbForm": "Fin",
@@ -1647,7 +1647,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"VerbForm": "Fin",
},
@@ -1655,7 +1655,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Reflex": "Yes",
"VerbForm": "Fin",
@@ -1664,7 +1664,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"VerbForm": "Fin",
},
@@ -1672,7 +1672,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"Reflex": "Yes",
"VerbForm": "Fin",
@@ -1681,7 +1681,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Imp",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"VerbForm": "Fin",
},
@@ -1689,7 +1689,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"VerbForm": "Fin",
},
@@ -1697,7 +1697,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Imp",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Reflex": "Yes",
"VerbForm": "Fin",
@@ -1706,7 +1706,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Imp",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Neg",
"VerbForm": "Fin",
},
@@ -1714,7 +1714,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Imp",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Neg",
"Reflex": "Yes",
"VerbForm": "Fin",
@@ -1723,7 +1723,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Imp",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"VerbForm": "Fin",
},
@@ -1731,7 +1731,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"VerbForm": "Fin",
},
@@ -1739,7 +1739,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Imp",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Reflex": "Yes",
"VerbForm": "Fin",
@@ -1748,7 +1748,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Imp",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Neg",
"VerbForm": "Fin",
},
@@ -1756,21 +1756,21 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Neg",
"VerbForm": "Fin",
},
"Vgm-3---n--ns-": {
POS: VERB,
"Mood": "Cnd",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"VerbForm": "Fin",
},
"Vgm-3---n--ys-": {
POS: VERB,
"Mood": "Cnd",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"VerbForm": "Fin",
@@ -1778,14 +1778,14 @@ TAG_MAP = {
"Vgm-3---y--ns-": {
POS: VERB,
"Mood": "Cnd",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"VerbForm": "Fin",
},
"Vgm-3---y--ys-": {
POS: VERB,
"Mood": "Cnd",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Reflex": "Yes",
"VerbForm": "Fin",
@@ -1794,7 +1794,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"VerbForm": "Fin",
},
@@ -1802,7 +1802,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"VerbForm": "Fin",
@@ -1811,7 +1811,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"VerbForm": "Fin",
},
@@ -1819,7 +1819,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"VerbForm": "Fin",
},
@@ -1827,7 +1827,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"VerbForm": "Fin",
@@ -1836,7 +1836,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"VerbForm": "Fin",
},
@@ -1844,7 +1844,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Cnd",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Reflex": "Yes",
"VerbForm": "Fin",
@@ -1853,7 +1853,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Tense": "Past",
"VerbForm": "Fin",
@@ -1862,7 +1862,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Past",
@@ -1872,7 +1872,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"Tense": "Past",
"VerbForm": "Fin",
@@ -1881,7 +1881,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"Reflex": "Yes",
"Tense": "Past",
@@ -1891,7 +1891,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Tense": "Past",
"VerbForm": "Fin",
@@ -1900,7 +1900,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Past",
@@ -1910,7 +1910,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"Tense": "Past",
"VerbForm": "Fin",
@@ -1919,7 +1919,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"Reflex": "Yes",
"Tense": "Past",
@@ -1929,7 +1929,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Tense": "Past",
"VerbForm": "Fin",
@@ -1938,7 +1938,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Past",
@@ -1948,7 +1948,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Neg",
"Tense": "Past",
"VerbForm": "Fin",
@@ -1957,7 +1957,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Tense": "Past",
"VerbForm": "Fin",
@@ -1966,7 +1966,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Neg",
"Tense": "Past",
"VerbForm": "Fin",
@@ -1974,7 +1974,7 @@ TAG_MAP = {
"Vgma3---n--ni-": {
POS: VERB,
"Mood": "Ind",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Past",
"VerbForm": "Fin",
@@ -1982,7 +1982,7 @@ TAG_MAP = {
"Vgma3---n--yi-": {
POS: VERB,
"Mood": "Ind",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Past",
@@ -1991,7 +1991,7 @@ TAG_MAP = {
"Vgma3---y--ni-": {
POS: VERB,
"Mood": "Ind",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Tense": "Past",
"VerbForm": "Fin",
@@ -1999,7 +1999,7 @@ TAG_MAP = {
"Vgma3--y--ni-": {
POS: VERB,
"Case": "Nom",
- "Person": "three",
+ "Person": "3",
"Tense": "Past",
"VerbForm": "Fin",
},
@@ -2007,7 +2007,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Past",
"VerbForm": "Fin",
@@ -2016,7 +2016,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Past",
@@ -2026,7 +2026,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Tense": "Past",
"VerbForm": "Fin",
@@ -2035,7 +2035,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Reflex": "Yes",
"Tense": "Past",
@@ -2045,7 +2045,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Past",
"VerbForm": "Fin",
@@ -2054,7 +2054,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Past",
@@ -2064,7 +2064,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Past",
@@ -2074,7 +2074,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Tense": "Past",
"VerbForm": "Fin",
@@ -2083,7 +2083,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Reflex": "Yes",
"Tense": "Past",
@@ -2093,7 +2093,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Tense": "Fut",
"VerbForm": "Fin",
@@ -2102,7 +2102,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Fut",
@@ -2112,7 +2112,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"Tense": "Fut",
"VerbForm": "Fin",
@@ -2121,7 +2121,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Tense": "Fut",
"VerbForm": "Fin",
@@ -2130,7 +2130,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Fut",
@@ -2140,7 +2140,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"Tense": "Fut",
"VerbForm": "Fin",
@@ -2149,7 +2149,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Tense": "Fut",
"VerbForm": "Fin",
@@ -2158,7 +2158,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Fut",
@@ -2168,7 +2168,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Tense": "Fut",
"VerbForm": "Fin",
@@ -2177,7 +2177,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Fut",
@@ -2187,7 +2187,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Neg",
"Tense": "Fut",
"VerbForm": "Fin",
@@ -2196,7 +2196,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Neg",
"Reflex": "Yes",
"Tense": "Fut",
@@ -2205,7 +2205,7 @@ TAG_MAP = {
"Vgmf3---n--ni-": {
POS: VERB,
"Mood": "Ind",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Fut",
"VerbForm": "Fin",
@@ -2213,7 +2213,7 @@ TAG_MAP = {
"Vgmf3---y--ni-": {
POS: VERB,
"Mood": "Ind",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Tense": "Fut",
"VerbForm": "Fin",
@@ -2222,7 +2222,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Fut",
"VerbForm": "Fin",
@@ -2231,7 +2231,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Fut",
@@ -2241,7 +2241,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Tense": "Fut",
"VerbForm": "Fin",
@@ -2250,7 +2250,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Fut",
"VerbForm": "Fin",
@@ -2259,7 +2259,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Fut",
@@ -2269,7 +2269,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Tense": "Fut",
"VerbForm": "Fin",
@@ -2278,7 +2278,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Reflex": "Yes",
"Tense": "Fut",
@@ -2288,7 +2288,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2297,7 +2297,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Pres",
@@ -2307,7 +2307,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2316,7 +2316,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"Reflex": "Yes",
"Tense": "Pres",
@@ -2326,7 +2326,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2335,7 +2335,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2344,7 +2344,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Pres",
@@ -2354,7 +2354,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2363,7 +2363,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"Reflex": "Yes",
"Tense": "Pres",
@@ -2373,7 +2373,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2382,7 +2382,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Pres",
@@ -2392,7 +2392,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Neg",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2401,7 +2401,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "two",
+ "Person": "2",
"Polarity": "Neg",
"Reflex": "Yes",
"Tense": "Pres",
@@ -2411,7 +2411,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2420,7 +2420,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Pres",
@@ -2430,7 +2430,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Neg",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2438,7 +2438,7 @@ TAG_MAP = {
"Vgmp3---n--ni-": {
POS: VERB,
"Mood": "Ind",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2446,7 +2446,7 @@ TAG_MAP = {
"Vgmp3---n--yi-": {
POS: VERB,
"Mood": "Ind",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Pres",
@@ -2455,7 +2455,7 @@ TAG_MAP = {
"Vgmp3---y--ni-": {
POS: VERB,
"Mood": "Ind",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2463,7 +2463,7 @@ TAG_MAP = {
"Vgmp3---y--yi-": {
POS: VERB,
"Mood": "Ind",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Reflex": "Yes",
"Tense": "Pres",
@@ -2473,7 +2473,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2482,7 +2482,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Pres",
@@ -2492,7 +2492,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2501,7 +2501,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Reflex": "Yes",
"Tense": "Pres",
@@ -2511,7 +2511,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2520,7 +2520,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2529,7 +2529,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2538,7 +2538,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Pres",
@@ -2548,7 +2548,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Tense": "Pres",
"VerbForm": "Fin",
@@ -2557,7 +2557,7 @@ TAG_MAP = {
POS: VERB,
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Reflex": "Yes",
"Tense": "Pres",
@@ -2568,7 +2568,7 @@ TAG_MAP = {
"Aspect": "Hab",
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Tense": "Past",
"VerbForm": "Fin",
@@ -2578,7 +2578,7 @@ TAG_MAP = {
"Aspect": "Hab",
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Past",
@@ -2589,7 +2589,7 @@ TAG_MAP = {
"Aspect": "Hab",
"Mood": "Ind",
"Number": "Sing",
- "Person": "one",
+ "Person": "1",
"Polarity": "Neg",
"Tense": "Past",
"VerbForm": "Fin",
@@ -2599,7 +2599,7 @@ TAG_MAP = {
"Aspect": "Hab",
"Mood": "Ind",
"Number": "Sing",
- "Person": "two",
+ "Person": "2",
"Polarity": "Pos",
"Tense": "Past",
"VerbForm": "Fin",
@@ -2608,7 +2608,7 @@ TAG_MAP = {
POS: VERB,
"Aspect": "Hab",
"Mood": "Ind",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Past",
"VerbForm": "Fin",
@@ -2618,7 +2618,7 @@ TAG_MAP = {
"Aspect": "Hab",
"Mood": "Ind",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Past",
"VerbForm": "Fin",
@@ -2628,7 +2628,7 @@ TAG_MAP = {
"Aspect": "Hab",
"Mood": "Ind",
"Number": "Plur",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Past",
@@ -2639,7 +2639,7 @@ TAG_MAP = {
"Aspect": "Hab",
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Past",
"VerbForm": "Fin",
@@ -2649,7 +2649,7 @@ TAG_MAP = {
"Aspect": "Hab",
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Reflex": "Yes",
"Tense": "Past",
@@ -2660,7 +2660,7 @@ TAG_MAP = {
"Aspect": "Hab",
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Neg",
"Tense": "Past",
"VerbForm": "Fin",
@@ -2670,7 +2670,7 @@ TAG_MAP = {
"Aspect": "Perf",
"Mood": "Ind",
"Number": "Sing",
- "Person": "three",
+ "Person": "3",
"Polarity": "Pos",
"Tense": "Past",
"VerbForm": "Fin",
diff --git a/spacy/lang/nl/lemmatizer/__init__.py b/spacy/lang/nl/lemmatizer/__init__.py
index ee4eaabb3..1e5d9aa1f 100644
--- a/spacy/lang/nl/lemmatizer/__init__.py
+++ b/spacy/lang/nl/lemmatizer/__init__.py
@@ -73,7 +73,7 @@ class DutchLemmatizer(object):
return [lemma[0]]
except KeyError:
pass
- # string corresponds to key in lookup table
+ # string corresponds to key in lookup table
lookup_table = self.lookup_table
looked_up_lemma = lookup_table.get(string)
if looked_up_lemma and looked_up_lemma in lemma_index:
@@ -103,12 +103,9 @@ class DutchLemmatizer(object):
# Overrides parent method so that a lowercased version of the string is
# used to search the lookup table. This is necessary because our lookup
# table consists entirely of lowercase keys.
- def lookup(self, string, orth=None):
+ def lookup(self, string):
string = string.lower()
- if orth is not None:
- return self.lookup_table.get(orth, string)
- else:
- return self.lookup_table.get(string, string)
+ return self.lookup_table.get(string, string)
def noun(self, string, morphology=None):
return self(string, "noun", morphology)
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index 70120566b..300d61c52 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -73,7 +73,7 @@ class RussianLemmatizer(Lemmatizer):
if (
feature in morphology
and feature in analysis_morph
- and morphology[feature].lower() != analysis_morph[feature].lower()
+ and morphology[feature] != analysis_morph[feature]
):
break
else:
@@ -115,7 +115,7 @@ class RussianLemmatizer(Lemmatizer):
def pron(self, string, morphology=None):
return self(string, "pron", morphology)
- def lookup(self, string, orth=None):
+ def lookup(self, string):
analyses = self._morph.parse(string)
if len(analyses) == 1:
return analyses[0].normal_form
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index d40bdf2df..ab56c824d 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -70,7 +70,7 @@ class UkrainianLemmatizer(Lemmatizer):
if (
feature in morphology
and feature in analysis_morph
- and morphology[feature].lower() != analysis_morph[feature].lower()
+ and morphology[feature] != analysis_morph[feature]
):
break
else:
@@ -112,7 +112,7 @@ class UkrainianLemmatizer(Lemmatizer):
def pron(self, string, morphology=None):
return self(string, "pron", morphology)
- def lookup(self, string, orth=None):
+ def lookup(self, string):
analyses = self._morph.parse(string)
if len(analyses) == 1:
return analyses[0].normal_form
diff --git a/spacy/language.py b/spacy/language.py
index a28f2a84e..09dd22cf2 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -20,7 +20,6 @@ from .pipeline import Tensorizer, EntityRecognizer, EntityLinker
from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
from .pipeline import EntityRuler
-from .pipeline import Morphologizer
from .compat import izip, basestring_
from .gold import GoldParse
from .scorer import Scorer
@@ -39,8 +38,6 @@ from . import about
class BaseDefaults(object):
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
- if lookups is None:
- lookups = cls.create_lookups(nlp=nlp)
rules, index, exc, lookup = util.get_lemma_tables(lookups)
return Lemmatizer(index, exc, rules, lookup)
@@ -111,8 +108,6 @@ class BaseDefaults(object):
syntax_iterators = {}
resources = {}
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
- single_orth_variants = []
- paired_orth_variants = []
class Language(object):
@@ -133,7 +128,6 @@ class Language(object):
"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
"tensorizer": lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
"tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
- "morphologizer": lambda nlp, **cfg: Morphologizer(nlp.vocab, **cfg),
"parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
"ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
"entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
@@ -257,8 +251,7 @@ class Language(object):
@property
def pipe_labels(self):
- """Get the labels set by the pipeline components, if available (if
- the component exposes a labels property).
+ """Get the labels set by the pipeline components, if available.
RETURNS (dict): Labels keyed by component name.
"""
@@ -449,25 +442,6 @@ class Language(object):
def make_doc(self, text):
return self.tokenizer(text)
- def _format_docs_and_golds(self, docs, golds):
- """Format golds and docs before update models."""
- expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links")
- gold_objs = []
- doc_objs = []
- for doc, gold in zip(docs, golds):
- if isinstance(doc, basestring_):
- doc = self.make_doc(doc)
- if not isinstance(gold, GoldParse):
- unexpected = [k for k in gold if k not in expected_keys]
- if unexpected:
- err = Errors.E151.format(unexp=unexpected, exp=expected_keys)
- raise ValueError(err)
- gold = GoldParse(doc, **gold)
- doc_objs.append(doc)
- gold_objs.append(gold)
-
- return doc_objs, gold_objs
-
def update(self, docs, golds, drop=0.0, sgd=None, losses=None, component_cfg=None):
"""Update the models in the pipeline.
@@ -481,6 +455,7 @@ class Language(object):
DOCS: https://spacy.io/api/language#update
"""
+ expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links")
if len(docs) != len(golds):
raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds)))
if len(docs) == 0:
@@ -490,7 +465,21 @@ class Language(object):
self._optimizer = create_default_optimizer(Model.ops)
sgd = self._optimizer
# Allow dict of args to GoldParse, instead of GoldParse objects.
- docs, golds = self._format_docs_and_golds(docs, golds)
+ gold_objs = []
+ doc_objs = []
+ for doc, gold in zip(docs, golds):
+ if isinstance(doc, basestring_):
+ doc = self.make_doc(doc)
+ if not isinstance(gold, GoldParse):
+ unexpected = [k for k in gold if k not in expected_keys]
+ if unexpected:
+ err = Errors.E151.format(unexp=unexpected, exp=expected_keys)
+ raise ValueError(err)
+ gold = GoldParse(doc, **gold)
+ doc_objs.append(doc)
+ gold_objs.append(gold)
+ golds = gold_objs
+ docs = doc_objs
grads = {}
def get_grads(W, dW, key=None):
@@ -594,7 +583,6 @@ class Language(object):
# Populate vocab
else:
for _, annots_brackets in get_gold_tuples():
- _ = annots_brackets.pop()
for annots, _ in annots_brackets:
for word in annots[1]:
_ = self.vocab[word] # noqa: F841
@@ -663,7 +651,7 @@ class Language(object):
DOCS: https://spacy.io/api/language#evaluate
"""
if scorer is None:
- scorer = Scorer(pipeline=self.pipeline)
+ scorer = Scorer()
if component_cfg is None:
component_cfg = {}
docs, golds = zip(*docs_golds)
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index 26c2227a0..f9e35f44a 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -2,7 +2,8 @@
from __future__ import unicode_literals
from collections import OrderedDict
-from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
+from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
+from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
class Lemmatizer(object):
@@ -54,8 +55,12 @@ class Lemmatizer(object):
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
"""
- if morphology is None:
- morphology = {}
+ morphology = {} if morphology is None else morphology
+ others = [
+ key
+ for key in morphology
+ if key not in (POS, "Number", "POS", "VerbForm", "Tense")
+ ]
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
@@ -66,17 +71,18 @@ class Lemmatizer(object):
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
+ and not others
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
- elif morphology.get("VerbForm") == "inf":
+ elif VerbForm_inf in morphology:
return True
- elif morphology.get("VerbForm") == "none":
+ elif VerbForm_none in morphology:
return True
- elif morphology.get("VerbForm") == "inf":
+ elif Number_sing in morphology:
return True
- elif morphology.get("Degree") == "pos":
+ elif Degree_pos in morphology:
return True
else:
return False
@@ -93,19 +99,9 @@ class Lemmatizer(object):
def punct(self, string, morphology=None):
return self(string, "punct", morphology)
- def lookup(self, string, orth=None):
- """Look up a lemma in the table, if available. If no lemma is found,
- the original string is returned.
-
- string (unicode): The original string.
- orth (int): Optional hash of the string to look up. If not set, the
- string will be used and hashed.
- RETURNS (unicode): The lemma if the string was found, otherwise the
- original string.
- """
- key = orth if orth is not None else string
- if key in self.lookup_table:
- return self.lookup_table[key]
+ def lookup(self, string):
+ if string in self.lookup_table:
+ return self.lookup_table[string]
return string
diff --git a/spacy/lookups.py b/spacy/lookups.py
index 05a60f289..801b4d00d 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -1,13 +1,11 @@
-# coding: utf-8
+# coding: utf8
from __future__ import unicode_literals
import srsly
from collections import OrderedDict
-from preshed.bloom import BloomFilter
from .errors import Errors
from .util import SimpleFrozenDict, ensure_path
-from .strings import get_string_id
class Lookups(object):
@@ -16,14 +14,16 @@ class Lookups(object):
so they can be accessed before the pipeline components are applied (e.g.
in the tokenizer and lemmatizer), as well as within the pipeline components
via doc.vocab.lookups.
+
+ Important note: At the moment, this class only performs a very basic
+ dictionary lookup. We're planning to replace this with a more efficient
+ implementation. See #3971 for details.
"""
def __init__(self):
"""Initialize the Lookups object.
RETURNS (Lookups): The newly created object.
-
- DOCS: https://spacy.io/api/lookups#init
"""
self._tables = OrderedDict()
@@ -32,7 +32,7 @@ class Lookups(object):
Lookups.has_table.
name (unicode): Name of the table.
- RETURNS (bool): Whether a table of that name is in the lookups.
+ RETURNS (bool): Whether a table of that name exists.
"""
return self.has_table(name)
@@ -51,12 +51,11 @@ class Lookups(object):
name (unicode): Unique name of table.
data (dict): Optional data to add to the table.
RETURNS (Table): The newly added table.
-
- DOCS: https://spacy.io/api/lookups#add_table
"""
if name in self.tables:
raise ValueError(Errors.E158.format(name=name))
- table = Table(name=name, data=data)
+ table = Table(name=name)
+ table.update(data)
self._tables[name] = table
return table
@@ -65,8 +64,6 @@ class Lookups(object):
name (unicode): Name of the table.
RETURNS (Table): The table.
-
- DOCS: https://spacy.io/api/lookups#get_table
"""
if name not in self._tables:
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
@@ -75,10 +72,8 @@ class Lookups(object):
def remove_table(self, name):
"""Remove a table. Raises an error if the table doesn't exist.
- name (unicode): Name of the table to remove.
+ name (unicode): The name to remove.
RETURNS (Table): The removed table.
-
- DOCS: https://spacy.io/api/lookups#remove_table
"""
if name not in self._tables:
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
@@ -89,57 +84,45 @@ class Lookups(object):
name (unicode): Name of the table.
RETURNS (bool): Whether a table of that name exists.
-
- DOCS: https://spacy.io/api/lookups#has_table
"""
return name in self._tables
- def to_bytes(self, **kwargs):
+ def to_bytes(self, exclude=tuple(), **kwargs):
"""Serialize the lookups to a bytestring.
+ exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized Lookups.
-
- DOCS: https://spacy.io/api/lookups#to_bytes
"""
return srsly.msgpack_dumps(self._tables)
- def from_bytes(self, bytes_data, **kwargs):
+ def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
"""Load the lookups from a bytestring.
- bytes_data (bytes): The data to load.
- RETURNS (Lookups): The loaded Lookups.
-
- DOCS: https://spacy.io/api/lookups#from_bytes
+ exclude (list): String names of serialization fields to exclude.
+ RETURNS (bytes): The loaded Lookups.
"""
- for key, value in srsly.msgpack_loads(bytes_data).items():
- self._tables[key] = Table(key)
- self._tables[key].update(value)
+ self._tables = OrderedDict()
+ msg = srsly.msgpack_loads(bytes_data)
+ for key, value in msg.items():
+ self._tables[key] = Table.from_dict(value)
return self
def to_disk(self, path, **kwargs):
- """Save the lookups to a directory as lookups.bin. Expects a path to a
- directory, which will be created if it doesn't exist.
+ """Save the lookups to a directory as lookups.bin.
path (unicode / Path): The file path.
-
- DOCS: https://spacy.io/api/lookups#to_disk
"""
if len(self._tables):
path = ensure_path(path)
- if not path.exists():
- path.mkdir()
filepath = path / "lookups.bin"
with filepath.open("wb") as file_:
file_.write(self.to_bytes())
def from_disk(self, path, **kwargs):
- """Load lookups from a directory containing a lookups.bin. Will skip
- loading if the file doesn't exist.
+ """Load lookups from a directory containing a lookups.bin.
- path (unicode / Path): The directory path.
+ path (unicode / Path): The file path.
RETURNS (Lookups): The loaded lookups.
-
- DOCS: https://spacy.io/api/lookups#from_disk
"""
path = ensure_path(path)
filepath = path / "lookups.bin"
@@ -153,118 +136,22 @@ class Lookups(object):
class Table(OrderedDict):
"""A table in the lookups. Subclass of builtin dict that implements a
slightly more consistent and unified API.
-
- Includes a Bloom filter to speed up missed lookups.
"""
-
@classmethod
def from_dict(cls, data, name=None):
- """Initialize a new table from a dict.
-
- data (dict): The dictionary.
- name (unicode): Optional table name for reference.
- RETURNS (Table): The newly created object.
-
- DOCS: https://spacy.io/api/lookups#table.from_dict
- """
self = cls(name=name)
self.update(data)
return self
- def __init__(self, name=None, data=None):
+ def __init__(self, name=None):
"""Initialize a new table.
name (unicode): Optional table name for reference.
- data (dict): Initial data, used to hint Bloom Filter.
RETURNS (Table): The newly created object.
-
- DOCS: https://spacy.io/api/lookups#table.init
"""
OrderedDict.__init__(self)
self.name = name
- # Assume a default size of 1M items
- self.default_size = 1e6
- size = len(data) if data and len(data) > 0 else self.default_size
- self.bloom = BloomFilter.from_error_rate(size)
- if data:
- self.update(data)
-
- def __setitem__(self, key, value):
- """Set new key/value pair. String keys will be hashed.
-
- key (unicode / int): The key to set.
- value: The value to set.
- """
- key = get_string_id(key)
- OrderedDict.__setitem__(self, key, value)
- self.bloom.add(key)
def set(self, key, value):
- """Set new key/value pair. String keys will be hashed.
- Same as table[key] = value.
-
- key (unicode / int): The key to set.
- value: The value to set.
- """
+ """Set new key/value pair. Same as table[key] = value."""
self[key] = value
-
- def __getitem__(self, key):
- """Get the value for a given key. String keys will be hashed.
-
- key (unicode / int): The key to get.
- RETURNS: The value.
- """
- key = get_string_id(key)
- return OrderedDict.__getitem__(self, key)
-
- def get(self, key, default=None):
- """Get the value for a given key. String keys will be hashed.
-
- key (unicode / int): The key to get.
- default: The default value to return.
- RETURNS: The value.
- """
- key = get_string_id(key)
- return OrderedDict.get(self, key, default)
-
- def __contains__(self, key):
- """Check whether a key is in the table. String keys will be hashed.
-
- key (unicode / int): The key to check.
- RETURNS (bool): Whether the key is in the table.
- """
- key = get_string_id(key)
- # This can give a false positive, so we need to check it after
- if key not in self.bloom:
- return False
- return OrderedDict.__contains__(self, key)
-
- def to_bytes(self):
- """Serialize table to a bytestring.
-
- RETURNS (bytes): The serialized table.
-
- DOCS: https://spacy.io/api/lookups#table.to_bytes
- """
- data = [
- ("name", self.name),
- ("dict", dict(self.items())),
- ("bloom", self.bloom.to_bytes()),
- ]
- return srsly.msgpack_dumps(OrderedDict(data))
-
- def from_bytes(self, bytes_data):
- """Load a table from a bytestring.
-
- bytes_data (bytes): The data to load.
- RETURNS (Table): The loaded table.
-
- DOCS: https://spacy.io/api/lookups#table.from_bytes
- """
- loaded = srsly.msgpack_loads(bytes_data)
- data = loaded.get("dict", {})
- self.name = loaded["name"]
- self.bloom = BloomFilter().from_bytes(loaded["bloom"])
- self.clear()
- self.update(data)
- return self
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 950a7b977..c698c8024 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -103,8 +103,6 @@ cdef class Matcher:
*patterns (list): List of token descriptions.
"""
errors = {}
- if on_match is not None and not hasattr(on_match, "__call__"):
- raise ValueError(Errors.E171.format(arg_type=type(on_match)))
for i, pattern in enumerate(patterns):
if len(pattern) == 0:
raise ValueError(Errors.E012.format(key=key))
@@ -164,37 +162,18 @@ cdef class Matcher:
return default
return (self._callbacks[key], self._patterns[key])
- def pipe(self, docs, batch_size=1000, n_threads=-1, return_matches=False,
- as_tuples=False):
+ def pipe(self, docs, batch_size=1000, n_threads=-1):
"""Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents.
batch_size (int): Number of documents to accumulate into a working set.
- return_matches (bool): Yield the match lists along with the docs, making
- results (doc, matches) tuples.
- as_tuples (bool): Interpret the input stream as (doc, context) tuples,
- and yield (result, context) tuples out.
- If both return_matches and as_tuples are True, the output will
- be a sequence of ((doc, matches), context) tuples.
YIELDS (Doc): Documents, in order.
"""
if n_threads != -1:
deprecation_warning(Warnings.W016)
-
- if as_tuples:
- for doc, context in docs:
- matches = self(doc)
- if return_matches:
- yield ((doc, matches), context)
- else:
- yield (doc, context)
- else:
- for doc in docs:
- matches = self(doc)
- if return_matches:
- yield (doc, matches)
- else:
- yield doc
+ for doc in docs:
+ self(doc)
+ yield doc
def __call__(self, Doc doc):
"""Find all token sequences matching the supplied pattern.
diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd
index 753b2da74..3aba1686f 100644
--- a/spacy/matcher/phrasematcher.pxd
+++ b/spacy/matcher/phrasematcher.pxd
@@ -1,27 +1,5 @@
from libcpp.vector cimport vector
-from cymem.cymem cimport Pool
-from preshed.maps cimport key_t, MapStruct
+from ..typedefs cimport hash_t
-from ..attrs cimport attr_id_t
-from ..tokens.doc cimport Doc
-from ..vocab cimport Vocab
-
-
-cdef class PhraseMatcher:
- cdef Vocab vocab
- cdef attr_id_t attr
- cdef object _callbacks
- cdef object _docs
- cdef bint _validate
- cdef MapStruct* c_map
- cdef Pool mem
- cdef key_t _terminal_hash
-
- cdef void find_matches(self, Doc doc, vector[MatchStruct] *matches) nogil
-
-
-cdef struct MatchStruct:
- key_t match_id
- int start
- int end
+ctypedef vector[hash_t] hash_vec
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index b6c9e01d2..9e8801cc1 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -2,16 +2,28 @@
# cython: profile=True
from __future__ import unicode_literals
-from libc.stdint cimport uintptr_t
+from libcpp.vector cimport vector
+from cymem.cymem cimport Pool
+from murmurhash.mrmr cimport hash64
+from preshed.maps cimport PreshMap
-from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
-
-from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
-from ..structs cimport TokenC
-from ..tokens.token cimport Token
+from .matcher cimport Matcher
+from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, attr_id_t
+from ..vocab cimport Vocab
+from ..tokens.doc cimport Doc, get_token_attr
+from ..typedefs cimport attr_t, hash_t
from ._schemas import TOKEN_PATTERN_SCHEMA
from ..errors import Errors, Warnings, deprecation_warning, user_warning
+from ..attrs import FLAG61 as U_ENT
+from ..attrs import FLAG60 as B2_ENT
+from ..attrs import FLAG59 as B3_ENT
+from ..attrs import FLAG58 as B4_ENT
+from ..attrs import FLAG43 as L2_ENT
+from ..attrs import FLAG42 as L3_ENT
+from ..attrs import FLAG41 as L4_ENT
+from ..attrs import FLAG42 as I3_ENT
+from ..attrs import FLAG41 as I4_ENT
cdef class PhraseMatcher:
@@ -21,11 +33,18 @@ cdef class PhraseMatcher:
DOCS: https://spacy.io/api/phrasematcher
USAGE: https://spacy.io/usage/rule-based-matching#phrasematcher
-
- Adapted from FlashText: https://github.com/vi3k6i5/flashtext
- MIT License (see `LICENSE`)
- Copyright (c) 2017 Vikash Singh (vikash.duliajan@gmail.com)
"""
+ cdef Pool mem
+ cdef Vocab vocab
+ cdef Matcher matcher
+ cdef PreshMap phrase_ids
+ cdef vector[hash_vec] ent_id_matrix
+ cdef int max_length
+ cdef attr_id_t attr
+ cdef public object _callbacks
+ cdef public object _patterns
+ cdef public object _docs
+ cdef public object _validate
def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False):
"""Initialize the PhraseMatcher.
@@ -39,17 +58,11 @@ cdef class PhraseMatcher:
"""
if max_length != 0:
deprecation_warning(Warnings.W010)
- self.vocab = vocab
- self._callbacks = {}
- self._docs = {}
- self._validate = validate
-
self.mem = Pool()
- self.c_map = self.mem.alloc(1, sizeof(MapStruct))
- self._terminal_hash = 826361138722620965
- map_init(self.mem, self.c_map, 8)
-
- if isinstance(attr, (int, long)):
+ self.max_length = max_length
+ self.vocab = vocab
+ self.matcher = Matcher(self.vocab, validate=False)
+ if isinstance(attr, long):
self.attr = attr
else:
attr = attr.upper()
@@ -58,15 +71,28 @@ cdef class PhraseMatcher:
if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
raise ValueError(Errors.E152.format(attr=attr))
self.attr = self.vocab.strings[attr]
+ self.phrase_ids = PreshMap()
+ abstract_patterns = [
+ [{U_ENT: True}],
+ [{B2_ENT: True}, {L2_ENT: True}],
+ [{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}],
+ [{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}],
+ ]
+ self.matcher.add("Candidate", None, *abstract_patterns)
+ self._callbacks = {}
+ self._docs = {}
+ self._validate = validate
def __len__(self):
- """Get the number of match IDs added to the matcher.
+ """Get the number of rules added to the matcher. Note that this only
+ returns the number of rules (identical with the number of IDs), not the
+ number of individual patterns.
RETURNS (int): The number of rules.
DOCS: https://spacy.io/api/phrasematcher#len
"""
- return len(self._callbacks)
+ return len(self._docs)
def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID.
@@ -76,79 +102,13 @@ cdef class PhraseMatcher:
DOCS: https://spacy.io/api/phrasematcher#contains
"""
- return key in self._callbacks
+ cdef hash_t ent_id = self.matcher._normalize_key(key)
+ return ent_id in self._callbacks
def __reduce__(self):
- data = (self.vocab, self._docs, self._callbacks, self.attr)
+ data = (self.vocab, self._docs, self._callbacks)
return (unpickle_matcher, data, None, None)
- def remove(self, key):
- """Remove a rule from the matcher by match ID. A KeyError is raised if
- the key does not exist.
-
- key (unicode): The match ID.
-
- DOCS: https://spacy.io/api/phrasematcher#remove
- """
- if key not in self._docs:
- raise KeyError(key)
- cdef MapStruct* current_node
- cdef MapStruct* terminal_map
- cdef MapStruct* node_pointer
- cdef void* result
- cdef key_t terminal_key
- cdef void* value
- cdef int c_i = 0
- cdef vector[MapStruct*] path_nodes
- cdef vector[key_t] path_keys
- cdef key_t key_to_remove
- for keyword in self._docs[key]:
- current_node = self.c_map
- for token in keyword:
- result = map_get(current_node, token)
- if result:
- path_nodes.push_back(current_node)
- path_keys.push_back(token)
- current_node = result
- else:
- # if token is not found, break out of the loop
- current_node = NULL
- break
- # remove the tokens from trie node if there are no other
- # keywords with them
- result = map_get(current_node, self._terminal_hash)
- if current_node != NULL and result:
- terminal_map = result
- terminal_keys = []
- c_i = 0
- while map_iter(terminal_map, &c_i, &terminal_key, &value):
- terminal_keys.append(self.vocab.strings[terminal_key])
- # if this is the only remaining key, remove unnecessary paths
- if terminal_keys == [key]:
- while not path_nodes.empty():
- node_pointer = path_nodes.back()
- path_nodes.pop_back()
- key_to_remove = path_keys.back()
- path_keys.pop_back()
- result = map_get(node_pointer, key_to_remove)
- if node_pointer.filled == 1:
- map_clear(node_pointer, key_to_remove)
- self.mem.free(result)
- else:
- # more than one key means more than 1 path,
- # delete not required path and keep the others
- map_clear(node_pointer, key_to_remove)
- self.mem.free(result)
- break
- # otherwise simply remove the key
- else:
- result = map_get(current_node, self._terminal_hash)
- if result:
- map_clear(result, self.vocab.strings[key])
-
- del self._callbacks[key]
- del self._docs[key]
-
def add(self, key, on_match, *docs):
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
key, an on_match callback, and one or more patterns.
@@ -159,53 +119,53 @@ cdef class PhraseMatcher:
DOCS: https://spacy.io/api/phrasematcher#add
"""
-
- _ = self.vocab[key]
- self._callbacks[key] = on_match
- self._docs.setdefault(key, set())
-
- cdef MapStruct* current_node
- cdef MapStruct* internal_node
- cdef void* result
-
+ cdef Doc doc
+ cdef hash_t ent_id = self.matcher._normalize_key(key)
+ self._callbacks[ent_id] = on_match
+ self._docs[ent_id] = docs
+ cdef int length
+ cdef int i
+ cdef hash_t phrase_hash
+ cdef Pool mem = Pool()
for doc in docs:
- if len(doc) == 0:
+ length = doc.length
+ if length == 0:
continue
- if isinstance(doc, Doc):
- if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
- raise ValueError(Errors.E155.format())
- if self.attr == DEP and not doc.is_parsed:
- raise ValueError(Errors.E156.format())
- if self._validate and (doc.is_tagged or doc.is_parsed) \
- and self.attr not in (DEP, POS, TAG, LEMMA):
- string_attr = self.vocab.strings[self.attr]
- user_warning(Warnings.W012.format(key=key, attr=string_attr))
- keyword = self._convert_to_array(doc)
+ if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
+ raise ValueError(Errors.E155.format())
+ if self.attr == DEP and not doc.is_parsed:
+ raise ValueError(Errors.E156.format())
+ if self._validate and (doc.is_tagged or doc.is_parsed) \
+ and self.attr not in (DEP, POS, TAG, LEMMA):
+ string_attr = self.vocab.strings[self.attr]
+ user_warning(Warnings.W012.format(key=key, attr=string_attr))
+ tags = get_biluo(length)
+ phrase_key = mem.alloc(length, sizeof(attr_t))
+ for i, tag in enumerate(tags):
+ attr_value = self.get_lex_value(doc, i)
+ lexeme = self.vocab[attr_value]
+ lexeme.set_flag(tag, True)
+ phrase_key[i] = lexeme.orth
+ phrase_hash = hash64(phrase_key, length * sizeof(attr_t), 0)
+
+ if phrase_hash in self.phrase_ids:
+ phrase_index = self.phrase_ids[phrase_hash]
+ ent_id_list = self.ent_id_matrix[phrase_index]
+ ent_id_list.append(ent_id)
+ self.ent_id_matrix[phrase_index] = ent_id_list
+
else:
- keyword = doc
- self._docs[key].add(tuple(keyword))
+ ent_id_list = hash_vec(1)
+ ent_id_list[0] = ent_id
+ new_index = self.ent_id_matrix.size()
+ if new_index == 0:
+ # PreshMaps can not contain 0 as value, so storing a dummy at 0
+ self.ent_id_matrix.push_back(hash_vec(0))
+ new_index = 1
+ self.ent_id_matrix.push_back(ent_id_list)
+ self.phrase_ids.set(phrase_hash, new_index)
- current_node = self.c_map
- for token in keyword:
- if token == self._terminal_hash:
- user_warning(Warnings.W021)
- break
- result = map_get(current_node, token)
- if not result:
- internal_node = self.mem.alloc(1, sizeof(MapStruct))
- map_init(self.mem, internal_node, 8)
- map_set(self.mem, current_node, token, internal_node)
- result = internal_node
- current_node = result
- result = map_get(current_node, self._terminal_hash)
- if not result:
- internal_node = self.mem.alloc(1, sizeof(MapStruct))
- map_init(self.mem, internal_node, 8)
- map_set(self.mem, current_node, self._terminal_hash, internal_node)
- result = internal_node
- map_set(self.mem, result, self.vocab.strings[key], NULL)
-
- def __call__(self, doc):
+ def __call__(self, Doc doc):
"""Find all sequences matching the supplied patterns on the `Doc`.
doc (Doc): The document to match over.
@@ -216,63 +176,25 @@ cdef class PhraseMatcher:
DOCS: https://spacy.io/api/phrasematcher#call
"""
matches = []
- if doc is None or len(doc) == 0:
- # if doc is empty or None just return empty list
- return matches
-
- cdef vector[MatchStruct] c_matches
- self.find_matches(doc, &c_matches)
- for i in range(c_matches.size()):
- matches.append((c_matches[i].match_id, c_matches[i].start, c_matches[i].end))
+ if self.attr == ORTH:
+ match_doc = doc
+ else:
+ # If we're not matching on the ORTH, match_doc will be a Doc whose
+ # token.orth values are the attribute values we're matching on,
+ # e.g. Doc(nlp.vocab, words=[token.pos_ for token in doc])
+ words = [self.get_lex_value(doc, i) for i in range(len(doc))]
+ match_doc = Doc(self.vocab, words=words)
+ for _, start, end in self.matcher(match_doc):
+ ent_ids = self.accept_match(match_doc, start, end)
+ if ent_ids is not None:
+ for ent_id in ent_ids:
+ matches.append((ent_id, start, end))
for i, (ent_id, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id)
if on_match is not None:
on_match(self, doc, i, matches)
return matches
- cdef void find_matches(self, Doc doc, vector[MatchStruct] *matches) nogil:
- cdef MapStruct* current_node = self.c_map
- cdef int start = 0
- cdef int idx = 0
- cdef int idy = 0
- cdef key_t key
- cdef void* value
- cdef int i = 0
- cdef MatchStruct ms
- cdef void* result
- while idx < doc.length:
- start = idx
- token = Token.get_struct_attr(&doc.c[idx], self.attr)
- # look for sequences from this position
- result = map_get(current_node, token)
- if result:
- current_node = result
- idy = idx + 1
- while idy < doc.length:
- result = map_get(current_node, self._terminal_hash)
- if result:
- i = 0
- while map_iter(result, &i, &key, &value):
- ms = make_matchstruct(key, start, idy)
- matches.push_back(ms)
- inner_token = Token.get_struct_attr(&doc.c[idy], self.attr)
- result = map_get(current_node, inner_token)
- if result:
- current_node = result
- idy += 1
- else:
- break
- else:
- # end of doc reached
- result = map_get(current_node, self._terminal_hash)
- if result:
- i = 0
- while map_iter(result, &i, &key, &value):
- ms = make_matchstruct(key, start, idy)
- matches.push_back(ms)
- current_node = self.c_map
- idx += 1
-
def pipe(self, stream, batch_size=1000, n_threads=-1, return_matches=False,
as_tuples=False):
"""Match a stream of documents, yielding them in turn.
@@ -306,21 +228,53 @@ cdef class PhraseMatcher:
else:
yield doc
- def _convert_to_array(self, Doc doc):
- return [Token.get_struct_attr(&doc.c[i], self.attr) for i in range(len(doc))]
+ def accept_match(self, Doc doc, int start, int end):
+ cdef int i, j
+ cdef Pool mem = Pool()
+ phrase_key = mem.alloc(end-start, sizeof(attr_t))
+ for i, j in enumerate(range(start, end)):
+ phrase_key[i] = doc.c[j].lex.orth
+ cdef hash_t key = hash64(phrase_key, (end-start) * sizeof(attr_t), 0)
+
+ ent_index = self.phrase_ids.get(key)
+ if ent_index == 0:
+ return None
+ return self.ent_id_matrix[ent_index]
+
+ def get_lex_value(self, Doc doc, int i):
+ if self.attr == ORTH:
+ # Return the regular orth value of the lexeme
+ return doc.c[i].lex.orth
+ # Get the attribute value instead, e.g. token.pos
+ attr_value = get_token_attr(&doc.c[i], self.attr)
+ if attr_value in (0, 1):
+ # Value is boolean, convert to string
+ string_attr_value = str(attr_value)
+ else:
+ string_attr_value = self.vocab.strings[attr_value]
+ string_attr_name = self.vocab.strings[self.attr]
+ # Concatenate the attr name and value to not pollute lexeme space
+ # e.g. 'POS-VERB' instead of just 'VERB', which could otherwise
+ # create false positive matches
+ return "matcher:{}-{}".format(string_attr_name, string_attr_value)
-def unpickle_matcher(vocab, docs, callbacks, attr):
- matcher = PhraseMatcher(vocab, attr=attr)
+def get_biluo(length):
+ if length == 0:
+ raise ValueError(Errors.E127)
+ elif length == 1:
+ return [U_ENT]
+ elif length == 2:
+ return [B2_ENT, L2_ENT]
+ elif length == 3:
+ return [B3_ENT, I3_ENT, L3_ENT]
+ else:
+ return [B4_ENT, I4_ENT] + [I4_ENT] * (length-3) + [L4_ENT]
+
+
+def unpickle_matcher(vocab, docs, callbacks):
+ matcher = PhraseMatcher(vocab)
for key, specs in docs.items():
callback = callbacks.get(key, None)
matcher.add(key, callback, *specs)
return matcher
-
-
-cdef MatchStruct make_matchstruct(key_t match_id, int start, int end) nogil:
- cdef MatchStruct ms
- ms.match_id = match_id
- ms.start = start
- ms.end = end
- return ms
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 1a3cedf97..d0110b300 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,41 +1,301 @@
from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMap, PreshMapArray
+from preshed.maps cimport PreshMapArray
from libc.stdint cimport uint64_t
-from murmurhash cimport mrmr
-from .structs cimport TokenC, MorphAnalysisC
+from .structs cimport TokenC
from .strings cimport StringStore
-from .typedefs cimport hash_t, attr_t, flags_t
+from .typedefs cimport attr_t, flags_t
from .parts_of_speech cimport univ_pos_t
from . cimport symbols
+
+cdef struct RichTagC:
+ uint64_t morph
+ int id
+ univ_pos_t pos
+ attr_t name
+
+
+cdef struct MorphAnalysisC:
+ RichTagC tag
+ attr_t lemma
+
+
cdef class Morphology:
cdef readonly Pool mem
cdef readonly StringStore strings
- cdef PreshMap tags # Keyed by hash, value is pointer to tag
-
cdef public object lemmatizer
cdef readonly object tag_map
- cdef readonly object tag_names
- cdef readonly object reverse_index
- cdef readonly object exc
- cdef readonly object _feat_map
- cdef readonly PreshMapArray _cache
- cdef readonly int n_tags
+ cdef public object n_tags
+ cdef public object reverse_index
+ cdef public object tag_names
+ cdef public object exc
+
+ cdef RichTagC* rich_tags
+ cdef PreshMapArray _cache
- cpdef update(self, hash_t morph, features)
- cdef hash_t insert(self, MorphAnalysisC tag) except 0
-
cdef int assign_untagged(self, TokenC* token) except -1
+
cdef int assign_tag(self, TokenC* token, tag) except -1
+
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
- cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1
+ cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
-cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil
-cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil
-cdef list list_features(const MorphAnalysisC* tag)
+cdef enum univ_morph_t:
+ NIL = 0
+ Animacy_anim = symbols.Animacy_anim
+ Animacy_inan
+ Animacy_hum
+ Animacy_nhum
+ Aspect_freq
+ Aspect_imp
+ Aspect_mod
+ Aspect_none
+ Aspect_perf
+ Case_abe
+ Case_abl
+ Case_abs
+ Case_acc
+ Case_ade
+ Case_all
+ Case_cau
+ Case_com
+ Case_dat
+ Case_del
+ Case_dis
+ Case_ela
+ Case_ess
+ Case_gen
+ Case_ill
+ Case_ine
+ Case_ins
+ Case_loc
+ Case_lat
+ Case_nom
+ Case_par
+ Case_sub
+ Case_sup
+ Case_tem
+ Case_ter
+ Case_tra
+ Case_voc
+ Definite_two
+ Definite_def
+ Definite_red
+ Definite_cons # U20
+ Definite_ind
+ Degree_cmp
+ Degree_comp
+ Degree_none
+ Degree_pos
+ Degree_sup
+ Degree_abs
+ Degree_com
+ Degree_dim # du
+ Gender_com
+ Gender_fem
+ Gender_masc
+ Gender_neut
+ Mood_cnd
+ Mood_imp
+ Mood_ind
+ Mood_n
+ Mood_pot
+ Mood_sub
+ Mood_opt
+ Negative_neg
+ Negative_pos
+ Negative_yes
+ Polarity_neg # U20
+ Polarity_pos # U20
+ Number_com
+ Number_dual
+ Number_none
+ Number_plur
+ Number_sing
+ Number_ptan # bg
+ Number_count # bg
+ NumType_card
+ NumType_dist
+ NumType_frac
+ NumType_gen
+ NumType_mult
+ NumType_none
+ NumType_ord
+ NumType_sets
+ Person_one
+ Person_two
+ Person_three
+ Person_none
+ Poss_yes
+ PronType_advPart
+ PronType_art
+ PronType_default
+ PronType_dem
+ PronType_ind
+ PronType_int
+ PronType_neg
+ PronType_prs
+ PronType_rcp
+ PronType_rel
+ PronType_tot
+ PronType_clit
+ PronType_exc # es, ca, it, fa
+ Reflex_yes
+ Tense_fut
+ Tense_imp
+ Tense_past
+ Tense_pres
+ VerbForm_fin
+ VerbForm_ger
+ VerbForm_inf
+ VerbForm_none
+ VerbForm_part
+ VerbForm_partFut
+ VerbForm_partPast
+ VerbForm_partPres
+ VerbForm_sup
+ VerbForm_trans
+ VerbForm_conv # U20
+ VerbForm_gdv # la
+ Voice_act
+ Voice_cau
+ Voice_pass
+ Voice_mid # gkc
+ Voice_int # hb
+ Abbr_yes # cz, fi, sl, U
+ AdpType_prep # cz, U
+ AdpType_post # U
+ AdpType_voc # cz
+ AdpType_comprep # cz
+ AdpType_circ # U
+ AdvType_man
+ AdvType_loc
+ AdvType_tim
+ AdvType_deg
+ AdvType_cau
+ AdvType_mod
+ AdvType_sta
+ AdvType_ex
+ AdvType_adadj
+ ConjType_oper # cz, U
+ ConjType_comp # cz, U
+ Connegative_yes # fi
+ Derivation_minen # fi
+ Derivation_sti # fi
+ Derivation_inen # fi
+ Derivation_lainen # fi
+ Derivation_ja # fi
+ Derivation_ton # fi
+ Derivation_vs # fi
+ Derivation_ttain # fi
+ Derivation_ttaa # fi
+ Echo_rdp # U
+ Echo_ech # U
+ Foreign_foreign # cz, fi, U
+ Foreign_fscript # cz, fi, U
+ Foreign_tscript # cz, U
+ Foreign_yes # sl
+ Gender_dat_masc # bq, U
+ Gender_dat_fem # bq, U
+ Gender_erg_masc # bq
+ Gender_erg_fem # bq
+ Gender_psor_masc # cz, sl, U
+ Gender_psor_fem # cz, sl, U
+ Gender_psor_neut # sl
+ Hyph_yes # cz, U
+ InfForm_one # fi
+ InfForm_two # fi
+ InfForm_three # fi
+ NameType_geo # U, cz
+ NameType_prs # U, cz
+ NameType_giv # U, cz
+ NameType_sur # U, cz
+ NameType_nat # U, cz
+ NameType_com # U, cz
+ NameType_pro # U, cz
+ NameType_oth # U, cz
+ NounType_com # U
+ NounType_prop # U
+ NounType_class # U
+ Number_abs_sing # bq, U
+ Number_abs_plur # bq, U
+ Number_dat_sing # bq, U
+ Number_dat_plur # bq, U
+ Number_erg_sing # bq, U
+ Number_erg_plur # bq, U
+ Number_psee_sing # U
+ Number_psee_plur # U
+ Number_psor_sing # cz, fi, sl, U
+ Number_psor_plur # cz, fi, sl, U
+ NumForm_digit # cz, sl, U
+ NumForm_roman # cz, sl, U
+ NumForm_word # cz, sl, U
+ NumValue_one # cz, U
+ NumValue_two # cz, U
+ NumValue_three # cz, U
+ PartForm_pres # fi
+ PartForm_past # fi
+ PartForm_agt # fi
+ PartForm_neg # fi
+ PartType_mod # U
+ PartType_emp # U
+ PartType_res # U
+ PartType_inf # U
+ PartType_vbp # U
+ Person_abs_one # bq, U
+ Person_abs_two # bq, U
+ Person_abs_three # bq, U
+ Person_dat_one # bq, U
+ Person_dat_two # bq, U
+ Person_dat_three # bq, U
+ Person_erg_one # bq, U
+ Person_erg_two # bq, U
+ Person_erg_three # bq, U
+ Person_psor_one # fi, U
+ Person_psor_two # fi, U
+ Person_psor_three # fi, U
+ Polite_inf # bq, U
+ Polite_pol # bq, U
+ Polite_abs_inf # bq, U
+ Polite_abs_pol # bq, U
+ Polite_erg_inf # bq, U
+ Polite_erg_pol # bq, U
+ Polite_dat_inf # bq, U
+ Polite_dat_pol # bq, U
+ Prefix_yes # U
+ PrepCase_npr # cz
+ PrepCase_pre # U
+ PunctSide_ini # U
+ PunctSide_fin # U
+ PunctType_peri # U
+ PunctType_qest # U
+ PunctType_excl # U
+ PunctType_quot # U
+ PunctType_brck # U
+ PunctType_comm # U
+ PunctType_colo # U
+ PunctType_semi # U
+ PunctType_dash # U
+ Style_arch # cz, fi, U
+ Style_rare # cz, fi, U
+ Style_poet # cz, U
+ Style_norm # cz, U
+ Style_coll # cz, U
+ Style_vrnc # cz, U
+ Style_sing # cz, U
+ Style_expr # cz, U
+ Style_derg # cz, U
+ Style_vulg # cz, U
+ Style_yes # fi, U
+ StyleVariant_styleShort # cz
+ StyleVariant_styleBound # cz, sl
+ VerbType_aux # U
+ VerbType_cop # U
+ VerbType_mod # U
+ VerbType_light # U
+
-cdef tag_to_json(const MorphAnalysisC* tag)
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index c146094a9..e9de621c8 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -3,83 +3,18 @@
from __future__ import unicode_literals
from libc.string cimport memset
-import srsly
-from collections import Counter
-from .compat import basestring_
-from .strings import get_string_id
-from . import symbols
from .attrs cimport POS, IS_SPACE
from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme
from .errors import Errors
-from .util import ensure_path
-
-
-cdef enum univ_field_t:
- Field_POS
- Field_Abbr
- Field_AdpType
- Field_AdvType
- Field_Animacy
- Field_Aspect
- Field_Case
- Field_ConjType
- Field_Connegative
- Field_Definite
- Field_Degree
- Field_Derivation
- Field_Echo
- Field_Foreign
- Field_Gender
- Field_Hyph
- Field_InfForm
- Field_Mood
- Field_NameType
- Field_Negative
- Field_NounType
- Field_Number
- Field_NumForm
- Field_NumType
- Field_NumValue
- Field_PartForm
- Field_PartType
- Field_Person
- Field_Polarity
- Field_Polite
- Field_Poss
- Field_Prefix
- Field_PrepCase
- Field_PronType
- Field_PunctSide
- Field_PunctType
- Field_Reflex
- Field_Style
- Field_StyleVariant
- Field_Tense
- Field_Typo
- Field_VerbForm
- Field_VerbType
- Field_Voice
def _normalize_props(props):
"""Transform deprecated string keys to correct names."""
out = {}
- props = dict(props)
- for key in FIELDS:
- if key in props:
- value = str(props[key]).lower()
- # We don't have support for disjunctive int|rel features, so
- # just take the first one :(
- if "|" in value:
- value = value.split("|")[0]
- attr = '%s_%s' % (key, value)
- if attr in FEATURES:
- props.pop(key)
- props[attr] = True
for key, value in props.items():
if key == POS:
if hasattr(value, 'upper'):
@@ -89,67 +24,17 @@ def _normalize_props(props):
out[key] = value
elif isinstance(key, int):
out[key] = value
- elif value is True:
- out[key] = value
elif key.lower() == 'pos':
out[POS] = POS_IDS[value.upper()]
- elif key.lower() != 'morph':
+ else:
out[key] = value
return out
-class MorphologyClassMap(object):
- def __init__(self, features):
- self.features = tuple(features)
- self.fields = []
- self.feat2field = {}
- seen_fields = set()
- for feature in features:
- field = feature.split("_", 1)[0]
- if field not in seen_fields:
- self.fields.append(field)
- seen_fields.add(field)
- self.feat2field[feature] = FIELDS[field]
- self.id2feat = {get_string_id(name): name for name in features}
- self.field2feats = {"POS": []}
- self.col2info = []
- self.attr2field = dict(LOWER_FIELDS.items())
- self.feat2offset = {}
- self.field2col = {}
- self.field2id = dict(FIELDS.items())
- self.fieldid2field = {field_id: field for field, field_id in FIELDS.items()}
- for feature in features:
- field = self.fields[self.feat2field[feature]]
- if field not in self.field2col:
- self.field2col[field] = len(self.col2info)
- if field != "POS" and field not in self.field2feats:
- self.col2info.append((field, 0, "NIL"))
- self.field2feats.setdefault(field, ["NIL"])
- offset = len(self.field2feats[field])
- self.field2feats[field].append(feature)
- self.col2info.append((field, offset, feature))
- self.feat2offset[feature] = offset
-
- @property
- def field_sizes(self):
- return [len(self.field2feats[field]) for field in self.fields]
-
- def get_field_offset(self, field):
- return self.field2col[field]
-
-
cdef class Morphology:
- '''Store the possible morphological analyses for a language, and index them
- by hash.
-
- To save space on each token, tokens only know the hash of their morphological
- analysis, so queries of morphological attributes are delegated
- to this class.
- '''
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
self.mem = Pool()
self.strings = string_store
- self.tags = PreshMap()
# Add special space symbol. We prefix with underscore, to make sure it
# always sorts to the end.
space_attrs = tag_map.get('SP', {POS: SPACE})
@@ -162,109 +47,31 @@ cdef class Morphology:
self.lemmatizer = lemmatizer
self.n_tags = len(tag_map)
self.reverse_index = {}
- self._feat_map = MorphologyClassMap(FEATURES)
- self._load_from_tag_map(tag_map)
+
+ self.rich_tags = self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
+ for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
+ self.strings.add(tag_str)
+ self.tag_map[tag_str] = dict(attrs)
+ attrs = _normalize_props(attrs)
+ attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
+ self.rich_tags[i].id = i
+ self.rich_tags[i].name = self.strings.add(tag_str)
+ self.rich_tags[i].morph = 0
+ self.rich_tags[i].pos = attrs[POS]
+ self.reverse_index[self.rich_tags[i].name] = i
+ # Add a 'null' tag, which we can reference when assign morphology to
+ # untagged tokens.
+ self.rich_tags[self.n_tags].id = self.n_tags
self._cache = PreshMapArray(self.n_tags)
self.exc = {}
if exc is not None:
- for (tag, orth), attrs in exc.items():
- attrs = _normalize_props(attrs)
- self.add_special_case(
- self.strings.as_string(tag), self.strings.as_string(orth), attrs)
-
- def _load_from_tag_map(self, tag_map):
- for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
- attrs = _normalize_props(attrs)
- self.add({self._feat_map.id2feat[feat] for feat in attrs
- if feat in self._feat_map.id2feat})
- self.tag_map[tag_str] = dict(attrs)
- self.reverse_index[self.strings.add(tag_str)] = i
+ for (tag_str, orth_str), attrs in exc.items():
+ self.add_special_case(tag_str, orth_str, attrs)
def __reduce__(self):
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
- self.exc), None, None)
-
- def add(self, features):
- """Insert a morphological analysis in the morphology table, if not already
- present. Returns the hash of the new analysis.
- """
- for f in features:
- if isinstance(f, basestring_):
- self.strings.add(f)
- string_features = features
- features = intify_features(features)
- cdef attr_t feature
- for feature in features:
- if feature != 0 and feature not in self._feat_map.id2feat:
- raise ValueError(Errors.E167.format(feat=self.strings[feature], feat_id=feature))
- cdef MorphAnalysisC tag
- tag = create_rich_tag(features)
- cdef hash_t key = self.insert(tag)
- return key
-
- def get(self, hash_t morph):
- tag = self.tags.get(morph)
- if tag == NULL:
- return []
- else:
- return tag_to_json(tag)
-
- cpdef update(self, hash_t morph, features):
- """Update a morphological analysis with new feature values."""
- tag = (self.tags.get(morph))[0]
- features = intify_features(features)
- cdef attr_t feature
- for feature in features:
- field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
- set_feature(&tag, field, feature, 1)
- morph = self.insert(tag)
- return morph
-
- def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
- if orth not in self.strings:
- return orth
- cdef unicode py_string = self.strings[orth]
- if self.lemmatizer is None:
- return self.strings.add(py_string.lower())
- cdef list lemma_strings
- cdef unicode lemma_string
- # Normalize features into a dict keyed by the field, to make life easier
- # for the lemmatizer. Handles string-to-int conversion too.
- string_feats = {}
- for key, value in morphology.items():
- if value is True:
- name, value = self.strings.as_string(key).split('_', 1)
- string_feats[name] = value
- else:
- string_feats[self.strings.as_string(key)] = self.strings.as_string(value)
- lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats)
- lemma_string = lemma_strings[0]
- lemma = self.strings.add(lemma_string)
- return lemma
-
- def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
- force=False):
- """Add a special-case rule to the morphological analyser. Tokens whose
- tag and orth match the rule will receive the specified properties.
-
- tag (unicode): The part-of-speech tag to key the exception.
- orth (unicode): The word-form to key the exception.
- """
- attrs = dict(attrs)
- attrs = _normalize_props(attrs)
- self.add({self._feat_map.id2feat[feat] for feat in attrs
- if feat in self._feat_map.id2feat})
- attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
- self.exc[(tag_str, self.strings.add(orth_str))] = attrs
-
- cdef hash_t insert(self, MorphAnalysisC tag) except 0:
- cdef hash_t key = hash_tag(tag)
- if self.tags.get(key) == NULL:
- tag_ptr = self.mem.alloc(1, sizeof(MorphAnalysisC))
- tag_ptr[0] = tag
- self.tags.set(key, tag_ptr)
- return key
+ self.exc), None, None)
cdef int assign_untagged(self, TokenC* token) except -1:
"""Set morphological attributes on a token without a POS tag. Uses
@@ -273,11 +80,12 @@ cdef class Morphology:
"""
if token.lemma == 0:
orth_str = self.strings[token.lex.orth]
- lemma = self.lemmatizer.lookup(orth_str, orth=token.lex.orth)
+ lemma = self.lemmatizer.lookup(orth_str)
token.lemma = self.strings.add(lemma)
- cdef int assign_tag(self, TokenC* token, tag_str) except -1:
- cdef attr_t tag = self.strings.as_int(tag_str)
+ cdef int assign_tag(self, TokenC* token, tag) except -1:
+ if isinstance(tag, basestring):
+ tag = self.strings.add(tag)
if tag in self.reverse_index:
tag_id = self.reverse_index[tag]
self.assign_tag_id(token, tag_id)
@@ -287,821 +95,351 @@ cdef class Morphology:
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id > self.n_tags:
raise ValueError(Errors.E014.format(tag=tag_id))
- # Ensure spaces get tagged as space.
- # It seems pretty arbitrary to put this logic here, but there's really
- # nowhere better. I guess the justification is that this is where the
- # specific word and the tag interact. Still, we should have a better
- # way to enforce this rule, or figure out why the statistical model fails.
- # Related to Issue #220
+ # TODO: It's pretty arbitrary to put this logic here. I guess the
+ # justification is that this is where the specific word and the tag
+ # interact. Still, we should have a better way to enforce this rule, or
+ # figure out why the statistical model fails. Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings.add('_SP')]
- tag_str = self.tag_names[tag_id]
- features = dict(self.tag_map.get(tag_str, {}))
- if features:
- pos = self.strings.as_int(features.pop(POS))
- else:
- pos = 0
- cdef attr_t lemma = self._cache.get(tag_id, token.lex.orth)
- if lemma == 0:
- # Ugh, self.lemmatize has opposite arg order from self.lemmatizer :(
- lemma = self.lemmatize(pos, token.lex.orth, features)
- self._cache.set(tag_id, token.lex.orth, lemma)
- token.lemma = lemma
- token.pos = pos
- token.tag = self.strings[tag_str]
- token.morph = self.add(features)
- if (self.tag_names[tag_id], token.lex.orth) in self.exc:
- self._assign_tag_from_exceptions(token, tag_id)
+ rich_tag = self.rich_tags[tag_id]
+ analysis = self._cache.get(tag_id, token.lex.orth)
+ if analysis is NULL:
+ analysis = self.mem.alloc(1, sizeof(MorphAnalysisC))
+ tag_str = self.strings[self.rich_tags[tag_id].name]
+ analysis.tag = rich_tag
+ analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
+ self.tag_map.get(tag_str, {}))
- cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
- key = (self.tag_names[tag_id], token.lex.orth)
- cdef dict attrs
- attrs = self.exc[key]
- token.pos = attrs.get(POS, token.pos)
- token.lemma = attrs.get(LEMMA, token.lemma)
+ self._cache.set(tag_id, token.lex.orth, analysis)
+ if token.lemma == 0:
+ token.lemma = analysis.lemma
+ token.pos = analysis.tag.pos
+ token.tag = analysis.tag.name
+ token.morph = analysis.tag.morph
+
+ cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1:
+ cdef flags_t one = 1
+ if value:
+ flags[0] |= one << flag_id
+ else:
+ flags[0] &= ~(one << flag_id)
+
+ def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
+ force=False):
+ """Add a special-case rule to the morphological analyser. Tokens whose
+ tag and orth match the rule will receive the specified properties.
+
+ tag (unicode): The part-of-speech tag to key the exception.
+ orth (unicode): The word-form to key the exception.
+ """
+ # TODO: Currently we've assumed that we know the number of tags --
+ # RichTagC is an array, and _cache is a PreshMapArray
+ # This is really bad: it makes the morphology typed to the tagger
+ # classes, which is all wrong.
+ self.exc[(tag_str, orth_str)] = dict(attrs)
+ tag = self.strings.add(tag_str)
+ if tag not in self.reverse_index:
+ return
+ tag_id = self.reverse_index[tag]
+ orth = self.strings.add(orth_str)
+ cdef RichTagC rich_tag = self.rich_tags[tag_id]
+ attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
+ cached = self._cache.get(tag_id, orth)
+ if cached is NULL:
+ cached = self.mem.alloc(1, sizeof(MorphAnalysisC))
+ elif force:
+ memset(cached, 0, sizeof(cached[0]))
+ else:
+ raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str))
+
+ cached.tag = rich_tag
+ # TODO: Refactor this to take arbitrary attributes.
+ for name_id, value_id in attrs.items():
+ if name_id == LEMMA:
+ cached.lemma = value_id
+ else:
+ self.assign_feature(&cached.tag.morph, name_id, value_id)
+ if cached.lemma == 0:
+ cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
+ self._cache.set(tag_id, orth, cached)
def load_morph_exceptions(self, dict exc):
- # Map (form, pos) to attributes
+ # Map (form, pos) to (lemma, rich tag)
for tag_str, entries in exc.items():
for form_str, attrs in entries.items():
self.add_special_case(tag_str, form_str, attrs)
- @classmethod
- def create_class_map(cls):
- return MorphologyClassMap(FEATURES)
+ def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
+ if orth not in self.strings:
+ return orth
+ cdef unicode py_string = self.strings[orth]
+ if self.lemmatizer is None:
+ return self.strings.add(py_string.lower())
+ cdef list lemma_strings
+ cdef unicode lemma_string
+ lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
+ lemma_string = lemma_strings[0]
+ lemma = self.strings.add(lemma_string)
+ return lemma
-cpdef univ_pos_t get_int_tag(pos_):
- return 0
-
-cpdef intify_features(features):
- return {get_string_id(feature) for feature in features}
-
-cdef hash_t hash_tag(MorphAnalysisC tag) nogil:
- return mrmr.hash64(&tag, sizeof(tag), 0)
-
-
-cdef MorphAnalysisC create_rich_tag(features) except *:
- cdef MorphAnalysisC tag
- cdef attr_t feature
- memset(&tag, 0, sizeof(tag))
- for feature in features:
- field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
- set_feature(&tag, field, feature, 1)
- return tag
-
-
-cdef tag_to_json(const MorphAnalysisC* tag):
- return [FEATURE_NAMES[f] for f in list_features(tag)]
-
-
-cdef MorphAnalysisC tag_from_json(json_tag):
- raise NotImplementedError
-
-
-cdef list list_features(const MorphAnalysisC* tag):
- output = []
- if tag.abbr != 0:
- output.append(tag.abbr)
- if tag.adp_type != 0:
- output.append(tag.adp_type)
- if tag.adv_type != 0:
- output.append(tag.adv_type)
- if tag.animacy != 0:
- output.append(tag.animacy)
- if tag.aspect != 0:
- output.append(tag.aspect)
- if tag.case != 0:
- output.append(tag.case)
- if tag.conj_type != 0:
- output.append(tag.conj_type)
- if tag.connegative != 0:
- output.append(tag.connegative)
- if tag.definite != 0:
- output.append(tag.definite)
- if tag.degree != 0:
- output.append(tag.degree)
- if tag.derivation != 0:
- output.append(tag.derivation)
- if tag.echo != 0:
- output.append(tag.echo)
- if tag.foreign != 0:
- output.append(tag.foreign)
- if tag.gender != 0:
- output.append(tag.gender)
- if tag.hyph != 0:
- output.append(tag.hyph)
- if tag.inf_form != 0:
- output.append(tag.inf_form)
- if tag.mood != 0:
- output.append(tag.mood)
- if tag.negative != 0:
- output.append(tag.negative)
- if tag.number != 0:
- output.append(tag.number)
- if tag.name_type != 0:
- output.append(tag.name_type)
- if tag.noun_type != 0:
- output.append(tag.noun_type)
- if tag.part_form != 0:
- output.append(tag.part_form)
- if tag.part_type != 0:
- output.append(tag.part_type)
- if tag.person != 0:
- output.append(tag.person)
- if tag.polite != 0:
- output.append(tag.polite)
- if tag.polarity != 0:
- output.append(tag.polarity)
- if tag.poss != 0:
- output.append(tag.poss)
- if tag.prefix != 0:
- output.append(tag.prefix)
- if tag.prep_case != 0:
- output.append(tag.prep_case)
- if tag.pron_type != 0:
- output.append(tag.pron_type)
- if tag.punct_type != 0:
- output.append(tag.punct_type)
- if tag.reflex != 0:
- output.append(tag.reflex)
- if tag.style != 0:
- output.append(tag.style)
- if tag.style_variant != 0:
- output.append(tag.style_variant)
- if tag.typo != 0:
- output.append(tag.typo)
- if tag.verb_form != 0:
- output.append(tag.verb_form)
- if tag.voice != 0:
- output.append(tag.voice)
- if tag.verb_type != 0:
- output.append(tag.verb_type)
- return output
-
-
-cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil:
- field = field_id
- if field == Field_POS:
- return tag.pos
- if field == Field_Abbr:
- return tag.abbr
- elif field == Field_AdpType:
- return tag.adp_type
- elif field == Field_AdvType:
- return tag.adv_type
- elif field == Field_Animacy:
- return tag.animacy
- elif field == Field_Aspect:
- return tag.aspect
- elif field == Field_Case:
- return tag.case
- elif field == Field_ConjType:
- return tag.conj_type
- elif field == Field_Connegative:
- return tag.connegative
- elif field == Field_Definite:
- return tag.definite
- elif field == Field_Degree:
- return tag.degree
- elif field == Field_Derivation:
- return tag.derivation
- elif field == Field_Echo:
- return tag.echo
- elif field == Field_Foreign:
- return tag.foreign
- elif field == Field_Gender:
- return tag.gender
- elif field == Field_Hyph:
- return tag.hyph
- elif field == Field_InfForm:
- return tag.inf_form
- elif field == Field_Mood:
- return tag.mood
- elif field == Field_Negative:
- return tag.negative
- elif field == Field_Number:
- return tag.number
- elif field == Field_NameType:
- return tag.name_type
- elif field == Field_NounType:
- return tag.noun_type
- elif field == Field_NumForm:
- return tag.num_form
- elif field == Field_NumType:
- return tag.num_type
- elif field == Field_NumValue:
- return tag.num_value
- elif field == Field_PartForm:
- return tag.part_form
- elif field == Field_PartType:
- return tag.part_type
- elif field == Field_Person:
- return tag.person
- elif field == Field_Polite:
- return tag.polite
- elif field == Field_Polarity:
- return tag.polarity
- elif field == Field_Poss:
- return tag.poss
- elif field == Field_Prefix:
- return tag.prefix
- elif field == Field_PrepCase:
- return tag.prep_case
- elif field == Field_PronType:
- return tag.pron_type
- elif field == Field_PunctSide:
- return tag.punct_side
- elif field == Field_PunctType:
- return tag.punct_type
- elif field == Field_Reflex:
- return tag.reflex
- elif field == Field_Style:
- return tag.style
- elif field == Field_StyleVariant:
- return tag.style_variant
- elif field == Field_Tense:
- return tag.tense
- elif field == Field_Typo:
- return tag.typo
- elif field == Field_VerbForm:
- return tag.verb_form
- elif field == Field_Voice:
- return tag.voice
- elif field == Field_VerbType:
- return tag.verb_type
- else:
- raise ValueError(Errors.E168.format(field=field_id))
-
-
-cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil:
- if tag.abbr == feature:
- return 1
- elif tag.adp_type == feature:
- return 1
- elif tag.adv_type == feature:
- return 1
- elif tag.animacy == feature:
- return 1
- elif tag.aspect == feature:
- return 1
- elif tag.case == feature:
- return 1
- elif tag.conj_type == feature:
- return 1
- elif tag.connegative == feature:
- return 1
- elif tag.definite == feature:
- return 1
- elif tag.degree == feature:
- return 1
- elif tag.derivation == feature:
- return 1
- elif tag.echo == feature:
- return 1
- elif tag.foreign == feature:
- return 1
- elif tag.gender == feature:
- return 1
- elif tag.hyph == feature:
- return 1
- elif tag.inf_form == feature:
- return 1
- elif tag.mood == feature:
- return 1
- elif tag.negative == feature:
- return 1
- elif tag.number == feature:
- return 1
- elif tag.name_type == feature:
- return 1
- elif tag.noun_type == feature:
- return 1
- elif tag.num_form == feature:
- return 1
- elif tag.num_type == feature:
- return 1
- elif tag.num_value == feature:
- return 1
- elif tag.part_form == feature:
- return 1
- elif tag.part_type == feature:
- return 1
- elif tag.person == feature:
- return 1
- elif tag.polite == feature:
- return 1
- elif tag.polarity == feature:
- return 1
- elif tag.poss == feature:
- return 1
- elif tag.prefix == feature:
- return 1
- elif tag.prep_case == feature:
- return 1
- elif tag.pron_type == feature:
- return 1
- elif tag.punct_side == feature:
- return 1
- elif tag.punct_type == feature:
- return 1
- elif tag.reflex == feature:
- return 1
- elif tag.style == feature:
- return 1
- elif tag.style_variant == feature:
- return 1
- elif tag.tense == feature:
- return 1
- elif tag.typo == feature:
- return 1
- elif tag.verb_form == feature:
- return 1
- elif tag.voice == feature:
- return 1
- elif tag.verb_type == feature:
- return 1
- else:
- return 0
-
-cdef int set_feature(MorphAnalysisC* tag,
- univ_field_t field, attr_t feature, int value) except -1:
- if value == True:
- value_ = feature
- else:
- value_ = 0
- prev_value = get_field(tag, field)
- if prev_value != 0 and value_ == 0 and field != Field_POS:
- tag.length -= 1
- elif prev_value == 0 and value_ != 0 and field != Field_POS:
- tag.length += 1
- if feature == 0:
- pass
- elif field == Field_POS:
- tag.pos = get_string_id(FEATURE_NAMES[value_].split('_')[1])
- elif field == Field_Abbr:
- tag.abbr = value_
- elif field == Field_AdpType:
- tag.adp_type = value_
- elif field == Field_AdvType:
- tag.adv_type = value_
- elif field == Field_Animacy:
- tag.animacy = value_
- elif field == Field_Aspect:
- tag.aspect = value_
- elif field == Field_Case:
- tag.case = value_
- elif field == Field_ConjType:
- tag.conj_type = value_
- elif field == Field_Connegative:
- tag.connegative = value_
- elif field == Field_Definite:
- tag.definite = value_
- elif field == Field_Degree:
- tag.degree = value_
- elif field == Field_Derivation:
- tag.derivation = value_
- elif field == Field_Echo:
- tag.echo = value_
- elif field == Field_Foreign:
- tag.foreign = value_
- elif field == Field_Gender:
- tag.gender = value_
- elif field == Field_Hyph:
- tag.hyph = value_
- elif field == Field_InfForm:
- tag.inf_form = value_
- elif field == Field_Mood:
- tag.mood = value_
- elif field == Field_Negative:
- tag.negative = value_
- elif field == Field_Number:
- tag.number = value_
- elif field == Field_NameType:
- tag.name_type = value_
- elif field == Field_NounType:
- tag.noun_type = value_
- elif field == Field_NumForm:
- tag.num_form = value_
- elif field == Field_NumType:
- tag.num_type = value_
- elif field == Field_NumValue:
- tag.num_value = value_
- elif field == Field_PartForm:
- tag.part_form = value_
- elif field == Field_PartType:
- tag.part_type = value_
- elif field == Field_Person:
- tag.person = value_
- elif field == Field_Polite:
- tag.polite = value_
- elif field == Field_Polarity:
- tag.polarity = value_
- elif field == Field_Poss:
- tag.poss = value_
- elif field == Field_Prefix:
- tag.prefix = value_
- elif field == Field_PrepCase:
- tag.prep_case = value_
- elif field == Field_PronType:
- tag.pron_type = value_
- elif field == Field_PunctSide:
- tag.punct_side = value_
- elif field == Field_PunctType:
- tag.punct_type = value_
- elif field == Field_Reflex:
- tag.reflex = value_
- elif field == Field_Style:
- tag.style = value_
- elif field == Field_StyleVariant:
- tag.style_variant = value_
- elif field == Field_Tense:
- tag.tense = value_
- elif field == Field_Typo:
- tag.typo = value_
- elif field == Field_VerbForm:
- tag.verb_form = value_
- elif field == Field_Voice:
- tag.voice = value_
- elif field == Field_VerbType:
- tag.verb_type = value_
- else:
- raise ValueError(Errors.E167.format(field=FEATURE_NAMES.get(feature), field_id=feature))
-
-
-FIELDS = {
- 'POS': Field_POS,
- 'Abbr': Field_Abbr,
- 'AdpType': Field_AdpType,
- 'AdvType': Field_AdvType,
- 'Animacy': Field_Animacy,
- 'Aspect': Field_Aspect,
- 'Case': Field_Case,
- 'ConjType': Field_ConjType,
- 'Connegative': Field_Connegative,
- 'Definite': Field_Definite,
- 'Degree': Field_Degree,
- 'Derivation': Field_Derivation,
- 'Echo': Field_Echo,
- 'Foreign': Field_Foreign,
- 'Gender': Field_Gender,
- 'Hyph': Field_Hyph,
- 'InfForm': Field_InfForm,
- 'Mood': Field_Mood,
- 'NameType': Field_NameType,
- 'Negative': Field_Negative,
- 'NounType': Field_NounType,
- 'Number': Field_Number,
- 'NumForm': Field_NumForm,
- 'NumType': Field_NumType,
- 'NumValue': Field_NumValue,
- 'PartForm': Field_PartForm,
- 'PartType': Field_PartType,
- 'Person': Field_Person,
- 'Polite': Field_Polite,
- 'Polarity': Field_Polarity,
- 'Poss': Field_Poss,
- 'Prefix': Field_Prefix,
- 'PrepCase': Field_PrepCase,
- 'PronType': Field_PronType,
- 'PunctSide': Field_PunctSide,
- 'PunctType': Field_PunctType,
- 'Reflex': Field_Reflex,
- 'Style': Field_Style,
- 'StyleVariant': Field_StyleVariant,
- 'Tense': Field_Tense,
- 'Typo': Field_Typo,
- 'VerbForm': Field_VerbForm,
- 'VerbType': Field_VerbType,
- 'Voice': Field_Voice,
-}
-
-LOWER_FIELDS = {
- 'pos': Field_POS,
- 'abbr': Field_Abbr,
- 'adp_type': Field_AdpType,
- 'adv_type': Field_AdvType,
- 'animacy': Field_Animacy,
- 'aspect': Field_Aspect,
- 'case': Field_Case,
- 'conj_type': Field_ConjType,
- 'connegative': Field_Connegative,
- 'definite': Field_Definite,
- 'degree': Field_Degree,
- 'derivation': Field_Derivation,
- 'echo': Field_Echo,
- 'foreign': Field_Foreign,
- 'gender': Field_Gender,
- 'hyph': Field_Hyph,
- 'inf_form': Field_InfForm,
- 'mood': Field_Mood,
- 'name_type': Field_NameType,
- 'negative': Field_Negative,
- 'noun_type': Field_NounType,
- 'number': Field_Number,
- 'num_form': Field_NumForm,
- 'num_type': Field_NumType,
- 'num_value': Field_NumValue,
- 'part_form': Field_PartForm,
- 'part_type': Field_PartType,
- 'person': Field_Person,
- 'polarity': Field_Polarity,
- 'polite': Field_Polite,
- 'poss': Field_Poss,
- 'prefix': Field_Prefix,
- 'prep_case': Field_PrepCase,
- 'pron_type': Field_PronType,
- 'punct_side': Field_PunctSide,
- 'punct_type': Field_PunctType,
- 'reflex': Field_Reflex,
- 'style': Field_Style,
- 'style_variant': Field_StyleVariant,
- 'tense': Field_Tense,
- 'typo': Field_Typo,
- 'verb_form': Field_VerbForm,
- 'verb_type': Field_VerbType,
- 'voice': Field_Voice,
+IDS = {
+ "Animacy_anim": Animacy_anim,
+ "Animacy_inan": Animacy_inan,
+ "Animacy_hum": Animacy_hum, # U20
+ "Animacy_nhum": Animacy_nhum,
+ "Aspect_freq": Aspect_freq,
+ "Aspect_imp": Aspect_imp,
+ "Aspect_mod": Aspect_mod,
+ "Aspect_none": Aspect_none,
+ "Aspect_perf": Aspect_perf,
+ "Case_abe": Case_abe,
+ "Case_abl": Case_abl,
+ "Case_abs": Case_abs,
+ "Case_acc": Case_acc,
+ "Case_ade": Case_ade,
+ "Case_all": Case_all,
+ "Case_cau": Case_cau,
+ "Case_com": Case_com,
+ "Case_dat": Case_dat,
+ "Case_del": Case_del,
+ "Case_dis": Case_dis,
+ "Case_ela": Case_ela,
+ "Case_ess": Case_ess,
+ "Case_gen": Case_gen,
+ "Case_ill": Case_ill,
+ "Case_ine": Case_ine,
+ "Case_ins": Case_ins,
+ "Case_loc": Case_loc,
+ "Case_lat": Case_lat,
+ "Case_nom": Case_nom,
+ "Case_par": Case_par,
+ "Case_sub": Case_sub,
+ "Case_sup": Case_sup,
+ "Case_tem": Case_tem,
+ "Case_ter": Case_ter,
+ "Case_tra": Case_tra,
+ "Case_voc": Case_voc,
+ "Definite_two": Definite_two,
+ "Definite_def": Definite_def,
+ "Definite_red": Definite_red,
+ "Definite_cons": Definite_cons, # U20
+ "Definite_ind": Definite_ind,
+ "Degree_cmp": Degree_cmp,
+ "Degree_comp": Degree_comp,
+ "Degree_none": Degree_none,
+ "Degree_pos": Degree_pos,
+ "Degree_sup": Degree_sup,
+ "Degree_abs": Degree_abs,
+ "Degree_com": Degree_com,
+ "Degree_dim ": Degree_dim, # du
+ "Gender_com": Gender_com,
+ "Gender_fem": Gender_fem,
+ "Gender_masc": Gender_masc,
+ "Gender_neut": Gender_neut,
+ "Mood_cnd": Mood_cnd,
+ "Mood_imp": Mood_imp,
+ "Mood_ind": Mood_ind,
+ "Mood_n": Mood_n,
+ "Mood_pot": Mood_pot,
+ "Mood_sub": Mood_sub,
+ "Mood_opt": Mood_opt,
+ "Negative_neg": Negative_neg,
+ "Negative_pos": Negative_pos,
+ "Negative_yes": Negative_yes,
+ "Polarity_neg": Polarity_neg, # U20
+ "Polarity_pos": Polarity_pos, # U20
+ "Number_com": Number_com,
+ "Number_dual": Number_dual,
+ "Number_none": Number_none,
+ "Number_plur": Number_plur,
+ "Number_sing": Number_sing,
+ "Number_ptan ": Number_ptan, # bg
+ "Number_count ": Number_count, # bg
+ "NumType_card": NumType_card,
+ "NumType_dist": NumType_dist,
+ "NumType_frac": NumType_frac,
+ "NumType_gen": NumType_gen,
+ "NumType_mult": NumType_mult,
+ "NumType_none": NumType_none,
+ "NumType_ord": NumType_ord,
+ "NumType_sets": NumType_sets,
+ "Person_one": Person_one,
+ "Person_two": Person_two,
+ "Person_three": Person_three,
+ "Person_none": Person_none,
+ "Poss_yes": Poss_yes,
+ "PronType_advPart": PronType_advPart,
+ "PronType_art": PronType_art,
+ "PronType_default": PronType_default,
+ "PronType_dem": PronType_dem,
+ "PronType_ind": PronType_ind,
+ "PronType_int": PronType_int,
+ "PronType_neg": PronType_neg,
+ "PronType_prs": PronType_prs,
+ "PronType_rcp": PronType_rcp,
+ "PronType_rel": PronType_rel,
+ "PronType_tot": PronType_tot,
+ "PronType_clit": PronType_clit,
+ "PronType_exc ": PronType_exc, # es, ca, it, fa,
+ "Reflex_yes": Reflex_yes,
+ "Tense_fut": Tense_fut,
+ "Tense_imp": Tense_imp,
+ "Tense_past": Tense_past,
+ "Tense_pres": Tense_pres,
+ "VerbForm_fin": VerbForm_fin,
+ "VerbForm_ger": VerbForm_ger,
+ "VerbForm_inf": VerbForm_inf,
+ "VerbForm_none": VerbForm_none,
+ "VerbForm_part": VerbForm_part,
+ "VerbForm_partFut": VerbForm_partFut,
+ "VerbForm_partPast": VerbForm_partPast,
+ "VerbForm_partPres": VerbForm_partPres,
+ "VerbForm_sup": VerbForm_sup,
+ "VerbForm_trans": VerbForm_trans,
+ "VerbForm_conv": VerbForm_conv, # U20
+ "VerbForm_gdv ": VerbForm_gdv, # la,
+ "Voice_act": Voice_act,
+ "Voice_cau": Voice_cau,
+ "Voice_pass": Voice_pass,
+ "Voice_mid ": Voice_mid, # gkc,
+ "Voice_int ": Voice_int, # hb,
+ "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
+ "AdpType_prep ": AdpType_prep, # cz, U,
+ "AdpType_post ": AdpType_post, # U,
+ "AdpType_voc ": AdpType_voc, # cz,
+ "AdpType_comprep ": AdpType_comprep, # cz,
+ "AdpType_circ ": AdpType_circ, # U,
+ "AdvType_man": AdvType_man,
+ "AdvType_loc": AdvType_loc,
+ "AdvType_tim": AdvType_tim,
+ "AdvType_deg": AdvType_deg,
+ "AdvType_cau": AdvType_cau,
+ "AdvType_mod": AdvType_mod,
+ "AdvType_sta": AdvType_sta,
+ "AdvType_ex": AdvType_ex,
+ "AdvType_adadj": AdvType_adadj,
+ "ConjType_oper ": ConjType_oper, # cz, U,
+ "ConjType_comp ": ConjType_comp, # cz, U,
+ "Connegative_yes ": Connegative_yes, # fi,
+ "Derivation_minen ": Derivation_minen, # fi,
+ "Derivation_sti ": Derivation_sti, # fi,
+ "Derivation_inen ": Derivation_inen, # fi,
+ "Derivation_lainen ": Derivation_lainen, # fi,
+ "Derivation_ja ": Derivation_ja, # fi,
+ "Derivation_ton ": Derivation_ton, # fi,
+ "Derivation_vs ": Derivation_vs, # fi,
+ "Derivation_ttain ": Derivation_ttain, # fi,
+ "Derivation_ttaa ": Derivation_ttaa, # fi,
+ "Echo_rdp ": Echo_rdp, # U,
+ "Echo_ech ": Echo_ech, # U,
+ "Foreign_foreign ": Foreign_foreign, # cz, fi, U,
+ "Foreign_fscript ": Foreign_fscript, # cz, fi, U,
+ "Foreign_tscript ": Foreign_tscript, # cz, U,
+ "Foreign_yes ": Foreign_yes, # sl,
+ "Gender_dat_masc ": Gender_dat_masc, # bq, U,
+ "Gender_dat_fem ": Gender_dat_fem, # bq, U,
+ "Gender_erg_masc ": Gender_erg_masc, # bq,
+ "Gender_erg_fem ": Gender_erg_fem, # bq,
+ "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
+ "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
+ "Gender_psor_neut ": Gender_psor_neut, # sl,
+ "Hyph_yes ": Hyph_yes, # cz, U,
+ "InfForm_one ": InfForm_one, # fi,
+ "InfForm_two ": InfForm_two, # fi,
+ "InfForm_three ": InfForm_three, # fi,
+ "NameType_geo ": NameType_geo, # U, cz,
+ "NameType_prs ": NameType_prs, # U, cz,
+ "NameType_giv ": NameType_giv, # U, cz,
+ "NameType_sur ": NameType_sur, # U, cz,
+ "NameType_nat ": NameType_nat, # U, cz,
+ "NameType_com ": NameType_com, # U, cz,
+ "NameType_pro ": NameType_pro, # U, cz,
+ "NameType_oth ": NameType_oth, # U, cz,
+ "NounType_com ": NounType_com, # U,
+ "NounType_prop ": NounType_prop, # U,
+ "NounType_class ": NounType_class, # U,
+ "Number_abs_sing ": Number_abs_sing, # bq, U,
+ "Number_abs_plur ": Number_abs_plur, # bq, U,
+ "Number_dat_sing ": Number_dat_sing, # bq, U,
+ "Number_dat_plur ": Number_dat_plur, # bq, U,
+ "Number_erg_sing ": Number_erg_sing, # bq, U,
+ "Number_erg_plur ": Number_erg_plur, # bq, U,
+ "Number_psee_sing ": Number_psee_sing, # U,
+ "Number_psee_plur ": Number_psee_plur, # U,
+ "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
+ "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
+ "NumForm_digit ": NumForm_digit, # cz, sl, U,
+ "NumForm_roman ": NumForm_roman, # cz, sl, U,
+ "NumForm_word ": NumForm_word, # cz, sl, U,
+ "NumValue_one ": NumValue_one, # cz, U,
+ "NumValue_two ": NumValue_two, # cz, U,
+ "NumValue_three ": NumValue_three, # cz, U,
+ "PartForm_pres ": PartForm_pres, # fi,
+ "PartForm_past ": PartForm_past, # fi,
+ "PartForm_agt ": PartForm_agt, # fi,
+ "PartForm_neg ": PartForm_neg, # fi,
+ "PartType_mod ": PartType_mod, # U,
+ "PartType_emp ": PartType_emp, # U,
+ "PartType_res ": PartType_res, # U,
+ "PartType_inf ": PartType_inf, # U,
+ "PartType_vbp ": PartType_vbp, # U,
+ "Person_abs_one ": Person_abs_one, # bq, U,
+ "Person_abs_two ": Person_abs_two, # bq, U,
+ "Person_abs_three ": Person_abs_three, # bq, U,
+ "Person_dat_one ": Person_dat_one, # bq, U,
+ "Person_dat_two ": Person_dat_two, # bq, U,
+ "Person_dat_three ": Person_dat_three, # bq, U,
+ "Person_erg_one ": Person_erg_one, # bq, U,
+ "Person_erg_two ": Person_erg_two, # bq, U,
+ "Person_erg_three ": Person_erg_three, # bq, U,
+ "Person_psor_one ": Person_psor_one, # fi, U,
+ "Person_psor_two ": Person_psor_two, # fi, U,
+ "Person_psor_three ": Person_psor_three, # fi, U,
+ "Polite_inf ": Polite_inf, # bq, U,
+ "Polite_pol ": Polite_pol, # bq, U,
+ "Polite_abs_inf ": Polite_abs_inf, # bq, U,
+ "Polite_abs_pol ": Polite_abs_pol, # bq, U,
+ "Polite_erg_inf ": Polite_erg_inf, # bq, U,
+ "Polite_erg_pol ": Polite_erg_pol, # bq, U,
+ "Polite_dat_inf ": Polite_dat_inf, # bq, U,
+ "Polite_dat_pol ": Polite_dat_pol, # bq, U,
+ "Prefix_yes ": Prefix_yes, # U,
+ "PrepCase_npr ": PrepCase_npr, # cz,
+ "PrepCase_pre ": PrepCase_pre, # U,
+ "PunctSide_ini ": PunctSide_ini, # U,
+ "PunctSide_fin ": PunctSide_fin, # U,
+ "PunctType_peri ": PunctType_peri, # U,
+ "PunctType_qest ": PunctType_qest, # U,
+ "PunctType_excl ": PunctType_excl, # U,
+ "PunctType_quot ": PunctType_quot, # U,
+ "PunctType_brck ": PunctType_brck, # U,
+ "PunctType_comm ": PunctType_comm, # U,
+ "PunctType_colo ": PunctType_colo, # U,
+ "PunctType_semi ": PunctType_semi, # U,
+ "PunctType_dash ": PunctType_dash, # U,
+ "Style_arch ": Style_arch, # cz, fi, U,
+ "Style_rare ": Style_rare, # cz, fi, U,
+ "Style_poet ": Style_poet, # cz, U,
+ "Style_norm ": Style_norm, # cz, U,
+ "Style_coll ": Style_coll, # cz, U,
+ "Style_vrnc ": Style_vrnc, # cz, U,
+ "Style_sing ": Style_sing, # cz, U,
+ "Style_expr ": Style_expr, # cz, U,
+ "Style_derg ": Style_derg, # cz, U,
+ "Style_vulg ": Style_vulg, # cz, U,
+ "Style_yes ": Style_yes, # fi, U,
+ "StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
+ "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
+ "VerbType_aux ": VerbType_aux, # U,
+ "VerbType_cop ": VerbType_cop, # U,
+ "VerbType_mod ": VerbType_mod, # U,
+ "VerbType_light ": VerbType_light, # U,
}
-FEATURES = [
- "POS_ADJ",
- "POS_ADP",
- "POS_ADV",
- "POS_AUX",
- "POS_CONJ",
- "POS_CCONJ",
- "POS_DET",
- "POS_INTJ",
- "POS_NOUN",
- "POS_NUM",
- "POS_PART",
- "POS_PRON",
- "POS_PROPN",
- "POS_PUNCT",
- "POS_SCONJ",
- "POS_SYM",
- "POS_VERB",
- "POS_X",
- "POS_EOL",
- "POS_SPACE",
- "Abbr_yes",
- "AdpType_circ",
- "AdpType_comprep",
- "AdpType_prep",
- "AdpType_post",
- "AdpType_voc",
- "AdvType_adadj",
- "AdvType_cau",
- "AdvType_deg",
- "AdvType_ex",
- "AdvType_loc",
- "AdvType_man",
- "AdvType_mod",
- "AdvType_sta",
- "AdvType_tim",
- "Animacy_anim",
- "Animacy_hum",
- "Animacy_inan",
- "Animacy_nhum",
- "Aspect_hab",
- "Aspect_imp",
- "Aspect_iter",
- "Aspect_perf",
- "Aspect_prog",
- "Aspect_prosp",
- "Aspect_none",
- "Case_abe",
- "Case_abl",
- "Case_abs",
- "Case_acc",
- "Case_ade",
- "Case_all",
- "Case_cau",
- "Case_com",
- "Case_dat",
- "Case_del",
- "Case_dis",
- "Case_ela",
- "Case_ess",
- "Case_gen",
- "Case_ill",
- "Case_ine",
- "Case_ins",
- "Case_loc",
- "Case_lat",
- "Case_nom",
- "Case_par",
- "Case_sub",
- "Case_sup",
- "Case_tem",
- "Case_ter",
- "Case_tra",
- "Case_voc",
- "ConjType_comp",
- "ConjType_oper",
- "Connegative_yes",
- "Definite_cons",
- "Definite_def",
- "Definite_ind",
- "Definite_red",
- "Definite_two",
- "Degree_abs",
- "Degree_cmp",
- "Degree_comp",
- "Degree_none",
- "Degree_pos",
- "Degree_sup",
- "Degree_com",
- "Degree_dim",
- "Derivation_minen",
- "Derivation_sti",
- "Derivation_inen",
- "Derivation_lainen",
- "Derivation_ja",
- "Derivation_ton",
- "Derivation_vs",
- "Derivation_ttain",
- "Derivation_ttaa",
- "Echo_rdp",
- "Echo_ech",
- "Foreign_foreign",
- "Foreign_fscript",
- "Foreign_tscript",
- "Foreign_yes",
- "Gender_com",
- "Gender_fem",
- "Gender_masc",
- "Gender_neut",
- "Gender_dat_masc",
- "Gender_dat_fem",
- "Gender_erg_masc",
- "Gender_erg_fem",
- "Gender_psor_masc",
- "Gender_psor_fem",
- "Gender_psor_neut",
- "Hyph_yes",
- "InfForm_one",
- "InfForm_two",
- "InfForm_three",
- "Mood_cnd",
- "Mood_imp",
- "Mood_ind",
- "Mood_n",
- "Mood_pot",
- "Mood_sub",
- "Mood_opt",
- "NameType_geo",
- "NameType_prs",
- "NameType_giv",
- "NameType_sur",
- "NameType_nat",
- "NameType_com",
- "NameType_pro",
- "NameType_oth",
- "Negative_neg",
- "Negative_pos",
- "Negative_yes",
- "NounType_com",
- "NounType_prop",
- "NounType_class",
- "Number_com",
- "Number_dual",
- "Number_none",
- "Number_plur",
- "Number_sing",
- "Number_ptan",
- "Number_count",
- "Number_abs_sing",
- "Number_abs_plur",
- "Number_dat_sing",
- "Number_dat_plur",
- "Number_erg_sing",
- "Number_erg_plur",
- "Number_psee_sing",
- "Number_psee_plur",
- "Number_psor_sing",
- "Number_psor_plur",
- "NumForm_digit",
- "NumForm_roman",
- "NumForm_word",
- "NumForm_combi",
- "NumType_card",
- "NumType_dist",
- "NumType_frac",
- "NumType_gen",
- "NumType_mult",
- "NumType_none",
- "NumType_ord",
- "NumType_sets",
- "NumType_dual",
- "NumValue_one",
- "NumValue_two",
- "NumValue_three",
- "PartForm_pres",
- "PartForm_past",
- "PartForm_agt",
- "PartForm_neg",
- "PartType_mod",
- "PartType_emp",
- "PartType_res",
- "PartType_inf",
- "PartType_vbp",
- "Person_one",
- "Person_two",
- "Person_three",
- "Person_none",
- "Person_abs_one",
- "Person_abs_two",
- "Person_abs_three",
- "Person_dat_one",
- "Person_dat_two",
- "Person_dat_three",
- "Person_erg_one",
- "Person_erg_two",
- "Person_erg_three",
- "Person_psor_one",
- "Person_psor_two",
- "Person_psor_three",
- "Polarity_neg",
- "Polarity_pos",
- "Polite_inf",
- "Polite_pol",
- "Polite_abs_inf",
- "Polite_abs_pol",
- "Polite_erg_inf",
- "Polite_erg_pol",
- "Polite_dat_inf",
- "Polite_dat_pol",
- "Poss_yes",
- "Prefix_yes",
- "PrepCase_npr",
- "PrepCase_pre",
- "PronType_advPart",
- "PronType_art",
- "PronType_default",
- "PronType_dem",
- "PronType_ind",
- "PronType_int",
- "PronType_neg",
- "PronType_prs",
- "PronType_rcp",
- "PronType_rel",
- "PronType_tot",
- "PronType_clit",
- "PronType_exc",
- "PunctSide_ini",
- "PunctSide_fin",
- "PunctType_peri",
- "PunctType_qest",
- "PunctType_excl",
- "PunctType_quot",
- "PunctType_brck",
- "PunctType_comm",
- "PunctType_colo",
- "PunctType_semi",
- "PunctType_dash",
- "Reflex_yes",
- "Style_arch",
- "Style_rare",
- "Style_poet",
- "Style_norm",
- "Style_coll",
- "Style_vrnc",
- "Style_sing",
- "Style_expr",
- "Style_derg",
- "Style_vulg",
- "Style_yes",
- "StyleVariant_styleShort",
- "StyleVariant_styleBound",
- "Tense_fut",
- "Tense_imp",
- "Tense_past",
- "Tense_pres",
- "Typo_yes",
- "VerbForm_fin",
- "VerbForm_ger",
- "VerbForm_inf",
- "VerbForm_none",
- "VerbForm_part",
- "VerbForm_partFut",
- "VerbForm_partPast",
- "VerbForm_partPres",
- "VerbForm_sup",
- "VerbForm_trans",
- "VerbForm_conv",
- "VerbForm_gdv",
- "VerbType_aux",
- "VerbType_cop",
- "VerbType_mod",
- "VerbType_light",
- "Voice_act",
- "Voice_cau",
- "Voice_pass",
- "Voice_mid",
- "Voice_int",
-]
-
-FEATURE_NAMES = {get_string_id(f): f for f in FEATURES}
-FEATURE_FIELDS = {f: FIELDS[f.split('_', 1)[0]] for f in FEATURES}
+NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
+# Unfortunate hack here, to work around problem with long cpdef enum
+# (which is generating an enormous amount of C++ in Cython 0.24+)
+# We keep the enum cdef, and just make sure the names are available to Python
+locals().update(IDS)
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 2f30fbbee..5d7b079d9 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
-from .morphologizer import Morphologizer
from .entityruler import EntityRuler
from .hooks import SentenceSegmenter, SimilarityHook
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
@@ -16,7 +15,6 @@ __all__ = [
"TextCategorizer",
"Tensorizer",
"Pipe",
- "Morphologizer",
"EntityRuler",
"Sentencizer",
"SentenceSegmenter",
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 956d67291..a1d3f922e 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -180,28 +180,21 @@ class EntityRuler(object):
DOCS: https://spacy.io/api/entityruler#add_patterns
"""
- # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
- try:
- current_index = self.nlp.pipe_names.index(self.name)
- subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]]
- except ValueError:
- subsequent_pipes = []
- with self.nlp.disable_pipes(*subsequent_pipes):
- for entry in patterns:
- label = entry["label"]
- if "id" in entry:
- label = self._create_label(label, entry["id"])
- pattern = entry["pattern"]
- if isinstance(pattern, basestring_):
- self.phrase_patterns[label].append(self.nlp(pattern))
- elif isinstance(pattern, list):
- self.token_patterns[label].append(pattern)
- else:
- raise ValueError(Errors.E097.format(pattern=pattern))
- for label, patterns in self.token_patterns.items():
- self.matcher.add(label, None, *patterns)
- for label, patterns in self.phrase_patterns.items():
- self.phrase_matcher.add(label, None, *patterns)
+ for entry in patterns:
+ label = entry["label"]
+ if "id" in entry:
+ label = self._create_label(label, entry["id"])
+ pattern = entry["pattern"]
+ if isinstance(pattern, basestring_):
+ self.phrase_patterns[label].append(self.nlp(pattern))
+ elif isinstance(pattern, list):
+ self.token_patterns[label].append(pattern)
+ else:
+ raise ValueError(Errors.E097.format(pattern=pattern))
+ for label, patterns in self.token_patterns.items():
+ self.matcher.add(label, None, *patterns)
+ for label, patterns in self.phrase_patterns.items():
+ self.phrase_matcher.add(label, None, *patterns)
def _split_label(self, label):
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
deleted file mode 100644
index b14e2bec7..000000000
--- a/spacy/pipeline/morphologizer.pyx
+++ /dev/null
@@ -1,164 +0,0 @@
-from __future__ import unicode_literals
-from collections import OrderedDict, defaultdict
-
-import numpy
-cimport numpy as np
-
-from thinc.api import chain
-from thinc.neural.util import to_categorical, copy_array, get_array_module
-from .. import util
-from .pipes import Pipe
-from .._ml import Tok2Vec, build_morphologizer_model
-from .._ml import link_vectors_to_models, zero_init, flatten
-from .._ml import create_default_optimizer
-from ..errors import Errors, TempErrors
-from ..compat import basestring_
-from ..tokens.doc cimport Doc
-from ..vocab cimport Vocab
-from ..morphology cimport Morphology
-
-
-class Morphologizer(Pipe):
- name = 'morphologizer'
-
- @classmethod
- def Model(cls, **cfg):
- if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
- raise ValueError(TempErrors.T008)
- class_map = Morphology.create_class_map()
- return build_morphologizer_model(class_map.field_sizes, **cfg)
-
- def __init__(self, vocab, model=True, **cfg):
- self.vocab = vocab
- self.model = model
- self.cfg = OrderedDict(sorted(cfg.items()))
- self.cfg.setdefault('cnn_maxout_pieces', 2)
- self._class_map = self.vocab.morphology.create_class_map()
-
- @property
- def labels(self):
- return self.vocab.morphology.tag_names
-
- @property
- def tok2vec(self):
- if self.model in (None, True, False):
- return None
- else:
- return chain(self.model.tok2vec, flatten)
-
- def __call__(self, doc):
- features, tokvecs = self.predict([doc])
- self.set_annotations([doc], features, tensors=tokvecs)
- return doc
-
- def pipe(self, stream, batch_size=128, n_threads=-1):
- for docs in util.minibatch(stream, size=batch_size):
- docs = list(docs)
- features, tokvecs = self.predict(docs)
- self.set_annotations(docs, features, tensors=tokvecs)
- yield from docs
-
- def predict(self, docs):
- if not any(len(doc) for doc in docs):
- # Handle case where there are no tokens in any docs.
- n_labels = self.model.nO
- guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
- tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
- return guesses, tokvecs
- tokvecs = self.model.tok2vec(docs)
- scores = self.model.softmax(tokvecs)
- return scores, tokvecs
-
- def set_annotations(self, docs, batch_scores, tensors=None):
- if isinstance(docs, Doc):
- docs = [docs]
- cdef Doc doc
- cdef Vocab vocab = self.vocab
- offsets = [self._class_map.get_field_offset(field)
- for field in self._class_map.fields]
- for i, doc in enumerate(docs):
- doc_scores = batch_scores[i]
- doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)
- # Convert the neuron indices into feature IDs.
- doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i')
- for j in range(len(doc)):
- for k, offset in enumerate(offsets):
- if doc_guesses[j, k] == 0:
- doc_feat_ids[j, k] = 0
- else:
- doc_feat_ids[j, k] = offset + doc_guesses[j, k]
- # Get the set of feature names.
- feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]}
- if "NIL" in feats:
- feats.remove("NIL")
- # Now add the analysis, and set the hash.
- doc.c[j].morph = self.vocab.morphology.add(feats)
- if doc[j].morph.pos != 0:
- doc.c[j].pos = doc[j].morph.pos
-
- def update(self, docs, golds, drop=0., sgd=None, losses=None):
- if losses is not None and self.name not in losses:
- losses[self.name] = 0.
-
- tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
- loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
- bp_tag_scores(d_tag_scores, sgd=sgd)
-
- if losses is not None:
- losses[self.name] += loss
-
- def get_loss(self, docs, golds, scores):
- guesses = []
- for doc_scores in scores:
- guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes))
- guesses = self.model.ops.xp.vstack(guesses)
- scores = self.model.ops.xp.vstack(scores)
- if not isinstance(scores, numpy.ndarray):
- scores = scores.get()
- if not isinstance(guesses, numpy.ndarray):
- guesses = guesses.get()
- cdef int idx = 0
- # Do this on CPU, as we can't vectorize easily.
- target = numpy.zeros(scores.shape, dtype='f')
- field_sizes = self.model.softmax.out_sizes
- for doc, gold in zip(docs, golds):
- for t, features in enumerate(gold.morphology):
- if features is None:
- target[idx] = scores[idx]
- else:
- gold_fields = {}
- for feature in features:
- field = self._class_map.feat2field[feature]
- gold_fields[field] = self._class_map.feat2offset[feature]
- for field in self._class_map.fields:
- field_id = self._class_map.field2id[field]
- col_offset = self._class_map.field2col[field]
- if field_id in gold_fields:
- target[idx, col_offset + gold_fields[field_id]] = 1.
- else:
- target[idx, col_offset] = 1.
- #print(doc[t])
- #for col, info in enumerate(self._class_map.col2info):
- # print(col, info, scores[idx, col], target[idx, col])
- idx += 1
- target = self.model.ops.asarray(target, dtype='f')
- scores = self.model.ops.asarray(scores, dtype='f')
- d_scores = scores - target
- loss = (d_scores**2).sum()
- d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
- return float(loss), d_scores
-
- def use_params(self, params):
- with self.model.use_params(params):
- yield
-
-def scores_to_guesses(scores, out_sizes):
- xp = get_array_module(scores)
- guesses = xp.zeros((scores.shape[0], len(out_sizes)), dtype='i')
- offset = 0
- for i, size in enumerate(out_sizes):
- slice_ = scores[:, offset : offset + size]
- col_guesses = slice_.argmax(axis=1)
- guesses[:, i] = col_guesses
- offset += size
- return guesses
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 9ac3affc9..190116a2e 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -69,7 +69,7 @@ class Pipe(object):
predictions = self.predict([doc])
if isinstance(predictions, tuple) and len(predictions) == 2:
scores, tensors = predictions
- self.set_annotations([doc], scores, tensors=tensors)
+ self.set_annotations([doc], scores, tensor=tensors)
else:
self.set_annotations([doc], predictions)
return doc
@@ -90,7 +90,7 @@ class Pipe(object):
predictions = self.predict(docs)
if isinstance(predictions, tuple) and len(tuple) == 2:
scores, tensors = predictions
- self.set_annotations(docs, scores, tensors=tensors)
+ self.set_annotations(docs, scores, tensor=tensors)
else:
self.set_annotations(docs, predictions)
yield from docs
@@ -424,22 +424,18 @@ class Tagger(Pipe):
cdef Doc doc
cdef int idx = 0
cdef Vocab vocab = self.vocab
- assign_morphology = self.cfg.get("set_morphology", True)
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
# Don't clobber preset POS tags
- if doc.c[j].tag == 0:
- if doc.c[j].pos == 0 and assign_morphology:
- # Don't clobber preset lemmas
- lemma = doc.c[j].lemma
- vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
- if lemma != 0 and lemma != doc.c[j].lex.orth:
- doc.c[j].lemma = lemma
- else:
- doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
+ if doc.c[j].tag == 0 and doc.c[j].pos == 0:
+ # Don't clobber preset lemmas
+ lemma = doc.c[j].lemma
+ vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
+ if lemma != 0 and lemma != doc.c[j].lex.orth:
+ doc.c[j].lemma = lemma
idx += 1
if tensors is not None and len(tensors):
if isinstance(doc.tensor, numpy.ndarray) \
@@ -504,7 +500,6 @@ class Tagger(Pipe):
orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = OrderedDict()
for raw_text, annots_brackets in get_gold_tuples():
- _ = annots_brackets.pop()
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
for tag in tags:
@@ -937,6 +932,11 @@ class TextCategorizer(Pipe):
def labels(self, value):
self.cfg["labels"] = tuple(value)
+ def __call__(self, doc):
+ scores, tensors = self.predict([doc])
+ self.set_annotations([doc], scores, tensors=tensors)
+ return doc
+
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in util.minibatch(stream, size=batch_size):
docs = list(docs)
@@ -1017,10 +1017,6 @@ class TextCategorizer(Pipe):
return 1
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
- for raw_text, annots_brackets in get_gold_tuples():
- cats = annots_brackets.pop()
- for cat in cats:
- self.add_label(cat)
if self.model is True:
self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
self.require_labels()
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 9c057d0a3..4032cc4dd 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1,10 +1,7 @@
# coding: utf8
from __future__ import division, print_function, unicode_literals
-import numpy as np
-
from .gold import tags_to_entities, GoldParse
-from .errors import Errors
class PRFScore(object):
@@ -37,39 +34,10 @@ class PRFScore(object):
return 2 * ((p * r) / (p + r + 1e-100))
-class ROCAUCScore(object):
- """
- An AUC ROC score.
- """
-
- def __init__(self):
- self.golds = []
- self.cands = []
- self.saved_score = 0.0
- self.saved_score_at_len = 0
-
- def score_set(self, cand, gold):
- self.cands.append(cand)
- self.golds.append(gold)
-
- @property
- def score(self):
- if len(self.golds) == self.saved_score_at_len:
- return self.saved_score
- try:
- self.saved_score = _roc_auc_score(self.golds, self.cands)
- # catch ValueError: Only one class present in y_true.
- # ROC AUC score is not defined in that case.
- except ValueError:
- self.saved_score = -float("inf")
- self.saved_score_at_len = len(self.golds)
- return self.saved_score
-
-
class Scorer(object):
"""Compute evaluation scores."""
- def __init__(self, eval_punct=False, pipeline=None):
+ def __init__(self, eval_punct=False):
"""Initialize the Scorer.
eval_punct (bool): Evaluate the dependency attachments to and from
@@ -86,24 +54,6 @@ class Scorer(object):
self.ner = PRFScore()
self.ner_per_ents = dict()
self.eval_punct = eval_punct
- self.textcat = None
- self.textcat_per_cat = dict()
- self.textcat_positive_label = None
- self.textcat_multilabel = False
-
- if pipeline:
- for name, model in pipeline:
- if name == "textcat":
- self.textcat_positive_label = model.cfg.get("positive_label", None)
- if self.textcat_positive_label:
- self.textcat = PRFScore()
- if not model.cfg.get("exclusive_classes", False):
- self.textcat_multilabel = True
- for label in model.cfg.get("labels", []):
- self.textcat_per_cat[label] = ROCAUCScore()
- else:
- for label in model.cfg.get("labels", []):
- self.textcat_per_cat[label] = PRFScore()
@property
def tags_acc(self):
@@ -151,47 +101,10 @@ class Scorer(object):
for k, v in self.ner_per_ents.items()
}
- @property
- def textcat_score(self):
- """RETURNS (float): f-score on positive label for binary exclusive,
- macro-averaged f-score for 3+ exclusive,
- macro-averaged AUC ROC score for multilabel (-1 if undefined)
- """
- if not self.textcat_multilabel:
- # binary multiclass
- if self.textcat_positive_label:
- return self.textcat.fscore * 100
- # other multiclass
- return (
- sum([score.fscore for label, score in self.textcat_per_cat.items()])
- / (len(self.textcat_per_cat) + 1e-100)
- * 100
- )
- # multilabel
- return max(
- sum([score.score for label, score in self.textcat_per_cat.items()])
- / (len(self.textcat_per_cat) + 1e-100),
- -1,
- )
-
- @property
- def textcats_per_cat(self):
- """RETURNS (dict): Scores per textcat label.
- """
- if not self.textcat_multilabel:
- return {
- k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
- for k, v in self.textcat_per_cat.items()
- }
- return {
- k: {"roc_auc_score": max(v.score, -1)}
- for k, v in self.textcat_per_cat.items()
- }
-
@property
def scores(self):
"""RETURNS (dict): All scores with keys `uas`, `las`, `ents_p`,
- `ents_r`, `ents_f`, `tags_acc`, `token_acc`, and `textcat_score`.
+ `ents_r`, `ents_f`, `tags_acc` and `token_acc`.
"""
return {
"uas": self.uas,
@@ -202,8 +115,6 @@ class Scorer(object):
"ents_per_type": self.ents_per_type,
"tags_acc": self.tags_acc,
"token_acc": self.token_acc,
- "textcat_score": self.textcat_score,
- "textcats_per_cat": self.textcats_per_cat,
}
def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")):
@@ -281,301 +192,9 @@ class Scorer(object):
self.unlabelled.score_set(
set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
)
- if (
- len(gold.cats) > 0
- and set(self.textcat_per_cat) == set(gold.cats)
- and set(gold.cats) == set(doc.cats)
- ):
- goldcat = max(gold.cats, key=gold.cats.get)
- candcat = max(doc.cats, key=doc.cats.get)
- if self.textcat_positive_label:
- self.textcat.score_set(
- set([self.textcat_positive_label]) & set([candcat]),
- set([self.textcat_positive_label]) & set([goldcat]),
- )
- for label in self.textcat_per_cat:
- if self.textcat_multilabel:
- self.textcat_per_cat[label].score_set(
- doc.cats[label], gold.cats[label]
- )
- else:
- self.textcat_per_cat[label].score_set(
- set([label]) & set([candcat]), set([label]) & set([goldcat])
- )
- elif len(self.textcat_per_cat) > 0:
- model_labels = set(self.textcat_per_cat)
- eval_labels = set(gold.cats)
- raise ValueError(
- Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
- )
if verbose:
gold_words = [item[1] for item in gold.orig_annot]
for w_id, h_id, dep in cand_deps - gold_deps:
print("F", gold_words[w_id], dep, gold_words[h_id])
for w_id, h_id, dep in gold_deps - cand_deps:
print("M", gold_words[w_id], dep, gold_words[h_id])
-
-
-#############################################################################
-#
-# The following implementation of roc_auc_score() is adapted from
-# scikit-learn, which is distributed under the following license:
-#
-# New BSD License
-#
-# Copyright (c) 2007–2019 The scikit-learn developers.
-# All rights reserved.
-#
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# a. Redistributions of source code must retain the above copyright notice,
-# this list of conditions and the following disclaimer.
-# b. Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# c. Neither the name of the Scikit-learn Developers nor the names of
-# its contributors may be used to endorse or promote products
-# derived from this software without specific prior written
-# permission.
-#
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-# DAMAGE.
-
-
-def _roc_auc_score(y_true, y_score):
- """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
- from prediction scores.
-
- Note: this implementation is restricted to the binary classification task
-
- Parameters
- ----------
- y_true : array, shape = [n_samples] or [n_samples, n_classes]
- True binary labels or binary label indicators.
- The multiclass case expects shape = [n_samples] and labels
- with values in ``range(n_classes)``.
-
- y_score : array, shape = [n_samples] or [n_samples, n_classes]
- Target scores, can either be probability estimates of the positive
- class, confidence values, or non-thresholded measure of decisions
- (as returned by "decision_function" on some classifiers). For binary
- y_true, y_score is supposed to be the score of the class with greater
- label. The multiclass case expects shape = [n_samples, n_classes]
- where the scores correspond to probability estimates.
-
- Returns
- -------
- auc : float
-
- References
- ----------
- .. [1] `Wikipedia entry for the Receiver operating characteristic
- `_
-
- .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
- Letters, 2006, 27(8):861-874.
-
- .. [3] `Analyzing a portion of the ROC curve. McClish, 1989
- `_
- """
- if len(np.unique(y_true)) != 2:
- raise ValueError(Errors.E165)
- fpr, tpr, _ = _roc_curve(y_true, y_score)
- return _auc(fpr, tpr)
-
-
-def _roc_curve(y_true, y_score):
- """Compute Receiver operating characteristic (ROC)
-
- Note: this implementation is restricted to the binary classification task.
-
- Parameters
- ----------
-
- y_true : array, shape = [n_samples]
- True binary labels. If labels are not either {-1, 1} or {0, 1}, then
- pos_label should be explicitly given.
-
- y_score : array, shape = [n_samples]
- Target scores, can either be probability estimates of the positive
- class, confidence values, or non-thresholded measure of decisions
- (as returned by "decision_function" on some classifiers).
-
- Returns
- -------
- fpr : array, shape = [>2]
- Increasing false positive rates such that element i is the false
- positive rate of predictions with score >= thresholds[i].
-
- tpr : array, shape = [>2]
- Increasing true positive rates such that element i is the true
- positive rate of predictions with score >= thresholds[i].
-
- thresholds : array, shape = [n_thresholds]
- Decreasing thresholds on the decision function used to compute
- fpr and tpr. `thresholds[0]` represents no instances being predicted
- and is arbitrarily set to `max(y_score) + 1`.
-
- Notes
- -----
- Since the thresholds are sorted from low to high values, they
- are reversed upon returning them to ensure they correspond to both ``fpr``
- and ``tpr``, which are sorted in reversed order during their calculation.
-
- References
- ----------
- .. [1] `Wikipedia entry for the Receiver operating characteristic
- `_
-
- .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
- Letters, 2006, 27(8):861-874.
- """
- fps, tps, thresholds = _binary_clf_curve(y_true, y_score)
-
- # Add an extra threshold position
- # to make sure that the curve starts at (0, 0)
- tps = np.r_[0, tps]
- fps = np.r_[0, fps]
- thresholds = np.r_[thresholds[0] + 1, thresholds]
-
- if fps[-1] <= 0:
- fpr = np.repeat(np.nan, fps.shape)
- else:
- fpr = fps / fps[-1]
-
- if tps[-1] <= 0:
- tpr = np.repeat(np.nan, tps.shape)
- else:
- tpr = tps / tps[-1]
-
- return fpr, tpr, thresholds
-
-
-def _binary_clf_curve(y_true, y_score):
- """Calculate true and false positives per binary classification threshold.
-
- Parameters
- ----------
- y_true : array, shape = [n_samples]
- True targets of binary classification
-
- y_score : array, shape = [n_samples]
- Estimated probabilities or decision function
-
- Returns
- -------
- fps : array, shape = [n_thresholds]
- A count of false positives, at index i being the number of negative
- samples assigned a score >= thresholds[i]. The total number of
- negative samples is equal to fps[-1] (thus true negatives are given by
- fps[-1] - fps).
-
- tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
- An increasing count of true positives, at index i being the number
- of positive samples assigned a score >= thresholds[i]. The total
- number of positive samples is equal to tps[-1] (thus false negatives
- are given by tps[-1] - tps).
-
- thresholds : array, shape = [n_thresholds]
- Decreasing score values.
- """
- pos_label = 1.0
-
- y_true = np.ravel(y_true)
- y_score = np.ravel(y_score)
-
- # make y_true a boolean vector
- y_true = y_true == pos_label
-
- # sort scores and corresponding truth values
- desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
- y_score = y_score[desc_score_indices]
- y_true = y_true[desc_score_indices]
- weight = 1.0
-
- # y_score typically has many tied values. Here we extract
- # the indices associated with the distinct values. We also
- # concatenate a value for the end of the curve.
- distinct_value_indices = np.where(np.diff(y_score))[0]
- threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
-
- # accumulate the true positives with decreasing threshold
- tps = _stable_cumsum(y_true * weight)[threshold_idxs]
- fps = 1 + threshold_idxs - tps
- return fps, tps, y_score[threshold_idxs]
-
-
-def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
- """Use high precision for cumsum and check that final value matches sum
-
- Parameters
- ----------
- arr : array-like
- To be cumulatively summed as flat
- axis : int, optional
- Axis along which the cumulative sum is computed.
- The default (None) is to compute the cumsum over the flattened array.
- rtol : float
- Relative tolerance, see ``np.allclose``
- atol : float
- Absolute tolerance, see ``np.allclose``
- """
- out = np.cumsum(arr, axis=axis, dtype=np.float64)
- expected = np.sum(arr, axis=axis, dtype=np.float64)
- if not np.all(
- np.isclose(
- out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
- )
- ):
- raise ValueError(Errors.E163)
- return out
-
-
-def _auc(x, y):
- """Compute Area Under the Curve (AUC) using the trapezoidal rule
-
- This is a general function, given points on a curve. For computing the
- area under the ROC-curve, see :func:`roc_auc_score`.
-
- Parameters
- ----------
- x : array, shape = [n]
- x coordinates. These must be either monotonic increasing or monotonic
- decreasing.
- y : array, shape = [n]
- y coordinates.
-
- Returns
- -------
- auc : float
- """
- x = np.ravel(x)
- y = np.ravel(y)
-
- direction = 1
- dx = np.diff(x)
- if np.any(dx < 0):
- if np.all(dx <= 0):
- direction = -1
- else:
- raise ValueError(Errors.E164.format(x))
-
- area = direction * np.trapz(y, x)
- if isinstance(area, np.memmap):
- # Reductions such as .sum used internally in np.trapz do not return a
- # scalar by default for numpy.memmap instances contrary to
- # regular numpy.ndarray instances.
- area = area.dtype.type(area)
- return area
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index f3457e1a5..df86f8ac7 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -119,7 +119,9 @@ cdef class StringStore:
return ""
elif string_or_id in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string_or_id]
+
cdef hash_t key
+
if isinstance(string_or_id, unicode):
key = hash_string(string_or_id)
return key
@@ -137,20 +139,6 @@ cdef class StringStore:
else:
return decode_Utf8Str(utf8str)
- def as_int(self, key):
- """If key is an int, return it; otherwise, get the int value."""
- if not isinstance(key, basestring):
- return key
- else:
- return self[key]
-
- def as_string(self, key):
- """If key is a string, return it; otherwise, get the string value."""
- if isinstance(key, basestring):
- return key
- else:
- return self[key]
-
def add(self, string):
"""Add a string to the StringStore.
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 468277f6b..6c643b4cd 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -78,54 +78,6 @@ cdef struct TokenC:
hash_t ent_id
-cdef struct MorphAnalysisC:
- univ_pos_t pos
- int length
-
- attr_t abbr
- attr_t adp_type
- attr_t adv_type
- attr_t animacy
- attr_t aspect
- attr_t case
- attr_t conj_type
- attr_t connegative
- attr_t definite
- attr_t degree
- attr_t derivation
- attr_t echo
- attr_t foreign
- attr_t gender
- attr_t hyph
- attr_t inf_form
- attr_t mood
- attr_t negative
- attr_t number
- attr_t name_type
- attr_t noun_type
- attr_t num_form
- attr_t num_type
- attr_t num_value
- attr_t part_form
- attr_t part_type
- attr_t person
- attr_t polite
- attr_t polarity
- attr_t poss
- attr_t prefix
- attr_t prep_case
- attr_t pron_type
- attr_t punct_side
- attr_t punct_type
- attr_t reflex
- attr_t style
- attr_t style_variant
- attr_t tense
- attr_t typo
- attr_t verb_form
- attr_t voice
- attr_t verb_type
-
# Internal struct, for storage and disambiguation of entities.
cdef struct KBEntryC:
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 5a7355061..eb39124ce 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -342,7 +342,6 @@ cdef class ArcEager(TransitionSystem):
actions[RIGHT][label] = 1
actions[REDUCE][label] = 1
for raw_text, sents in kwargs.get('gold_parses', []):
- _ = sents.pop()
for (ids, words, tags, heads, labels, iob), ctnts in sents:
heads, labels = nonproj.projectivize(heads, labels)
for child, head, label in zip(ids, heads, labels):
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index 3bd096463..767e4c2e0 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -66,14 +66,12 @@ cdef class BiluoPushDown(TransitionSystem):
UNIT: Counter(),
OUT: Counter()
}
- actions[OUT][''] = 1 # Represents a token predicted to be outside of any entity
- actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
+ actions[OUT][''] = 1
for entity_type in kwargs.get('entity_types', []):
for action in (BEGIN, IN, LAST, UNIT):
actions[action][entity_type] = 1
moves = ('M', 'B', 'I', 'L', 'U')
for raw_text, sents in kwargs.get('gold_parses', []):
- _ = sents.pop()
for (ids, words, tags, heads, labels, biluo), _ in sents:
for i, ner_tag in enumerate(biluo):
if ner_tag != 'O' and ner_tag != '-':
@@ -163,7 +161,8 @@ cdef class BiluoPushDown(TransitionSystem):
for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
- raise KeyError(Errors.E022.format(name=name))
+ else:
+ raise KeyError(Errors.E022.format(name=name))
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
@@ -267,7 +266,7 @@ cdef class Begin:
return False
elif label == 0:
return False
- elif preset_ent_iob == 1:
+ elif preset_ent_iob == 1 or preset_ent_iob == 2:
# Ensure we don't clobber preset entities. If no entity preset,
# ent_iob is 0
return False
@@ -283,8 +282,8 @@ cdef class Begin:
# Otherwise, force acceptance, even if we're across a sentence
# boundary or the token is whitespace.
return True
- elif st.B_(1).ent_iob == 3:
- # If the next word is B, we can't B now
+ elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
+ # If the next word is B or O, we can't B now
return False
elif st.B_(1).sent_start == 1:
# Don't allow entities to extend across sentence boundaries
@@ -327,7 +326,6 @@ cdef class In:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
- cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0:
return False
elif st.E_(0).ent_type != label:
@@ -337,22 +335,13 @@ cdef class In:
elif st.B(1) == -1:
# If we're at the end, we can't I.
return False
+ elif preset_ent_iob == 2:
+ return False
elif preset_ent_iob == 3:
return False
- elif st.B_(1).ent_iob == 3:
- # If we know the next word is B, we can't be I (must be L)
+ elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
+ # If we know the next word is B or O, we can't be I (must be L)
return False
- elif preset_ent_iob == 1:
- if st.B_(1).ent_iob in (0, 2):
- # if next preset is missing or O, this can't be I (must be L)
- return False
- elif label != preset_ent_label:
- # If label isn't right, reject
- return False
- else:
- # Otherwise, force acceptance, even if we're across a sentence
- # boundary or the token is whitespace.
- return True
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
# Don't allow entities to extend across sentence boundaries
return False
@@ -398,24 +387,17 @@ cdef class In:
else:
return 1
+
cdef class Last:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
- cdef int preset_ent_iob = st.B_(0).ent_iob
- cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0:
return False
elif not st.entity_is_open():
return False
- elif preset_ent_iob == 1 and st.B_(1).ent_iob != 1:
+ elif st.B_(0).ent_iob == 1 and st.B_(1).ent_iob != 1:
# If a preset entity has I followed by not-I, is L
- if label != preset_ent_label:
- # If label isn't right, reject
- return False
- else:
- # Otherwise, force acceptance, even if we're across a sentence
- # boundary or the token is whitespace.
- return True
+ return True
elif st.E_(0).ent_type != label:
return False
elif st.B_(1).ent_iob == 1:
@@ -468,13 +450,12 @@ cdef class Unit:
cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0:
- # this is only allowed if it's a preset blocked annotation
- if preset_ent_label == 0 and preset_ent_iob == 3:
- return True
- else:
- return False
+ return False
elif st.entity_is_open():
return False
+ elif preset_ent_iob == 2:
+ # Don't clobber preset O
+ return False
elif st.B_(1).ent_iob == 1:
# If next token is In, we can't be Unit -- must be Begin
return False
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index aeb4a5306..85f7b5bb9 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -135,9 +135,7 @@ cdef class Parser:
names = []
for i in range(self.moves.n_moves):
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
- # Explicitly removing the internal "U-" token used for blocking entities
- if name != "U-":
- names.append(name)
+ names.append(name)
return names
nr_feature = 8
@@ -163,16 +161,10 @@ cdef class Parser:
added = self.moves.add_action(action, label)
if added:
resized = True
- if resized:
- self._resize()
-
- def _resize(self):
- if "nr_class" in self.cfg:
+ if resized and "nr_class" in self.cfg:
self.cfg["nr_class"] = self.moves.n_moves
- if self.model not in (True, False, None):
+ if self.model not in (True, False, None) and resized:
self.model.resize_output(self.moves.n_moves)
- if self._rehearsal_model not in (True, False, None):
- self._rehearsal_model.resize_output(self.moves.n_moves)
def add_multitask_objective(self, target):
# Defined in subclasses, to avoid circular import
@@ -243,9 +235,7 @@ cdef class Parser:
if isinstance(docs, Doc):
docs = [docs]
if not any(len(doc) for doc in docs):
- result = self.moves.init_batch(docs)
- self._resize()
- return result
+ return self.moves.init_batch(docs)
if beam_width < 2:
return self.greedy_parse(docs, drop=drop)
else:
@@ -259,7 +249,7 @@ cdef class Parser:
# This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to
# expand our model output.
- self._resize()
+ self.model.resize_output(self.moves.n_moves)
model = self.model(docs)
weights = get_c_weights(model)
for state in batch:
@@ -279,7 +269,7 @@ cdef class Parser:
# This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to
# expand our model output.
- self._resize()
+ self.model.resize_output(self.moves.n_moves)
model = self.model(docs)
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
dtype='i', order='C')
@@ -453,7 +443,8 @@ cdef class Parser:
# This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to
# expand our model output.
- self._resize()
+ self.model.resize_output(self.moves.n_moves)
+ self._rehearsal_model.resize_output(self.moves.n_moves)
# Prepare the stepwise model, and get the callback for finishing the batch
tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0)
model, finish_update = self.model.begin_update(docs, drop=0.0)
@@ -594,7 +585,6 @@ cdef class Parser:
doc_sample = []
gold_sample = []
for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
- _ = annots_brackets.pop()
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
doc_sample.append(Doc(self.vocab, words=words))
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 58b3a6993..523cd6699 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -63,13 +63,6 @@ cdef class TransitionSystem:
cdef Doc doc
beams = []
cdef int offset = 0
-
- # Doc objects might contain labels that we need to register actions for. We need to check for that
- # *before* we create any Beam objects, because the Beam object needs the correct number of
- # actions. It's sort of dumb, but the best way is to just call init_batch() -- that triggers the additions,
- # and it doesn't matter that we create and discard the state objects.
- self.init_batch(docs)
-
for doc in docs:
beam = Beam(self.n_moves, beam_width, min_density=beam_density)
beam.initialize(self.init_beam_state, doc.length, doc.c)
@@ -103,7 +96,8 @@ cdef class TransitionSystem:
def apply_transition(self, StateClass state, name):
if not self.is_valid(state, name):
- raise ValueError(Errors.E170.format(name=name))
+ raise ValueError(
+ "Cannot apply transition {name}: invalid for the current state.".format(name=name))
action = self.lookup_transition(name)
action.do(state.c, action.label)
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 0763af32b..c88f3314e 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -185,12 +185,6 @@ def ru_tokenizer():
return get_lang_class("ru").Defaults.create_tokenizer()
-@pytest.fixture
-def ru_lemmatizer():
- pytest.importorskip("pymorphy2")
- return get_lang_class("ru").Defaults.create_lemmatizer()
-
-
@pytest.fixture(scope="session")
def sr_tokenizer():
return get_lang_class("sr").Defaults.create_tokenizer()
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 6c69e699a..433541c48 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -1,11 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
-from spacy.pipeline import EntityRecognizer
-from spacy.tokens import Span
-import pytest
-
+from ...pipeline import EntityRecognizer
from ..util import get_doc
+from ...tokens import Span
+
+import pytest
def test_doc_add_entities_set_ents_iob(en_vocab):
@@ -16,23 +16,10 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
ner(doc)
assert len(list(doc.ents)) == 0
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
-
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
- assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
-
+ assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
- assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
-
-
-def test_ents_reset(en_vocab):
- text = ["This", "is", "a", "lion"]
- doc = get_doc(en_vocab, text)
- ner = EntityRecognizer(en_vocab)
- ner.begin_training([])
- ner(doc)
- assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
- doc.ents = list(doc.ents)
- assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
+ assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
def test_add_overlapping_entities(en_vocab):
diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py
index b222f6bf0..ce42b39b9 100644
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@@ -5,13 +5,11 @@ import pytest
from spacy.vocab import Vocab
from spacy.tokens import Doc
from spacy.lemmatizer import Lemmatizer
-from spacy.lookups import Table
@pytest.fixture
def lemmatizer():
- lookup = Table(data={"dogs": "dog", "boxen": "box", "mice": "mouse"})
- return Lemmatizer(lookup=lookup)
+ return Lemmatizer(lookup={"dogs": "dog", "boxen": "box", "mice": "mouse"})
@pytest.fixture
diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py
deleted file mode 100644
index 5d570af53..000000000
--- a/spacy/tests/doc/test_morphanalysis.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import pytest
-
-
-@pytest.fixture
-def i_has(en_tokenizer):
- doc = en_tokenizer("I has")
- doc[0].tag_ = "PRP"
- doc[1].tag_ = "VBZ"
- return doc
-
-
-def test_token_morph_id(i_has):
- assert i_has[0].morph.id
- assert i_has[1].morph.id != 0
- assert i_has[0].morph.id != i_has[1].morph.id
-
-
-def test_morph_props(i_has):
- assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"]
- assert i_has[0].morph.pron_type_ == "PronType_prs"
- assert i_has[1].morph.pron_type == 0
-
-
-def test_morph_iter(i_has):
- assert list(i_has[0].morph) == ["PronType_prs"]
- assert list(i_has[1].morph) == ["Number_sing", "Person_three", "VerbForm_fin"]
-
-
-def test_morph_get(i_has):
- assert i_has[0].morph.get("pron_type") == "PronType_prs"
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index ad8bfaa00..c95e7bc40 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -47,10 +47,3 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
pos = [token.pos_ for token in ja_tokenizer(text)]
assert pos == expected_pos
-
-
-def test_extra_spaces(ja_tokenizer):
- # note: three spaces after "I"
- tokens = ja_tokenizer("I like cheese.")
- assert tokens[1].orth_ == " "
- assert tokens[2].orth_ == " "
diff --git a/spacy/tests/lang/lt/test_lemmatizer.py b/spacy/tests/lang/lt/test_lemmatizer.py
index f7408fc16..9b2969849 100644
--- a/spacy/tests/lang/lt/test_lemmatizer.py
+++ b/spacy/tests/lang/lt/test_lemmatizer.py
@@ -17,4 +17,4 @@ TEST_CASES = [
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
- assert lemmas == [lt_lemmatizer.lookup_table.get(token, token) for token in tokens]
+ assert lemmas == [lt_lemmatizer.lookup(token) for token in tokens]
diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py
index b228fded8..b92dfa29c 100644
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@@ -2,10 +2,17 @@
from __future__ import unicode_literals
import pytest
+from spacy.lang.ru import Russian
from ...util import get_doc
+@pytest.fixture
+def ru_lemmatizer():
+ pytest.importorskip("pymorphy2")
+ return Russian.Defaults.create_lemmatizer()
+
+
def test_ru_doc_lemmatization(ru_tokenizer):
words = ["мама", "мыла", "раму"]
tags = [
diff --git a/spacy/tests/lang/sr/test_exceptions.py b/spacy/tests/lang/sr/test_еxceptions.py
similarity index 100%
rename from spacy/tests/lang/sr/test_exceptions.py
rename to spacy/tests/lang/sr/test_еxceptions.py
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 0d640e1a2..df35a1be2 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -410,11 +410,3 @@ def test_matcher_schema_token_attributes(en_vocab, pattern, text):
assert len(matcher) == 1
matches = matcher(doc)
assert len(matches) == 1
-
-
-def test_matcher_valid_callback(en_vocab):
- """Test that on_match can only be None or callable."""
- matcher = Matcher(en_vocab)
- with pytest.raises(ValueError):
- matcher.add("TEST", [], [{"TEXT": "test"}])
- matcher(Doc(en_vocab, words=["test"]))
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 486cbb984..b82f9a058 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -8,31 +8,10 @@ from ..util import get_doc
def test_matcher_phrase_matcher(en_vocab):
+ doc = Doc(en_vocab, words=["Google", "Now"])
+ matcher = PhraseMatcher(en_vocab)
+ matcher.add("COMPANY", None, doc)
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
- # intermediate phrase
- pattern = Doc(en_vocab, words=["Google", "Now"])
- matcher = PhraseMatcher(en_vocab)
- matcher.add("COMPANY", None, pattern)
- assert len(matcher(doc)) == 1
- # initial token
- pattern = Doc(en_vocab, words=["I"])
- matcher = PhraseMatcher(en_vocab)
- matcher.add("I", None, pattern)
- assert len(matcher(doc)) == 1
- # initial phrase
- pattern = Doc(en_vocab, words=["I", "like"])
- matcher = PhraseMatcher(en_vocab)
- matcher.add("ILIKE", None, pattern)
- assert len(matcher(doc)) == 1
- # final token
- pattern = Doc(en_vocab, words=["best"])
- matcher = PhraseMatcher(en_vocab)
- matcher.add("BEST", None, pattern)
- assert len(matcher(doc)) == 1
- # final phrase
- pattern = Doc(en_vocab, words=["Now", "best"])
- matcher = PhraseMatcher(en_vocab)
- matcher.add("NOWBEST", None, pattern)
assert len(matcher(doc)) == 1
@@ -52,68 +31,6 @@ def test_phrase_matcher_contains(en_vocab):
assert "TEST2" not in matcher
-def test_phrase_matcher_repeated_add(en_vocab):
- matcher = PhraseMatcher(en_vocab)
- # match ID only gets added once
- matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
- matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
- matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
- matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
- doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
- assert "TEST" in matcher
- assert "TEST2" not in matcher
- assert len(matcher(doc)) == 1
-
-
-def test_phrase_matcher_remove(en_vocab):
- matcher = PhraseMatcher(en_vocab)
- matcher.add("TEST1", None, Doc(en_vocab, words=["like"]))
- matcher.add("TEST2", None, Doc(en_vocab, words=["best"]))
- doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
- assert "TEST1" in matcher
- assert "TEST2" in matcher
- assert "TEST3" not in matcher
- assert len(matcher(doc)) == 2
- matcher.remove("TEST1")
- assert "TEST1" not in matcher
- assert "TEST2" in matcher
- assert "TEST3" not in matcher
- assert len(matcher(doc)) == 1
- matcher.remove("TEST2")
- assert "TEST1" not in matcher
- assert "TEST2" not in matcher
- assert "TEST3" not in matcher
- assert len(matcher(doc)) == 0
- with pytest.raises(KeyError):
- matcher.remove("TEST3")
- assert "TEST1" not in matcher
- assert "TEST2" not in matcher
- assert "TEST3" not in matcher
- assert len(matcher(doc)) == 0
-
-
-def test_phrase_matcher_overlapping_with_remove(en_vocab):
- matcher = PhraseMatcher(en_vocab)
- matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
- # TEST2 is added alongside TEST
- matcher.add("TEST2", None, Doc(en_vocab, words=["like"]))
- doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
- assert "TEST" in matcher
- assert len(matcher) == 2
- assert len(matcher(doc)) == 2
- # removing TEST does not remove the entry for TEST2
- matcher.remove("TEST")
- assert "TEST" not in matcher
- assert len(matcher) == 1
- assert len(matcher(doc)) == 1
- assert matcher(doc)[0][0] == en_vocab.strings["TEST2"]
- # removing TEST2 removes all
- matcher.remove("TEST2")
- assert "TEST2" not in matcher
- assert len(matcher) == 0
- assert len(matcher(doc)) == 0
-
-
def test_phrase_matcher_string_attrs(en_vocab):
words1 = ["I", "like", "cats"]
pos1 = ["PRON", "VERB", "NOUN"]
diff --git a/spacy/tests/morphology/__init__.py b/spacy/tests/morphology/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py
deleted file mode 100644
index 4b8f0d754..000000000
--- a/spacy/tests/morphology/test_morph_features.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import pytest
-from spacy.morphology import Morphology
-from spacy.strings import StringStore, get_string_id
-from spacy.lemmatizer import Lemmatizer
-
-
-@pytest.fixture
-def morphology():
- return Morphology(StringStore(), {}, Lemmatizer())
-
-
-def test_init(morphology):
- pass
-
-
-def test_add_morphology_with_string_names(morphology):
- morphology.add({"Case_gen", "Number_sing"})
-
-
-def test_add_morphology_with_int_ids(morphology):
- morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")})
-
-
-def test_add_morphology_with_mix_strings_and_ints(morphology):
- morphology.add({get_string_id("PunctSide_ini"), "VerbType_aux"})
-
-
-def test_morphology_tags_hash_distinctly(morphology):
- tag1 = morphology.add({"PunctSide_ini", "VerbType_aux"})
- tag2 = morphology.add({"Case_gen", "Number_sing"})
- assert tag1 != tag2
-
-
-def test_morphology_tags_hash_independent_of_order(morphology):
- tag1 = morphology.add({"Case_gen", "Number_sing"})
- tag2 = morphology.add({"Number_sing", "Case_gen"})
- assert tag1 == tag2
-
-
-def test_update_morphology_tag(morphology):
- tag1 = morphology.add({"Case_gen"})
- tag2 = morphology.update(tag1, {"Number_sing"})
- assert tag1 != tag2
- tag3 = morphology.add({"Number_sing", "Case_gen"})
- assert tag2 == tag3
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 4dc7542ed..43c00a963 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -2,9 +2,7 @@
from __future__ import unicode_literals
import pytest
-from spacy.lang.en import English
-
-from spacy.pipeline import EntityRecognizer, EntityRuler
+from spacy.pipeline import EntityRecognizer
from spacy.vocab import Vocab
from spacy.syntax.ner import BiluoPushDown
from spacy.gold import GoldParse
@@ -82,190 +80,14 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
assert names
-def test_oracle_moves_missing_B(en_vocab):
- words = ["B", "52", "Bomber"]
- biluo_tags = [None, None, "L-PRODUCT"]
-
- doc = Doc(en_vocab, words=words)
- gold = GoldParse(doc, words=words, entities=biluo_tags)
-
- moves = BiluoPushDown(en_vocab.strings)
- move_types = ("M", "B", "I", "L", "U", "O")
- for tag in biluo_tags:
- if tag is None:
- continue
- elif tag == "O":
- moves.add_action(move_types.index("O"), "")
- else:
- action, label = tag.split("-")
- moves.add_action(move_types.index("B"), label)
- moves.add_action(move_types.index("I"), label)
- moves.add_action(move_types.index("L"), label)
- moves.add_action(move_types.index("U"), label)
- moves.preprocess_gold(gold)
- seq = moves.get_oracle_sequence(doc, gold)
-
-
-def test_oracle_moves_whitespace(en_vocab):
- words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
- biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]
-
- doc = Doc(en_vocab, words=words)
- gold = GoldParse(doc, words=words, entities=biluo_tags)
-
- moves = BiluoPushDown(en_vocab.strings)
- move_types = ("M", "B", "I", "L", "U", "O")
- for tag in biluo_tags:
- if tag is None:
- continue
- elif tag == "O":
- moves.add_action(move_types.index("O"), "")
- else:
- action, label = tag.split("-")
- moves.add_action(move_types.index(action), label)
- moves.preprocess_gold(gold)
- moves.get_oracle_sequence(doc, gold)
-
-
-def test_accept_blocked_token():
- """Test succesful blocking of tokens to be in an entity."""
- # 1. test normal behaviour
- nlp1 = English()
- doc1 = nlp1("I live in New York")
- ner1 = EntityRecognizer(doc1.vocab)
- assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
- assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
-
- # Add the OUT action
- ner1.moves.add_action(5, "")
- ner1.add_label("GPE")
- # Get into the state just before "New"
- state1 = ner1.moves.init_batch([doc1])[0]
- ner1.moves.apply_transition(state1, "O")
- ner1.moves.apply_transition(state1, "O")
- ner1.moves.apply_transition(state1, "O")
- # Check that B-GPE is valid.
- assert ner1.moves.is_valid(state1, "B-GPE")
-
- # 2. test blocking behaviour
- nlp2 = English()
- doc2 = nlp2("I live in New York")
- ner2 = EntityRecognizer(doc2.vocab)
-
- # set "New York" to a blocked entity
- doc2.ents = [(0, 3, 5)]
- assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
- assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
-
- # Check that B-GPE is now invalid.
- ner2.moves.add_action(4, "")
- ner2.moves.add_action(5, "")
- ner2.add_label("GPE")
- state2 = ner2.moves.init_batch([doc2])[0]
- ner2.moves.apply_transition(state2, "O")
- ner2.moves.apply_transition(state2, "O")
- ner2.moves.apply_transition(state2, "O")
- # we can only use U- for "New"
- assert not ner2.moves.is_valid(state2, "B-GPE")
- assert ner2.moves.is_valid(state2, "U-")
- ner2.moves.apply_transition(state2, "U-")
- # we can only use U- for "York"
- assert not ner2.moves.is_valid(state2, "B-GPE")
- assert ner2.moves.is_valid(state2, "U-")
-
-
-def test_overwrite_token():
- nlp = English()
- ner1 = nlp.create_pipe("ner")
- nlp.add_pipe(ner1, name="ner")
- nlp.begin_training()
-
- # The untrained NER will predict O for each token
- doc = nlp("I live in New York")
- assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
- assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
-
- # Check that a new ner can overwrite O
- ner2 = EntityRecognizer(doc.vocab)
- ner2.moves.add_action(5, "")
- ner2.add_label("GPE")
- state = ner2.moves.init_batch([doc])[0]
- assert ner2.moves.is_valid(state, "B-GPE")
- assert ner2.moves.is_valid(state, "U-GPE")
- ner2.moves.apply_transition(state, "B-GPE")
- assert ner2.moves.is_valid(state, "I-GPE")
- assert ner2.moves.is_valid(state, "L-GPE")
-
-
-def test_ruler_before_ner():
- """ Test that an NER works after an entity_ruler: the second can add annotations """
- nlp = English()
-
- # 1 : Entity Ruler - should set "this" to B and everything else to empty
- ruler = EntityRuler(nlp)
- patterns = [{"label": "THING", "pattern": "This"}]
- ruler.add_patterns(patterns)
- nlp.add_pipe(ruler)
-
- # 2: untrained NER - should set everything else to O
- untrained_ner = nlp.create_pipe("ner")
- untrained_ner.add_label("MY_LABEL")
- nlp.add_pipe(untrained_ner)
- nlp.begin_training()
-
- doc = nlp("This is Antti Korhonen speaking in Finland")
- expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
- expected_types = ["THING", "", "", "", "", "", ""]
- assert [token.ent_iob_ for token in doc] == expected_iobs
- assert [token.ent_type_ for token in doc] == expected_types
-
-
-def test_ner_before_ruler():
- """ Test that an entity_ruler works after an NER: the second can overwrite O annotations """
- nlp = English()
-
- # 1: untrained NER - should set everything to O
- untrained_ner = nlp.create_pipe("ner")
- untrained_ner.add_label("MY_LABEL")
- nlp.add_pipe(untrained_ner, name="uner")
- nlp.begin_training()
-
- # 2 : Entity Ruler - should set "this" to B and keep everything else O
- ruler = EntityRuler(nlp)
- patterns = [{"label": "THING", "pattern": "This"}]
- ruler.add_patterns(patterns)
- nlp.add_pipe(ruler)
-
- doc = nlp("This is Antti Korhonen speaking in Finland")
- expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
- expected_types = ["THING", "", "", "", "", "", ""]
- assert [token.ent_iob_ for token in doc] == expected_iobs
- assert [token.ent_type_ for token in doc] == expected_types
-
-
-def test_block_ner():
- """ Test functionality for blocking tokens so they can't be in a named entity """
- # block "Antti L Korhonen" from being a named entity
- nlp = English()
- nlp.add_pipe(BlockerComponent1(2, 5))
- untrained_ner = nlp.create_pipe("ner")
- untrained_ner.add_label("MY_LABEL")
- nlp.add_pipe(untrained_ner, name="uner")
- nlp.begin_training()
- doc = nlp("This is Antti L Korhonen speaking in Finland")
- expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
- expected_types = ["", "", "", "", "", "", "", ""]
- assert [token.ent_iob_ for token in doc] == expected_iobs
- assert [token.ent_type_ for token in doc] == expected_types
-
-
-class BlockerComponent1(object):
- name = "my_blocker"
-
- def __init__(self, start, end):
- self.start = start
- self.end = end
-
- def __call__(self, doc):
- doc.ents = [(0, self.start, self.end)]
- return doc
+def test_doc_add_entities_set_ents_iob(en_vocab):
+ doc = Doc(en_vocab, words=["This", "is", "a", "lion"])
+ ner = EntityRecognizer(en_vocab)
+ ner.begin_training([])
+ ner(doc)
+ assert len(list(doc.ents)) == 0
+ assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
+ doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
+ assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
+ doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
+ assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index b3f347765..febf2b5b3 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -426,7 +426,7 @@ def test_issue957(en_tokenizer):
def test_issue999(train_data):
"""Test that adding entities and resuming training works passably OK.
There are two issues here:
- 1) We have to read labels. This isn't very nice.
+ 1) We have to readd labels. This isn't very nice.
2) There's no way to set the learning rate for the weight update, so we
end up out-of-scale, causing it to learn too fast.
"""
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 520090bb4..24f725ab8 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -187,7 +187,7 @@ def test_issue1799():
def test_issue1807():
"""Test vocab.set_vector also adds the word to the vocab."""
- vocab = Vocab(vectors_name="test_issue1807")
+ vocab = Vocab()
assert "hello" not in vocab
vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
assert "hello" in vocab
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index a0b1e2aac..cf29c2535 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -184,7 +184,7 @@ def test_issue2833(en_vocab):
def test_issue2871():
"""Test that vectors recover the correct key for spaCy reserved words."""
words = ["dog", "cat", "SUFFIX"]
- vocab = Vocab(vectors_name="test_issue2871")
+ vocab = Vocab()
vocab.vectors.resize(shape=(3, 10))
vector_data = numpy.zeros((3, 10), dtype="f")
for word in words:
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index c430678d3..3b0c2f1ed 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -30,20 +30,20 @@ def test_issue3002():
def test_issue3009(en_vocab):
"""Test problem with matcher quantifiers"""
patterns = [
- [{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
+ [{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"POS": "ADP"}],
[
{"LEMMA": "have"},
{"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
{"LOWER": "to"},
{"LOWER": "do"},
- {"TAG": "IN"},
+ {"POS": "ADP"},
],
[
{"LEMMA": "have"},
{"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
{"LOWER": "to"},
{"LOWER": "do"},
- {"TAG": "IN"},
+ {"POS": "ADP"},
],
]
words = ["also", "has", "to", "do", "with"]
diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py
deleted file mode 100644
index 500be9f2a..000000000
--- a/spacy/tests/regression/test_issue4042.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import spacy
-from spacy.pipeline import EntityRecognizer, EntityRuler
-from spacy.lang.en import English
-from spacy.tokens import Span
-from spacy.util import ensure_path
-
-from ..util import make_tempdir
-
-
-def test_issue4042():
- """Test that serialization of an EntityRuler before NER works fine."""
- nlp = English()
-
- # add ner pipe
- ner = nlp.create_pipe("ner")
- ner.add_label("SOME_LABEL")
- nlp.add_pipe(ner)
- nlp.begin_training()
-
- # Add entity ruler
- ruler = EntityRuler(nlp)
- patterns = [
- {"label": "MY_ORG", "pattern": "Apple"},
- {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
- ]
- ruler.add_patterns(patterns)
- nlp.add_pipe(ruler, before="ner") # works fine with "after"
- doc1 = nlp("What do you think about Apple ?")
- assert doc1.ents[0].label_ == "MY_ORG"
-
- with make_tempdir() as d:
- output_dir = ensure_path(d)
- if not output_dir.exists():
- output_dir.mkdir()
- nlp.to_disk(output_dir)
-
- nlp2 = spacy.load(output_dir)
- doc2 = nlp2("What do you think about Apple ?")
- assert doc2.ents[0].label_ == "MY_ORG"
-
-
-def test_issue4042_bug2():
- """
- Test that serialization of an NER works fine when new labels were added.
- This is the second bug of two bugs underlying the issue 4042.
- """
- nlp1 = English()
- vocab = nlp1.vocab
-
- # add ner pipe
- ner1 = nlp1.create_pipe("ner")
- ner1.add_label("SOME_LABEL")
- nlp1.add_pipe(ner1)
- nlp1.begin_training()
-
- # add a new label to the doc
- doc1 = nlp1("What do you think about Apple ?")
- assert len(ner1.labels) == 1
- assert "SOME_LABEL" in ner1.labels
- apple_ent = Span(doc1, 5, 6, label="MY_ORG")
- doc1.ents = list(doc1.ents) + [apple_ent]
-
- # reapply the NER - at this point it should resize itself
- ner1(doc1)
- assert len(ner1.labels) == 2
- assert "SOME_LABEL" in ner1.labels
- assert "MY_ORG" in ner1.labels
-
- with make_tempdir() as d:
- # assert IO goes fine
- output_dir = ensure_path(d)
- if not output_dir.exists():
- output_dir.mkdir()
- ner1.to_disk(output_dir)
-
- nlp2 = English(vocab)
- ner2 = EntityRecognizer(vocab)
- ner2.from_disk(output_dir)
- assert len(ner2.labels) == 2
diff --git a/spacy/tests/regression/test_issue4054.py b/spacy/tests/regression/test_issue4054.py
index cc84cebf8..2c9d73751 100644
--- a/spacy/tests/regression/test_issue4054.py
+++ b/spacy/tests/regression/test_issue4054.py
@@ -2,12 +2,12 @@
from __future__ import unicode_literals
from spacy.vocab import Vocab
+
import spacy
from spacy.lang.en import English
+from spacy.tests.util import make_tempdir
from spacy.util import ensure_path
-from ..util import make_tempdir
-
def test_issue4054(en_vocab):
"""Test that a new blank model can be made with a vocab from file,
diff --git a/spacy/tests/regression/test_issue4267.py b/spacy/tests/regression/test_issue4267.py
deleted file mode 100644
index 5fc61e142..000000000
--- a/spacy/tests/regression/test_issue4267.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import pytest
-
-import spacy
-
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler
-from spacy.tokens import Span
-
-
-def test_issue4267():
- """ Test that running an entity_ruler after ner gives consistent results"""
- nlp = English()
- ner = nlp.create_pipe("ner")
- ner.add_label("PEOPLE")
- nlp.add_pipe(ner)
- nlp.begin_training()
-
- assert "ner" in nlp.pipe_names
-
- # assert that we have correct IOB annotations
- doc1 = nlp("hi")
- assert doc1.is_nered
- for token in doc1:
- assert token.ent_iob == 2
-
- # add entity ruler and run again
- ruler = EntityRuler(nlp)
- patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
-
- ruler.add_patterns(patterns)
- nlp.add_pipe(ruler)
- assert "entity_ruler" in nlp.pipe_names
- assert "ner" in nlp.pipe_names
-
- # assert that we still have correct IOB annotations
- doc2 = nlp("hi")
- assert doc2.is_nered
- for token in doc2:
- assert token.ent_iob == 2
diff --git a/spacy/tests/regression/test_issue4278.py b/spacy/tests/regression/test_issue4278.py
index cb09340ff..4c85d15c4 100644
--- a/spacy/tests/regression/test_issue4278.py
+++ b/spacy/tests/regression/test_issue4278.py
@@ -13,7 +13,7 @@ class DummyPipe(Pipe):
def predict(self, docs):
return ([1, 2, 3], [4, 5, 6])
- def set_annotations(self, docs, scores, tensors=None):
+ def set_annotations(self, docs, scores, tensor=None):
return docs
diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py
deleted file mode 100644
index c68f745a7..000000000
--- a/spacy/tests/regression/test_issue4313.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from collections import defaultdict
-
-from spacy.pipeline import EntityRecognizer
-
-from spacy.lang.en import English
-from spacy.tokens import Span
-
-
-def test_issue4313():
- """ This should not crash or exit with some strange error code """
- beam_width = 16
- beam_density = 0.0001
- nlp = English()
- ner = EntityRecognizer(nlp.vocab)
- ner.add_label("SOME_LABEL")
- ner.begin_training([])
- nlp.add_pipe(ner)
-
- # add a new label to the doc
- doc = nlp("What do you think about Apple ?")
- assert len(ner.labels) == 1
- assert "SOME_LABEL" in ner.labels
- apple_ent = Span(doc, 5, 6, label="MY_ORG")
- doc.ents = list(doc.ents) + [apple_ent]
-
- # ensure the beam_parse still works with the new label
- docs = [doc]
- beams = nlp.entity.beam_parse(
- docs, beam_width=beam_width, beam_density=beam_density
- )
-
- for doc, beam in zip(docs, beams):
- entity_scores = defaultdict(float)
- for score, ents in nlp.entity.moves.get_beam_parses(beam):
- for start, end, label in ents:
- entity_scores[(start, end, label)] += score
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index b19c11864..67fd9f0d4 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -1,10 +1,10 @@
# coding: utf-8
from __future__ import unicode_literals
-from spacy.util import ensure_path
-from spacy.kb import KnowledgeBase
-
from ..util import make_tempdir
+from ...util import ensure_path
+
+from spacy.kb import KnowledgeBase
def test_serialize_kb_disk(en_vocab):
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index 2d1f1bd8f..5e99d261a 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -32,7 +32,7 @@ def test_displacy_parse_deps(en_vocab):
assert isinstance(deps, dict)
assert deps["words"] == [
{"text": "This", "tag": "DET"},
- {"text": "is", "tag": "AUX"},
+ {"text": "is", "tag": "VERB"},
{"text": "a", "tag": "DET"},
{"text": "sentence", "tag": "NOUN"},
]
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 4f79c4463..860540be2 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -3,12 +3,8 @@ from __future__ import unicode_literals
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from spacy.gold import spans_from_biluo_tags, GoldParse
-from spacy.gold import GoldCorpus, docs_to_json
-from spacy.lang.en import English
from spacy.tokens import Doc
-from .util import make_tempdir
import pytest
-import srsly
def test_gold_biluo_U(en_vocab):
@@ -85,28 +81,3 @@ def test_gold_ner_missing_tags(en_tokenizer):
doc = en_tokenizer("I flew to Silicon Valley via London.")
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
gold = GoldParse(doc, entities=biluo_tags) # noqa: F841
-
-
-def test_roundtrip_docs_to_json():
- text = "I flew to Silicon Valley via London."
- cats = {"TRAVEL": 1.0, "BAKING": 0.0}
- nlp = English()
- doc = nlp(text)
- doc.cats = cats
- doc[0].is_sent_start = True
- for i in range(1, len(doc)):
- doc[i].is_sent_start = False
-
- with make_tempdir() as tmpdir:
- json_file = tmpdir / "roundtrip.json"
- srsly.write_json(json_file, [docs_to_json(doc)])
- goldcorpus = GoldCorpus(str(json_file), str(json_file))
-
- reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
-
- assert len(doc) == goldcorpus.count_train()
- assert text == reloaded_doc.text
- assert "TRAVEL" in goldparse.cats
- assert "BAKING" in goldparse.cats
- assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
- assert cats["BAKING"] == goldparse.cats["BAKING"]
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 9cc4f75b2..a747d3adb 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -1,12 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
-from numpy.testing import assert_almost_equal, assert_array_almost_equal
-import pytest
from pytest import approx
from spacy.gold import GoldParse
-from spacy.scorer import Scorer, ROCAUCScore
-from spacy.scorer import _roc_auc_score, _roc_curve
+from spacy.scorer import Scorer
from .util import get_doc
test_ner_cardinal = [
@@ -69,73 +66,3 @@ def test_ner_per_type(en_vocab):
assert results["ents_per_type"]["ORG"]["p"] == 50
assert results["ents_per_type"]["ORG"]["r"] == 100
assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666)
-
-
-def test_roc_auc_score():
- # Binary classification, toy tests from scikit-learn test suite
- y_true = [0, 1]
- y_score = [0, 1]
- tpr, fpr, _ = _roc_curve(y_true, y_score)
- roc_auc = _roc_auc_score(y_true, y_score)
- assert_array_almost_equal(tpr, [0, 0, 1])
- assert_array_almost_equal(fpr, [0, 1, 1])
- assert_almost_equal(roc_auc, 1.0)
-
- y_true = [0, 1]
- y_score = [1, 0]
- tpr, fpr, _ = _roc_curve(y_true, y_score)
- roc_auc = _roc_auc_score(y_true, y_score)
- assert_array_almost_equal(tpr, [0, 1, 1])
- assert_array_almost_equal(fpr, [0, 0, 1])
- assert_almost_equal(roc_auc, 0.0)
-
- y_true = [1, 0]
- y_score = [1, 1]
- tpr, fpr, _ = _roc_curve(y_true, y_score)
- roc_auc = _roc_auc_score(y_true, y_score)
- assert_array_almost_equal(tpr, [0, 1])
- assert_array_almost_equal(fpr, [0, 1])
- assert_almost_equal(roc_auc, 0.5)
-
- y_true = [1, 0]
- y_score = [1, 0]
- tpr, fpr, _ = _roc_curve(y_true, y_score)
- roc_auc = _roc_auc_score(y_true, y_score)
- assert_array_almost_equal(tpr, [0, 0, 1])
- assert_array_almost_equal(fpr, [0, 1, 1])
- assert_almost_equal(roc_auc, 1.0)
-
- y_true = [1, 0]
- y_score = [0.5, 0.5]
- tpr, fpr, _ = _roc_curve(y_true, y_score)
- roc_auc = _roc_auc_score(y_true, y_score)
- assert_array_almost_equal(tpr, [0, 1])
- assert_array_almost_equal(fpr, [0, 1])
- assert_almost_equal(roc_auc, 0.5)
-
- # same result as above with ROCAUCScore wrapper
- score = ROCAUCScore()
- score.score_set(0.5, 1)
- score.score_set(0.5, 0)
- assert_almost_equal(score.score, 0.5)
-
- # check that errors are raised in undefined cases and score is -inf
- y_true = [0, 0]
- y_score = [0.25, 0.75]
- with pytest.raises(ValueError):
- _roc_auc_score(y_true, y_score)
-
- score = ROCAUCScore()
- score.score_set(0.25, 0)
- score.score_set(0.75, 0)
- assert score.score == -float("inf")
-
- y_true = [1, 1]
- y_score = [0.25, 0.75]
- with pytest.raises(ValueError):
- _roc_auc_score(y_true, y_score)
-
- score = ROCAUCScore()
- score.score_set(0.25, 1)
- score.score_set(0.75, 1)
- assert score.score == -float("inf")
diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py
index f78dd33c4..16ffe83fc 100644
--- a/spacy/tests/vocab_vectors/test_lookups.py
+++ b/spacy/tests/vocab_vectors/test_lookups.py
@@ -2,8 +2,7 @@
from __future__ import unicode_literals
import pytest
-from spacy.lookups import Lookups, Table
-from spacy.strings import get_string_id
+from spacy.lookups import Lookups
from spacy.vocab import Vocab
from ..util import make_tempdir
@@ -20,9 +19,9 @@ def test_lookups_api():
table = lookups.get_table(table_name)
assert table.name == table_name
assert len(table) == 2
- assert table["hello"] == "world"
- table["a"] = "b"
- assert table["a"] == "b"
+ assert table.get("hello") == "world"
+ table.set("a", "b")
+ assert table.get("a") == "b"
table = lookups.get_table(table_name)
assert len(table) == 3
with pytest.raises(KeyError):
@@ -37,44 +36,8 @@ def test_lookups_api():
lookups.get_table(table_name)
-def test_table_api():
- table = Table(name="table")
- assert table.name == "table"
- assert len(table) == 0
- assert "abc" not in table
- data = {"foo": "bar", "hello": "world"}
- table = Table(name="table", data=data)
- assert len(table) == len(data)
- assert "foo" in table
- assert get_string_id("foo") in table
- assert table["foo"] == "bar"
- assert table[get_string_id("foo")] == "bar"
- assert table.get("foo") == "bar"
- assert table.get("abc") is None
- table["abc"] = 123
- assert table["abc"] == 123
- assert table[get_string_id("abc")] == 123
- table.set("def", 456)
- assert table["def"] == 456
- assert table[get_string_id("def")] == 456
-
-
-def test_table_api_to_from_bytes():
- data = {"foo": "bar", "hello": "world", "abc": 123}
- table = Table(name="table", data=data)
- table_bytes = table.to_bytes()
- new_table = Table().from_bytes(table_bytes)
- assert new_table.name == "table"
- assert len(new_table) == 3
- assert new_table["foo"] == "bar"
- assert new_table[get_string_id("foo")] == "bar"
- new_table2 = Table(data={"def": 456})
- new_table2.from_bytes(table_bytes)
- assert len(new_table2) == 3
- assert "def" not in new_table2
-
-
-@pytest.mark.skip(reason="This fails on Python 3.5")
+# This fails on Python 3.5
+@pytest.mark.xfail
def test_lookups_to_from_bytes():
lookups = Lookups()
lookups.add_table("table1", {"foo": "bar", "hello": "world"})
@@ -87,14 +50,15 @@ def test_lookups_to_from_bytes():
assert "table2" in new_lookups
table1 = new_lookups.get_table("table1")
assert len(table1) == 2
- assert table1["foo"] == "bar"
+ assert table1.get("foo") == "bar"
table2 = new_lookups.get_table("table2")
assert len(table2) == 3
- assert table2["b"] == 2
+ assert table2.get("b") == 2
assert new_lookups.to_bytes() == lookups_bytes
-@pytest.mark.skip(reason="This fails on Python 3.5")
+# This fails on Python 3.5
+@pytest.mark.xfail
def test_lookups_to_from_disk():
lookups = Lookups()
lookups.add_table("table1", {"foo": "bar", "hello": "world"})
@@ -108,13 +72,14 @@ def test_lookups_to_from_disk():
assert "table2" in new_lookups
table1 = new_lookups.get_table("table1")
assert len(table1) == 2
- assert table1["foo"] == "bar"
+ assert table1.get("foo") == "bar"
table2 = new_lookups.get_table("table2")
assert len(table2) == 3
- assert table2["b"] == 2
+ assert table2.get("b") == 2
-@pytest.mark.skip(reason="This fails on Python 3.5")
+# This fails on Python 3.5
+@pytest.mark.xfail
def test_lookups_to_from_bytes_via_vocab():
table_name = "test"
vocab = Vocab()
@@ -128,11 +93,12 @@ def test_lookups_to_from_bytes_via_vocab():
assert table_name in new_vocab.lookups
table = new_vocab.lookups.get_table(table_name)
assert len(table) == 2
- assert table["hello"] == "world"
+ assert table.get("hello") == "world"
assert new_vocab.to_bytes() == vocab_bytes
-@pytest.mark.skip(reason="This fails on Python 3.5")
+# This fails on Python 3.5
+@pytest.mark.xfail
def test_lookups_to_from_disk_via_vocab():
table_name = "test"
vocab = Vocab()
@@ -147,4 +113,4 @@ def test_lookups_to_from_disk_via_vocab():
assert table_name in new_vocab.lookups
table = new_vocab.lookups.get_table(table_name)
assert len(table) == 2
- assert table["hello"] == "world"
+ assert table.get("hello") == "world"
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 4226bca3b..2a828de9c 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -259,7 +259,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2):
def test_vocab_add_vector():
- vocab = Vocab(vectors_name="test_vocab_add_vector")
+ vocab = Vocab()
data = numpy.ndarray((5, 3), dtype="f")
data[0] = 1.0
data[1] = 2.0
@@ -272,7 +272,7 @@ def test_vocab_add_vector():
def test_vocab_prune_vectors():
- vocab = Vocab(vectors_name="test_vocab_prune_vectors")
+ vocab = Vocab()
_ = vocab["cat"] # noqa: F841
_ = vocab["dog"] # noqa: F841
_ = vocab["kitten"] # noqa: F841
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index 536ec8349..5722d45bc 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -4,6 +4,5 @@ from __future__ import unicode_literals
from .doc import Doc
from .token import Token
from .span import Span
-from ._serialize import DocBin
-__all__ = ["Doc", "Token", "Span", "DocBin"]
+__all__ = ["Doc", "Token", "Span"]
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 5b0747fa0..741be7e6a 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -146,12 +146,11 @@ def _merge(Doc doc, merges):
syntactic root of the span.
RETURNS (Token): The first newly merged token.
"""
- cdef int i, merge_index, start, end, token_index, current_span_index, current_offset, offset, span_index
+ cdef int i, merge_index, start, end, token_index
cdef Span span
cdef const LexemeC* lex
cdef TokenC* token
cdef Pool mem = Pool()
- cdef int merged_iob = 0
tokens = mem.alloc(len(merges), sizeof(TokenC))
spans = []
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 634d7450a..41f524839 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -8,77 +8,36 @@ from thinc.neural.ops import NumpyOps
from ..compat import copy_reg
from ..tokens import Doc
-from ..attrs import SPACY, ORTH, intify_attrs
-from ..errors import Errors
+from ..attrs import SPACY, ORTH
-class DocBin(object):
- """Pack Doc objects for binary serialization.
-
- The DocBin class lets you efficiently serialize the information from a
- collection of Doc objects. You can control which information is serialized
- by passing a list of attribute IDs, and optionally also specify whether the
- user data is serialized. The DocBin is faster and produces smaller data
- sizes than pickle, and allows you to deserialize without executing arbitrary
- Python code.
-
- The serialization format is gzipped msgpack, where the msgpack object has
- the following structure:
-
- {
- "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
- "tokens": bytes, # Serialized numpy uint64 array with the token data
- "spaces": bytes, # Serialized numpy boolean array with spaces data
- "lengths": bytes, # Serialized numpy int32 array with the doc lengths
- "strings": List[unicode] # List of unique strings in the token data
- }
-
- Strings for the words, tags, labels etc are represented by 64-bit hashes in
- the token data, and every string that occurs at least once is passed via the
- strings object. This means the storage is more efficient if you pack more
- documents together, because you have less duplication in the strings.
-
- A notable downside to this format is that you can't easily extract just one
- document from the DocBin.
- """
+class DocBox(object):
+ """Serialize analyses from a collection of doc objects."""
def __init__(self, attrs=None, store_user_data=False):
- """Create a DocBin object to hold serialized annotations.
+ """Create a DocBox object, to hold serialized annotations.
attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
always serialized, so they're not required. Defaults to None.
- store_user_data (bool): Whether to include the `Doc.user_data`.
- RETURNS (DocBin): The newly constructed object.
-
- DOCS: https://spacy.io/api/docbin#init
"""
attrs = attrs or []
- attrs = sorted(intify_attrs(attrs))
+ # Ensure ORTH is always attrs[0]
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
- self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
+ self.attrs.insert(0, ORTH)
self.tokens = []
self.spaces = []
self.user_data = []
self.strings = set()
self.store_user_data = store_user_data
- def __len__(self):
- """RETURNS: The number of Doc objects added to the DocBin."""
- return len(self.tokens)
-
def add(self, doc):
- """Add a Doc's annotations to the DocBin for serialization.
-
- doc (Doc): The Doc object to add.
-
- DOCS: https://spacy.io/api/docbin#add
- """
+ """Add a doc's annotations to the DocBox for serialization."""
array = doc.to_array(self.attrs)
if len(array.shape) == 1:
array = array.reshape((array.shape[0], 1))
self.tokens.append(array)
spaces = doc.to_array(SPACY)
- assert array.shape[0] == spaces.shape[0] # this should never happen
+ assert array.shape[0] == spaces.shape[0]
spaces = spaces.reshape((spaces.shape[0], 1))
self.spaces.append(numpy.asarray(spaces, dtype=bool))
self.strings.update(w.text for w in doc)
@@ -86,13 +45,7 @@ class DocBin(object):
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
def get_docs(self, vocab):
- """Recover Doc objects from the annotations, using the given vocab.
-
- vocab (Vocab): The shared vocab.
- YIELDS (Doc): The Doc objects.
-
- DOCS: https://spacy.io/api/docbin#get_docs
- """
+ """Recover Doc objects from the annotations, using the given vocab."""
for string in self.strings:
vocab[string]
orth_col = self.attrs.index(ORTH)
@@ -107,16 +60,8 @@ class DocBin(object):
yield doc
def merge(self, other):
- """Extend the annotations of this DocBin with the annotations from
- another. Will raise an error if the pre-defined attrs of the two
- DocBins don't match.
-
- other (DocBin): The DocBin to merge into the current bin.
-
- DOCS: https://spacy.io/api/docbin#merge
- """
- if self.attrs != other.attrs:
- raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs))
+ """Extend the annotations of this DocBox with the annotations from another."""
+ assert self.attrs == other.attrs
self.tokens.extend(other.tokens)
self.spaces.extend(other.spaces)
self.strings.update(other.strings)
@@ -124,14 +69,9 @@ class DocBin(object):
self.user_data.extend(other.user_data)
def to_bytes(self):
- """Serialize the DocBin's annotations to a bytestring.
-
- RETURNS (bytes): The serialized DocBin.
-
- DOCS: https://spacy.io/api/docbin#to_bytes
- """
+ """Serialize the DocBox's annotations into a byte string."""
for tokens in self.tokens:
- assert len(tokens.shape) == 2, tokens.shape # this should never happen
+ assert len(tokens.shape) == 2, tokens.shape
lengths = [len(tokens) for tokens in self.tokens]
msg = {
"attrs": self.attrs,
@@ -144,15 +84,9 @@ class DocBin(object):
msg["user_data"] = self.user_data
return gzip.compress(srsly.msgpack_dumps(msg))
- def from_bytes(self, bytes_data):
- """Deserialize the DocBin's annotations from a bytestring.
-
- bytes_data (bytes): The data to load from.
- RETURNS (DocBin): The loaded DocBin.
-
- DOCS: https://spacy.io/api/docbin#from_bytes
- """
- msg = srsly.msgpack_loads(gzip.decompress(bytes_data))
+ def from_bytes(self, string):
+ """Deserialize the DocBox's annotations from a byte string."""
+ msg = srsly.msgpack_loads(gzip.decompress(string))
self.attrs = msg["attrs"]
self.strings = set(msg["strings"])
lengths = numpy.fromstring(msg["lengths"], dtype="int32")
@@ -166,35 +100,35 @@ class DocBin(object):
if self.store_user_data and "user_data" in msg:
self.user_data = list(msg["user_data"])
for tokens in self.tokens:
- assert len(tokens.shape) == 2, tokens.shape # this should never happen
+ assert len(tokens.shape) == 2, tokens.shape
return self
-def merge_bins(bins):
+def merge_boxes(boxes):
merged = None
- for byte_string in bins:
+ for byte_string in boxes:
if byte_string is not None:
- doc_bin = DocBin(store_user_data=True).from_bytes(byte_string)
+ box = DocBox(store_user_data=True).from_bytes(byte_string)
if merged is None:
- merged = doc_bin
+ merged = box
else:
- merged.merge(doc_bin)
+ merged.merge(box)
if merged is not None:
return merged.to_bytes()
else:
return b""
-def pickle_bin(doc_bin):
- return (unpickle_bin, (doc_bin.to_bytes(),))
+def pickle_box(box):
+ return (unpickle_box, (box.to_bytes(),))
-def unpickle_bin(byte_string):
- return DocBin().from_bytes(byte_string)
+def unpickle_box(byte_string):
+ return DocBox().from_bytes(byte_string)
-copy_reg.pickle(DocBin, pickle_bin, unpickle_bin)
+copy_reg.pickle(DocBox, pickle_box, unpickle_box)
# Compatibility, as we had named it this previously.
-Binder = DocBin
+Binder = DocBox
-__all__ = ["DocBin"]
+__all__ = ["DocBox"]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 80a808bae..e863b0807 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -256,7 +256,7 @@ cdef class Doc:
def is_nered(self):
"""Check if the document has named entities set. Will return True if
*any* of the tokens has a named entity tag set (even if the others are
- unknown values).
+ uknown values).
"""
if len(self) == 0:
return True
@@ -525,11 +525,13 @@ cdef class Doc:
def __set__(self, ents):
# TODO:
- # 1. Test basic data-driven ORTH gazetteer
- # 2. Test more nuanced date and currency regex
+ # 1. Allow negative matches
+ # 2. Ensure pre-set NERs are not over-written during statistical
+ # prediction
+ # 3. Test basic data-driven ORTH gazetteer
+ # 4. Test more nuanced date and currency regex
tokens_in_ents = {}
cdef attr_t entity_type
- cdef attr_t kb_id
cdef int ent_start, ent_end
for ent_info in ents:
entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
@@ -543,31 +545,27 @@ cdef class Doc:
tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
cdef int i
for i in range(self.length):
- # default values
- entity_type = 0
- kb_id = 0
-
- # Set ent_iob to Missing (0) bij default unless this token was nered before
- ent_iob = 0
- if self.c[i].ent_iob != 0:
- ent_iob = 2
-
- # overwrite if the token was part of a specified entity
- if i in tokens_in_ents.keys():
- ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
- if entity_type is None or entity_type <= 0:
- # Blocking this token from being overwritten by downstream NER
- ent_iob = 3
- elif ent_start == i:
- # Marking the start of an entity
- ent_iob = 3
- else:
- # Marking the inside of an entity
- ent_iob = 1
-
- self.c[i].ent_type = entity_type
- self.c[i].ent_kb_id = kb_id
- self.c[i].ent_iob = ent_iob
+ self.c[i].ent_type = 0
+ self.c[i].ent_kb_id = 0
+ self.c[i].ent_iob = 0 # Means missing.
+ cdef attr_t ent_type
+ cdef int start, end
+ for ent_info in ents:
+ ent_type, ent_kb_id, start, end = get_entity_info(ent_info)
+ if ent_type is None or ent_type < 0:
+ # Mark as O
+ for i in range(start, end):
+ self.c[i].ent_type = 0
+ self.c[i].ent_kb_id = 0
+ self.c[i].ent_iob = 2
+ else:
+ # Mark (inside) as I
+ for i in range(start, end):
+ self.c[i].ent_type = ent_type
+ self.c[i].ent_kb_id = ent_kb_id
+ self.c[i].ent_iob = 1
+ # Set start as B
+ self.c[start].ent_iob = 3
@property
def noun_chunks(self):
@@ -1091,37 +1089,6 @@ cdef class Doc:
data["_"][attr] = value
return data
- def to_utf8_array(self, int nr_char=-1):
- """Encode word strings to utf8, and export to a fixed-width array
- of characters. Characters are placed into the array in the order:
- 0, -1, 1, -2, etc
- For example, if the array is sliced array[:, :8], the array will
- contain the first 4 characters and last 4 characters of each word ---
- with the middle characters clipped out. The value 255 is used as a pad
- value.
- """
- byte_strings = [token.orth_.encode('utf8') for token in self]
- if nr_char == -1:
- nr_char = max(len(bs) for bs in byte_strings)
- cdef np.ndarray output = numpy.zeros((len(byte_strings), nr_char), dtype='uint8')
- output.fill(255)
- cdef int i, j, start_idx, end_idx
- cdef bytes byte_string
- cdef unsigned char utf8_char
- for i, byte_string in enumerate(byte_strings):
- j = 0
- start_idx = 0
- end_idx = len(byte_string) - 1
- while j < nr_char and start_idx <= end_idx:
- output[i, j] = byte_string[start_idx]
- start_idx += 1
- j += 1
- if j < nr_char and start_idx <= end_idx:
- output[i, j] = byte_string[end_idx]
- end_idx -= 1
- j += 1
- return output
-
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
cdef int i
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
deleted file mode 100644
index 22844454a..000000000
--- a/spacy/tokens/morphanalysis.pxd
+++ /dev/null
@@ -1,9 +0,0 @@
-from ..vocab cimport Vocab
-from ..typedefs cimport hash_t
-from ..structs cimport MorphAnalysisC
-
-
-cdef class MorphAnalysis:
- cdef readonly Vocab vocab
- cdef hash_t key
- cdef MorphAnalysisC c
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
deleted file mode 100644
index e09870741..000000000
--- a/spacy/tokens/morphanalysis.pyx
+++ /dev/null
@@ -1,423 +0,0 @@
-from libc.string cimport memset
-
-from ..vocab cimport Vocab
-from ..typedefs cimport hash_t, attr_t
-from ..morphology cimport list_features, check_feature, get_field, tag_to_json
-
-from ..strings import get_string_id
-
-
-cdef class MorphAnalysis:
- """Control access to morphological features for a token."""
- def __init__(self, Vocab vocab, features=tuple()):
- self.vocab = vocab
- self.key = self.vocab.morphology.add(features)
- analysis = self.vocab.morphology.tags.get(self.key)
- if analysis is not NULL:
- self.c = analysis[0]
- else:
- memset(&self.c, 0, sizeof(self.c))
-
- @classmethod
- def from_id(cls, Vocab vocab, hash_t key):
- """Create a morphological analysis from a given ID."""
- cdef MorphAnalysis morph = MorphAnalysis.__new__(MorphAnalysis, vocab)
- morph.vocab = vocab
- morph.key = key
- analysis = vocab.morphology.tags.get(key)
- if analysis is not NULL:
- morph.c = analysis[0]
- else:
- memset(&morph.c, 0, sizeof(morph.c))
- return morph
-
- def __contains__(self, feature):
- """Test whether the morphological analysis contains some feature."""
- cdef attr_t feat_id = get_string_id(feature)
- return check_feature(&self.c, feat_id)
-
- def __iter__(self):
- """Iterate over the features in the analysis."""
- cdef attr_t feature
- for feature in list_features(&self.c):
- yield self.vocab.strings[feature]
-
- def __len__(self):
- """The number of features in the analysis."""
- return self.c.length
-
- def __str__(self):
- return self.to_json()
-
- def __repr__(self):
- return self.to_json()
-
- def __hash__(self):
- return self.key
-
- def get(self, unicode field):
- """Retrieve a feature by field."""
- cdef int field_id = self.vocab.morphology._feat_map.attr2field[field]
- return self.vocab.strings[get_field(&self.c, field_id)]
-
- def to_json(self):
- """Produce a json serializable representation, which will be a list of
- strings.
- """
- return tag_to_json(&self.c)
-
- @property
- def is_base_form(self):
- raise NotImplementedError
-
- @property
- def pos(self):
- return self.c.pos
-
- @property
- def pos_(self):
- return self.vocab.strings[self.c.pos]
-
- property id:
- def __get__(self):
- return self.key
-
- property abbr:
- def __get__(self):
- return self.c.abbr
-
- property adp_type:
- def __get__(self):
- return self.c.adp_type
-
- property adv_type:
- def __get__(self):
- return self.c.adv_type
-
- property animacy:
- def __get__(self):
- return self.c.animacy
-
- property aspect:
- def __get__(self):
- return self.c.aspect
-
- property case:
- def __get__(self):
- return self.c.case
-
- property conj_type:
- def __get__(self):
- return self.c.conj_type
-
- property connegative:
- def __get__(self):
- return self.c.connegative
-
- property definite:
- def __get__(self):
- return self.c.definite
-
- property degree:
- def __get__(self):
- return self.c.degree
-
- property derivation:
- def __get__(self):
- return self.c.derivation
-
- property echo:
- def __get__(self):
- return self.c.echo
-
- property foreign:
- def __get__(self):
- return self.c.foreign
-
- property gender:
- def __get__(self):
- return self.c.gender
-
- property hyph:
- def __get__(self):
- return self.c.hyph
-
- property inf_form:
- def __get__(self):
- return self.c.inf_form
-
- property mood:
- def __get__(self):
- return self.c.mood
-
- property name_type:
- def __get__(self):
- return self.c.name_type
-
- property negative:
- def __get__(self):
- return self.c.negative
-
- property noun_type:
- def __get__(self):
- return self.c.noun_type
-
- property number:
- def __get__(self):
- return self.c.number
-
- property num_form:
- def __get__(self):
- return self.c.num_form
-
- property num_type:
- def __get__(self):
- return self.c.num_type
-
- property num_value:
- def __get__(self):
- return self.c.num_value
-
- property part_form:
- def __get__(self):
- return self.c.part_form
-
- property part_type:
- def __get__(self):
- return self.c.part_type
-
- property person:
- def __get__(self):
- return self.c.person
-
- property polite:
- def __get__(self):
- return self.c.polite
-
- property polarity:
- def __get__(self):
- return self.c.polarity
-
- property poss:
- def __get__(self):
- return self.c.poss
-
- property prefix:
- def __get__(self):
- return self.c.prefix
-
- property prep_case:
- def __get__(self):
- return self.c.prep_case
-
- property pron_type:
- def __get__(self):
- return self.c.pron_type
-
- property punct_side:
- def __get__(self):
- return self.c.punct_side
-
- property punct_type:
- def __get__(self):
- return self.c.punct_type
-
- property reflex:
- def __get__(self):
- return self.c.reflex
-
- property style:
- def __get__(self):
- return self.c.style
-
- property style_variant:
- def __get__(self):
- return self.c.style_variant
-
- property tense:
- def __get__(self):
- return self.c.tense
-
- property typo:
- def __get__(self):
- return self.c.typo
-
- property verb_form:
- def __get__(self):
- return self.c.verb_form
-
- property voice:
- def __get__(self):
- return self.c.voice
-
- property verb_type:
- def __get__(self):
- return self.c.verb_type
-
- property abbr_:
- def __get__(self):
- return self.vocab.strings[self.c.abbr]
-
- property adp_type_:
- def __get__(self):
- return self.vocab.strings[self.c.adp_type]
-
- property adv_type_:
- def __get__(self):
- return self.vocab.strings[self.c.adv_type]
-
- property animacy_:
- def __get__(self):
- return self.vocab.strings[self.c.animacy]
-
- property aspect_:
- def __get__(self):
- return self.vocab.strings[self.c.aspect]
-
- property case_:
- def __get__(self):
- return self.vocab.strings[self.c.case]
-
- property conj_type_:
- def __get__(self):
- return self.vocab.strings[self.c.conj_type]
-
- property connegative_:
- def __get__(self):
- return self.vocab.strings[self.c.connegative]
-
- property definite_:
- def __get__(self):
- return self.vocab.strings[self.c.definite]
-
- property degree_:
- def __get__(self):
- return self.vocab.strings[self.c.degree]
-
- property derivation_:
- def __get__(self):
- return self.vocab.strings[self.c.derivation]
-
- property echo_:
- def __get__(self):
- return self.vocab.strings[self.c.echo]
-
- property foreign_:
- def __get__(self):
- return self.vocab.strings[self.c.foreign]
-
- property gender_:
- def __get__(self):
- return self.vocab.strings[self.c.gender]
-
- property hyph_:
- def __get__(self):
- return self.vocab.strings[self.c.hyph]
-
- property inf_form_:
- def __get__(self):
- return self.vocab.strings[self.c.inf_form]
-
- property name_type_:
- def __get__(self):
- return self.vocab.strings[self.c.name_type]
-
- property negative_:
- def __get__(self):
- return self.vocab.strings[self.c.negative]
-
- property mood_:
- def __get__(self):
- return self.vocab.strings[self.c.mood]
-
- property number_:
- def __get__(self):
- return self.vocab.strings[self.c.number]
-
- property num_form_:
- def __get__(self):
- return self.vocab.strings[self.c.num_form]
-
- property num_type_:
- def __get__(self):
- return self.vocab.strings[self.c.num_type]
-
- property num_value_:
- def __get__(self):
- return self.vocab.strings[self.c.num_value]
-
- property part_form_:
- def __get__(self):
- return self.vocab.strings[self.c.part_form]
-
- property part_type_:
- def __get__(self):
- return self.vocab.strings[self.c.part_type]
-
- property person_:
- def __get__(self):
- return self.vocab.strings[self.c.person]
-
- property polite_:
- def __get__(self):
- return self.vocab.strings[self.c.polite]
-
- property polarity_:
- def __get__(self):
- return self.vocab.strings[self.c.polarity]
-
- property poss_:
- def __get__(self):
- return self.vocab.strings[self.c.poss]
-
- property prefix_:
- def __get__(self):
- return self.vocab.strings[self.c.prefix]
-
- property prep_case_:
- def __get__(self):
- return self.vocab.strings[self.c.prep_case]
-
- property pron_type_:
- def __get__(self):
- return self.vocab.strings[self.c.pron_type]
-
- property punct_side_:
- def __get__(self):
- return self.vocab.strings[self.c.punct_side]
-
- property punct_type_:
- def __get__(self):
- return self.vocab.strings[self.c.punct_type]
-
- property reflex_:
- def __get__(self):
- return self.vocab.strings[self.c.reflex]
-
- property style_:
- def __get__(self):
- return self.vocab.strings[self.c.style]
-
- property style_variant_:
- def __get__(self):
- return self.vocab.strings[self.c.style_variant]
-
- property tense_:
- def __get__(self):
- return self.vocab.strings[self.c.tense]
-
- property typo_:
- def __get__(self):
- return self.vocab.strings[self.c.typo]
-
- property verb_form_:
- def __get__(self):
- return self.vocab.strings[self.c.verb_form]
-
- property voice_:
- def __get__(self):
- return self.vocab.strings[self.c.voice]
-
- property verb_type_:
- def __get__(self):
- return self.vocab.strings[self.c.verb_type]
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 8b15a4223..07c6f1c99 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -26,7 +26,6 @@ from .. import util
from ..compat import is_config
from ..errors import Errors, Warnings, user_warning, models_warning
from .underscore import Underscore, get_ext_args
-from .morphanalysis cimport MorphAnalysis
cdef class Token:
@@ -219,10 +218,6 @@ cdef class Token:
xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
- @property
- def morph(self):
- return MorphAnalysis.from_id(self.vocab, self.c.morph)
-
@property
def lex_id(self):
"""RETURNS (int): Sequential ID of the token's lexical type."""
@@ -335,7 +330,7 @@ cdef class Token:
"""
def __get__(self):
if self.c.lemma == 0:
- lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
+ lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_)
return self.vocab.strings[lemma_]
else:
return self.c.lemma
@@ -754,8 +749,7 @@ cdef class Token:
def ent_iob_(self):
"""IOB code of named entity tag. "B" means the token begins an entity,
"I" means it is inside an entity, "O" means it is outside an entity,
- and "" means no entity tag is set. "B" with an empty ent_type
- means that the token is blocked from further processing by NER.
+ and "" means no entity tag is set.
RETURNS (unicode): IOB code of named entity tag.
"""
@@ -863,7 +857,7 @@ cdef class Token:
"""
def __get__(self):
if self.c.lemma == 0:
- return self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
+ return self.vocab.morphology.lemmatizer.lookup(self.orth_)
else:
return self.vocab.strings[self.c.lemma]
diff --git a/spacy/util.py b/spacy/util.py
index dbe965392..e88d66452 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -136,7 +136,7 @@ def load_language_data(path):
def get_module_path(module):
if not hasattr(module, "__module__"):
- raise ValueError(Errors.E169.format(module=repr(module)))
+ raise ValueError("Can't find module {}".format(repr(module)))
return Path(sys.modules[module.__module__].__file__).parent
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 3c238fe2d..2cb5b077f 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -63,7 +63,7 @@ cdef class Vectors:
shape (tuple): Size of the table, as (# entries, # columns)
data (numpy.ndarray): The vector data.
keys (iterable): A sequence of keys, aligned with the data.
- name (unicode): A name to identify the vectors table.
+ name (string): A name to identify the vectors table.
RETURNS (Vectors): The newly created object.
DOCS: https://spacy.io/api/vectors#init
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 62c1791b9..7e360d409 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -18,10 +18,10 @@ from .structs cimport SerializedLexemeC
from .compat import copy_reg, basestring_
from .errors import Errors
from .lemmatizer import Lemmatizer
+from .lookups import Lookups
from .attrs import intify_attrs, NORM
from .vectors import Vectors
from ._ml import link_vectors_to_models
-from .lookups import Lookups
from . import util
@@ -33,8 +33,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab
"""
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
- strings=tuple(), lookups=None, oov_prob=-20., vectors_name=None,
- **deprecated_kwargs):
+ strings=tuple(), lookups=None, oov_prob=-20., **deprecated_kwargs):
"""Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to
@@ -45,7 +44,6 @@ cdef class Vocab:
strings (StringStore): StringStore that maps strings to integers, and
vice versa.
lookups (Lookups): Container for large lookup tables and dictionaries.
- name (unicode): Optional name to identify the vectors table.
RETURNS (Vocab): The newly constructed object.
"""
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
@@ -64,7 +62,7 @@ cdef class Vocab:
_ = self[string]
self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
- self.vectors = Vectors(name=vectors_name)
+ self.vectors = Vectors()
self.lookups = lookups
@property
@@ -320,7 +318,7 @@ cdef class Vocab:
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
- self.vectors = Vectors(data=keep, keys=keys, name=self.vectors.name)
+ self.vectors = Vectors(data=keep, keys=keys)
syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
remap = {}
for i, key in enumerate(keys[nr_row:]):
diff --git a/website/README.md b/website/README.md
index a02d5a151..be817225d 100644
--- a/website/README.md
+++ b/website/README.md
@@ -309,7 +309,7 @@ indented block as plain text and preserve whitespace.
### Using spaCy
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("This is a sentence.")
+doc = nlp(u"This is a sentence.")
for token in doc:
print(token.text, token.pos_)
```
@@ -335,9 +335,9 @@ from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
-pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
-matcher.add("HelloWorld", None, pattern)
-doc = nlp("Hello, world! Hello world!")
+pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]
+matcher.add('HelloWorld', None, pattern)
+doc = nlp(u'Hello, world! Hello world!')
matches = matcher(doc)
```
@@ -360,7 +360,7 @@ interactive widget defaults to a regular code block.
### {executable="true"}
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("This is a sentence.")
+doc = nlp(u"This is a sentence.")
for token in doc:
print(token.text, token.pos_)
```
@@ -457,8 +457,7 @@ sit amet dignissim justo congue.
## Setup and installation {#setup}
Before running the setup, make sure your versions of
-[Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date.
-Node v10.15 or later is required.
+[Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date. Node v10.15 or later is required.
```bash
# Clone the repository
diff --git a/website/docs/api/annotation.md b/website/docs/api/annotation.md
index fb8b67c1e..7f7b46260 100644
--- a/website/docs/api/annotation.md
+++ b/website/docs/api/annotation.md
@@ -16,7 +16,7 @@ menu:
> ```python
> from spacy.lang.en import English
> nlp = English()
-> tokens = nlp("Some\\nspaces and\\ttab characters")
+> tokens = nlp(u"Some\\nspaces and\\ttab characters")
> tokens_text = [t.text for t in tokens]
> assert tokens_text == ["Some", "\\n", "spaces", " ", "and", "\\t", "tab", "characters"]
> ```
@@ -80,8 +80,8 @@ training corpus and can be defined in the respective language data's
-spaCy maps all language-specific part-of-speech tags to a small, fixed set of
-word type tags following the
+spaCy also maps all language-specific part-of-speech tags to a small, fixed set
+of word type tags following the
[Universal Dependencies scheme](http://universaldependencies.org/u/pos/). The
universal tags don't code for any morphological features and only cover the word
type. They're available as the [`Token.pos`](/api/token#attributes) and
@@ -552,10 +552,6 @@ spaCy's JSON format, you can use the
"last": int, # index of last token
"label": string # phrase label
}]
- }],
- "cats": [{ # new in v2.2: categories for text classifier
- "label": string, # text category label
- "value": float / bool # label applies (1.0/true) or not (0.0/false)
}]
}]
}]
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 7b20b76de..c5e77dc0d 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -8,7 +8,6 @@ menu:
- ['Info', 'info']
- ['Validate', 'validate']
- ['Convert', 'convert']
- - ['Debug data', 'debug-data']
- ['Train', 'train']
- ['Pretrain', 'pretrain']
- ['Init Model', 'init-model']
@@ -23,11 +22,11 @@ type `spacy --help`.
## Download {#download}
Download [models](/usage/models) for spaCy. The downloader finds the
-best-matching compatible version, uses `pip install` to download the model as a
-package and creates a [shortcut link](/usage/models#usage) if the model was
-downloaded via a shortcut. Direct downloads don't perform any compatibility
-checks and require the model name to be specified with its version (e.g.
-`en_core_web_sm-2.2.0`).
+best-matching compatible version, uses pip to download the model as a package
+and automatically creates a [shortcut link](/usage/models#usage) to load the
+model by name. Direct downloads don't perform any compatibility checks and
+require the model name to be specified with its version (e.g.
+`en_core_web_sm-2.0.0`).
> #### Downloading best practices
>
@@ -40,16 +39,16 @@ checks and require the model name to be specified with its version (e.g.
> also allow you to add it as a versioned package dependency to your project.
```bash
-$ python -m spacy download [model] [--direct] [pip args]
+$ python -m spacy download [model] [--direct]
```
-| Argument | Type | Description |
-| ------------------------------------- | ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model` | positional | Model name or shortcut (`en`, `de`, `en_core_web_sm`). |
-| `--direct`, `-d` | flag | Force direct download of exact model version. |
-| pip args 2.1 | - | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. |
-| `--help`, `-h` | flag | Show help message and available arguments. |
-| **CREATES** | directory, symlink | The installed model package in your `site-packages` directory and a shortcut link as a symlink in `spacy/data` if installed via shortcut. |
+| Argument | Type | Description |
+| ---------------------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model` | positional | Model name or shortcut (`en`, `de`, `en_core_web_sm`). |
+| `--direct`, `-d` | flag | Force direct download of exact model version. |
+| other 2.1 | - | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory. |
+| `--help`, `-h` | flag | Show help message and available arguments. |
+| **CREATES** | directory, symlink | The installed model package in your `site-packages` directory and a shortcut link as a symlink in `spacy/data`. |
## Link {#link}
@@ -182,166 +181,6 @@ All output files generated by this command are compatible with
| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
-## Debug data {#debug-data new="2.2"}
-
-Analyze, debug and validate your training and development data, get useful
-stats, and find problems like invalid entity annotations, cyclic dependencies,
-low data labels and more.
-
-```bash
-$ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format]
-```
-
-| Argument | Type | Description |
-| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------- |
-| `lang` | positional | Model language. |
-| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. |
-| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
-| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. |
-| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
-| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
-| `--verbose`, `-V` | flag | Print additional information and explanations. |
-| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |
-
-
-
-```
-=========================== Data format validation ===========================
-✔ Corpus is loadable
-
-=============================== Training stats ===============================
-Training pipeline: tagger, parser, ner
-Starting with blank model 'en'
-18127 training docs
-2939 evaluation docs
-⚠ 34 training examples also in evaluation data
-
-============================== Vocab & Vectors ==============================
-ℹ 2083156 total words in the data (56962 unique)
-⚠ 13020 misaligned tokens in the training data
-⚠ 2423 misaligned tokens in the dev data
-10 most common words: 'the' (98429), ',' (91756), '.' (87073), 'to' (50058),
-'of' (49559), 'and' (44416), 'a' (34010), 'in' (31424), 'that' (22792), 'is'
-(18952)
-ℹ No word vectors present in the model
-
-========================== Named Entity Recognition ==========================
-ℹ 18 new labels, 0 existing labels
-528978 missing values (tokens with '-' label)
-New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
-(10490), 'NORP' (9033), 'MONEY' (5164), 'PERCENT' (3761), 'ORDINAL' (2122),
-'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
-(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
-✔ Good amount of examples for all labels
-✔ Examples without occurences available for all labels
-✔ No entities consisting of or starting/ending with whitespace
-
-=========================== Part-of-speech Tagging ===========================
-ℹ 49 labels in data (57 labels in tag map)
-'NN' (266331), 'IN' (227365), 'DT' (185600), 'NNP' (164404), 'JJ' (119830),
-'NNS' (110957), '.' (101482), ',' (92476), 'RB' (90090), 'PRP' (90081), 'VB'
-(74538), 'VBD' (68199), 'CC' (62862), 'VBZ' (50712), 'VBP' (43420), 'VBN'
-(42193), 'CD' (40326), 'VBG' (34764), 'TO' (31085), 'MD' (25863), 'PRP$'
-(23335), 'HYPH' (13833), 'POS' (13427), 'UH' (13322), 'WP' (10423), 'WDT'
-(9850), 'RP' (8230), 'WRB' (8201), ':' (8168), '''' (7392), '``' (6984), 'NNPS'
-(5817), 'JJR' (5689), '$' (3710), 'EX' (3465), 'JJS' (3118), 'RBR' (2872),
-'-RRB-' (2825), '-LRB-' (2788), 'PDT' (2078), 'XX' (1316), 'RBS' (1142), 'FW'
-(794), 'NFP' (557), 'SYM' (440), 'WP$' (294), 'LS' (293), 'ADD' (191), 'AFX'
-(24)
-✔ All labels present in tag map for language 'en'
-
-============================= Dependency Parsing =============================
-ℹ Found 111703 sentences with an average length of 18.6 words.
-ℹ Found 2251 nonprojective train sentences
-ℹ Found 303 nonprojective dev sentences
-ℹ 47 labels in train data
-ℹ 211 labels in projectivized train data
-'punct' (236796), 'prep' (188853), 'pobj' (182533), 'det' (172674), 'nsubj'
-(169481), 'compound' (116142), 'ROOT' (111697), 'amod' (107945), 'dobj' (93540),
-'aux' (86802), 'advmod' (86197), 'cc' (62679), 'conj' (59575), 'poss' (36449),
-'ccomp' (36343), 'advcl' (29017), 'mark' (27990), 'nummod' (24582), 'relcl'
-(21359), 'xcomp' (21081), 'attr' (18347), 'npadvmod' (17740), 'acomp' (17204),
-'auxpass' (15639), 'appos' (15368), 'neg' (15266), 'nsubjpass' (13922), 'case'
-(13408), 'acl' (12574), 'pcomp' (10340), 'nmod' (9736), 'intj' (9285), 'prt'
-(8196), 'quantmod' (7403), 'dep' (4300), 'dative' (4091), 'agent' (3908), 'expl'
-(3456), 'parataxis' (3099), 'oprd' (2326), 'predet' (1946), 'csubj' (1494),
-'subtok' (1147), 'preconj' (692), 'meta' (469), 'csubjpass' (64), 'iobj' (1)
-⚠ Low number of examples for label 'iobj' (1)
-⚠ Low number of examples for 130 labels in the projectivized dependency
-trees used for training. You may want to projectivize labels such as punct
-before training in order to improve parser performance.
-⚠ Projectivized labels with low numbers of examples: appos||attr: 12
-advmod||dobj: 13 prep||ccomp: 12 nsubjpass||ccomp: 15 pcomp||prep: 14
-amod||dobj: 9 attr||xcomp: 14 nmod||nsubj: 17 prep||advcl: 2 prep||prep: 5
-nsubj||conj: 12 advcl||advmod: 18 ccomp||advmod: 11 ccomp||pcomp: 5 acl||pobj:
-10 npadvmod||acomp: 7 dobj||pcomp: 14 nsubjpass||pcomp: 1 nmod||pobj: 8
-amod||attr: 6 nmod||dobj: 12 aux||conj: 1 neg||conj: 1 dative||xcomp: 11
-pobj||dative: 3 xcomp||acomp: 19 advcl||pobj: 2 nsubj||advcl: 2 csubj||ccomp: 1
-advcl||acl: 1 relcl||nmod: 2 dobj||advcl: 10 advmod||advcl: 3 nmod||nsubjpass: 6
-amod||pobj: 5 cc||neg: 1 attr||ccomp: 16 advcl||xcomp: 3 nmod||attr: 4
-advcl||nsubjpass: 5 advcl||ccomp: 4 ccomp||conj: 1 punct||acl: 1 meta||acl: 1
-parataxis||acl: 1 prep||acl: 1 amod||nsubj: 7 ccomp||ccomp: 3 acomp||xcomp: 5
-dobj||acl: 5 prep||oprd: 6 advmod||acl: 2 dative||advcl: 1 pobj||agent: 5
-xcomp||amod: 1 dep||advcl: 1 prep||amod: 8 relcl||compound: 1 advcl||csubj: 3
-npadvmod||conj: 2 npadvmod||xcomp: 4 advmod||nsubj: 3 ccomp||amod: 7
-advcl||conj: 1 nmod||conj: 2 advmod||nsubjpass: 2 dep||xcomp: 2 appos||ccomp: 1
-advmod||dep: 1 advmod||advmod: 5 aux||xcomp: 8 dep||advmod: 1 dative||ccomp: 2
-prep||dep: 1 conj||conj: 1 dep||ccomp: 4 cc||ROOT: 1 prep||ROOT: 1 nsubj||pcomp:
-3 advmod||prep: 2 relcl||dative: 1 acl||conj: 1 advcl||attr: 4 prep||npadvmod: 1
-nsubjpass||xcomp: 1 neg||advmod: 1 xcomp||oprd: 1 advcl||advcl: 1 dobj||dep: 3
-nsubjpass||parataxis: 1 attr||pcomp: 1 ccomp||parataxis: 1 advmod||attr: 1
-nmod||oprd: 1 appos||nmod: 2 advmod||relcl: 1 appos||npadvmod: 1 appos||conj: 1
-prep||expl: 1 nsubjpass||conj: 1 punct||pobj: 1 cc||pobj: 1 conj||pobj: 1
-punct||conj: 1 ccomp||dep: 1 oprd||xcomp: 3 ccomp||xcomp: 1 ccomp||nsubj: 1
-nmod||dep: 1 xcomp||ccomp: 1 acomp||advcl: 1 intj||advmod: 1 advmod||acomp: 2
-relcl||oprd: 1 advmod||prt: 1 advmod||pobj: 1 appos||nummod: 1 relcl||npadvmod:
-3 mark||advcl: 1 aux||ccomp: 1 amod||nsubjpass: 1 npadvmod||advmod: 1 conj||dep:
-1 nummod||pobj: 1 amod||npadvmod: 1 intj||pobj: 1 nummod||npadvmod: 1
-xcomp||xcomp: 1 aux||dep: 1 advcl||relcl: 1
-⚠ The following labels were found only in the train data: xcomp||amod,
-advcl||relcl, prep||nsubjpass, acl||nsubj, nsubjpass||conj, xcomp||oprd,
-advmod||conj, advmod||advmod, iobj, advmod||nsubjpass, dobj||conj, ccomp||amod,
-meta||acl, xcomp||xcomp, prep||attr, prep||ccomp, advcl||acomp, acl||dobj,
-advcl||advcl, pobj||agent, prep||advcl, nsubjpass||xcomp, prep||dep,
-acomp||xcomp, aux||ccomp, ccomp||dep, conj||dep, relcl||compound,
-nsubjpass||ccomp, nmod||dobj, advmod||advcl, advmod||acl, dobj||advcl,
-dative||xcomp, prep||nsubj, ccomp||ccomp, nsubj||ccomp, xcomp||acomp,
-prep||acomp, dep||advmod, acl||pobj, appos||dobj, npadvmod||acomp, cc||ROOT,
-relcl||nsubj, nmod||pobj, acl||nsubjpass, ccomp||advmod, pcomp||prep,
-amod||dobj, advmod||attr, advcl||csubj, appos||attr, dobj||pcomp, prep||ROOT,
-relcl||pobj, advmod||pobj, amod||nsubj, ccomp||xcomp, prep||oprd,
-npadvmod||advmod, appos||nummod, advcl||pobj, neg||advmod, acl||attr,
-appos||nsubjpass, csubj||ccomp, amod||nsubjpass, intj||pobj, dep||advcl,
-cc||neg, xcomp||ccomp, dative||ccomp, nmod||oprd, pobj||dative, prep||dobj,
-dep||ccomp, relcl||attr, ccomp||nsubj, advcl||xcomp, nmod||dep, advcl||advmod,
-ccomp||conj, pobj||prep, advmod||acomp, advmod||relcl, attr||pcomp,
-ccomp||parataxis, oprd||xcomp, intj||advmod, nmod||nsubjpass, prep||npadvmod,
-parataxis||acl, prep||pobj, advcl||dobj, amod||pobj, prep||acl, conj||pobj,
-advmod||dep, punct||pobj, ccomp||acomp, acomp||advcl, nummod||npadvmod,
-dobj||dep, npadvmod||xcomp, advcl||conj, relcl||npadvmod, punct||acl,
-relcl||dobj, dobj||xcomp, nsubjpass||parataxis, dative||advcl, relcl||nmod,
-advcl||ccomp, appos||npadvmod, ccomp||pcomp, prep||amod, mark||advcl,
-prep||advmod, prep||xcomp, appos||nsubj, attr||ccomp, advmod||prt, dobj||ccomp,
-aux||conj, advcl||nsubj, conj||conj, advmod||ccomp, advcl||nsubjpass,
-attr||xcomp, nmod||conj, npadvmod||conj, relcl||dative, prep||expl,
-nsubjpass||pcomp, advmod||xcomp, advmod||dobj, appos||pobj, nsubj||conj,
-relcl||nsubjpass, advcl||attr, appos||ccomp, advmod||prep, prep||conj,
-nmod||attr, punct||conj, neg||conj, dep||xcomp, aux||xcomp, dobj||acl,
-nummod||pobj, amod||npadvmod, nsubj||pcomp, advcl||acl, appos||nmod,
-relcl||oprd, prep||prep, cc||pobj, nmod||nsubj, amod||attr, aux||dep,
-appos||conj, advmod||nsubj, nsubj||advcl, acl||conj
-To train a parser, your data should include at least 20 instances of each label.
-⚠ Multiple root labels (ROOT, nsubj, aux, npadvmod, prep) found in
-training data. spaCy's parser uses a single root label ROOT so this distinction
-will not be available.
-
-================================== Summary ==================================
-✔ 5 checks passed
-⚠ 8 warnings
-```
-
-
-
## Train {#train}
Train a model. Expects data in spaCy's
@@ -361,41 +200,36 @@ will only train the tagger and parser.
```bash
$ python -m spacy train [lang] [output_path] [train_path] [dev_path]
-[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping]
-[--n-examples] [--use-gpu] [--version] [--meta-path] [--init-tok2vec]
-[--parser-multitasks] [--entity-multitasks] [--gold-preproc] [--noise-level]
-[--orth-variant-level] [--learn-tokens] [--textcat-arch] [--textcat-multilabel]
-[--textcat-positive-label] [--verbose]
+[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping] [--n-examples] [--use-gpu]
+[--version] [--meta-path] [--init-tok2vec] [--parser-multitasks]
+[--entity-multitasks] [--gold-preproc] [--noise-level] [--learn-tokens]
+[--verbose]
```
-| Argument | Type | Description |
-| --------------------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang` | positional | Model language. |
-| `output_path` | positional | Directory to store model in. Will be created if it doesn't exist. |
-| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. |
-| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
-| `--base-model`, `-b` 2.1 | option | Optional name of base model to update. Can be any loadable spaCy model. |
-| `--pipeline`, `-p` 2.1 | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
-| `--vectors`, `-v` | option | Model to load vectors from. |
-| `--n-iter`, `-n` | option | Number of iterations (default: `30`). |
-| `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. |
-| `--n-examples`, `-ns` | option | Number of examples to use (defaults to `0` for all examples). |
-| `--use-gpu`, `-g` | option | Whether to use GPU. Can be either `0`, `1` or `-1`. |
-| `--version`, `-V` | option | Model version. Will be written out to the model's `meta.json` after training. |
-| `--meta-path`, `-m` 2 | option | Optional path to model [`meta.json`](/usage/training#models-generating). All relevant properties like `lang`, `pipeline` and `spacy_version` will be overwritten. |
-| `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
-| `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` |
-| `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` |
-| `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. |
-| `--orth-variant-level`, `-ovl` 2.2 | option | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement). |
-| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
-| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. |
-| `--textcat-multilabel`, `-TML` 2.2 | flag | Text classification classes aren't mutually exclusive (multilabel). |
-| `--textcat-arch`, `-ta` 2.2 | option | Text classification model architecture. Defaults to `"bow"`. |
-| `--textcat-positive-label`, `-tpl` 2.2 | option | Text classification positive label for binary classes with two labels. |
-| `--verbose`, `-VV` 2.0.13 | flag | Show more detailed messages during training. |
-| `--help`, `-h` | flag | Show help message and available arguments. |
-| **CREATES** | model, pickle | A spaCy model on each epoch. |
+| Argument | Type | Description |
+| ----------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `lang` | positional | Model language. |
+| `output_path` | positional | Directory to store model in. Will be created if it doesn't exist. |
+| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. |
+| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
+| `--base-model`, `-b` 2.1 | option | Optional name of base model to update. Can be any loadable spaCy model. |
+| `--pipeline`, `-p` 2.1 | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
+| `--vectors`, `-v` | option | Model to load vectors from. |
+| `--n-iter`, `-n` | option | Number of iterations (default: `30`). |
+| `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. |
+| `--n-examples`, `-ns` | option | Number of examples to use (defaults to `0` for all examples). |
+| `--use-gpu`, `-g` | option | Whether to use GPU. Can be either `0`, `1` or `-1`. |
+| `--version`, `-V` | option | Model version. Will be written out to the model's `meta.json` after training. |
+| `--meta-path`, `-m` 2 | option | Optional path to model [`meta.json`](/usage/training#models-generating). All relevant properties like `lang`, `pipeline` and `spacy_version` will be overwritten. |
+| `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
+| `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` |
+| `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` |
+| `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. |
+| `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
+| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. |
+| `--verbose`, `-VV` 2.0.13 | flag | Show more detailed messages during training. |
+| `--help`, `-h` | flag | Show help message and available arguments. |
+| **CREATES** | model, pickle | A spaCy model on each epoch. |
### Environment variables for hyperparameters {#train-hyperparams new="2"}
@@ -540,7 +374,6 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
-| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
## Evaluate {#evaluate new="2"}
diff --git a/website/docs/api/cython-classes.md b/website/docs/api/cython-classes.md
index 77d6fdd10..4d188d90f 100644
--- a/website/docs/api/cython-classes.md
+++ b/website/docs/api/cython-classes.md
@@ -45,9 +45,9 @@ Append a token to the `Doc`. The token can be provided as a
> from spacy.vocab cimport Vocab
>
> doc = Doc(Vocab())
-> lexeme = doc.vocab.get("hello")
+> lexeme = doc.vocab.get(u'hello')
> doc.push_back(lexeme, True)
-> assert doc.text == "hello "
+> assert doc.text == u'hello '
> ```
| Name | Type | Description |
@@ -164,7 +164,7 @@ vocabulary.
> #### Example
>
> ```python
-> lexeme = vocab.get(vocab.mem, "hello")
+> lexeme = vocab.get(vocab.mem, u'hello')
> ```
| Name | Type | Description |
diff --git a/website/docs/api/cython-structs.md b/website/docs/api/cython-structs.md
index 935bce25d..0e427a8d5 100644
--- a/website/docs/api/cython-structs.md
+++ b/website/docs/api/cython-structs.md
@@ -88,7 +88,7 @@ Find a token in a `TokenC*` array by the offset of its first character.
> from spacy.tokens.doc cimport Doc, token_by_start
> from spacy.vocab cimport Vocab
>
-> doc = Doc(Vocab(), words=["hello", "world"])
+> doc = Doc(Vocab(), words=[u'hello', u'world'])
> assert token_by_start(doc.c, doc.length, 6) == 1
> assert token_by_start(doc.c, doc.length, 4) == -1
> ```
@@ -110,7 +110,7 @@ Find a token in a `TokenC*` array by the offset of its final character.
> from spacy.tokens.doc cimport Doc, token_by_end
> from spacy.vocab cimport Vocab
>
-> doc = Doc(Vocab(), words=["hello", "world"])
+> doc = Doc(Vocab(), words=[u'hello', u'world'])
> assert token_by_end(doc.c, doc.length, 5) == 0
> assert token_by_end(doc.c, doc.length, 1) == -1
> ```
@@ -134,7 +134,7 @@ attribute, in order to make the parse tree navigation consistent.
> from spacy.tokens.doc cimport Doc, set_children_from_heads
> from spacy.vocab cimport Vocab
>
-> doc = Doc(Vocab(), words=["Baileys", "from", "a", "shoe"])
+> doc = Doc(Vocab(), words=[u'Baileys', u'from', u'a', u'shoe'])
> doc.c[0].head = 0
> doc.c[1].head = 0
> doc.c[2].head = 3
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index df0df3e38..58acc4425 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -58,7 +58,7 @@ and all pipeline components are applied to the `Doc` in order. Both
>
> ```python
> parser = DependencyParser(nlp.vocab)
-> doc = nlp("This is a sentence.")
+> doc = nlp(u"This is a sentence.")
> # This usually happens under the hood
> processed = parser(doc)
> ```
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index ad684f51e..431d3a092 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -20,11 +20,11 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
>
> ```python
> # Construction 1
-> doc = nlp("Some text")
+> doc = nlp(u"Some text")
>
> # Construction 2
> from spacy.tokens import Doc
-> words = ["hello", "world", "!"]
+> words = [u"hello", u"world", u"!"]
> spaces = [True, False, False]
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
> ```
@@ -45,7 +45,7 @@ Negative indexing is supported, and follows the usual Python semantics, i.e.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> assert doc[0].text == "Give"
> assert doc[-1].text == "."
> span = doc[1:3]
@@ -76,8 +76,8 @@ Iterate over `Token` objects, from which the annotations can be easily accessed.
> #### Example
>
> ```python
-> doc = nlp("Give it back")
-> assert [t.text for t in doc] == ["Give", "it", "back"]
+> doc = nlp(u'Give it back')
+> assert [t.text for t in doc] == [u'Give', u'it', u'back']
> ```
This is the main way of accessing [`Token`](/api/token) objects, which are the
@@ -96,7 +96,7 @@ Get the number of tokens in the document.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> assert len(doc) == 7
> ```
@@ -114,9 +114,9 @@ details, see the documentation on
>
> ```python
> from spacy.tokens import Doc
-> city_getter = lambda doc: any(city in doc.text for city in ("New York", "Paris", "Berlin"))
-> Doc.set_extension("has_city", getter=city_getter)
-> doc = nlp("I like New York")
+> city_getter = lambda doc: any(city in doc.text for city in ('New York', 'Paris', 'Berlin'))
+> Doc.set_extension('has_city', getter=city_getter)
+> doc = nlp(u'I like New York')
> assert doc._.has_city
> ```
@@ -192,8 +192,8 @@ the character indices don't map to a valid span.
> #### Example
>
> ```python
-> doc = nlp("I like New York")
-> span = doc.char_span(7, 15, label="GPE")
+> doc = nlp(u"I like New York")
+> span = doc.char_span(7, 15, label=u"GPE")
> assert span.text == "New York"
> ```
@@ -213,8 +213,8 @@ using an average of word vectors.
> #### Example
>
> ```python
-> apples = nlp("I like apples")
-> oranges = nlp("I like oranges")
+> apples = nlp(u"I like apples")
+> oranges = nlp(u"I like oranges")
> apples_oranges = apples.similarity(oranges)
> oranges_apples = oranges.similarity(apples)
> assert apples_oranges == oranges_apples
@@ -235,7 +235,7 @@ attribute ID.
>
> ```python
> from spacy.attrs import ORTH
-> doc = nlp("apple apple orange banana")
+> doc = nlp(u"apple apple orange banana")
> assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
> doc.to_array([ORTH])
> # array([[11880], [11880], [7561], [12800]])
@@ -255,7 +255,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
> #### Example
>
> ```python
-> doc = nlp("This is a test")
+> doc = nlp(u"This is a test")
> matrix = doc.get_lca_matrix()
> # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32)
> ```
@@ -274,7 +274,7 @@ They'll be added to an `"_"` key in the data, e.g. `"_": {"foo": "bar"}`.
> #### Example
>
> ```python
-> doc = nlp("Hello")
+> doc = nlp(u"Hello")
> json_doc = doc.to_json()
> ```
>
@@ -342,7 +342,7 @@ array of attributes.
> ```python
> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
> from spacy.tokens import Doc
-> doc = nlp("Hello world!")
+> doc = nlp(u"Hello world!")
> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
> doc2 = Doc(doc.vocab, words=[t.text for t in doc])
> doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
@@ -396,7 +396,7 @@ Serialize, i.e. export the document contents to a binary string.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> doc_bytes = doc.to_bytes()
> ```
@@ -413,9 +413,10 @@ Deserialize, i.e. import the document contents from a binary string.
>
> ```python
> from spacy.tokens import Doc
-> doc = nlp("Give it back! He pleaded.")
-> doc_bytes = doc.to_bytes()
-> doc2 = Doc(doc.vocab).from_bytes(doc_bytes)
+> text = u"Give it back! He pleaded."
+> doc = nlp(text)
+> bytes = doc.to_bytes()
+> doc2 = Doc(doc.vocab).from_bytes(bytes)
> assert doc.text == doc2.text
> ```
@@ -456,9 +457,9 @@ dictionary mapping attribute names to values as the `"_"` key.
> #### Example
>
> ```python
-> doc = nlp("I like David Bowie")
+> doc = nlp(u"I like David Bowie")
> with doc.retokenize() as retokenizer:
-> attrs = {"LEMMA": "David Bowie"}
+> attrs = {"LEMMA": u"David Bowie"}
> retokenizer.merge(doc[2:4], attrs=attrs)
> ```
@@ -488,7 +489,7 @@ underlying lexeme (if they're context-independent lexical attributes like
> #### Example
>
> ```python
-> doc = nlp("I live in NewYork")
+> doc = nlp(u"I live in NewYork")
> with doc.retokenize() as retokenizer:
> heads = [(doc[3], 1), doc[2]]
> attrs = {"POS": ["PROPN", "PROPN"],
@@ -520,9 +521,9 @@ and end token boundaries, the document remains unchanged.
> #### Example
>
> ```python
-> doc = nlp("Los Angeles start.")
+> doc = nlp(u"Los Angeles start.")
> doc.merge(0, len("Los Angeles"), "NNP", "Los Angeles", "GPE")
-> assert [t.text for t in doc] == ["Los Angeles", "start", "."]
+> assert [t.text for t in doc] == [u"Los Angeles", u"start", u"."]
> ```
| Name | Type | Description |
@@ -540,11 +541,11 @@ objects, if the entity recognizer has been applied.
> #### Example
>
> ```python
-> doc = nlp("Mr. Best flew to New York on Saturday morning.")
+> doc = nlp(u"Mr. Best flew to New York on Saturday morning.")
> ents = list(doc.ents)
> assert ents[0].label == 346
-> assert ents[0].label_ == "PERSON"
-> assert ents[0].text == "Mr. Best"
+> assert ents[0].label_ == u"PERSON"
+> assert ents[0].text == u"Mr. Best"
> ```
| Name | Type | Description |
@@ -562,10 +563,10 @@ relative clauses.
> #### Example
>
> ```python
-> doc = nlp("A phrase with another phrase occurs.")
+> doc = nlp(u"A phrase with another phrase occurs.")
> chunks = list(doc.noun_chunks)
-> assert chunks[0].text == "A phrase"
-> assert chunks[1].text == "another phrase"
+> assert chunks[0].text == u"A phrase"
+> assert chunks[1].text == u"another phrase"
> ```
| Name | Type | Description |
@@ -582,10 +583,10 @@ will be unavailable.
> #### Example
>
> ```python
-> doc = nlp("This is a sentence. Here's another...")
+> doc = nlp(u"This is a sentence. Here's another...")
> sents = list(doc.sents)
> assert len(sents) == 2
-> assert [s.root.text for s in sents] == ["is", "'s"]
+> assert [s.root.text for s in sents] == [u"is", u"'s"]
> ```
| Name | Type | Description |
@@ -599,7 +600,7 @@ A boolean value indicating whether a word vector is associated with the object.
> #### Example
>
> ```python
-> doc = nlp("I like apples")
+> doc = nlp(u"I like apples")
> assert doc.has_vector
> ```
@@ -615,8 +616,8 @@ vectors.
> #### Example
>
> ```python
-> doc = nlp("I like apples")
-> assert doc.vector.dtype == "float32"
+> doc = nlp(u"I like apples")
+> assert doc.vector.dtype == 'float32'
> assert doc.vector.shape == (300,)
> ```
@@ -631,8 +632,8 @@ The L2 norm of the document's vector representation.
> #### Example
>
> ```python
-> doc1 = nlp("I like apples")
-> doc2 = nlp("I like oranges")
+> doc1 = nlp(u"I like apples")
+> doc2 = nlp(u"I like oranges")
> doc1.vector_norm # 4.54232424414368
> doc2.vector_norm # 3.304373298575751
> assert doc1.vector_norm != doc2.vector_norm
diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md
deleted file mode 100644
index a4525906e..000000000
--- a/website/docs/api/docbin.md
+++ /dev/null
@@ -1,149 +0,0 @@
----
-title: DocBin
-tag: class
-new: 2.2
-teaser: Pack Doc objects for binary serialization
-source: spacy/tokens/_serialize.py
----
-
-The `DocBin` class lets you efficiently serialize the information from a
-collection of `Doc` objects. You can control which information is serialized by
-passing a list of attribute IDs, and optionally also specify whether the user
-data is serialized. The `DocBin` is faster and produces smaller data sizes than
-pickle, and allows you to deserialize without executing arbitrary Python code. A
-notable downside to this format is that you can't easily extract just one
-document from the `DocBin`. The serialization format is gzipped msgpack, where
-the msgpack object has the following structure:
-
-```python
-### msgpack object strcutrue
-{
- "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
- "tokens": bytes, # Serialized numpy uint64 array with the token data
- "spaces": bytes, # Serialized numpy boolean array with spaces data
- "lengths": bytes, # Serialized numpy int32 array with the doc lengths
- "strings": List[unicode] # List of unique strings in the token data
-}
-```
-
-Strings for the words, tags, labels etc are represented by 64-bit hashes in the
-token data, and every string that occurs at least once is passed via the strings
-object. This means the storage is more efficient if you pack more documents
-together, because you have less duplication in the strings. For usage examples,
-see the docs on [serializing `Doc` objects](/usage/saving-loading#docs).
-
-## DocBin.\_\_init\_\_ {#init tag="method"}
-
-Create a `DocBin` object to hold serialized annotations.
-
-> #### Example
->
-> ```python
-> from spacy.tokens import DocBin
-> doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
-> ```
-
-| Argument | Type | Description |
-| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
-| `store_user_data` | bool | Whether to include the `Doc.user_data`. Defaults to `False`. |
-| **RETURNS** | `DocBin` | The newly constructed object. |
-
-## DocBin.\_\len\_\_ {#len tag="method"}
-
-Get the number of `Doc` objects that were added to the `DocBin`.
-
-> #### Example
->
-> ```python
-> doc_bin = DocBin(attrs=["LEMMA"])
-> doc = nlp("This is a document to serialize.")
-> doc_bin.add(doc)
-> assert len(doc_bin) == 1
-> ```
-
-| Argument | Type | Description |
-| ----------- | ---- | ------------------------------------------- |
-| **RETURNS** | int | The number of `Doc`s added to the `DocBin`. |
-
-## DocBin.add {#add tag="method"}
-
-Add a `Doc`'s annotations to the `DocBin` for serialization.
-
-> #### Example
->
-> ```python
-> doc_bin = DocBin(attrs=["LEMMA"])
-> doc = nlp("This is a document to serialize.")
-> doc_bin.add(doc)
-> ```
-
-| Argument | Type | Description |
-| -------- | ----- | ------------------------ |
-| `doc` | `Doc` | The `Doc` object to add. |
-
-## DocBin.get_docs {#get_docs tag="method"}
-
-Recover `Doc` objects from the annotations, using the given vocab.
-
-> #### Example
->
-> ```python
-> docs = list(doc_bin.get_docs(nlp.vocab))
-> ```
-
-| Argument | Type | Description |
-| ---------- | ------- | ------------------ |
-| `vocab` | `Vocab` | The shared vocab. |
-| **YIELDS** | `Doc` | The `Doc` objects. |
-
-## DocBin.merge {#merge tag="method"}
-
-Extend the annotations of this `DocBin` with the annotations from another. Will
-raise an error if the pre-defined attrs of the two `DocBin`s don't match.
-
-> #### Example
->
-> ```python
-> doc_bin1 = DocBin(attrs=["LEMMA", "POS"])
-> doc_bin1.add(nlp("Hello world"))
-> doc_bin2 = DocBin(attrs=["LEMMA", "POS"])
-> doc_bin2.add(nlp("This is a sentence"))
-> merged_bins = doc_bin1.merge(doc_bin2)
-> assert len(merged_bins) == 2
-> ```
-
-| Argument | Type | Description |
-| -------- | -------- | ------------------------------------------- |
-| `other` | `DocBin` | The `DocBin` to merge into the current bin. |
-
-## DocBin.to_bytes {#to_bytes tag="method"}
-
-Serialize the `DocBin`'s annotations to a bytestring.
-
-> #### Example
->
-> ```python
-> doc_bin = DocBin(attrs=["DEP", "HEAD"])
-> doc_bin_bytes = doc_bin.to_bytes()
-> ```
-
-| Argument | Type | Description |
-| ----------- | ----- | ------------------------ |
-| **RETURNS** | bytes | The serialized `DocBin`. |
-
-## DocBin.from_bytes {#from_bytes tag="method"}
-
-Deserialize the `DocBin`'s annotations from a bytestring.
-
-> #### Example
->
-> ```python
-> doc_bin_bytes = doc_bin.to_bytes()
-> new_doc_bin = DocBin().from_bytes(doc_bin_bytes)
-> ```
-
-| Argument | Type | Description |
-| ------------ | -------- | ---------------------- |
-| `bytes_data` | bytes | The data to load from. |
-| **RETURNS** | `DocBin` | The loaded `DocBin`. |
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
deleted file mode 100644
index 88131761f..000000000
--- a/website/docs/api/entitylinker.md
+++ /dev/null
@@ -1,300 +0,0 @@
----
-title: EntityLinker
-teaser:
- Functionality to disambiguate a named entity in text to a unique knowledge
- base identifier.
-tag: class
-source: spacy/pipeline/pipes.pyx
-new: 2.2
----
-
-This class is a subclass of `Pipe` and follows the same API. The pipeline
-component is available in the [processing pipeline](/usage/processing-pipelines)
-via the ID `"entity_linker"`.
-
-## EntityLinker.Model {#model tag="classmethod"}
-
-Initialize a model for the pipe. The model should implement the
-`thinc.neural.Model` API, and should contain a field `tok2vec` that contains the
-context encoder. Wrappers are under development for most major machine learning
-libraries.
-
-| Name | Type | Description |
-| ----------- | ------ | ------------------------------------- |
-| `**kwargs` | - | Parameters for initializing the model |
-| **RETURNS** | object | The initialized model. |
-
-## EntityLinker.\_\_init\_\_ {#init tag="method"}
-
-Create a new pipeline instance. In your application, you would normally use a
-shortcut for this and instantiate the component using its string name and
-[`nlp.create_pipe`](/api/language#create_pipe).
-
-> #### Example
->
-> ```python
-> # Construction via create_pipe
-> entity_linker = nlp.create_pipe("entity_linker")
->
-> # Construction from class
-> from spacy.pipeline import EntityLinker
-> entity_linker = EntityLinker(nlp.vocab)
-> entity_linker.from_disk("/path/to/model")
-> ```
-
-| Name | Type | Description |
-| -------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | `Vocab` | The shared vocabulary. |
-| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
-| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to 128. |
-| `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to True. |
-| `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilites are used). Defaults to True. |
-| **RETURNS** | `EntityLinker` | The newly constructed object. |
-
-## EntityLinker.\_\_call\_\_ {#call tag="method"}
-
-Apply the pipe to one document. The document is modified in place, and returned.
-This usually happens under the hood when the `nlp` object is called on a text
-and all pipeline components are applied to the `Doc` in order. Both
-[`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe)
-delegate to the [`predict`](/api/entitylinker#predict) and
-[`set_annotations`](/api/entitylinker#set_annotations) methods.
-
-> #### Example
->
-> ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> doc = nlp("This is a sentence.")
-> # This usually happens under the hood
-> processed = entity_linker(doc)
-> ```
-
-| Name | Type | Description |
-| ----------- | ----- | ------------------------ |
-| `doc` | `Doc` | The document to process. |
-| **RETURNS** | `Doc` | The processed document. |
-
-## EntityLinker.pipe {#pipe tag="method"}
-
-Apply the pipe to a stream of documents. This usually happens under the hood
-when the `nlp` object is called on a text and all pipeline components are
-applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
-[`pipe`](/api/entitylinker#pipe) delegate to the
-[`predict`](/api/entitylinker#predict) and
-[`set_annotations`](/api/entitylinker#set_annotations) methods.
-
-> #### Example
->
-> ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> for doc in entity_linker.pipe(docs, batch_size=50):
-> pass
-> ```
-
-| Name | Type | Description |
-| ------------ | -------- | ------------------------------------------------------ |
-| `stream` | iterable | A stream of documents. |
-| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
-| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
-
-## EntityLinker.predict {#predict tag="method"}
-
-Apply the pipeline's model to a batch of docs, without modifying them.
-
-> #### Example
->
-> ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> kb_ids, tensors = entity_linker.predict([doc1, doc2])
-> ```
-
-| Name | Type | Description |
-| ----------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs` | iterable | The documents to predict. |
-| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. |
-
-## EntityLinker.set_annotations {#set_annotations tag="method"}
-
-Modify a batch of documents, using pre-computed entity IDs for a list of named
-entities.
-
-> #### Example
->
-> ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> kb_ids, tensors = entity_linker.predict([doc1, doc2])
-> entity_linker.set_annotations([doc1, doc2], kb_ids, tensors)
-> ```
-
-| Name | Type | Description |
-| --------- | -------- | ------------------------------------------------------------------------------------------------- |
-| `docs` | iterable | The documents to modify. |
-| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
-| `tensors` | iterable | The token representations used to predict the identifiers. |
-
-## EntityLinker.update {#update tag="method"}
-
-Learn from a batch of documents and gold-standard information, updating both the
-pipe's entity linking model and context encoder. Delegates to
-[`predict`](/api/entitylinker#predict) and
-[`get_loss`](/api/entitylinker#get_loss).
-
-> #### Example
->
-> ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> losses = {}
-> optimizer = nlp.begin_training()
-> entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
-> ```
-
-| Name | Type | Description |
-| -------- | -------- | ------------------------------------------------------------------------------------------------------- |
-| `docs` | iterable | A batch of documents to learn from. |
-| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
-| `drop` | float | The dropout rate, used both for the EL model and the context encoder. |
-| `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. |
-| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
-
-## EntityLinker.get_loss {#get_loss tag="method"}
-
-Find the loss and gradient of loss for the entities in a batch of documents and
-their predicted scores.
-
-> #### Example
->
-> ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> kb_ids, tensors = entity_linker.predict(docs)
-> loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors)
-> ```
-
-| Name | Type | Description |
-| ----------- | -------- | ------------------------------------------------------------ |
-| `docs` | iterable | The batch of documents. |
-| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
-| `kb_ids` | iterable | KB identifiers representing the model's predictions. |
-| `tensors` | iterable | The token representations used to predict the identifiers |
-| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
-
-## EntityLinker.set_kb {#set_kb tag="method"}
-
-Define the knowledge base (KB) used for disambiguating named entities to KB
-identifiers.
-
-> #### Example
->
-> ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> entity_linker.set_kb(kb)
-> ```
-
-| Name | Type | Description |
-| ---- | --------------- | ------------------------------- |
-| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). |
-
-## EntityLinker.begin_training {#begin_training tag="method"}
-
-Initialize the pipe for training, using data examples if available. If no model
-has been initialized yet, the model is added. Before calling this method, a
-knowledge base should have been defined with
-[`set_kb`](/api/entitylinker#set_kb).
-
-> #### Example
->
-> ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> entity_linker.set_kb(kb)
-> nlp.add_pipe(entity_linker, last=True)
-> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
-> ```
-
-| Name | Type | Description |
-| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
-| `pipeline` | list | Optional list of pipeline components that this component is part of. |
-| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. |
-| **RETURNS** | callable | An optimizer. |
-
-## EntityLinker.create_optimizer {#create_optimizer tag="method"}
-
-Create an optimizer for the pipeline component.
-
-> #### Example
->
-> ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> optimizer = entity_linker.create_optimizer()
-> ```
-
-| Name | Type | Description |
-| ----------- | -------- | -------------- |
-| **RETURNS** | callable | The optimizer. |
-
-## EntityLinker.use_params {#use_params tag="method, contextmanager"}
-
-Modify the pipe's EL model, to use the given parameter values.
-
-> #### Example
->
-> ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> with entity_linker.use_params(optimizer.averages):
-> entity_linker.to_disk("/best_model")
-> ```
-
-| Name | Type | Description |
-| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
-| `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
-
-## EntityLinker.to_disk {#to_disk tag="method"}
-
-Serialize the pipe to disk.
-
-> #### Example
->
-> ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> entity_linker.to_disk("/path/to/entity_linker")
-> ```
-
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
-
-## EntityLinker.from_disk {#from_disk tag="method"}
-
-Load the pipe from disk. Modifies the object in place and returns it.
-
-> #### Example
->
-> ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> entity_linker.from_disk("/path/to/entity_linker")
-> ```
-
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
-| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |
-
-## Serialization fields {#serialization-fields}
-
-During serialization, spaCy will export several data fields used to restore
-different aspects of the object. If needed, you can exclude them from
-serialization by passing in the string names via the `exclude` argument.
-
-> #### Example
->
-> ```python
-> data = entity_linker.to_disk("/path", exclude=["vocab"])
-> ```
-
-| Name | Description |
-| ------- | -------------------------------------------------------------- |
-| `vocab` | The shared [`Vocab`](/api/vocab). |
-| `cfg` | The config file. You usually don't want to exclude this. |
-| `model` | The binary model data. You usually don't want to exclude this. |
-| `kb` | The knowledge base. You usually don't want to exclude this. |
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 9a2766c07..7279a7f77 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -58,7 +58,7 @@ and all pipeline components are applied to the `Doc` in order. Both
>
> ```python
> ner = EntityRecognizer(nlp.vocab)
-> doc = nlp("This is a sentence.")
+> doc = nlp(u"This is a sentence.")
> # This usually happens under the hood
> processed = ner(doc)
> ```
@@ -99,7 +99,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
>
> ```python
> ner = EntityRecognizer(nlp.vocab)
-> scores, tensors = ner.predict([doc1, doc2])
+> scores = ner.predict([doc1, doc2])
> ```
| Name | Type | Description |
@@ -115,15 +115,14 @@ Modify a batch of documents, using pre-computed scores.
>
> ```python
> ner = EntityRecognizer(nlp.vocab)
-> scores, tensors = ner.predict([doc1, doc2])
-> ner.set_annotations([doc1, doc2], scores, tensors)
+> scores = ner.predict([doc1, doc2])
+> ner.set_annotations([doc1, doc2], scores)
> ```
-| Name | Type | Description |
-| --------- | -------- | ---------------------------------------------------------- |
-| `docs` | iterable | The documents to modify. |
-| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. |
-| `tensors` | iterable | The token representations used to predict the scores. |
+| Name | Type | Description |
+| -------- | -------- | ---------------------------------------------------------- |
+| `docs` | iterable | The documents to modify. |
+| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. |
## EntityRecognizer.update {#update tag="method"}
@@ -211,13 +210,13 @@ Modify the pipe's model, to use the given parameter values.
>
> ```python
> ner = EntityRecognizer(nlp.vocab)
-> with ner.use_params(optimizer.averages):
+> with ner.use_params():
> ner.to_disk("/best_model")
> ```
| Name | Type | Description |
| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
-| `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
+| `params` | - | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
## EntityRecognizer.add_label {#add_label tag="method"}
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index 5b93fceac..006ba90e6 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -10,9 +10,7 @@ token-based rules or exact phrase matches. It can be combined with the
statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or
used on its own to implement a purely rule-based entity recognition system.
After initialization, the component is typically added to the processing
-pipeline using [`nlp.add_pipe`](/api/language#add_pipe). For usage examples, see
-the docs on
-[rule-based entity recogntion](/usage/rule-based-matching#entityruler).
+pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
## EntityRuler.\_\_init\_\_ {#init tag="method"}
diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md
index 2dd24316f..5a2d8a110 100644
--- a/website/docs/api/goldparse.md
+++ b/website/docs/api/goldparse.md
@@ -23,7 +23,6 @@ gradient for those labels will be zero.
| `deps` | iterable | A sequence of strings, representing the syntactic relation types. |
| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. |
| `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). |
-| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either 1.0 (positive) or 0.0 (negative). |
| **RETURNS** | `GoldParse` | The newly constructed object. |
## GoldParse.\_\_len\_\_ {#len tag="method"}
@@ -44,17 +43,16 @@ Whether the provided syntactic annotations form a projective dependency tree.
## Attributes {#attributes}
-| Name | Type | Description |
-| ------------------------------------ | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `words` | list | The words. |
-| `tags` | list | The part-of-speech tag annotations. |
-| `heads` | list | The syntactic head annotations. |
-| `labels` | list | The syntactic relation-type annotations. |
-| `ner` | list | The named entity annotations as BILUO tags. |
-| `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. |
-| `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. |
-| `cats` 2 | list | Entries in the list should be either a label, or a `(start, end, label)` triple. The tuple form is used for categories applied to spans of the document. |
-| `links` 2.2 | dict | Keys in the dictionary are `(start_char, end_char)` triples, and the values are dictionaries with `kb_id:value` entries. |
+| Name | Type | Description |
+| --------------------------------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `words` | list | The words. |
+| `tags` | list | The part-of-speech tag annotations. |
+| `heads` | list | The syntactic head annotations. |
+| `labels` | list | The syntactic relation-type annotations. |
+| `ner` | list | The named entity annotations as BILUO tags. |
+| `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. |
+| `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. |
+| `cats` 2 | list | Entries in the list should be either a label, or a `(start, end, label)` triple. The tuple form is used for categories applied to spans of the document. |
## Utilities {#util}
@@ -69,7 +67,7 @@ Convert a list of Doc objects into the
> ```python
> from spacy.gold import docs_to_json
>
-> doc = nlp("I like London")
+> doc = nlp(u"I like London")
> json_data = docs_to_json([doc])
> ```
@@ -150,7 +148,7 @@ single-token entity.
> ```python
> from spacy.gold import biluo_tags_from_offsets
>
-> doc = nlp("I like London.")
+> doc = nlp(u"I like London.")
> entities = [(7, 13, "LOC")]
> tags = biluo_tags_from_offsets(doc, entities)
> assert tags == ["O", "O", "U-LOC", "O"]
@@ -172,7 +170,7 @@ entity offsets.
> ```python
> from spacy.gold import offsets_from_biluo_tags
>
-> doc = nlp("I like London.")
+> doc = nlp(u"I like London.")
> tags = ["O", "O", "U-LOC", "O"]
> entities = offsets_from_biluo_tags(doc, tags)
> assert entities == [(7, 13, "LOC")]
@@ -195,7 +193,7 @@ token-based tags, e.g. to overwrite the `doc.ents`.
> ```python
> from spacy.gold import spans_from_biluo_tags
>
-> doc = nlp("I like London.")
+> doc = nlp(u"I like London.")
> tags = ["O", "O", "U-LOC", "O"]
> doc.ents = spans_from_biluo_tags(doc, tags)
> ```
diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md
deleted file mode 100644
index 639ababb6..000000000
--- a/website/docs/api/kb.md
+++ /dev/null
@@ -1,268 +0,0 @@
----
-title: KnowledgeBase
-teaser: A storage class for entities and aliases of a specific knowledge base (ontology)
-tag: class
-source: spacy/kb.pyx
-new: 2.2
----
-
-The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init)
-objects, which are plausible external identifiers given a certain textual mention.
-Each such `Candidate` holds information from the relevant KB entities,
-such as its frequency in text and possible aliases.
-Each entity in the knowledge base also has a pre-trained entity vector of a fixed size.
-
-## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
-
-Create the knowledge base.
-
-> #### Example
->
-> ```python
-> from spacy.kb import KnowledgeBase
-> vocab = nlp.vocab
-> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
-> ```
-
-| Name | Type | Description |
-| ----------------------- | ---------------- | ----------------------------------------- |
-| `vocab` | `Vocab` | A `Vocab` object. |
-| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
-| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
-
-
-## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
-
-The length of the fixed-size entity vectors in the knowledge base.
-
-| Name | Type | Description |
-| ----------- | ---- | ----------------------------------------- |
-| **RETURNS** | int | Length of the fixed-size entity vectors. |
-
-## KnowledgeBase.add_entity {#add_entity tag="method"}
-
-Add an entity to the knowledge base, specifying its corpus frequency
-and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length).
-
-> #### Example
->
-> ```python
-> kb.add_entity(entity="Q42", freq=32, entity_vector=vector1)
-> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
-> ```
-
-| Name | Type | Description |
-| --------------- | ------------- | ------------------------------------------------- |
-| `entity` | unicode | The unique entity identifier |
-| `freq` | float | The frequency of the entity in a typical corpus |
-| `entity_vector` | vector | The pre-trained vector of the entity |
-
-## KnowledgeBase.set_entities {#set_entities tag="method"}
-
-Define the full list of entities in the knowledge base, specifying the corpus frequency
-and entity vector for each entity.
-
-> #### Example
->
-> ```python
-> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
-> ```
-
-| Name | Type | Description |
-| ------------- | ------------- | ------------------------------------------------- |
-| `entity_list` | iterable | List of unique entity identifiers |
-| `freq_list` | iterable | List of entity frequencies |
-| `vector_list` | iterable | List of entity vectors |
-
-## KnowledgeBase.add_alias {#add_alias tag="method"}
-
-Add an alias or mention to the knowledge base, specifying its potential KB identifiers
-and their prior probabilities. The entity identifiers should refer to entities previously
-added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities).
-The sum of the prior probabilities should not exceed 1.
-
-> #### Example
->
-> ```python
-> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
-> ```
-
-| Name | Type | Description |
-| -------------- | ------------- | -------------------------------------------------- |
-| `alias` | unicode | The textual mention or alias |
-| `entities` | iterable | The potential entities that the alias may refer to |
-| `probabilities`| iterable | The prior probabilities of each entity |
-
-## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
-
-Get the total number of entities in the knowledge base.
-
-> #### Example
->
-> ```python
-> total_entities = len(kb)
-> ```
-
-| Name | Type | Description |
-| ----------- | ---- | --------------------------------------------- |
-| **RETURNS** | int | The number of entities in the knowledge base. |
-
-## KnowledgeBase.get_entity_strings {#get_entity_strings tag="method"}
-
-Get a list of all entity IDs in the knowledge base.
-
-> #### Example
->
-> ```python
-> all_entities = kb.get_entity_strings()
-> ```
-
-| Name | Type | Description |
-| ----------- | ---- | --------------------------------------------- |
-| **RETURNS** | list | The list of entities in the knowledge base. |
-
-## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
-
-Get the total number of aliases in the knowledge base.
-
-> #### Example
->
-> ```python
-> total_aliases = kb.get_size_aliases()
-> ```
-
-| Name | Type | Description |
-| ----------- | ---- | --------------------------------------------- |
-| **RETURNS** | int | The number of aliases in the knowledge base. |
-
-## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
-
-Get a list of all aliases in the knowledge base.
-
-> #### Example
->
-> ```python
-> all_aliases = kb.get_alias_strings()
-> ```
-
-| Name | Type | Description |
-| ----------- | ---- | --------------------------------------------- |
-| **RETURNS** | list | The list of aliases in the knowledge base. |
-
-## KnowledgeBase.get_candidates {#get_candidates tag="method"}
-
-Given a certain textual mention as input, retrieve a list of candidate entities
-of type [`Candidate`](/api/kb/#candidate_init).
-
-> #### Example
->
-> ```python
-> candidates = kb.get_candidates("Douglas")
-> ```
-
-| Name | Type | Description |
-| ------------- | ------------- | -------------------------------------------------- |
-| `alias` | unicode | The textual mention or alias |
-| **RETURNS** | iterable | The list of relevant `Candidate` objects |
-
-## KnowledgeBase.get_vector {#get_vector tag="method"}
-
-Given a certain entity ID, retrieve its pre-trained entity vector.
-
-> #### Example
->
-> ```python
-> vector = kb.get_vector("Q42")
-> ```
-
-| Name | Type | Description |
-| ------------- | ------------- | -------------------------------------------------- |
-| `entity` | unicode | The entity ID |
-| **RETURNS** | vector | The entity vector |
-
-## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
-
-Given a certain entity ID and a certain textual mention, retrieve
-the prior probability of the fact that the mention links to the entity ID.
-
-> #### Example
->
-> ```python
-> probability = kb.get_prior_prob("Q42", "Douglas")
-> ```
-
-| Name | Type | Description |
-| ------------- | ------------- | --------------------------------------------------------------- |
-| `entity` | unicode | The entity ID |
-| `alias` | unicode | The textual mention or alias |
-| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` |
-
-## KnowledgeBase.dump {#dump tag="method"}
-
-Save the current state of the knowledge base to a directory.
-
-> #### Example
->
-> ```python
-> kb.dump(loc)
-> ```
-
-| Name | Type | Description |
-| ------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ |
-| `loc` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-
-## KnowledgeBase.load_bulk {#load_bulk tag="method"}
-
-Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab)
-should also be the same as the one used to create the KB.
-
-> #### Example
->
-> ```python
-> from spacy.kb import KnowledgeBase
-> from spacy.vocab import Vocab
-> vocab = Vocab().from_disk("/path/to/vocab")
-> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
-> kb.load_bulk("/path/to/kb")
-> ```
-
-
-| Name | Type | Description |
-| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
-| `loc` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. |
-
-
-## Candidate.\_\_init\_\_ {#candidate_init tag="method"}
-
-Construct a `Candidate` object. Usually this constructor is not called directly,
-but instead these objects are returned by the [`get_candidates`](/api/kb#get_candidates) method
-of a `KnowledgeBase`.
-
-> #### Example
->
-> ```python
-> from spacy.kb import Candidate
-> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
-> ```
-
-| Name | Type | Description |
-| ------------- | --------------- | -------------------------------------------------------------- |
-| `kb` | `KnowledgeBase` | The knowledge base that defined this candidate. |
-| `entity_hash` | int | The hash of the entity's KB ID. |
-| `entity_freq` | float | The entity frequency as recorded in the KB. |
-| `alias_hash` | int | The hash of the textual mention or alias. |
-| `prior_prob` | float | The prior probability of the `alias` referring to the `entity` |
-| **RETURNS** | `Candidate` | The newly constructed object. |
-
-## Candidate attributes {#candidate_attributes}
-
-| Name | Type | Description |
-| ---------------------- | ------------ | ------------------------------------------------------------------ |
-| `entity` | int | The entity's unique KB identifier |
-| `entity_` | unicode | The entity's unique KB identifier |
-| `alias` | int | The alias or textual mention |
-| `alias_` | unicode | The alias or textual mention |
-| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
-| `entity_freq` | long | The frequency of the entity in a typical corpus |
-| `entity_vector` | vector | The pre-trained vector of the entity |
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index c44339ff5..3fcdeb195 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -45,7 +45,7 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
> #### Example
>
> ```python
-> doc = nlp("An example sentence. Another sentence.")
+> doc = nlp(u"An example sentence. Another sentence.")
> assert (doc[0].text, doc[0].head.tag_) == ("An", "NN")
> ```
@@ -61,8 +61,8 @@ Pipeline components to prevent from being loaded can now be added as a list to
`disable`, instead of specifying one keyword argument per component.
```diff
-- doc = nlp("I don't want parsed", parse=False)
-+ doc = nlp("I don't want parsed", disable=["parser"])
+- doc = nlp(u"I don't want parsed", parse=False)
++ doc = nlp(u"I don't want parsed", disable=["parser"])
```
@@ -86,7 +86,7 @@ multiprocessing.
> #### Example
>
> ```python
-> texts = ["One document.", "...", "Lots of documents"]
+> texts = [u"One document.", u"...", u"Lots of documents"]
> for doc in nlp.pipe(texts, batch_size=50):
> assert doc.is_parsed
> ```
@@ -140,7 +140,6 @@ Evaluate a model's pipeline components.
| `batch_size` | int | The batch size to use. |
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
| `component_cfg` 2.1 | dict | Config parameters for specific pipeline components, keyed by component name. |
-| **RETURNS** | Scorer | The scorer containing the evaluation scores. |
## Language.begin_training {#begin_training tag="method"}
@@ -444,16 +443,15 @@ per component.
## Attributes {#attributes}
-| Name | Type | Description |
-| ------------------------------------------ | ----------- | ----------------------------------------------------------------------------------------------- |
-| `vocab` | `Vocab` | A container for the lexical types. |
-| `tokenizer` | `Tokenizer` | The tokenizer. |
-| `make_doc` | `callable` | Callable that takes a unicode text and returns a `Doc`. |
-| `pipeline` | list | List of `(name, component)` tuples describing the current processing pipeline, in order. |
-| `pipe_names` 2 | list | List of pipeline component names, in order. |
-| `pipe_labels` 2.2 | dict | List of labels set by the pipeline components, if available, keyed by component name. |
-| `meta` | dict | Custom meta data for the Language class. If a model is loaded, contains meta data of the model. |
-| `path` 2 | `Path` | Path to the model data directory, if a model is loaded. Otherwise `None`. |
+| Name | Type | Description |
+| --------------------------------------- | ------------------ | ----------------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | A container for the lexical types. |
+| `tokenizer` | `Tokenizer` | The tokenizer. |
+| `make_doc` | `lambda text: Doc` | Create a `Doc` object from unicode text. |
+| `pipeline` | list | List of `(name, component)` tuples describing the current processing pipeline, in order. |
+| `pipe_names` 2 | list | List of pipeline component names, in order. |
+| `meta` | dict | Custom meta data for the Language class. If a model is loaded, contains meta data of the model. |
+| `path` 2 | `Path` | Path to the model data directory, if a model is loaded. Otherwise `None`. |
## Class attributes {#class-attributes}
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index 805e96b0f..7bc2691e5 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -35,10 +35,10 @@ Lemmatize a string.
>
> ```python
> from spacy.lemmatizer import Lemmatizer
-> rules = {"noun": [["s", ""]]}
-> lemmatizer = Lemmatizer(index={}, exceptions={}, rules=rules)
-> lemmas = lemmatizer("ducks", "NOUN")
-> assert lemmas == ["duck"]
+> from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
+> lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
+> lemmas = lemmatizer(u"ducks", u"NOUN")
+> assert lemmas == [u"duck"]
> ```
| Name | Type | Description |
@@ -52,22 +52,21 @@ Lemmatize a string.
Look up a lemma in the lookup table, if available. If no lemma is found, the
original string is returned. Languages can provide a
-[lookup table](/usage/adding-languages#lemmatizer) via the `resources`, set on
-the individual `Language` class.
+[lookup table](/usage/adding-languages#lemmatizer) via the `lemma_lookup`
+variable, set on the individual `Language` class.
> #### Example
>
> ```python
-> lookup = {"going": "go"}
+> lookup = {u"going": u"go"}
> lemmatizer = Lemmatizer(lookup=lookup)
-> assert lemmatizer.lookup("going") == "go"
+> assert lemmatizer.lookup(u"going") == u"go"
> ```
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- |
-| `string` | unicode | The string to look up. |
-| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
-| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. |
+| Name | Type | Description |
+| ----------- | ------- | ----------------------------------------------------------------- |
+| `string` | unicode | The string to look up. |
+| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. |
## Lemmatizer.is_base_form {#is_base_form tag="method"}
diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md
index 398b71708..018dc72d8 100644
--- a/website/docs/api/lexeme.md
+++ b/website/docs/api/lexeme.md
@@ -27,7 +27,7 @@ Change the value of a boolean flag.
>
> ```python
> COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
-> nlp.vocab["spaCy"].set_flag(COOL_FLAG, True)
+> nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True)
> ```
| Name | Type | Description |
@@ -42,9 +42,9 @@ Check the value of a boolean flag.
> #### Example
>
> ```python
-> is_my_library = lambda text: text in ["spaCy", "Thinc"]
+> is_my_library = lambda text: text in [u"spaCy", u"Thinc"]
> MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
-> assert nlp.vocab["spaCy"].check_flag(MY_LIBRARY) == True
+> assert nlp.vocab[u"spaCy"].check_flag(MY_LIBRARY) == True
> ```
| Name | Type | Description |
@@ -59,8 +59,8 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors.
> #### Example
>
> ```python
-> apple = nlp.vocab["apple"]
-> orange = nlp.vocab["orange"]
+> apple = nlp.vocab[u"apple"]
+> orange = nlp.vocab[u"orange"]
> apple_orange = apple.similarity(orange)
> orange_apple = orange.similarity(apple)
> assert apple_orange == orange_apple
@@ -78,7 +78,7 @@ A boolean value indicating whether a word vector is associated with the lexeme.
> #### Example
>
> ```python
-> apple = nlp.vocab["apple"]
+> apple = nlp.vocab[u"apple"]
> assert apple.has_vector
> ```
@@ -93,7 +93,7 @@ A real-valued meaning representation.
> #### Example
>
> ```python
-> apple = nlp.vocab["apple"]
+> apple = nlp.vocab[u"apple"]
> assert apple.vector.dtype == "float32"
> assert apple.vector.shape == (300,)
> ```
@@ -109,8 +109,8 @@ The L2 norm of the lexeme's vector representation.
> #### Example
>
> ```python
-> apple = nlp.vocab["apple"]
-> pasta = nlp.vocab["pasta"]
+> apple = nlp.vocab[u"apple"]
+> pasta = nlp.vocab[u"pasta"]
> apple.vector_norm # 7.1346845626831055
> pasta.vector_norm # 7.759851932525635
> assert apple.vector_norm != pasta.vector_norm
diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md
deleted file mode 100644
index 9878546ea..000000000
--- a/website/docs/api/lookups.md
+++ /dev/null
@@ -1,318 +0,0 @@
----
-title: Lookups
-teaser: A container for large lookup tables and dictionaries
-tag: class
-source: spacy/lookups.py
-new: 2.2
----
-
-This class allows convenient accesss to large lookup tables and dictionaries,
-e.g. lemmatization data or tokenizer exception lists using Bloom filters.
-Lookups are available via the [`Vocab`](/api/vocab) as `vocab.lookups`, so they
-can be accessed before the pipeline components are applied (e.g. in the
-tokenizer and lemmatizer), as well as within the pipeline components via
-`doc.vocab.lookups`.
-
-## Lookups.\_\_init\_\_ {#init tag="method"}
-
-Create a `Lookups` object.
-
-> #### Example
->
-> ```python
-> from spacy.lookups import Lookups
-> lookups = Lookups()
-> ```
-
-| Name | Type | Description |
-| ----------- | --------- | ----------------------------- |
-| **RETURNS** | `Lookups` | The newly constructed object. |
-
-## Lookups.\_\_len\_\_ {#len tag="method"}
-
-Get the current number of tables in the lookups.
-
-> #### Example
->
-> ```python
-> lookups = Lookups()
-> assert len(lookups) == 0
-> ```
-
-| Name | Type | Description |
-| ----------- | ---- | ------------------------------------ |
-| **RETURNS** | int | The number of tables in the lookups. |
-
-## Lookups.\_\contains\_\_ {#contains tag="method"}
-
-Check if the lookups contain a table of a given name. Delegates to
-[`Lookups.has_table`](/api/lookups#has_table).
-
-> #### Example
->
-> ```python
-> lookups = Lookups()
-> lookups.add_table("some_table")
-> assert "some_table" in lookups
-> ```
-
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------- |
-| `name` | unicode | Name of the table. |
-| **RETURNS** | bool | Whether a table of that name is in the lookups. |
-
-## Lookups.tables {#tables tag="property"}
-
-Get the names of all tables in the lookups.
-
-> #### Example
->
-> ```python
-> lookups = Lookups()
-> lookups.add_table("some_table")
-> assert lookups.tables == ["some_table"]
-> ```
-
-| Name | Type | Description |
-| ----------- | ---- | ----------------------------------- |
-| **RETURNS** | list | Names of the tables in the lookups. |
-
-## Lookups.add_table {#add_table tag="method"}
-
-Add a new table with optional data to the lookups. Raises an error if the table
-exists.
-
-> #### Example
->
-> ```python
-> lookups = Lookups()
-> lookups.add_table("some_table", {"foo": "bar"})
-> ```
-
-| Name | Type | Description |
-| ----------- | ----------------------------- | ---------------------------------- |
-| `name` | unicode | Unique name of the table. |
-| `data` | dict | Optional data to add to the table. |
-| **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. |
-
-## Lookups.get_table {#get_table tag="method"}
-
-Get a table from the lookups. Raises an error if the table doesn't exist.
-
-> #### Example
->
-> ```python
-> lookups = Lookups()
-> lookups.add_table("some_table", {"foo": "bar"})
-> table = lookups.get_table("some_table")
-> assert table["foo"] == "bar"
-> ```
-
-| Name | Type | Description |
-| ----------- | ----------------------------- | ------------------ |
-| `name` | unicode | Name of the table. |
-| **RETURNS** | [`Table`](/api/lookups#table) | The table. |
-
-## Lookups.remove_table {#remove_table tag="method"}
-
-Remove a table from the lookups. Raises an error if the table doesn't exist.
-
-> #### Example
->
-> ```python
-> lookups = Lookups()
-> lookups.add_table("some_table")
-> removed_table = lookups.remove_table("some_table")
-> assert "some_table" not in lookups
-> ```
-
-| Name | Type | Description |
-| ----------- | ----------------------------- | ---------------------------- |
-| `name` | unicode | Name of the table to remove. |
-| **RETURNS** | [`Table`](/api/lookups#table) | The removed table. |
-
-## Lookups.has_table {#has_table tag="method"}
-
-Check if the lookups contain a table of a given name. Equivalent to
-[`Lookups.__contains__`](/api/lookups#contains).
-
-> #### Example
->
-> ```python
-> lookups = Lookups()
-> lookups.add_table("some_table")
-> assert lookups.has_table("some_table")
-> ```
-
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------- |
-| `name` | unicode | Name of the table. |
-| **RETURNS** | bool | Whether a table of that name is in the lookups. |
-
-## Lookups.to_bytes {#to_bytes tag="method"}
-
-Serialize the lookups to a bytestring.
-
-> #### Example
->
-> ```python
-> lookup_bytes = lookups.to_bytes()
-> ```
-
-| Name | Type | Description |
-| ----------- | ----- | ----------------------- |
-| **RETURNS** | bytes | The serialized lookups. |
-
-## Lookups.from_bytes {#from_bytes tag="method"}
-
-Load the lookups from a bytestring.
-
-> #### Example
->
-> ```python
-> lookup_bytes = lookups.to_bytes()
-> lookups = Lookups()
-> lookups.from_bytes(lookup_bytes)
-> ```
-
-| Name | Type | Description |
-| ------------ | --------- | ---------------------- |
-| `bytes_data` | bytes | The data to load from. |
-| **RETURNS** | `Lookups` | The loaded lookups. |
-
-## Lookups.to_disk {#to_disk tag="method"}
-
-Save the lookups to a directory as `lookups.bin`. Expects a path to a directory,
-which will be created if it doesn't exist.
-
-> #### Example
->
-> ```python
-> lookups.to_disk("/path/to/lookups")
-> ```
-
-| Name | Type | Description |
-| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-
-## Lookups.from_disk {#from_disk tag="method"}
-
-Load lookups from a directory containing a `lookups.bin`. Will skip loading if
-the file doesn't exist.
-
-> #### Example
->
-> ```python
-> from spacy.lookups import Lookups
-> lookups = Lookups()
-> lookups.from_disk("/path/to/lookups")
-> ```
-
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `Lookups` | The loaded lookups. |
-
-## Table {#table tag="class, ordererddict"}
-
-A table in the lookups. Subclass of `OrderedDict` that implements a slightly
-more consistent and unified API and includes a Bloom filter to speed up missed
-lookups. Supports **all other methods and attributes** of `OrderedDict` /
-`dict`, and the customized methods listed here. Methods that get or set keys
-accept both integers and strings (which will be hashed before being added to the
-table).
-
-### Table.\_\_init\_\_ {#table.init tag="method"}
-
-Initialize a new table.
-
-> #### Example
->
-> ```python
-> from spacy.lookups import Table
-> data = {"foo": "bar", "baz": 100}
-> table = Table(name="some_table", data=data)
-> assert "foo" in table
-> assert table["foo"] == "bar"
-> ```
-
-| Name | Type | Description |
-| ----------- | ------- | ---------------------------------- |
-| `name` | unicode | Optional table name for reference. |
-| **RETURNS** | `Table` | The newly constructed object. |
-
-### Table.from_dict {#table.from_dict tag="classmethod"}
-
-Initialize a new table from a dict.
-
-> #### Example
->
-> ```python
-> from spacy.lookups import Table
-> data = {"foo": "bar", "baz": 100}
-> table = Table.from_dict(data, name="some_table")
-> ```
-
-| Name | Type | Description |
-| ----------- | ------- | ---------------------------------- |
-| `data` | dict | The dictionary. |
-| `name` | unicode | Optional table name for reference. |
-| **RETURNS** | `Table` | The newly constructed object. |
-
-### Table.set {#table.set tag="method"}
-
-Set a new key / value pair. String keys will be hashed. Same as
-`table[key] = value`.
-
-> #### Example
->
-> ```python
-> from spacy.lookups import Table
-> table = Table()
-> table.set("foo", "bar")
-> assert table["foo"] == "bar"
-> ```
-
-| Name | Type | Description |
-| ------- | ------------- | ----------- |
-| `key` | unicode / int | The key. |
-| `value` | - | The value. |
-
-### Table.to_bytes {#table.to_bytes tag="method"}
-
-Serialize the table to a bytestring.
-
-> #### Example
->
-> ```python
-> table_bytes = table.to_bytes()
-> ```
-
-| Name | Type | Description |
-| ----------- | ----- | --------------------- |
-| **RETURNS** | bytes | The serialized table. |
-
-### Table.from_bytes {#table.from_bytes tag="method"}
-
-Load a table from a bytestring.
-
-> #### Example
->
-> ```python
-> table_bytes = table.to_bytes()
-> table = Table()
-> table.from_bytes(table_bytes)
-> ```
-
-| Name | Type | Description |
-| ------------ | ------- | ----------------- |
-| `bytes_data` | bytes | The data to load. |
-| **RETURNS** | `Table` | The loaded table. |
-
-### Attributes {#table-attributes}
-
-| Name | Type | Description |
-| -------------- | --------------------------- | ----------------------------------------------------- |
-| `name` | unicode | Table name. |
-| `default_size` | int | Default size of bloom filters if no data is provided. |
-| `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. |
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 84d9ed888..fb0ba1617 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -50,7 +50,7 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> matcher = Matcher(nlp.vocab)
> pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
> matcher.add("HelloWorld", None, pattern)
-> doc = nlp("hello world!")
+> doc = nlp(u'hello world!')
> matches = matcher(doc)
> ```
@@ -147,7 +147,7 @@ overwritten.
> matcher = Matcher(nlp.vocab)
> matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}])
> matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}])
-> doc = nlp("HELLO WORLD on Google Maps.")
+> doc = nlp(u"HELLO WORLD on Google Maps.")
> matches = matcher(doc)
> ```
diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md
index 40b8d6c1a..c61fa575d 100644
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@@ -59,8 +59,8 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> from spacy.matcher import PhraseMatcher
>
> matcher = PhraseMatcher(nlp.vocab)
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
-> doc = nlp("Barack Obama lifts America one last time in emotional farewell")
+> matcher.add("OBAMA", None, nlp(u"Barack Obama"))
+> doc = nlp(u"Barack Obama lifts America one last time in emotional farewell")
> matches = matcher(doc)
> ```
@@ -99,7 +99,7 @@ patterns.
> ```python
> matcher = PhraseMatcher(nlp.vocab)
> assert len(matcher) == 0
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
+> matcher.add("OBAMA", None, nlp(u"Barack Obama"))
> assert len(matcher) == 1
> ```
@@ -116,7 +116,7 @@ Check whether the matcher contains rules for a match ID.
> ```python
> matcher = PhraseMatcher(nlp.vocab)
> assert "OBAMA" not in matcher
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
+> matcher.add("OBAMA", None, nlp(u"Barack Obama"))
> assert "OBAMA" in matcher
> ```
@@ -140,10 +140,10 @@ overwritten.
> print('Matched!', matches)
>
> matcher = PhraseMatcher(nlp.vocab)
-> matcher.add("OBAMA", on_match, nlp("Barack Obama"))
-> matcher.add("HEALTH", on_match, nlp("health care reform"),
-> nlp("healthcare reform"))
-> doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms")
+> matcher.add("OBAMA", on_match, nlp(u"Barack Obama"))
+> matcher.add("HEALTH", on_match, nlp(u"health care reform"),
+> nlp(u"healthcare reform"))
+> doc = nlp(u"Barack Obama urges Congress to find courage to defend his healthcare reforms")
> matches = matcher(doc)
> ```
@@ -152,22 +152,3 @@ overwritten.
| `match_id` | unicode | An ID for the thing you're matching. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*docs` | list | `Doc` objects of the phrases to match. |
-
-## PhraseMatcher.remove {#remove tag="method" new="2.2"}
-
-Remove a rule from the matcher by match ID. A `KeyError` is raised if the key
-does not exist.
-
-> #### Example
->
-> ```python
-> matcher = PhraseMatcher(nlp.vocab)
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
-> assert "OBAMA" in matcher
-> matcher.remove("OBAMA")
-> assert "OBAMA" not in matcher
-> ```
-
-| Name | Type | Description |
-| ----- | ------- | ------------------------- |
-| `key` | unicode | The ID of the match rule. |
diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md
index 6e2b473b1..63b3cd164 100644
--- a/website/docs/api/pipeline-functions.md
+++ b/website/docs/api/pipeline-functions.md
@@ -17,13 +17,13 @@ the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
> #### Example
>
> ```python
-> texts = [t.text for t in nlp("I have a blue car")]
+> texts = [t.text for t in nlp(u"I have a blue car")]
> assert texts == ["I", "have", "a", "blue", "car"]
>
> merge_nps = nlp.create_pipe("merge_noun_chunks")
> nlp.add_pipe(merge_nps)
>
-> texts = [t.text for t in nlp("I have a blue car")]
+> texts = [t.text for t in nlp(u"I have a blue car")]
> assert texts == ["I", "have", "a blue car"]
> ```
@@ -50,13 +50,13 @@ the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
> #### Example
>
> ```python
-> texts = [t.text for t in nlp("I like David Bowie")]
+> texts = [t.text for t in nlp(u"I like David Bowie")]
> assert texts == ["I", "like", "David", "Bowie"]
>
> merge_ents = nlp.create_pipe("merge_entities")
> nlp.add_pipe(merge_ents)
>
-> texts = [t.text for t in nlp("I like David Bowie")]
+> texts = [t.text for t in nlp(u"I like David Bowie")]
> assert texts == ["I", "like", "David Bowie"]
> ```
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index 35348217b..2af4ec0ce 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -46,16 +46,14 @@ Update the evaluation scores from a single [`Doc`](/api/doc) /
## Properties
-| Name | Type | Description |
-| ----------------------------------------------- | ----- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `token_acc` | float | Tokenization accuracy. |
-| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). |
-| `uas` | float | Unlabelled dependency score. |
-| `las` | float | Labelled dependency score. |
-| `ents_p` | float | Named entity accuracy (precision). |
-| `ents_r` | float | Named entity accuracy (recall). |
-| `ents_f` | float | Named entity accuracy (F-score). |
-| `ents_per_type` 2.1.5 | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. |
-| `textcat_score` 2.2 | float | F-score on positive label for binary exclusive, macro-averaged F-score for 3+ exclusive, macro-averaged AUC ROC score for multilabel (`-1` if undefined). |
-| `textcats_per_cat` 2.2 | dict | Scores per textcat label, keyed by label. |
-| `scores` | dict | All scores, keyed by type. |
+| Name | Type | Description |
+| ---------------------------------------------- | ----- | ------------------------------------------------------------------------------------------------------------- |
+| `token_acc` | float | Tokenization accuracy. |
+| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). |
+| `uas` | float | Unlabelled dependency score. |
+| `las` | float | Labelled dependency score. |
+| `ents_p` | float | Named entity accuracy (precision). |
+| `ents_r` | float | Named entity accuracy (recall). |
+| `ents_f` | float | Named entity accuracy (F-score). |
+| `ents_per_type` 2.1.5 | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. |
+| `scores` | dict | All scores with keys `uas`, `las`, `ents_p`, `ents_r`, `ents_f`, `ents_per_type`, `tags_acc` and `token_acc`. |
diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md
index 237cd6a8a..26d205c24 100644
--- a/website/docs/api/sentencizer.md
+++ b/website/docs/api/sentencizer.md
@@ -59,7 +59,7 @@ the component has been added to the pipeline using
> nlp = English()
> sentencizer = nlp.create_pipe("sentencizer")
> nlp.add_pipe(sentencizer)
-> doc = nlp("This is a sentence. This is another sentence.")
+> doc = nlp(u"This is a sentence. This is another sentence.")
> assert list(doc.sents) == 2
> ```
diff --git a/website/docs/api/span.md b/website/docs/api/span.md
index 64b77b89d..c807c7bbf 100644
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@@ -13,20 +13,19 @@ Create a Span object from the slice `doc[start : end]`.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> span = doc[1:4]
-> assert [t.text for t in span] == ["it", "back", "!"]
+> assert [t.text for t in span] == [u"it", u"back", u"!"]
> ```
-| Name | Type | Description |
-| ----------- | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------- |
-| `doc` | `Doc` | The parent document. |
-| `start` | int | The index of the first token of the span. |
-| `end` | int | The index of the first token after the span. |
-| `label` | int / unicode | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a unicode string. |
-| `kb_id` | int / unicode | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a unicode string. |
-| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
-| **RETURNS** | `Span` | The newly constructed object. |
+| Name | Type | Description |
+| ----------- | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------- |
+| `doc` | `Doc` | The parent document. |
+| `start` | int | The index of the first token of the span. |
+| `end` | int | The index of the first token after the span. |
+| `label` | int / unicode | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a unicode string. |
+| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
+| **RETURNS** | `Span` | The newly constructed object. |
## Span.\_\_getitem\_\_ {#getitem tag="method"}
@@ -35,7 +34,7 @@ Get a `Token` object.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> span = doc[1:4]
> assert span[1].text == "back"
> ```
@@ -50,9 +49,9 @@ Get a `Span` object.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> span = doc[1:4]
-> assert span[1:3].text == "back!"
+> assert span[1:3].text == u"back!"
> ```
| Name | Type | Description |
@@ -67,9 +66,9 @@ Iterate over `Token` objects.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> span = doc[1:4]
-> assert [t.text for t in span] == ["it", "back", "!"]
+> assert [t.text for t in span] == [u"it", u"back", u"!"]
> ```
| Name | Type | Description |
@@ -83,7 +82,7 @@ Get the number of tokens in the span.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> span = doc[1:4]
> assert len(span) == 3
> ```
@@ -102,9 +101,9 @@ For details, see the documentation on
>
> ```python
> from spacy.tokens import Span
-> city_getter = lambda span: any(city in span.text for city in ("New York", "Paris", "Berlin"))
+> city_getter = lambda span: any(city in span.text for city in (u"New York", u"Paris", u"Berlin"))
> Span.set_extension("has_city", getter=city_getter)
-> doc = nlp("I like New York in Autumn")
+> doc = nlp(u"I like New York in Autumn")
> assert doc[1:4]._.has_city
> ```
@@ -180,7 +179,7 @@ using an average of word vectors.
> #### Example
>
> ```python
-> doc = nlp("green apples and red oranges")
+> doc = nlp(u"green apples and red oranges")
> green_apples = doc[:2]
> red_oranges = doc[3:]
> apples_oranges = green_apples.similarity(red_oranges)
@@ -202,7 +201,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
> #### Example
>
> ```python
-> doc = nlp("I like New York in Autumn")
+> doc = nlp(u"I like New York in Autumn")
> span = doc[1:4]
> matrix = span.get_lca_matrix()
> # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
@@ -222,7 +221,7 @@ shape `(N, M)`, where `N` is the length of the document. The values will be
>
> ```python
> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
-> doc = nlp("I like New York in Autumn.")
+> doc = nlp(u"I like New York in Autumn.")
> span = doc[2:3]
> # All strings mapped to integers, for easy export to numpy
> np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
@@ -248,11 +247,11 @@ Retokenize the document, such that the span is merged into a single token.
> #### Example
>
> ```python
-> doc = nlp("I like New York in Autumn.")
+> doc = nlp(u"I like New York in Autumn.")
> span = doc[2:4]
> span.merge()
> assert len(doc) == 6
-> assert doc[2].text == "New York"
+> assert doc[2].text == u"New York"
> ```
| Name | Type | Description |
@@ -268,12 +267,12 @@ if the entity recognizer has been applied.
> #### Example
>
> ```python
-> doc = nlp("Mr. Best flew to New York on Saturday morning.")
+> doc = nlp(u"Mr. Best flew to New York on Saturday morning.")
> span = doc[0:6]
> ents = list(span.ents)
> assert ents[0].label == 346
> assert ents[0].label_ == "PERSON"
-> assert ents[0].text == "Mr. Best"
+> assert ents[0].text == u"Mr. Best"
> ```
| Name | Type | Description |
@@ -287,10 +286,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
> #### Example
>
> ```python
-> doc = nlp("I like New York in Autumn.")
+> doc = nlp(u"I like New York in Autumn.")
> span = doc[2:4]
> doc2 = span.as_doc()
-> assert doc2.text == "New York"
+> assert doc2.text == u"New York"
> ```
| Name | Type | Description |
@@ -307,12 +306,12 @@ taken.
> #### Example
>
> ```python
-> doc = nlp("I like New York in Autumn.")
+> doc = nlp(u"I like New York in Autumn.")
> i, like, new, york, in_, autumn, dot = range(len(doc))
-> assert doc[new].head.text == "York"
-> assert doc[york].head.text == "like"
+> assert doc[new].head.text == u"York"
+> assert doc[york].head.text == u"like"
> new_york = doc[new:york+1]
-> assert new_york.root.text == "York"
+> assert new_york.root.text == u"York"
> ```
| Name | Type | Description |
@@ -326,9 +325,9 @@ A tuple of tokens coordinated to `span.root`.
> #### Example
>
> ```python
-> doc = nlp("I like apples and oranges")
+> doc = nlp(u"I like apples and oranges")
> apples_conjuncts = doc[2:3].conjuncts
-> assert [t.text for t in apples_conjuncts] == ["oranges"]
+> assert [t.text for t in apples_conjuncts] == [u"oranges"]
> ```
| Name | Type | Description |
@@ -342,9 +341,9 @@ Tokens that are to the left of the span, whose heads are within the span.
> #### Example
>
> ```python
-> doc = nlp("I like New York in Autumn.")
+> doc = nlp(u"I like New York in Autumn.")
> lefts = [t.text for t in doc[3:7].lefts]
-> assert lefts == ["New"]
+> assert lefts == [u"New"]
> ```
| Name | Type | Description |
@@ -358,9 +357,9 @@ Tokens that are to the right of the span, whose heads are within the span.
> #### Example
>
> ```python
-> doc = nlp("I like New York in Autumn.")
+> doc = nlp(u"I like New York in Autumn.")
> rights = [t.text for t in doc[2:4].rights]
-> assert rights == ["in"]
+> assert rights == [u"in"]
> ```
| Name | Type | Description |
@@ -375,7 +374,7 @@ the span.
> #### Example
>
> ```python
-> doc = nlp("I like New York in Autumn.")
+> doc = nlp(u"I like New York in Autumn.")
> assert doc[3:7].n_lefts == 1
> ```
@@ -391,7 +390,7 @@ the span.
> #### Example
>
> ```python
-> doc = nlp("I like New York in Autumn.")
+> doc = nlp(u"I like New York in Autumn.")
> assert doc[2:4].n_rights == 1
> ```
@@ -406,9 +405,9 @@ Tokens within the span and tokens which descend from them.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> subtree = [t.text for t in doc[:3].subtree]
-> assert subtree == ["Give", "it", "back", "!"]
+> assert subtree == [u"Give", u"it", u"back", u"!"]
> ```
| Name | Type | Description |
@@ -422,7 +421,7 @@ A boolean value indicating whether a word vector is associated with the object.
> #### Example
>
> ```python
-> doc = nlp("I like apples")
+> doc = nlp(u"I like apples")
> assert doc[1:].has_vector
> ```
@@ -438,7 +437,7 @@ vectors.
> #### Example
>
> ```python
-> doc = nlp("I like apples")
+> doc = nlp(u"I like apples")
> assert doc[1:].vector.dtype == "float32"
> assert doc[1:].vector.shape == (300,)
> ```
@@ -454,7 +453,7 @@ The L2 norm of the span's vector representation.
> #### Example
>
> ```python
-> doc = nlp("I like apples")
+> doc = nlp(u"I like apples")
> doc[1:].vector_norm # 4.800883928527915
> doc[2:].vector_norm # 6.895897646384268
> assert doc[1:].vector_norm != doc[2:].vector_norm
@@ -479,11 +478,9 @@ The L2 norm of the span's vector representation.
| `text_with_ws` | unicode | The text content of the span with a trailing whitespace character if the last token has one. |
| `orth` | int | ID of the verbatim text content. |
| `orth_` | unicode | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. |
-| `label` | int | The hash value of the span's label. |
+| `label` | int | The span's label. |
| `label_` | unicode | The span's label. |
| `lemma_` | unicode | The span's lemma. |
-| `kb_id` | int | The hash value of the knowledge base ID referred to by the span. |
-| `kb_id_` | unicode | The knowledge base ID referred to by the span. |
| `ent_id` | int | The hash value of the named entity the token is an instance of. |
| `ent_id_` | unicode | The string ID of the named entity the token is an instance of. |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the span. |
diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md
index 268f19125..40d27a62a 100644
--- a/website/docs/api/stringstore.md
+++ b/website/docs/api/stringstore.md
@@ -16,7 +16,7 @@ Create the `StringStore`.
>
> ```python
> from spacy.strings import StringStore
-> stringstore = StringStore(["apple", "orange"])
+> stringstore = StringStore([u"apple", u"orange"])
> ```
| Name | Type | Description |
@@ -31,7 +31,7 @@ Get the number of strings in the store.
> #### Example
>
> ```python
-> stringstore = StringStore(["apple", "orange"])
+> stringstore = StringStore([u"apple", u"orange"])
> assert len(stringstore) == 2
> ```
@@ -46,10 +46,10 @@ Retrieve a string from a given hash, or vice versa.
> #### Example
>
> ```python
-> stringstore = StringStore(["apple", "orange"])
-> apple_hash = stringstore["apple"]
+> stringstore = StringStore([u"apple", u"orange"])
+> apple_hash = stringstore[u"apple"]
> assert apple_hash == 8566208034543834098
-> assert stringstore[apple_hash] == "apple"
+> assert stringstore[apple_hash] == u"apple"
> ```
| Name | Type | Description |
@@ -64,9 +64,9 @@ Check whether a string is in the store.
> #### Example
>
> ```python
-> stringstore = StringStore(["apple", "orange"])
-> assert "apple" in stringstore
-> assert not "cherry" in stringstore
+> stringstore = StringStore([u"apple", u"orange"])
+> assert u"apple" in stringstore
+> assert not u"cherry" in stringstore
> ```
| Name | Type | Description |
@@ -82,9 +82,9 @@ store will always include an empty string `''` at position `0`.
> #### Example
>
> ```python
-> stringstore = StringStore(["apple", "orange"])
+> stringstore = StringStore([u"apple", u"orange"])
> all_strings = [s for s in stringstore]
-> assert all_strings == ["apple", "orange"]
+> assert all_strings == [u"apple", u"orange"]
> ```
| Name | Type | Description |
@@ -98,12 +98,12 @@ Add a string to the `StringStore`.
> #### Example
>
> ```python
-> stringstore = StringStore(["apple", "orange"])
-> banana_hash = stringstore.add("banana")
+> stringstore = StringStore([u"apple", u"orange"])
+> banana_hash = stringstore.add(u"banana")
> assert len(stringstore) == 3
> assert banana_hash == 2525716904149915114
-> assert stringstore[banana_hash] == "banana"
-> assert stringstore["banana"] == banana_hash
+> assert stringstore[banana_hash] == u"banana"
+> assert stringstore[u"banana"] == banana_hash
> ```
| Name | Type | Description |
@@ -182,7 +182,7 @@ Get a 64-bit hash for a given string.
>
> ```python
> from spacy.strings import hash_string
-> assert hash_string("apple") == 8566208034543834098
+> assert hash_string(u"apple") == 8566208034543834098
> ```
| Name | Type | Description |
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index bd3382f89..a1d921b41 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -57,7 +57,7 @@ and all pipeline components are applied to the `Doc` in order. Both
>
> ```python
> tagger = Tagger(nlp.vocab)
-> doc = nlp("This is a sentence.")
+> doc = nlp(u"This is a sentence.")
> # This usually happens under the hood
> processed = tagger(doc)
> ```
@@ -97,7 +97,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
>
> ```python
> tagger = Tagger(nlp.vocab)
-> scores, tensors = tagger.predict([doc1, doc2])
+> scores = tagger.predict([doc1, doc2])
> ```
| Name | Type | Description |
@@ -113,15 +113,14 @@ Modify a batch of documents, using pre-computed scores.
>
> ```python
> tagger = Tagger(nlp.vocab)
-> scores, tensors = tagger.predict([doc1, doc2])
-> tagger.set_annotations([doc1, doc2], scores, tensors)
+> scores = tagger.predict([doc1, doc2])
+> tagger.set_annotations([doc1, doc2], scores)
> ```
-| Name | Type | Description |
-| --------- | -------- | ----------------------------------------------------- |
-| `docs` | iterable | The documents to modify. |
-| `scores` | - | The scores to set, produced by `Tagger.predict`. |
-| `tensors` | iterable | The token representations used to predict the scores. |
+| Name | Type | Description |
+| -------- | -------- | ------------------------------------------------ |
+| `docs` | iterable | The documents to modify. |
+| `scores` | - | The scores to set, produced by `Tagger.predict`. |
## Tagger.update {#update tag="method"}
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index 1a0280265..310122b9c 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -75,7 +75,7 @@ delegate to the [`predict`](/api/textcategorizer#predict) and
>
> ```python
> textcat = TextCategorizer(nlp.vocab)
-> doc = nlp("This is a sentence.")
+> doc = nlp(u"This is a sentence.")
> # This usually happens under the hood
> processed = textcat(doc)
> ```
@@ -116,7 +116,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
>
> ```python
> textcat = TextCategorizer(nlp.vocab)
-> scores, tensors = textcat.predict([doc1, doc2])
+> scores = textcat.predict([doc1, doc2])
> ```
| Name | Type | Description |
@@ -132,15 +132,14 @@ Modify a batch of documents, using pre-computed scores.
>
> ```python
> textcat = TextCategorizer(nlp.vocab)
-> scores, tensors = textcat.predict([doc1, doc2])
-> textcat.set_annotations([doc1, doc2], scores, tensors)
+> scores = textcat.predict([doc1, doc2])
+> textcat.set_annotations([doc1, doc2], scores)
> ```
-| Name | Type | Description |
-| --------- | -------- | --------------------------------------------------------- |
-| `docs` | iterable | The documents to modify. |
-| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. |
-| `tensors` | iterable | The token representations used to predict the scores. |
+| Name | Type | Description |
+| -------- | -------- | --------------------------------------------------------- |
+| `docs` | iterable | The documents to modify. |
+| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. |
## TextCategorizer.update {#update tag="method"}
@@ -228,13 +227,13 @@ Modify the pipe's model, to use the given parameter values.
>
> ```python
> textcat = TextCategorizer(nlp.vocab)
-> with textcat.use_params(optimizer.averages):
+> with textcat.use_params():
> textcat.to_disk("/best_model")
> ```
| Name | Type | Description |
| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
-| `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
+| `params` | - | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
## TextCategorizer.add_label {#add_label tag="method"}
diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index 8d7ee5928..24816b401 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -12,9 +12,9 @@ Construct a `Token` object.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> token = doc[0]
-> assert token.text == "Give"
+> assert token.text == u"Give"
> ```
| Name | Type | Description |
@@ -31,7 +31,7 @@ The number of unicode characters in the token, i.e. `token.text`.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> token = doc[0]
> assert len(token) == 4
> ```
@@ -50,9 +50,9 @@ For details, see the documentation on
>
> ```python
> from spacy.tokens import Token
-> fruit_getter = lambda token: token.text in ("apple", "pear", "banana")
+> fruit_getter = lambda token: token.text in (u"apple", u"pear", u"banana")
> Token.set_extension("is_fruit", getter=fruit_getter)
-> doc = nlp("I have an apple")
+> doc = nlp(u"I have an apple")
> assert doc[3]._.is_fruit
> ```
@@ -128,7 +128,7 @@ Check the value of a boolean flag.
>
> ```python
> from spacy.attrs import IS_TITLE
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> token = doc[0]
> assert token.check_flag(IS_TITLE) == True
> ```
@@ -145,7 +145,7 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors.
> #### Example
>
> ```python
-> apples, _, oranges = nlp("apples and oranges")
+> apples, _, oranges = nlp(u"apples and oranges")
> apples_oranges = apples.similarity(oranges)
> oranges_apples = oranges.similarity(apples)
> assert apples_oranges == oranges_apples
@@ -163,9 +163,9 @@ Get a neighboring token.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> give_nbor = doc[0].nbor()
-> assert give_nbor.text == "it"
+> assert give_nbor.text == u"it"
> ```
| Name | Type | Description |
@@ -181,7 +181,7 @@ dependency tree.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> give = doc[0]
> it = doc[1]
> assert give.is_ancestor(it)
@@ -199,11 +199,11 @@ The rightmost token of this token's syntactic descendants.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> it_ancestors = doc[1].ancestors
-> assert [t.text for t in it_ancestors] == ["Give"]
+> assert [t.text for t in it_ancestors] == [u"Give"]
> he_ancestors = doc[4].ancestors
-> assert [t.text for t in he_ancestors] == ["pleaded"]
+> assert [t.text for t in he_ancestors] == [u"pleaded"]
> ```
| Name | Type | Description |
@@ -217,9 +217,9 @@ A tuple of coordinated tokens, not including the token itself.
> #### Example
>
> ```python
-> doc = nlp("I like apples and oranges")
+> doc = nlp(u"I like apples and oranges")
> apples_conjuncts = doc[2].conjuncts
-> assert [t.text for t in apples_conjuncts] == ["oranges"]
+> assert [t.text for t in apples_conjuncts] == [u"oranges"]
> ```
| Name | Type | Description |
@@ -233,9 +233,9 @@ A sequence of the token's immediate syntactic children.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> give_children = doc[0].children
-> assert [t.text for t in give_children] == ["it", "back", "!"]
+> assert [t.text for t in give_children] == [u"it", u"back", u"!"]
> ```
| Name | Type | Description |
@@ -249,9 +249,9 @@ The leftward immediate children of the word, in the syntactic dependency parse.
> #### Example
>
> ```python
-> doc = nlp("I like New York in Autumn.")
+> doc = nlp(u"I like New York in Autumn.")
> lefts = [t.text for t in doc[3].lefts]
-> assert lefts == ["New"]
+> assert lefts == [u'New']
> ```
| Name | Type | Description |
@@ -265,9 +265,9 @@ The rightward immediate children of the word, in the syntactic dependency parse.
> #### Example
>
> ```python
-> doc = nlp("I like New York in Autumn.")
+> doc = nlp(u"I like New York in Autumn.")
> rights = [t.text for t in doc[3].rights]
-> assert rights == ["in"]
+> assert rights == [u"in"]
> ```
| Name | Type | Description |
@@ -282,7 +282,7 @@ dependency parse.
> #### Example
>
> ```python
-> doc = nlp("I like New York in Autumn.")
+> doc = nlp(u"I like New York in Autumn.")
> assert doc[3].n_lefts == 1
> ```
@@ -298,7 +298,7 @@ dependency parse.
> #### Example
>
> ```python
-> doc = nlp("I like New York in Autumn.")
+> doc = nlp(u"I like New York in Autumn.")
> assert doc[3].n_rights == 1
> ```
@@ -313,9 +313,9 @@ A sequence containing the token and all the token's syntactic descendants.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> give_subtree = doc[0].subtree
-> assert [t.text for t in give_subtree] == ["Give", "it", "back", "!"]
+> assert [t.text for t in give_subtree] == [u"Give", u"it", u"back", u"!"]
> ```
| Name | Type | Description |
@@ -330,7 +330,7 @@ unknown. Defaults to `True` for the first token in the `Doc`.
> #### Example
>
> ```python
-> doc = nlp("Give it back! He pleaded.")
+> doc = nlp(u"Give it back! He pleaded.")
> assert doc[4].is_sent_start
> assert not doc[5].is_sent_start
> ```
@@ -361,7 +361,7 @@ A boolean value indicating whether a word vector is associated with the token.
> #### Example
>
> ```python
-> doc = nlp("I like apples")
+> doc = nlp(u"I like apples")
> apples = doc[2]
> assert apples.has_vector
> ```
@@ -377,7 +377,7 @@ A real-valued meaning representation.
> #### Example
>
> ```python
-> doc = nlp("I like apples")
+> doc = nlp(u"I like apples")
> apples = doc[2]
> assert apples.vector.dtype == "float32"
> assert apples.vector.shape == (300,)
@@ -394,7 +394,7 @@ The L2 norm of the token's vector representation.
> #### Example
>
> ```python
-> doc = nlp("I like apples and pasta")
+> doc = nlp(u"I like apples and pasta")
> apples = doc[2]
> pasta = doc[4]
> apples.vector_norm # 6.89589786529541
@@ -425,10 +425,8 @@ The L2 norm of the token's vector representation.
| `i` | int | The index of the token within the parent document. |
| `ent_type` | int | Named entity type. |
| `ent_type_` | unicode | Named entity type. |
-| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. |
+| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | |
| `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. |
-| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. |
-| `ent_kb_id_` 2.2 | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. |
| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. |
| `ent_id_` | unicode | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. |
| `lemma` | int | Base form of the token, with no inflectional suffixes. |
diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md
index d6ab73f14..ce1ba9a21 100644
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@@ -5,9 +5,7 @@ tag: class
source: spacy/tokenizer.pyx
---
-Segment text, and create `Doc` objects with the discovered segment boundaries.
-For a deeper understanding, see the docs on
-[how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works).
+Segment text, and create `Doc` objects with the discovered segment boundaries. For a deeper understanding, see the docs on [how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works).
## Tokenizer.\_\_init\_\_ {#init tag="method"}
@@ -51,7 +49,7 @@ Tokenize a string.
> #### Example
>
> ```python
-> tokens = tokenizer("This is a sentence")
+> tokens = tokenizer(u"This is a sentence")
> assert len(tokens) == 4
> ```
@@ -67,7 +65,7 @@ Tokenize a stream of texts.
> #### Example
>
> ```python
-> texts = ["One document.", "...", "Lots of documents"]
+> texts = [u"One document.", u"...", u"Lots of documents"]
> for doc in tokenizer.pipe(texts, batch_size=50):
> pass
> ```
@@ -111,15 +109,14 @@ if no suffix rules match.
Add a special-case tokenization rule. This mechanism is also used to add custom
tokenizer exceptions to the language data. See the usage guide on
-[adding languages](/usage/adding-languages#tokenizer-exceptions) and
-[linguistic features](/usage/linguistic-features#special-cases) for more details
-and examples.
+[adding languages](/usage/adding-languages#tokenizer-exceptions) and [linguistic features](/usage/linguistic-features#special-cases) for more
+details and examples.
> #### Example
>
> ```python
-> from spacy.attrs import ORTH, NORM
-> case = [{ORTH: "do"}, {ORTH: "n't", NORM: "not"}]
+> from spacy.attrs import ORTH, LEMMA
+> case = [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]
> tokenizer.add_special_case("don't", case)
> ```
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 50ba0e3d9..9d166a5c5 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -112,10 +112,10 @@ list of available terms, see
> #### Example
>
> ```python
-> spacy.explain("NORP")
+> spacy.explain(u"NORP")
> # Nationalities or religious or political groups
>
-> doc = nlp("Hello world")
+> doc = nlp(u"Hello world")
> for word in doc:
> print(word.text, word.tag_, spacy.explain(word.tag_))
> # Hello UH interjection
@@ -181,8 +181,8 @@ browser. Will run a simple web server.
> import spacy
> from spacy import displacy
> nlp = spacy.load("en_core_web_sm")
-> doc1 = nlp("This is a sentence.")
-> doc2 = nlp("This is another sentence.")
+> doc1 = nlp(u"This is a sentence.")
+> doc2 = nlp(u"This is another sentence.")
> displacy.serve([doc1, doc2], style="dep")
> ```
@@ -192,7 +192,7 @@ browser. Will run a simple web server.
| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
| `page` | bool | Render markup as full HTML page. | `True` |
| `minify` | bool | Minify HTML markup. | `False` |
-| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
+| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
| `port` | int | Port to serve visualization. | `5000` |
| `host` | unicode | Host to serve visualization. | `'0.0.0.0'` |
@@ -207,7 +207,7 @@ Render a dependency parse tree or named entity visualization.
> import spacy
> from spacy import displacy
> nlp = spacy.load("en_core_web_sm")
-> doc = nlp("This is a sentence.")
+> doc = nlp(u"This is a sentence.")
> html = displacy.render(doc, style="dep")
> ```
@@ -218,7 +218,7 @@ Render a dependency parse tree or named entity visualization.
| `page` | bool | Render markup as full HTML page. | `False` |
| `minify` | bool | Minify HTML markup. | `False` |
| `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` |
-| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
+| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
| **RETURNS** | unicode | Rendered HTML markup. |
@@ -262,18 +262,15 @@ If a setting is not present in the options, the default value will be used.
> displacy.serve(doc, style="ent", options=options)
> ```
-| Name | Type | Description | Default |
-| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
-| `ents` | list | Entity types to highlight (`None` for all types). | `None` |
-| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` |
-| `template` 2.2 | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
+| Name | Type | Description | Default |
+| -------- | ---- | ------------------------------------------------------------------------------------- | ------- |
+| `ents` | list | Entity types to highlight (`None` for all types). | `None` |
+| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` |
By default, displaCy comes with colors for all
[entity types supported by spaCy](/api/annotation#named-entities). If you're
using custom entity types, you can use the `colors` setting to add your own
-colors for them. Your application or model package can also expose a
-[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
-to add custom labels and their colors automatically.
+colors for them.
## Utility functions {#util source="spacy/util.py"}
@@ -514,9 +511,9 @@ an error if key doesn't match `ORTH` values.
>
> ```python
> BASE = {"a.": [{ORTH: "a."}], ":)": [{ORTH: ":)"}]}
-> NEW = {"a.": [{ORTH: "a.", NORM: "all"}]}
+> NEW = {"a.": [{ORTH: "a.", LEMMA: "all"}]}
> exceptions = util.update_exc(BASE, NEW)
-> # {"a.": [{ORTH: "a.", NORM: "all"}], ":)": [{ORTH: ":)"}]}
+> # {"a.": [{ORTH: "a.", LEMMA: "all"}], ":)": [{ORTH: ":)"}]}
> ```
| Name | Type | Description |
@@ -651,11 +648,11 @@ for batching. Larger `bufsize` means less bias.
> shuffled = itershuffle(values)
> ```
-| Name | Type | Description |
-| ---------- | -------- | ----------------------------------- |
-| `iterable` | iterable | Iterator to shuffle. |
-| `bufsize` | int | Items to hold back (default: 1000). |
-| **YIELDS** | iterable | The shuffled iterator. |
+| Name | Type | Description |
+| ---------- | -------- | ------------------------------------- |
+| `iterable` | iterable | Iterator to shuffle. |
+| `bufsize` | int | Items to hold back (default: 1000). |
+| **YIELDS** | iterable | The shuffled iterator. |
### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"}
diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md
index ae62d8cfc..c04085091 100644
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@@ -26,7 +26,7 @@ you can add vectors to later.
> empty_vectors = Vectors(shape=(10000, 300))
>
> data = numpy.zeros((3, 300), dtype='f')
-> keys = ["cat", "dog", "rat"]
+> keys = [u"cat", u"dog", u"rat"]
> vectors = Vectors(data=data, keys=keys)
> ```
@@ -35,7 +35,6 @@ you can add vectors to later.
| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. |
| `keys` | iterable | A sequence of keys aligned with the data. |
| `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. |
-| `name` | unicode | A name to identify the vectors table. |
| **RETURNS** | `Vectors` | The newly created object. |
## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
@@ -46,9 +45,9 @@ raised.
> #### Example
>
> ```python
-> cat_id = nlp.vocab.strings["cat"]
+> cat_id = nlp.vocab.strings[u"cat"]
> cat_vector = nlp.vocab.vectors[cat_id]
-> assert cat_vector == nlp.vocab["cat"].vector
+> assert cat_vector == nlp.vocab[u"cat"].vector
> ```
| Name | Type | Description |
@@ -63,7 +62,7 @@ Set a vector for the given key.
> #### Example
>
> ```python
-> cat_id = nlp.vocab.strings["cat"]
+> cat_id = nlp.vocab.strings[u"cat"]
> vector = numpy.random.uniform(-1, 1, (300,))
> nlp.vocab.vectors[cat_id] = vector
> ```
@@ -110,7 +109,7 @@ Check whether a key has been mapped to a vector entry in the table.
> #### Example
>
> ```python
-> cat_id = nlp.vocab.strings["cat"]
+> cat_id = nlp.vocab.strings[u"cat"]
> nlp.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,)))
> assert cat_id in vectors
> ```
@@ -133,9 +132,9 @@ mapping separately. If you need to manage the strings, you should use the
>
> ```python
> vector = numpy.random.uniform(-1, 1, (300,))
-> cat_id = nlp.vocab.strings["cat"]
+> cat_id = nlp.vocab.strings[u"cat"]
> nlp.vocab.vectors.add(cat_id, vector=vector)
-> nlp.vocab.vectors.add("dog", row=0)
+> nlp.vocab.vectors.add(u"dog", row=0)
> ```
| Name | Type | Description |
@@ -219,8 +218,8 @@ Look up one or more keys by row, or vice versa.
> #### Example
>
> ```python
-> row = nlp.vocab.vectors.find(key="cat")
-> rows = nlp.vocab.vectors.find(keys=["cat", "dog"])
+> row = nlp.vocab.vectors.find(key=u"cat")
+> rows = nlp.vocab.vectors.find(keys=[u"cat", u"dog"])
> key = nlp.vocab.vectors.find(row=256)
> keys = nlp.vocab.vectors.find(rows=[18, 256, 985])
> ```
@@ -242,7 +241,7 @@ vector table.
>
> ```python
> vectors = Vectors(shape(1, 300))
-> vectors.add("cat", numpy.random.uniform(-1, 1, (300,)))
+> vectors.add(u"cat", numpy.random.uniform(-1, 1, (300,)))
> rows, dims = vectors.shape
> assert rows == 1
> assert dims == 300
@@ -277,7 +276,7 @@ If a table is full, it can be resized using
>
> ```python
> vectors = Vectors(shape=(1, 300))
-> vectors.add("cat", numpy.random.uniform(-1, 1, (300,)))
+> vectors.add(u"cat", numpy.random.uniform(-1, 1, (300,)))
> assert vectors.is_full
> ```
diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md
index ea0c2d219..cd21a91d6 100644
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@@ -18,17 +18,16 @@ Create the vocabulary.
>
> ```python
> from spacy.vocab import Vocab
-> vocab = Vocab(strings=["hello", "world"])
+> vocab = Vocab(strings=[u"hello", u"world"])
> ```
-| Name | Type | Description |
-| ------------------------------------------- | -------------------- | ------------------------------------------------------------------------------------------------------------------ |
-| `lex_attr_getters` | dict | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. |
-| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
-| `lemmatizer` | object | A lemmatizer. Defaults to `None`. |
-| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. |
-| `vectors_name` 2.2 | unicode | A name to identify the vectors table. |
-| **RETURNS** | `Vocab` | The newly constructed object. |
+| Name | Type | Description |
+| ------------------ | -------------------- | ------------------------------------------------------------------------------------------------------------------ |
+| `lex_attr_getters` | dict | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. |
+| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
+| `lemmatizer` | object | A lemmatizer. Defaults to `None`. |
+| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. |
+| **RETURNS** | `Vocab` | The newly constructed object. |
## Vocab.\_\_len\_\_ {#len tag="method"}
@@ -37,7 +36,7 @@ Get the current number of lexemes in the vocabulary.
> #### Example
>
> ```python
-> doc = nlp("This is a sentence.")
+> doc = nlp(u"This is a sentence.")
> assert len(nlp.vocab) > 0
> ```
@@ -53,8 +52,8 @@ unicode string is given, a new lexeme is created and stored.
> #### Example
>
> ```python
-> apple = nlp.vocab.strings["apple"]
-> assert nlp.vocab[apple] == nlp.vocab["apple"]
+> apple = nlp.vocab.strings[u"apple"]
+> assert nlp.vocab[apple] == nlp.vocab[u"apple"]
> ```
| Name | Type | Description |
@@ -85,8 +84,8 @@ given string, you need to look it up in
> #### Example
>
> ```python
-> apple = nlp.vocab.strings["apple"]
-> oov = nlp.vocab.strings["dskfodkfos"]
+> apple = nlp.vocab.strings[u"apple"]
+> oov = nlp.vocab.strings[u"dskfodkfos"]
> assert apple in nlp.vocab
> assert oov not in nlp.vocab
> ```
@@ -107,11 +106,11 @@ using `token.check_flag(flag_id)`.
>
> ```python
> def is_my_product(text):
-> products = ["spaCy", "Thinc", "displaCy"]
+> products = [u"spaCy", u"Thinc", u"displaCy"]
> return text in products
>
> MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
-> doc = nlp("I like spaCy")
+> doc = nlp(u"I like spaCy")
> assert doc[2].check_flag(MY_PRODUCT) == True
> ```
@@ -171,7 +170,7 @@ or hash value. If no vectors data is loaded, a `ValueError` is raised.
> #### Example
>
> ```python
-> nlp.vocab.get_vector("apple")
+> nlp.vocab.get_vector(u"apple")
> ```
| Name | Type | Description |
@@ -187,7 +186,7 @@ or hash value.
> #### Example
>
> ```python
-> nlp.vocab.set_vector("apple", array([...]))
+> nlp.vocab.set_vector(u"apple", array([...]))
> ```
| Name | Type | Description |
@@ -203,8 +202,8 @@ Words can be looked up by string or hash value.
> #### Example
>
> ```python
-> if nlp.vocab.has_vector("apple"):
-> vector = nlp.vocab.get_vector("apple")
+> if nlp.vocab.has_vector(u"apple"):
+> vector = nlp.vocab.get_vector(u"apple")
> ```
| Name | Type | Description |
@@ -283,9 +282,9 @@ Load state from a binary string.
> #### Example
>
> ```python
-> apple_id = nlp.vocab.strings["apple"]
+> apple_id = nlp.vocab.strings[u"apple"]
> assert type(apple_id) == int
-> PERSON = nlp.vocab.strings["PERSON"]
+> PERSON = nlp.vocab.strings[u"PERSON"]
> assert type(PERSON) == int
> ```
@@ -294,7 +293,6 @@ Load state from a binary string.
| `strings` | `StringStore` | A table managing the string-to-int mapping. |
| `vectors` 2 | `Vectors` | A table associating word IDs to word vectors. |
| `vectors_length` | int | Number of dimensions for each word vector. |
-| `lookups` | `Lookups` | The available lookup tables in this vocab. |
| `writing_system` 2.1 | dict | A dict with information about the language's writing system. |
## Serialization fields {#serialization-fields}
@@ -315,4 +313,3 @@ serialization by passing in the string names via the `exclude` argument.
| `strings` | The strings in the [`StringStore`](/api/stringstore). |
| `lexemes` | The lexeme data. |
| `vectors` | The word vectors, if available. |
-| `lookups` | The lookup tables, if available. |
diff --git a/website/docs/images/displacy-ent-snek.html b/website/docs/images/displacy-ent-snek.html
deleted file mode 100644
index 1e4920fb5..000000000
--- a/website/docs/images/displacy-ent-snek.html
+++ /dev/null
@@ -1,18 +0,0 @@
-
- 🌱🌿 🐍 SNEK ____ 🌳🌲 ____ 👨🌾 HUMAN 🏘️
-
diff --git a/website/docs/usage/101/_named-entities.md b/website/docs/usage/101/_named-entities.md
index 1ecaf9fe7..54db6dbe8 100644
--- a/website/docs/usage/101/_named-entities.md
+++ b/website/docs/usage/101/_named-entities.md
@@ -12,7 +12,7 @@ Named entities are available as the `ents` property of a `Doc`:
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
+doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
@@ -21,7 +21,7 @@ for ent in doc.ents:
> - **Text:** The original entity text.
> - **Start:** Index of start of entity in the `Doc`.
> - **End:** Index of end of entity in the `Doc`.
-> - **Label:** Entity label, i.e. type.
+> - **LabeL:** Entity label, i.e. type.
| Text | Start | End | Label | Description |
| ----------- | :---: | :-: | ------- | ---------------------------------------------------- |
diff --git a/website/docs/usage/101/_pipelines.md b/website/docs/usage/101/_pipelines.md
index d33ea45fd..68308a381 100644
--- a/website/docs/usage/101/_pipelines.md
+++ b/website/docs/usage/101/_pipelines.md
@@ -12,14 +12,14 @@ passed on to the next component.
> - **Creates:** Objects, attributes and properties modified and set by the
> component.
-| Name | Component | Creates | Description |
-| ----------------- | ------------------------------------------------------------------ | ----------------------------------------------------------- | ------------------------------------------------ |
-| **tokenizer** | [`Tokenizer`](/api/tokenizer) | `Doc` | Segment text into tokens. |
-| **tagger** | [`Tagger`](/api/tagger) | `Doc[i].tag` | Assign part-of-speech tags. |
-| **parser** | [`DependencyParser`](/api/dependencyparser) | `Doc[i].head`, `Doc[i].dep`, `Doc.sents`, `Doc.noun_chunks` | Assign dependency labels. |
-| **ner** | [`EntityRecognizer`](/api/entityrecognizer) | `Doc.ents`, `Doc[i].ent_iob`, `Doc[i].ent_type` | Detect and label named entities. |
-| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. |
-| ... | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. |
+| Name | Component | Creates | Description |
+| ------------- | ------------------------------------------------------------------ | ----------------------------------------------------------- | ------------------------------------------------ |
+| **tokenizer** | [`Tokenizer`](/api/tokenizer) | `Doc` | Segment text into tokens. |
+| **tagger** | [`Tagger`](/api/tagger) | `Doc[i].tag` | Assign part-of-speech tags. |
+| **parser** | [`DependencyParser`](/api/dependencyparser) | `Doc[i].head`, `Doc[i].dep`, `Doc.sents`, `Doc.noun_chunks` | Assign dependency labels. |
+| **ner** | [`EntityRecognizer`](/api/entityrecognizer) | `Doc.ents`, `Doc[i].ent_iob`, `Doc[i].ent_type` | Detect and label named entities. |
+| **textcat** | [`TextCategorizer`](/api/textcategorizer) | `Doc.cats` | Assign document labels. |
+| ... | [custom components](/usage/processing-pipelines#custom-components) | `Doc._.xxx`, `Token._.xxx`, `Span._.xxx` | Assign custom attributes, methods or properties. |
The processing pipeline always **depends on the statistical model** and its
capabilities. For example, a pipeline can only include an entity recognizer
@@ -49,10 +49,6 @@ them, its dependency predictions may be different. Similarly, it matters if you
add the [`EntityRuler`](/api/entityruler) before or after the statistical entity
recognizer: if it's added before, the entity recognizer will take the existing
entities into account when making predictions.
-The [`EntityLinker`](/api/entitylinker), which resolves named entities to
-knowledge base IDs, should be preceded by
-a pipeline component that recognizes entities such as the
-[`EntityRecognizer`](/api/entityrecognizer).
diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md
index 9d04d6ffc..d86ee123d 100644
--- a/website/docs/usage/101/_pos-deps.md
+++ b/website/docs/usage/101/_pos-deps.md
@@ -15,8 +15,8 @@ need to add an underscore `_` to its name:
### {executable="true"}
import spacy
-nlp = spacy.load("en_core_web_sm")
-doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
+nlp = spacy.load('en_core_web_sm')
+doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
@@ -45,7 +45,7 @@ for token in doc:
| for | for | `ADP` | `IN` | `prep` | `xxx` | `True` | `True` |
| \$ | \$ | `SYM` | `$` | `quantmod` | `$` | `False` | `False` |
| 1 | 1 | `NUM` | `CD` | `compound` | `d` | `False` | `False` |
-| billion | billion | `NUM` | `CD` | `pobj` | `xxxx` | `True` | `False` |
+| billion | billion | `NUM` | `CD` | `probj` | `xxxx` | `True` | `False` |
> #### Tip: Understanding tags and labels
>
diff --git a/website/docs/usage/101/_serialization.md b/website/docs/usage/101/_serialization.md
index 01a9c39d1..828b796b3 100644
--- a/website/docs/usage/101/_serialization.md
+++ b/website/docs/usage/101/_serialization.md
@@ -13,9 +13,9 @@ file or a byte string. This process is called serialization. spaCy comes with
> object to and from disk, but it's also used for distributed computing, e.g.
> with
> [PySpark](https://spark.apache.org/docs/0.9.0/python-programming-guide.html)
-> or [Dask](https://dask.org). When you unpickle an object, you're agreeing to
-> execute whatever code it contains. It's like calling `eval()` on a string – so
-> don't unpickle objects from untrusted sources.
+> or [Dask](http://dask.pydata.org/en/latest/). When you unpickle an object,
+> you're agreeing to execute whatever code it contains. It's like calling
+> `eval()` on a string – so don't unpickle objects from untrusted sources.
All container classes, i.e. [`Language`](/api/language) (`nlp`),
[`Doc`](/api/doc), [`Vocab`](/api/vocab) and [`StringStore`](/api/stringstore)
diff --git a/website/docs/usage/101/_tokenization.md b/website/docs/usage/101/_tokenization.md
index 764f1e62a..e5f3d3080 100644
--- a/website/docs/usage/101/_tokenization.md
+++ b/website/docs/usage/101/_tokenization.md
@@ -9,7 +9,7 @@ tokens, and we can iterate over them:
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
+doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
print(token.text)
```
diff --git a/website/docs/usage/101/_training.md b/website/docs/usage/101/_training.md
index baf3a1891..61e047748 100644
--- a/website/docs/usage/101/_training.md
+++ b/website/docs/usage/101/_training.md
@@ -20,7 +20,7 @@ difference, the more significant the gradient and the updates to our model.
![The training process](../../images/training.svg)
When training a model, we don't just want it to memorize our examples – we want
-it to come up with a theory that can be **generalized across other examples**.
+it to come up with theory that can be **generalized across other examples**.
After all, we don't just want the model to learn that this one instance of
"Amazon" right here is a company – we want it to learn that "Amazon", in
contexts _like this_, is most likely a company. That's why the training data
diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md
index 73c35950f..2001d1481 100644
--- a/website/docs/usage/101/_vectors-similarity.md
+++ b/website/docs/usage/101/_vectors-similarity.md
@@ -48,8 +48,8 @@ norm, which can be used to normalize vectors.
### {executable="true"}
import spacy
-nlp = spacy.load("en_core_web_md")
-tokens = nlp("dog cat banana afskfsd")
+nlp = spacy.load('en_core_web_md')
+tokens = nlp(u'dog cat banana afskfsd')
for token in tokens:
print(token.text, token.has_vector, token.vector_norm, token.is_oov)
@@ -88,8 +88,8 @@ definition of similarity.
### {executable="true"}
import spacy
-nlp = spacy.load("en_core_web_md") # make sure to use larger model!
-tokens = nlp("dog cat banana")
+nlp = spacy.load('en_core_web_md') # make sure to use larger model!
+tokens = nlp(u'dog cat banana')
for token1 in tokens:
for token2 in tokens:
diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md
index 94d75ea31..374d948b2 100644
--- a/website/docs/usage/adding-languages.md
+++ b/website/docs/usage/adding-languages.md
@@ -71,19 +71,21 @@ from the global rules. Others, like the tokenizer and norm exceptions, are very
specific and will make a big difference to spaCy's performance on the particular
language and training a language model.
-| Variable | Type | Description |
-| ---------------------- | ----- | ---------------------------------------------------------------------------------------------------------- |
-| `STOP_WORDS` | set | Individual words. |
-| `TOKENIZER_EXCEPTIONS` | dict | Keyed by strings mapped to list of one dict per token with token attributes. |
-| `TOKEN_MATCH` | regex | Regexes to match complex tokens, e.g. URLs. |
-| `NORM_EXCEPTIONS` | dict | Keyed by strings, mapped to their norms. |
-| `TOKENIZER_PREFIXES` | list | Strings or regexes, usually not customized. |
-| `TOKENIZER_SUFFIXES` | list | Strings or regexes, usually not customized. |
-| `TOKENIZER_INFIXES` | list | Strings or regexes, usually not customized. |
-| `LEX_ATTRS` | dict | Attribute ID mapped to function. |
-| `SYNTAX_ITERATORS` | dict | Iterator ID mapped to function. Currently only supports `'noun_chunks'`. |
-| `TAG_MAP` | dict | Keyed by strings mapped to [Universal Dependencies](http://universaldependencies.org/u/pos/all.html) tags. |
-| `MORPH_RULES` | dict | Keyed by strings mapped to a dict of their morphological features. |
+| Variable | Type | Description |
+| ----------------------------------------- | ----- | ---------------------------------------------------------------------------------------------------------- |
+| `STOP_WORDS` | set | Individual words. |
+| `TOKENIZER_EXCEPTIONS` | dict | Keyed by strings mapped to list of one dict per token with token attributes. |
+| `TOKEN_MATCH` | regex | Regexes to match complex tokens, e.g. URLs. |
+| `NORM_EXCEPTIONS` | dict | Keyed by strings, mapped to their norms. |
+| `TOKENIZER_PREFIXES` | list | Strings or regexes, usually not customized. |
+| `TOKENIZER_SUFFIXES` | list | Strings or regexes, usually not customized. |
+| `TOKENIZER_INFIXES` | list | Strings or regexes, usually not customized. |
+| `LEX_ATTRS` | dict | Attribute ID mapped to function. |
+| `SYNTAX_ITERATORS` | dict | Iterator ID mapped to function. Currently only supports `'noun_chunks'`. |
+| `LOOKUP` | dict | Keyed by strings mapping to their lemma. |
+| `LEMMA_RULES`, `LEMMA_INDEX`, `LEMMA_EXC` | dict | Lemmatization rules, keyed by part of speech. |
+| `TAG_MAP` | dict | Keyed by strings mapped to [Universal Dependencies](http://universaldependencies.org/u/pos/all.html) tags. |
+| `MORPH_RULES` | dict | Keyed by strings mapped to a dict of their morphological features. |
> #### Should I ever update the global data?
>
@@ -211,7 +213,9 @@ spaCy's [tokenization algorithm](/usage/linguistic-features#how-tokenizer-works)
lets you deal with whitespace-delimited chunks separately. This makes it easy to
define special-case rules, without worrying about how they interact with the
rest of the tokenizer. Whenever the key string is matched, the special-case rule
-is applied, giving the defined sequence of tokens.
+is applied, giving the defined sequence of tokens. You can also attach
+attributes to the subtokens, covered by your special case, such as the subtokens
+`LEMMA` or `TAG`.
Tokenizer exceptions can be added in the following format:
@@ -219,8 +223,8 @@ Tokenizer exceptions can be added in the following format:
### tokenizer_exceptions.py (excerpt)
TOKENIZER_EXCEPTIONS = {
"don't": [
- {ORTH: "do"},
- {ORTH: "n't", NORM: "not"}]
+ {ORTH: "do", LEMMA: "do"},
+ {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
}
```
@@ -229,12 +233,41 @@ TOKENIZER_EXCEPTIONS = {
If an exception consists of more than one token, the `ORTH` values combined
always need to **match the original string**. The way the original string is
split up can be pretty arbitrary sometimes – for example `"gonna"` is split into
-`"gon"` (norm "going") and `"na"` (norm "to"). Because of how the tokenizer
+`"gon"` (lemma "go") and `"na"` (lemma "to"). Because of how the tokenizer
works, it's currently not possible to split single-letter strings into multiple
tokens.
+Unambiguous abbreviations, like month names or locations in English, should be
+added to exceptions with a lemma assigned, for example
+`{ORTH: "Jan.", LEMMA: "January"}`. Since the exceptions are added in Python,
+you can use custom logic to generate them more efficiently and make your data
+less verbose. How you do this ultimately depends on the language. Here's an
+example of how exceptions for time formats like "1a.m." and "1am" are generated
+in the English
+[`tokenizer_exceptions.py`](https://github.com/explosion/spaCy/tree/master/spacy/en/lang/tokenizer_exceptions.py):
+
+```python
+### tokenizer_exceptions.py (excerpt)
+# use short, internal variable for readability
+_exc = {}
+
+for h in range(1, 12 + 1):
+ for period in ["a.m.", "am"]:
+ # always keep an eye on string interpolation!
+ _exc["%d%s" % (h, period)] = [
+ {ORTH: "%d" % h},
+ {ORTH: period, LEMMA: "a.m."}]
+ for period in ["p.m.", "pm"]:
+ _exc["%d%s" % (h, period)] = [
+ {ORTH: "%d" % h},
+ {ORTH: period, LEMMA: "p.m."}]
+
+# only declare this at the bottom
+TOKENIZER_EXCEPTIONS = _exc
+```
+
> #### Generating tokenizer exceptions
>
> Keep in mind that generating exceptions only makes sense if there's a clearly
@@ -242,8 +275,7 @@ tokens.
> This is not always the case – in Spanish for instance, infinitive or
> imperative reflexive verbs and pronouns are one token (e.g. "vestirme"). In
> cases like this, spaCy shouldn't be generating exceptions for _all verbs_.
-> Instead, this will be handled at a later stage after part-of-speech tagging
-> and lemmatization.
+> Instead, this will be handled at a later stage during lemmatization.
When adding the tokenizer exceptions to the `Defaults`, you can use the
[`update_exc`](/api/top-level#util.update_exc) helper function to merge them
@@ -260,23 +292,33 @@ custom one.
from ...util import update_exc
BASE_EXCEPTIONS = {"a.": [{ORTH: "a."}], ":)": [{ORTH: ":)"}]}
-TOKENIZER_EXCEPTIONS = {"a.": [{ORTH: "a.", NORM: "all"}]}
+TOKENIZER_EXCEPTIONS = {"a.": [{ORTH: "a.", LEMMA: "all"}]}
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-# {"a.": [{ORTH: "a.", NORM: "all"}], ":)": [{ORTH: ":)"}]}
+# {"a.": [{ORTH: "a.", LEMMA: "all"}], ":)": [{ORTH: ":)"}]}
```
+
+
+Unlike verbs and common nouns, there's no clear base form of a personal pronoun.
+Should the lemma of "me" be "I", or should we normalize person as well, giving
+"it" — or maybe "he"? spaCy's solution is to introduce a novel symbol, `-PRON-`,
+which is used as the lemma for all personal pronouns.
+
+
+
### Norm exceptions {#norm-exceptions new="2"}
-In addition to `ORTH`, tokenizer exceptions can also set a `NORM` attribute.
-This is useful to specify a normalized version of the token – for example, the
-norm of "n't" is "not". By default, a token's norm equals its lowercase text. If
-the lowercase spelling of a word exists, norms should always be in lowercase.
+In addition to `ORTH` or `LEMMA`, tokenizer exceptions can also set a `NORM`
+attribute. This is useful to specify a normalized version of the token – for
+example, the norm of "n't" is "not". By default, a token's norm equals its
+lowercase text. If the lowercase spelling of a word exists, norms should always
+be in lowercase.
> #### Norms vs. lemmas
>
> ```python
-> doc = nlp("I'm gonna realise")
+> doc = nlp(u"I'm gonna realise")
> norms = [token.norm_ for token in doc]
> lemmas = [token.lemma_ for token in doc]
> assert norms == ["i", "am", "going", "to", "realize"]
@@ -396,10 +438,10 @@ iterators:
> #### Noun chunks example
>
> ```python
-> doc = nlp("A phrase with another phrase occurs.")
+> doc = nlp(u"A phrase with another phrase occurs.")
> chunks = list(doc.noun_chunks)
-> assert chunks[0].text == "A phrase"
-> assert chunks[1].text == "another phrase"
+> assert chunks[0].text == u"A phrase"
+> assert chunks[1].text == u"another phrase"
> ```
| Language | Code | Source |
@@ -416,50 +458,27 @@ the quickest and easiest way to get started. The data is stored in a dictionary
mapping a string to its lemma. To determine a token's lemma, spaCy simply looks
it up in the table. Here's an example from the Spanish language data:
-```json
-### lang/es/lemma_lookup.json (excerpt)
-{
- "aba": "abar",
- "ababa": "abar",
- "ababais": "abar",
- "ababan": "abar",
- "ababanes": "ababán",
- "ababas": "abar",
- "ababoles": "ababol",
- "ababábites": "ababábite"
+```python
+### lang/es/lemmatizer.py (excerpt)
+LOOKUP = {
+ "aba": "abar",
+ "ababa": "abar",
+ "ababais": "abar",
+ "ababan": "abar",
+ "ababanes": "ababán",
+ "ababas": "abar",
+ "ababoles": "ababol",
+ "ababábites": "ababábite"
}
```
-#### Adding JSON resources {#lemmatizer-resources new="2.2"}
-
-As of v2.2, resources for the lemmatizer are stored as JSON and loaded via the
-new [`Lookups`](/api/lookups) class. This allows easier access to the data,
-serialization with the models and file compression on disk (so your spaCy
-installation is smaller). Resource files can be provided via the `resources`
-attribute on the custom language subclass. All paths are relative to the
-language data directory, i.e. the directory the language's `__init__.py` is in.
+To provide a lookup lemmatizer for your language, import the lookup table and
+add it to the `Language` class as `lemma_lookup`:
```python
-resources = {
- "lemma_lookup": "lemmatizer/lemma_lookup.json",
- "lemma_rules": "lemmatizer/lemma_rules.json",
- "lemma_index": "lemmatizer/lemma_index.json",
- "lemma_exc": "lemmatizer/lemma_exc.json",
-}
+lemma_lookup = LOOKUP
```
-> #### Lookups example
->
-> ```python
-> table = nlp.vocab.lookups.get_table("my_table")
-> value = table.get("some_key")
-> ```
-
-If your language needs other large dictionaries and resources, you can also add
-those files here. The data will become available via a [`Lookups`](/api/lookups)
-table in `nlp.vocab.lookups`, and you'll be able to access it from the tokenizer
-or a custom pipeline component (via `doc.vocab.lookups`).
-
### Tag map {#tag-map}
Most treebanks define a custom part-of-speech tag scheme, striking a balance
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md
index 40b39d871..a3683b668 100644
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@@ -26,7 +26,7 @@ Here's a quick comparison of the functionalities offered by spaCy,
| Sentence segmentation | ✅ | ✅ | ✅ |
| Dependency parsing | ✅ | ❌ | ✅ |
| Entity recognition | ✅ | ✅ | ✅ |
-| Entity linking | ✅ | ❌ | ❌ |
+| Entity linking | ❌ | ❌ | ❌ |
| Coreference resolution | ❌ | ❌ | ✅ |
### When should I use what? {#comparison-usage}
diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md
index 1d6c0574c..1ffd0de0d 100644
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@@ -392,7 +392,7 @@ from is called `spacy`. So, when using spaCy, never call anything else `spacy`.
```python
-doc = nlp("They are")
+doc = nlp(u"They are")
print(doc[0].lemma_)
# -PRON-
```
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 4128fa73f..66ad816f5 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -69,6 +69,7 @@ of the two. The system works as follows:
morphological information, without consulting the context of the token. The
lemmatizer also accepts list-based exception files, acquired from
[WordNet](https://wordnet.princeton.edu/).
+
## Dependency Parsing {#dependency-parse model="parser"}
@@ -92,7 +93,7 @@ get the noun chunks in a document, simply iterate over
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
+doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
print(chunk.text, chunk.root.text, chunk.root.dep_,
chunk.root.head.text)
@@ -123,7 +124,7 @@ get the string value with `.dep_`.
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
+doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
for token in doc:
print(token.text, token.dep_, token.head.text, token.head.pos_,
[child for child in token.children])
@@ -160,7 +161,7 @@ import spacy
from spacy.symbols import nsubj, VERB
nlp = spacy.load("en_core_web_sm")
-doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
+doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
# Finding a verb with a subject from below — good
verbs = set()
@@ -203,7 +204,7 @@ children.
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("bright red apples on the tree")
+doc = nlp(u"bright red apples on the tree")
print([token.text for token in doc[2].lefts]) # ['bright', 'red']
print([token.text for token in doc[2].rights]) # ['on']
print(doc[2].n_lefts) # 2
@@ -215,7 +216,7 @@ print(doc[2].n_rights) # 1
import spacy
nlp = spacy.load("de_core_news_sm")
-doc = nlp("schöne rote Äpfel auf dem Baum")
+doc = nlp(u"schöne rote Äpfel auf dem Baum")
print([token.text for token in doc[2].lefts]) # ['schöne', 'rote']
print([token.text for token in doc[2].rights]) # ['auf']
```
@@ -239,7 +240,7 @@ sequence of tokens. You can walk up the tree with the
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("Credit and mortgage account holders must submit their requests")
+doc = nlp(u"Credit and mortgage account holders must submit their requests")
root = [token for token in doc if token.head == token][0]
subject = list(root.lefts)[0]
@@ -269,7 +270,7 @@ end-point of a range, don't forget to `+1`!
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("Credit and mortgage account holders must submit their requests")
+doc = nlp(u"Credit and mortgage account holders must submit their requests")
span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1]
with doc.retokenize() as retokenizer:
retokenizer.merge(span)
@@ -310,7 +311,7 @@ import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
+doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
# Since this is an interactive Jupyter environment, we can use displacy.render here
displacy.render(doc, style='dep')
```
@@ -335,7 +336,7 @@ the `nlp` object.
```python
nlp = spacy.load("en_core_web_sm", disable=["parser"])
nlp = English().from_disk("/model", disable=["parser"])
-doc = nlp("I don't want parsed", disable=["parser"])
+doc = nlp(u"I don't want parsed", disable=["parser"])
```
@@ -349,10 +350,10 @@ Language class via [`from_disk`](/api/language#from_disk).
```diff
+ nlp = spacy.load("en_core_web_sm", disable=["parser"])
-+ doc = nlp("I don't want parsed", disable=["parser"])
++ doc = nlp(u"I don't want parsed", disable=["parser"])
- nlp = spacy.load("en_core_web_sm", parser=False)
-- doc = nlp("I don't want parsed", parse=False)
+- doc = nlp(u"I don't want parsed", parse=False)
```
@@ -397,7 +398,7 @@ on a token, it will return an empty string.
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("San Francisco considers banning sidewalk delivery robots")
+doc = nlp(u"San Francisco considers banning sidewalk delivery robots")
# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
@@ -406,8 +407,8 @@ print(ents)
# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
-print(ent_san) # ['San', 'B', 'GPE']
-print(ent_francisco) # ['Francisco', 'I', 'GPE']
+print(ent_san) # [u'San', u'B', u'GPE']
+print(ent_francisco) # [u'Francisco', u'I', u'GPE']
```
| Text | ent_iob | ent_iob\_ | ent_type\_ | Description |
@@ -434,17 +435,18 @@ import spacy
from spacy.tokens import Span
nlp = spacy.load("en_core_web_sm")
-doc = nlp("FB is hiring a new Vice President of global policy")
+doc = nlp(u"FB is hiring a new Vice President of global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents)
# the model didn't recognise "FB" as an entity :(
-fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity
+ORG = doc.vocab.strings[u"ORG"] # get hash value of entity label
+fb_ent = Span(doc, 0, 1, label=ORG) # create a Span for the new entity
doc.ents = list(doc.ents) + [fb_ent]
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('After', ents)
-# [('FB', 0, 2, 'ORG')] 🎉
+# [(u'FB', 0, 2, 'ORG')] 🎉
```
Keep in mind that you need to create a `Span` with the start and end index of
@@ -466,13 +468,13 @@ import spacy
from spacy.attrs import ENT_IOB, ENT_TYPE
nlp = spacy.load("en_core_web_sm")
-doc = nlp.make_doc("London is a big city in the United Kingdom.")
+doc = nlp.make_doc(u"London is a big city in the United Kingdom.")
print("Before", doc.ents) # []
header = [ENT_IOB, ENT_TYPE]
attr_array = numpy.zeros((len(doc), len(header)))
attr_array[0, 0] = 3 # B
-attr_array[0, 1] = doc.vocab.strings["GPE"]
+attr_array[0, 1] = doc.vocab.strings[u"GPE"]
doc.from_array(header, attr_array)
print("After", doc.ents) # [London]
```
@@ -531,8 +533,8 @@ train_data = [
```
```python
-doc = Doc(nlp.vocab, ["rats", "make", "good", "pets"])
-gold = GoldParse(doc, entities=["U-ANIMAL", "O", "O", "O"])
+doc = Doc(nlp.vocab, [u"rats", u"make", u"good", u"pets"])
+gold = GoldParse(doc, entities=[u"U-ANIMAL", u"O", u"O", u"O"])
```
@@ -563,7 +565,7 @@ For more details and examples, see the
import spacy
from spacy import displacy
-text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
+text = u"When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
@@ -574,52 +576,6 @@ import DisplacyEntHtml from 'images/displacy-ent2.html'
-## Entity Linking {#entity-linking}
-
-To ground the named entities into the "real world", spaCy provides functionality
-to perform entity linking, which resolves a textual entity to a unique
-identifier from a knowledge base (KB). The
-[processing scripts](https://github.com/explosion/spaCy/tree/master/bin/wiki_entity_linking)
-we provide use WikiData identifiers, but you can create your own
-[`KnowledgeBase`](/api/kb) and
-[train a new Entity Linking model](/usage/training#entity-linker) using that
-custom-made KB.
-
-### Accessing entity identifiers {#entity-linking-accessing}
-
-The annotated KB identifier is accessible as either a hash value or as a string,
-using the attributes `ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span)
-object, or the `ent_kb_id` and `ent_kb_id_` attributes of a
-[`Token`](/api/token) object.
-
-```python
-import spacy
-
-nlp = spacy.load("my_custom_el_model")
-doc = nlp("Ada Lovelace was born in London")
-
-# document level
-ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents]
-print(ents) # [('Ada Lovelace', 'PERSON', 'Q7259'), ('London', 'GPE', 'Q84')]
-
-# token level
-ent_ada_0 = [doc[0].text, doc[0].ent_type_, doc[0].ent_kb_id_]
-ent_ada_1 = [doc[1].text, doc[1].ent_type_, doc[1].ent_kb_id_]
-ent_london_5 = [doc[5].text, doc[5].ent_type_, doc[5].ent_kb_id_]
-print(ent_ada_0) # ['Ada', 'PERSON', 'Q7259']
-print(ent_ada_1) # ['Lovelace', 'PERSON', 'Q7259']
-print(ent_london_5) # ['London', 'GPE', 'Q84']
-```
-
-| Text | ent_type\_ | ent_kb_id\_ |
-| -------- | ---------- | ----------- |
-| Ada | `"PERSON"` | `"Q7259"` |
-| Lovelace | `"PERSON"` | `"Q7259"` |
-| was | - | - |
-| born | - | - |
-| in | - | - |
-| London | `"GPE"` | `"Q84"` |
-
## Tokenization {#tokenization}
Tokenization is the task of splitting a text into meaningful segments, called
@@ -649,7 +605,7 @@ import Tokenization101 from 'usage/101/\_tokenization.md'
data in
[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang). The
tokenizer exceptions define special cases like "don't" in English, which needs
-to be split into two tokens: `{ORTH: "do"}` and `{ORTH: "n't", NORM: "not"}`.
+to be split into two tokens: `{ORTH: "do"}` and `{ORTH: "n't", LEMMA: "not"}`.
The prefixes, suffixes and infixes mostly define punctuation rules – for
example, when to split off periods (at the end of a sentence), and when to leave
tokens containing periods intact (abbreviations like "U.S.").
@@ -688,36 +644,53 @@ this specific field. Here's how to add a special case rule to an existing
```python
### {executable="true"}
import spacy
-from spacy.symbols import ORTH
+from spacy.symbols import ORTH, LEMMA, POS, TAG
nlp = spacy.load("en_core_web_sm")
-doc = nlp("gimme that") # phrase to tokenize
+doc = nlp(u"gimme that") # phrase to tokenize
print([w.text for w in doc]) # ['gimme', 'that']
-# Add special case rule
-special_case = [{ORTH: "gim"}, {ORTH: "me"}]
-nlp.tokenizer.add_special_case("gimme", special_case)
+# add special case rule
+special_case = [{ORTH: u"gim", LEMMA: u"give", POS: u"VERB"}, {ORTH: u"me"}]
+nlp.tokenizer.add_special_case(u"gimme", special_case)
-# Check new tokenization
-print([w.text for w in nlp("gimme that")]) # ['gim', 'me', 'that']
+# check new tokenization
+print([w.text for w in nlp(u"gimme that")]) # ['gim', 'me', 'that']
+
+# Pronoun lemma is returned as -PRON-!
+print([w.lemma_ for w in nlp(u"gimme that")]) # ['give', '-PRON-', 'that']
```
+
+
+For details on spaCy's custom pronoun lemma `-PRON-`,
+[see here](/usage/#pron-lemma).
+
+
+
The special case doesn't have to match an entire whitespace-delimited substring.
The tokenizer will incrementally split off punctuation, and keep looking up the
remaining substring:
```python
-assert "gimme" not in [w.text for w in nlp("gimme!")]
-assert "gimme" not in [w.text for w in nlp('("...gimme...?")')]
+assert "gimme" not in [w.text for w in nlp(u"gimme!")]
+assert "gimme" not in [w.text for w in nlp(u'("...gimme...?")')]
```
The special case rules have precedence over the punctuation splitting:
```python
-nlp.tokenizer.add_special_case("...gimme...?", [{ORTH: "...gimme...?"}])
-assert len(nlp("...gimme...?")) == 1
+special_case = [{ORTH: u"...gimme...?", LEMMA: u"give", TAG: u"VB"}]
+nlp.tokenizer.add_special_case(u"...gimme...?", special_case)
+assert len(nlp(u"...gimme...?")) == 1
```
+Because the special-case rules allow you to set arbitrary token attributes, such
+as the part-of-speech, lemma, etc, they make a good mechanism for arbitrary
+fix-up rules. Having this logic live in the tokenizer isn't very satisfying from
+a design perspective, however, so the API may eventually be exposed on the
+[`Language`](/api/language) class itself.
+
### How spaCy's tokenizer works {#how-tokenizer-works}
spaCy introduces a novel tokenization algorithm, that gives a better balance
@@ -817,7 +790,7 @@ def custom_tokenizer(nlp):
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = custom_tokenizer(nlp)
-doc = nlp("hello-world.")
+doc = nlp(u"hello-world.")
print([t.text for t in doc])
```
@@ -934,7 +907,7 @@ class WhitespaceTokenizer(object):
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
-doc = nlp("What's happened to me? he thought. It wasn't a dream.")
+doc = nlp(u"What's happened to me? he thought. It wasn't a dream.")
print([t.text for t in doc])
```
@@ -959,7 +932,7 @@ from spacy.tokens import Doc
from spacy.lang.en import English
nlp = English()
-doc = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
+doc = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"],
spaces=[False, True, False, False])
print([(t.text, t.text_with_ws, t.whitespace_) for t in doc])
```
@@ -976,8 +949,8 @@ from spacy.tokens import Doc
from spacy.lang.en import English
nlp = English()
-bad_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"])
-good_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
+bad_spaces = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"])
+good_spaces = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"],
spaces=[False, True, False, False])
print(bad_spaces.text) # 'Hello , world !'
@@ -1259,7 +1232,7 @@ that yields [`Span`](/api/span) objects.
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("This is a sentence. This is another sentence.")
+doc = nlp(u"This is a sentence. This is another sentence.")
for sent in doc.sents:
print(sent.text)
```
@@ -1279,7 +1252,7 @@ from spacy.lang.en import English
nlp = English() # just the language with no model
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
-doc = nlp("This is a sentence. This is another sentence.")
+doc = nlp(u"This is a sentence. This is another sentence.")
for sent in doc.sents:
print(sent.text)
```
@@ -1315,7 +1288,7 @@ take advantage of dependency-based sentence segmentation.
### {executable="true"}
import spacy
-text = "this is a sentence...hello...and another sentence."
+text = u"this is a sentence...hello...and another sentence."
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md
index c9b22279d..5df4ab458 100644
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@@ -106,7 +106,7 @@ python -m spacy download en_core_web_sm
python -m spacy download en
# Download exact model version (doesn't create shortcut link)
-python -m spacy download en_core_web_sm-2.2.0 --direct
+python -m spacy download en_core_web_sm-2.1.0 --direct
```
The download command will [install the model](/usage/models#download-pip) via
@@ -120,7 +120,7 @@ python -m spacy download en_core_web_sm
```python
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("This is a sentence.")
+doc = nlp(u"This is a sentence.")
```
@@ -145,10 +145,10 @@ click on the archive link and copy it to your clipboard.
```bash
# With external URL
-pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
+pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz
# With local file
-pip install /Users/you/en_core_web_sm-2.2.0.tar.gz
+pip install /Users/you/en_core_web_sm-2.1.0.tar.gz
```
By default, this will install the model into your `site-packages` directory. You
@@ -173,13 +173,13 @@ model data.
```yaml
### Directory structure {highlight="7"}
-└── en_core_web_md-2.2.0.tar.gz # downloaded archive
+└── en_core_web_md-2.1.0.tar.gz # downloaded archive
├── meta.json # model meta data
├── setup.py # setup file for pip installation
└── en_core_web_md # 📦 model package
├── __init__.py # init for pip installation
├── meta.json # model meta data
- └── en_core_web_md-2.2.0 # model data
+ └── en_core_web_md-2.1.0 # model data
```
You can place the **model package directory** anywhere on your local file
@@ -197,7 +197,7 @@ nlp = spacy.load("en_core_web_sm") # load model package "en_core_web_s
nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory
nlp = spacy.load("en") # load model with shortcut link "en"
-doc = nlp("This is a sentence.")
+doc = nlp(u"This is a sentence.")
```
@@ -269,7 +269,7 @@ also `import` it and then call its `load()` method with no arguments:
import en_core_web_sm
nlp = en_core_web_sm.load()
-doc = nlp("This is a sentence.")
+doc = nlp(u"This is a sentence.")
```
How you choose to load your models ultimately depends on personal preference.
@@ -325,8 +325,8 @@ URLs.
```text
### requirements.txt
-spacy>=2.2.0,<3.0.0
-https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm
+spacy>=2.0.0,<3.0.0
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm
```
Specifying `#egg=` with the package name tells pip which package to expect from
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index dcd182965..f3c59da7b 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -20,7 +20,7 @@ component** on the `Doc`, in order. It then returns the processed `Doc` that you
can work with.
```python
-doc = nlp("This is a text")
+doc = nlp(u"This is a text")
```
When processing large volumes of text, the statistical models are usually more
@@ -29,7 +29,7 @@ efficient if you let them work on batches of texts. spaCy's
processed `Doc` objects. The batching is done internally.
```diff
-texts = ["This is a text", "These are lots of texts", "..."]
+texts = [u"This is a text", u"These are lots of texts", u"..."]
- docs = [nlp(text) for text in texts]
+ docs = list(nlp.pipe(texts))
```
@@ -172,7 +172,7 @@ which is then processed by the component next in the pipeline.
```python
### The pipeline under the hood
-doc = nlp.make_doc("This is a sentence") # create a Doc from raw text
+doc = nlp.make_doc(u"This is a sentence") # create a Doc from raw text
for name, proc in nlp.pipeline: # iterate over components in order
doc = proc(doc) # apply each component
```
@@ -213,7 +213,6 @@ require them in the pipeline settings in your model's `meta.json`.
| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. |
| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. |
| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. |
-| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. |
| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules. |
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
@@ -263,12 +262,12 @@ blocks.
### Disable for block
# 1. Use as a contextmanager
with nlp.disable_pipes("tagger", "parser"):
- doc = nlp("I won't be tagged and parsed")
-doc = nlp("I will be tagged and parsed")
+ doc = nlp(u"I won't be tagged and parsed")
+doc = nlp(u"I will be tagged and parsed")
# 2. Restore manually
disabled = nlp.disable_pipes("ner")
-doc = nlp("I won't have named entities")
+doc = nlp(u"I won't have named entities")
disabled.restore()
```
@@ -295,11 +294,11 @@ initializing a Language class via [`from_disk`](/api/language#from_disk).
```diff
- nlp = spacy.load('en', tagger=False, entity=False)
-- doc = nlp("I don't want parsed", parse=False)
+- doc = nlp(u"I don't want parsed", parse=False)
+ nlp = spacy.load("en", disable=["ner"])
+ nlp.remove_pipe("parser")
-+ doc = nlp("I don't want parsed")
++ doc = nlp(u"I don't want parsed")
```
@@ -376,7 +375,7 @@ def my_component(doc):
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(my_component, name="print_info", last=True)
print(nlp.pipe_names) # ['tagger', 'parser', 'ner', 'print_info']
-doc = nlp("This is a sentence.")
+doc = nlp(u"This is a sentence.")
```
@@ -426,14 +425,14 @@ class EntityMatcher(object):
return doc
nlp = spacy.load("en_core_web_sm")
-terms = ("cat", "dog", "tree kangaroo", "giant sea spider")
+terms = (u"cat", u"dog", u"tree kangaroo", u"giant sea spider")
entity_matcher = EntityMatcher(nlp, terms, "ANIMAL")
nlp.add_pipe(entity_matcher, after="ner")
print(nlp.pipe_names) # The components in the pipeline
-doc = nlp("This is a text about Barack Obama and a tree kangaroo")
+doc = nlp(u"This is a text about Barack Obama and a tree kangaroo")
print([(ent.text, ent.label_) for ent in doc.ents])
```
@@ -471,7 +470,7 @@ def custom_sentencizer(doc):
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(custom_sentencizer, before="parser") # Insert before the parser
-doc = nlp("This is. A sentence. | This is. Another sentence.")
+doc = nlp(u"This is. A sentence. | This is. Another sentence.")
for sent in doc.sents:
print(sent.text)
```
@@ -517,7 +516,7 @@ config parameters are passed all the way down from
components with custom settings:
```python
-nlp = spacy.load("your_custom_model", terms=["tree kangaroo"], label="ANIMAL")
+nlp = spacy.load("your_custom_model", terms=(u"tree kangaroo"), label="ANIMAL")
```
@@ -617,7 +616,7 @@ raise an `AttributeError`.
### Example
from spacy.tokens import Doc, Span, Token
-fruits = ["apple", "pear", "banana", "orange", "strawberry"]
+fruits = [u"apple", u"pear", u"banana", u"orange", u"strawberry"]
is_fruit_getter = lambda token: token.text in fruits
has_fruit_getter = lambda obj: any([t.text in fruits for t in obj])
@@ -629,7 +628,7 @@ Span.set_extension("has_fruit", getter=has_fruit_getter)
> #### Usage example
>
> ```python
-> doc = nlp("I have an apple and a melon")
+> doc = nlp(u"I have an apple and a melon")
> assert doc[3]._.is_fruit # get Token attributes
> assert not doc[0]._.is_fruit
> assert doc._.has_fruit # get Doc attributes
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 4c398ecd0..1d67625a5 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -90,7 +90,7 @@ the pattern is not going to produce any results. When developing complex
patterns, make sure to check examples against spaCy's tokenization:
```python
-doc = nlp("A complex-example,!")
+doc = nlp(u"A complex-example,!")
print([token.text for token in doc])
```
@@ -113,7 +113,7 @@ matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern)
-doc = nlp("Hello, world! Hello world!")
+doc = nlp(u"Hello, world! Hello world!")
matches = matcher(doc)
for match_id, start, end in matches:
string_id = nlp.vocab.strings[match_id] # Get string representation
@@ -447,7 +447,7 @@ def add_event_ent(matcher, doc, i, matches):
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", add_event_ent, pattern)
-doc = nlp("This is a text about Google I/O")
+doc = nlp(u"This is a text about Google I/O")
matches = matcher(doc)
```
@@ -539,7 +539,7 @@ class BadHTMLMerger(object):
nlp = spacy.load("en_core_web_sm")
html_merger = BadHTMLMerger(nlp)
nlp.add_pipe(html_merger, last=True) # Add component to the pipeline
-doc = nlp("Hello world! This is a test.")
+doc = nlp(u"Hello world! This is a test.")
for token in doc:
print(token.text, token._.bad_html)
@@ -617,7 +617,7 @@ def collect_sents(matcher, doc, i, matches):
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
{"POS": "ADJ"}]
matcher.add("FacebookIs", collect_sents, pattern) # add pattern
-doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")
+doc = nlp(u"I'd say that Facebook is evil. – Facebook is pretty cool, right?")
matches = matcher(doc)
# Serve visualization of sentences containing match with displaCy
@@ -673,7 +673,7 @@ pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
{"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
matcher.add("PHONE_NUMBER", None, pattern)
-doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
+doc = nlp(u"Call me at (123) 456 789 or (123) 456 789!")
print([t.text for t in doc])
matches = matcher(doc)
for match_id, start, end in matches:
@@ -719,8 +719,8 @@ from spacy.matcher import Matcher
nlp = English() # We only want the tokenizer, so no need to load a model
matcher = Matcher(nlp.vocab)
-pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"] # Positive emoji
-neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"] # Negative emoji
+pos_emoji = [u"😀", u"😃", u"😂", u"🤣", u"😊", u"😍"] # Positive emoji
+neg_emoji = [u"😞", u"😠", u"😩", u"😢", u"😭", u"😒"] # Negative emoji
# Add patterns to match one or more emoji tokens
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
@@ -740,7 +740,7 @@ matcher.add("SAD", label_sentiment, *neg_patterns) # Add negative pattern
# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
-doc = nlp("Hello world 😀 #MondayMotivation")
+doc = nlp(u"Hello world 😀 #MondayMotivation")
matches = matcher(doc)
for match_id, start, end in matches:
string_id = doc.vocab.strings[match_id] # Look up string ID
@@ -797,7 +797,7 @@ matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
# Register token extension
Token.set_extension("is_hashtag", default=False)
-doc = nlp("Hello world 😀 #MondayMotivation")
+doc = nlp(u"Hello world 😀 #MondayMotivation")
matches = matcher(doc)
hashtags = []
for match_id, start, end in matches:
@@ -838,13 +838,13 @@ from spacy.matcher import PhraseMatcher
nlp = spacy.load('en_core_web_sm')
matcher = PhraseMatcher(nlp.vocab)
-terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
+terms = [u"Barack Obama", u"Angela Merkel", u"Washington, D.C."]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", None, *patterns)
-doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
- "converse in the Oval Office inside the White House in Washington, D.C.")
+doc = nlp(u"German Chancellor Angela Merkel and US President Barack Obama "
+ u"converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
@@ -853,8 +853,8 @@ for match_id, start, end in matches:
Since spaCy is used for processing both the patterns and the text to be matched,
you won't have to worry about specific tokenization – for example, you can
-simply pass in `nlp("Washington, D.C.")` and won't have to write a complex token
-pattern covering the exact tokenization of the term.
+simply pass in `nlp(u"Washington, D.C.")` and won't have to write a complex
+token pattern covering the exact tokenization of the term.
@@ -889,10 +889,10 @@ from spacy.matcher import PhraseMatcher
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
-patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
+patterns = [nlp.make_doc(name) for name in [u"Angela Merkel", u"Barack Obama"]]
matcher.add("Names", None, *patterns)
-doc = nlp("angela merkel and us president barack Obama")
+doc = nlp(u"angela merkel and us president barack Obama")
for match_id, start, end in matcher(doc):
print("Matched based on lowercase token text:", doc[start:end])
```
@@ -924,9 +924,9 @@ from spacy.matcher import PhraseMatcher
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
-matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0"))
+matcher.add("IP", None, nlp(u"127.0.0.1"), nlp(u"127.127.0.0"))
-doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
+doc = nlp(u"Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
for match_id, start, end in matcher(doc):
print("Matched based on token shape:", doc[start:end])
```
@@ -982,7 +982,7 @@ patterns = [{"label": "ORG", "pattern": "Apple"},
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
-doc = nlp("Apple is opening its first big office in San Francisco.")
+doc = nlp(u"Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_) for ent in doc.ents])
```
@@ -1006,7 +1006,7 @@ patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
-doc = nlp("MyCorp Inc. is a company in the U.S.")
+doc = nlp(u"MyCorp Inc. is a company in the U.S.")
print([(ent.text, ent.label_) for ent in doc.ents])
```
diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index 3d904f01a..81e90dcc7 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -59,45 +59,12 @@ initializes the language class, creates and adds the pipeline components and
_then_ loads in the binary data. You can read more about this process
[here](/usage/processing-pipelines#pipelines).
-### Serializing Doc objects efficiently {#docs new="2.2"}
-
-If you're working with lots of data, you'll probably need to pass analyses
-between machines, either to use something like [Dask](https://dask.org) or
-[Spark](https://spark.apache.org), or even just to save out work to disk. Often
-it's sufficient to use the [`Doc.to_array`](/api/doc#to_array) functionality for
-this, and just serialize the numpy arrays – but other times you want a more
-general way to save and restore `Doc` objects.
-
-The [`DocBin`](/api/docbin) class makes it easy to serialize and deserialize a
-collection of `Doc` objects together, and is much more efficient than calling
-[`Doc.to_bytes`](/api/doc#to_bytes) on each individual `Doc` object. You can
-also control what data gets saved, and you can merge pallets together for easy
-map/reduce-style processing.
-
-```python
-### {highlight="4,8,9,13,14"}
-import spacy
-from spacy.tokens import DocBin
-
-doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
-texts = ["Some text", "Lots of texts...", "..."]
-nlp = spacy.load("en_core_web_sm")
-for doc in nlp.pipe(texts):
- doc_bin.add(doc)
-bytes_data = docbin.to_bytes()
-
-# Deserialize later, e.g. in a new process
-nlp = spacy.blank("en")
-doc_bin = DocBin().from_bytes(bytes_data)
-docs = list(doc_bin.get_docs(nlp.vocab))
-```
-
### Using Pickle {#pickle}
> #### Example
>
> ```python
-> doc = nlp("This is a text.")
+> doc = nlp(u"This is a text.")
> data = pickle.dumps(doc)
> ```
@@ -117,8 +84,8 @@ the _same_ `Vocab` object, it will only be included once.
```python
### Pickling objects with shared data {highlight="8-9"}
-doc1 = nlp("Hello world")
-doc2 = nlp("This is a test")
+doc1 = nlp(u"Hello world")
+doc2 = nlp(u"This is a test")
doc1_data = pickle.dumps(doc1)
doc2_data = pickle.dumps(doc2)
@@ -271,31 +238,13 @@ custom components to spaCy automatically.
## Using entry points {#entry-points new="2.1"}
-Entry points let you expose parts of a Python package you write to other Python
-packages. This lets one application easily customize the behavior of another, by
-exposing an entry point in its `setup.py`. For a quick and fun intro to entry
-points in Python, check out
-[this excellent blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/).
-spaCy can load custom function from several different entry points to add
-pipeline component factories, language classes and other settings. To make spaCy
-use your entry points, your package needs to expose them and it needs to be
-installed in the same environment – that's it.
-
-| Entry point | Description |
-| ------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`spacy_factories`](#entry-points-components) | Group of entry points for pipeline component factories to add to [`Language.factories`](/usage/processing-pipelines#custom-components-factories), keyed by component name. |
-| [`spacy_languages`](#entry-points-languages) | Group of entry points for custom [`Language` subclasses](/usage/adding-languages), keyed by language shortcut. |
-| [`spacy_displacy_colors`](#entry-points-displacy) 2.2 | Group of entry points of custom label colors for the [displaCy visualizer](/usage/visualizers#ent). The key name doesn't matter, but it should point to a dict of labels and color values. Useful for custom models that predict different entity types. |
-
-### Custom components via entry points {#entry-points-components}
-
When you load a model, spaCy will generally use the model's `meta.json` to set
up the language class and construct the pipeline. The pipeline is specified as a
list of strings, e.g. `"pipeline": ["tagger", "paser", "ner"]`. For each of
those strings, spaCy will call `nlp.create_pipe` and look up the name in the
-[built-in factories](/usage/processing-pipelines#custom-components-factories).
-If your model wanted to specify its own custom components, you usually have to
-write to `Language.factories` _before_ loading the model.
+[built-in factories](#custom-components-factories). If your model wanted to
+specify its own custom components, you usually have to write to
+`Language.factories` _before_ loading the model.
```python
pipe = nlp.create_pipe("custom_component") # fails 👎
@@ -311,11 +260,13 @@ added to the built-in factories when the `Language` class is initialized. If a
package in the same environment exposes spaCy entry points, all of this happens
automatically and no further user action is required.
-To stick with the theme of
-[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
-consider the following custom spaCy extension which is initialized with the
-shared `nlp` object and will print a snake when it's called as a pipeline
-component.
+#### Custom components via entry points {#entry-points-components}
+
+For a quick and fun intro to entry points in Python, I recommend
+[this excellent blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/).
+To stick with the theme of the post, consider the following custom spaCy
+extension which is initialized with the shared `nlp` object and will print a
+snake when it's called as a pipeline component.
> #### Package directory structure
>
@@ -353,13 +304,15 @@ entry to the factories, you can now expose it in your `setup.py` via the
`entry_points` dictionary:
```python
-### setup.py {highlight="5-7"}
+### setup.py {highlight="5-8"}
from setuptools import setup
setup(
name="snek",
entry_points={
- "spacy_factories": ["snek = snek:SnekFactory"]
+ "spacy_factories": [
+ "snek = snek:SnekFactory"
+ ]
}
)
```
@@ -380,7 +333,7 @@ spaCy is now able to create the pipeline component `'snek'`:
>>> nlp = English()
>>> snek = nlp.create_pipe("snek") # this now works! 🐍🎉
>>> nlp.add_pipe(snek)
->>> doc = nlp("I am snek")
+>>> doc = nlp(u"I am snek")
--..,_ _,.--.
`'.'. .'`__ o `;__.
'.'. .'.'` '---'` `
@@ -457,7 +410,7 @@ The above example will serialize the current snake in a `snek.txt` in the model
data directory. When a model using the `snek` component is loaded, it will open
the `snek.txt` and make it available to the component.
-### Custom language classes via entry points {#entry-points-languages}
+#### Custom language classes via entry points {#entry-points-components}
To stay with the theme of the previous example and
[this blog post on entry points](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
@@ -493,8 +446,12 @@ from setuptools import setup
setup(
name="snek",
entry_points={
- "spacy_factories": ["snek = snek:SnekFactory"],
-+ "spacy_languages": ["snk = snek:SnekLanguage"]
+ "spacy_factories": [
+ "snek = snek:SnekFactory"
+ ]
++ "spacy_languages": [
++ "sk = snek:SnekLanguage"
++ ]
}
)
```
@@ -524,50 +481,6 @@ SnekLanguage = get_lang_class("snk")
nlp = SnekLanguage()
```
-### Custom displaCy colors via entry points {#entry-points-displacy new="2.2"}
-
-If you're training a named entity recognition model for a custom domain, you may
-end up training different labels that don't have pre-defined colors in the
-[`displacy` visualizer](/usage/visualizers#ent). The `spacy_displacy_colors`
-entry point lets you define a dictionary of entity labels mapped to their color
-values. It's added to the pre-defined colors and can also overwrite existing
-values.
-
-> #### Domain-specific NER labels
->
-> Good examples of models with domain-specific label schemes are
-> [scispaCy](/universe/project/scispacy) and
-> [Blackstone](/universe/project/blackstone).
-
-```python
-### snek.py
-displacy_colors = {"SNEK": "#3dff74", "HUMAN": "#cfc5ff"}
-```
-
-Given the above colors, the entry point can be defined as follows. Entry points
-need to have a name, so we use the key `colors`. However, the name doesn't
-matter and whatever is defined in the entry point group will be used.
-
-```diff
-### setup.py
-from setuptools import setup
-
-setup(
- name="snek",
- entry_points={
-+ "spacy_displacy_colors": ["colors = snek:displacy_colors"]
- }
-)
-```
-
-After installing the package, the the custom colors will be used when
-visualizing text with `displacy`. Whenever the label `SNEK` is assigned, it will
-be displayed in `#3dff74`.
-
-import DisplaCyEntSnekHtml from 'images/displacy-ent-snek.html'
-
-
-
## Saving, loading and distributing models {#models}
After training your model, you'll usually want to save its state, and load it
diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md
index 4bfecb3a9..03feb03b1 100644
--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@@ -122,7 +122,6 @@ related to more general machine learning functionality.
| **Lemmatization** | Assigning the base forms of words. For example, the lemma of "was" is "be", and the lemma of "rats" is "rat". |
| **Sentence Boundary Detection** (SBD) | Finding and segmenting individual sentences. |
| **Named Entity Recognition** (NER) | Labelling named "real-world" objects, like persons, companies or locations. |
-| **Entity Linking** (EL) | Disambiguating textual entities to unique identifiers in a Knowledge Base. |
| **Similarity** | Comparing words, text spans and documents and how similar they are to each other. |
| **Text Classification** | Assigning categories or labels to a whole document, or parts of a document. |
| **Rule-based Matching** | Finding sequences of tokens based on their texts and linguistic annotations, similar to regular expressions. |
@@ -179,7 +178,7 @@ processed `Doc`:
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
+doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
print(token.text, token.pos_, token.dep_)
```
@@ -298,8 +297,8 @@ its hash, or a hash to get its string:
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("I love coffee")
-print(doc.vocab.strings["coffee"]) # 3197928453018144401
+doc = nlp(u"I love coffee")
+print(doc.vocab.strings[u"coffee"]) # 3197928453018144401
print(doc.vocab.strings[3197928453018144401]) # 'coffee'
```
@@ -322,7 +321,7 @@ ever change. Its hash value will also always be the same.
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("I love coffee")
+doc = nlp(u"I love coffee")
for word in doc:
lexeme = doc.vocab[word.text]
print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
@@ -363,14 +362,14 @@ from spacy.tokens import Doc
from spacy.vocab import Vocab
nlp = spacy.load("en_core_web_sm")
-doc = nlp("I love coffee") # Original Doc
-print(doc.vocab.strings["coffee"]) # 3197928453018144401
+doc = nlp(u"I love coffee") # Original Doc
+print(doc.vocab.strings[u"coffee"]) # 3197928453018144401
print(doc.vocab.strings[3197928453018144401]) # 'coffee' 👍
empty_doc = Doc(Vocab()) # New Doc with empty Vocab
# empty_doc.vocab.strings[3197928453018144401] will raise an error :(
-empty_doc.vocab.strings.add("coffee") # Add "coffee" and generate hash
+empty_doc.vocab.strings.add(u"coffee") # Add "coffee" and generate hash
print(empty_doc.vocab.strings[3197928453018144401]) # 'coffee' 👍
new_doc = Doc(doc.vocab) # Create new doc with first doc's vocab
@@ -384,79 +383,6 @@ spaCy will also export the `Vocab` when you save a `Doc` or `nlp` object. This
will give you the object and its encoded annotations, plus the "key" to decode
it.
-## Knowledge Base {#kb}
-
-To support the entity linking task, spaCy stores external knowledge in a
-[`KnowledgeBase`](/api/kb). The knowledge base (KB) uses the `Vocab` to store
-its data efficiently.
-
-> - **Mention**: A textual occurrence of a named entity, e.g. 'Miss Lovelace'.
-> - **KB ID**: A unique identifier refering to a particular real-world concept,
-> e.g. 'Q7259'.
-> - **Alias**: A plausible synonym or description for a certain KB ID, e.g. 'Ada
-> Lovelace'.
-> - **Prior probability**: The probability of a certain mention resolving to a
-> certain KB ID, prior to knowing anything about the context in which the
-> mention is used.
-> - **Entity vector**: A pretrained word vector capturing the entity
-> description.
-
-A knowledge base is created by first adding all entities to it. Next, for each
-potential mention or alias, a list of relevant KB IDs and their prior
-probabilities is added. The sum of these prior probabilities should never exceed
-1 for any given alias.
-
-```python
-### {executable="true"}
-import spacy
-from spacy.kb import KnowledgeBase
-
-# load the model and create an empty KB
-nlp = spacy.load('en_core_web_sm')
-kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
-
-# adding entities
-kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5])
-kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3])
-kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2])
-
-# adding aliases
-kb.add_alias(alias="Douglas", entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.6, 0.1, 0.2])
-kb.add_alias(alias="Douglas Adams", entities=["Q42"], probabilities=[0.9])
-
-print()
-print("Number of entities in KB:",kb.get_size_entities()) # 3
-print("Number of aliases in KB:", kb.get_size_aliases()) # 2
-```
-
-### Candidate generation
-
-Given a textual entity, the Knowledge Base can provide a list of plausible
-candidates or entity identifiers. The [`EntityLinker`](/api/entitylinker) will
-take this list of candidates as input, and disambiguate the mention to the most
-probable identifier, given the document context.
-
-```python
-### {executable="true"}
-import spacy
-from spacy.kb import KnowledgeBase
-
-nlp = spacy.load('en_core_web_sm')
-kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
-
-# adding entities
-kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5])
-kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3])
-kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2])
-
-# adding aliases
-kb.add_alias(alias="Douglas", entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.6, 0.1, 0.2])
-
-candidates = kb.get_candidates("Douglas")
-for c in candidates:
- print(" ", c.entity_, c.prior_prob, c.entity_vector)
-```
-
## Serialization {#serialization}
import Serialization101 from 'usage/101/\_serialization.md'
@@ -515,11 +441,11 @@ python -m spacy download de_core_news_sm
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("Hello, world. Here are two sentences.")
+doc = nlp(u"Hello, world. Here are two sentences.")
print([t.text for t in doc])
nlp_de = spacy.load("de_core_news_sm")
-doc_de = nlp_de("Ich bin ein Berliner.")
+doc_de = nlp_de(u"Ich bin ein Berliner.")
print([t.text for t in doc_de])
```
@@ -538,8 +464,8 @@ print([t.text for t in doc_de])
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("Peach emoji is where it has always been. Peach is the superior "
- "emoji. It's outranking eggplant 🍑 ")
+doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
+ u"emoji. It's outranking eggplant 🍑 ")
print(doc[0].text) # 'Peach'
print(doc[1].text) # 'emoji'
print(doc[-1].text) # '🍑'
@@ -567,7 +493,7 @@ print(sentences[1].text) # 'Peach is the superior emoji.'
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
+doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
apple = doc[0]
print("Fine-grained POS tag", apple.pos_, apple.pos)
print("Coarse-grained POS tag", apple.tag_, apple.tag)
@@ -595,20 +521,20 @@ print("Like an email address?", billion.like_email)
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("I love coffee")
+doc = nlp(u"I love coffee")
-coffee_hash = nlp.vocab.strings["coffee"] # 3197928453018144401
+coffee_hash = nlp.vocab.strings[u"coffee"] # 3197928453018144401
coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'
print(coffee_hash, coffee_text)
print(doc[2].orth, coffee_hash) # 3197928453018144401
print(doc[2].text, coffee_text) # 'coffee'
-beer_hash = doc.vocab.strings.add("beer") # 3073001599257881079
+beer_hash = doc.vocab.strings.add(u"beer") # 3073001599257881079
beer_text = doc.vocab.strings[beer_hash] # 'beer'
print(beer_hash, beer_text)
-unicorn_hash = doc.vocab.strings.add("🦄") # 18234233413267120783
-unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄'
+unicorn_hash = doc.vocab.strings.add(u"🦄 ") # 18234233413267120783
+unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 '
print(unicorn_hash, unicorn_text)
```
@@ -624,17 +550,19 @@ print(unicorn_hash, unicorn_text)
```python
### {executable="true"}
import spacy
-from spacy.tokens import Span
nlp = spacy.load("en_core_web_sm")
-doc = nlp("San Francisco considers banning sidewalk delivery robots")
+doc = nlp(u"San Francisco considers banning sidewalk delivery robots")
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
-doc = nlp("FB is hiring a new VP of global policy")
-doc.ents = [Span(doc, 0, 1, label="ORG")]
+from spacy.tokens import Span
+
+doc = nlp(u"FB is hiring a new VP of global policy")
+doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u"ORG"])]
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
+
```
@@ -650,7 +578,7 @@ import spacy
import random
nlp = spacy.load("en_core_web_sm")
-train_data = [("Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})]
+train_data = [(u"Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
@@ -678,11 +606,11 @@ nlp.to_disk("/model")
```python
from spacy import displacy
-doc_dep = nlp("This is a sentence.")
+doc_dep = nlp(u"This is a sentence.")
displacy.serve(doc_dep, style="dep")
-doc_ent = nlp("When Sebastian Thrun started working on self-driving cars at Google "
- "in 2007, few people outside of the company took him seriously.")
+doc_ent = nlp(u"When Sebastian Thrun started working on self-driving cars at Google "
+ u"in 2007, few people outside of the company took him seriously.")
displacy.serve(doc_ent, style="ent")
```
@@ -700,7 +628,7 @@ displacy.serve(doc_ent, style="ent")
import spacy
nlp = spacy.load("en_core_web_md")
-doc = nlp("Apple and banana are similar. Pasta and hippo aren't.")
+doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
apple = doc[0]
banana = doc[2]
@@ -762,7 +690,7 @@ pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "
matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o"
matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji
-doc = nlp("A text about Google I/O 😀😀")
+doc = nlp(u"A text about Google I/O 😀😀")
matches = matcher(doc)
for match_id, start, end in matches:
@@ -782,7 +710,7 @@ print("Sentiment", doc.sentiment)
### Minibatched stream processing {#lightning-tour-minibatched}
```python
-texts = ["One document.", "...", "Lots of documents"]
+texts = [u"One document.", u"...", u"Lots of documents"]
# .pipe streams input, and produces streaming output
iter_texts = (texts[i % 3] for i in range(100000000))
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)):
@@ -798,8 +726,8 @@ for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)):
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("When Sebastian Thrun started working on self-driving cars at Google "
- "in 2007, few people outside of the company took him seriously.")
+doc = nlp(u"When Sebastian Thrun started working on self-driving cars at Google "
+ u"in 2007, few people outside of the company took him seriously.")
dep_labels = []
for token in doc:
@@ -824,7 +752,7 @@ import spacy
from spacy.attrs import ORTH, LIKE_URL
nlp = spacy.load("en_core_web_sm")
-doc = nlp("Check out https://spacy.io")
+doc = nlp(u"Check out https://spacy.io")
for token in doc:
print(token.text, token.orth, token.like_url)
@@ -870,7 +798,7 @@ def put_spans_around_tokens(doc):
nlp = spacy.load("en_core_web_sm")
-doc = nlp("This is a test.\\n\\nHello world.")
+doc = nlp(u"This is a test.\\n\\nHello world.")
html = put_spans_around_tokens(doc)
print(html)
```
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index f84fd0ed4..b84bf4e12 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -6,14 +6,12 @@ menu:
- ['NER', 'ner']
- ['Tagger & Parser', 'tagger-parser']
- ['Text Classification', 'textcat']
- - ['Entity Linking', 'entity-linker']
- ['Tips and Advice', 'tips']
---
This guide describes how to train new statistical models for spaCy's
-part-of-speech tagger, named entity recognizer, dependency parser, text
-classifier and entity linker. Once the model is trained, you can then
-[save and load](/usage/saving-loading#models) it.
+part-of-speech tagger, named entity recognizer and dependency parser. Once the
+model is trained, you can then [save and load](/usage/saving-loading#models) it.
## Training basics {#basics}
@@ -41,19 +39,6 @@ mkdir models
python -m spacy train es models ancora-json/es_ancora-ud-train.json ancora-json/es_ancora-ud-dev.json
```
-
-
-If you're running spaCy v2.2 or above, you can use the
-[`debug-data` command](/api/cli#debug-data) to analyze and validate your
-training and development data, get useful stats, and find problems like invalid
-entity annotations, cyclic dependencies, low data labels and more.
-
-```bash
-$ python -m spacy debug-data en train.json dev.json --verbose
-```
-
-
-
You can also use the [`gold.docs_to_json`](/api/goldparse#docs_to_json) helper
to convert a list of `Doc` objects to spaCy's JSON training format.
@@ -236,10 +221,10 @@ of being dropped.
> - [`begin_training()`](/api/language#begin_training): Start the training and
> return an optimizer function to update the model's weights. Can take an
-> optional function converting the training data to spaCy's training format.
-> - [`update()`](/api/language#update): Update the model with the training
-> example and gold data.
-> - [`to_disk()`](/api/language#to_disk): Save the updated model to a directory.
+> optional function converting the training data to spaCy's training
+> format. -[`update()`](/api/language#update): Update the model with the
+> training example and gold data. -[`to_disk()`](/api/language#to_disk): Save
+> the updated model to a directory.
```python
### Example training loop
@@ -298,10 +283,10 @@ imports. It also makes it easier to structure and load your training data.
```python
### Simple training loop
TRAIN_DATA = [
- ("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}),
- ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]})]
+ (u"Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}),
+ (u"Google rebrands its business apps", {"entities": [(0, 6, "ORG")]})]
-nlp = spacy.blank("en")
+nlp = spacy.blank('en')
optimizer = nlp.begin_training()
for i in range(20):
random.shuffle(TRAIN_DATA)
@@ -498,7 +483,7 @@ like this:
![Custom dependencies](../images/displacy-custom-parser.svg)
```python
-doc = nlp("find a hotel with good wifi")
+doc = nlp(u"find a hotel with good wifi")
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-'])
# [('find', 'ROOT', 'find'), ('hotel', 'PLACE', 'find'),
# ('good', 'QUALITY', 'wifi'), ('wifi', 'ATTRIBUTE', 'hotel')]
@@ -596,76 +581,6 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_textcat.p
7. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk).
8. **Test** the model to make sure the text classifier works as expected.
-## Entity linking {#entity-linker}
-
-To train an entity linking model, you first need to define a knowledge base
-(KB).
-
-### Creating a knowledge base {#kb}
-
-A KB consists of a list of entities with unique identifiers. Each such entity
-has an entity vector that will be used to measure similarity with the context in
-which an entity is used. These vectors are pretrained and stored in the KB
-before the entity linking model will be trained.
-
-The following example shows how to build a knowledge base from scratch, given a
-list of entities and potential aliases. The script further demonstrates how to
-pretrain and store the entity vectors. To run this example, the script needs
-access to a `vocab` instance or an `nlp` model with pretrained word embeddings.
-
-```python
-https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py
-```
-
-#### Step by step guide {#step-by-step-kb}
-
-1. **Load the model** you want to start with, or create an **empty model** using
- [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language and
- a pre-defined [`vocab`](/api/vocab) object.
-2. **Pretrain the entity embeddings** by running the descriptions of the
- entities through a simple encoder-decoder network. The current implementation
- requires the `nlp` model to have access to pre-trained word embeddings, but a
- custom implementation of this enoding step can also be used.
-3. **Construct the KB** by defining all entities with their pretrained vectors,
- and all aliases with their prior probabilities.
-4. **Save** the KB using [`kb.dump`](/api/kb#dump).
-5. **Test** the KB to make sure the entities were added correctly.
-
-### Training an entity linking model {#entity-linker-model}
-
-This example shows how to create an entity linker pipe using a previously
-created knowledge base. The entity linker pipe is then trained with your own
-examples. To do so, you'll need to provide **example texts**, and the
-**character offsets** and **knowledge base identifiers** of each entity
-contained in the texts.
-
-```python
-https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py
-```
-
-#### Step by step guide {#step-by-step-entity-linker}
-
-1. **Load the KB** you want to start with, and specify the path to the `Vocab`
- object that was used to create this KB. Then, create an **empty model** using
- [`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language.
- Don't forget to add the KB to the entity linker, and to add the entity linker
- to the pipeline. In practical applications, you will want a more advanced
- pipeline including also a component for
- [named entity recognition](/usage/training#ner). If you're using a model with
- additional components, make sure to disable all other pipeline components
- during training using [`nlp.disable_pipes`](/api/language#disable_pipes).
- This way, you'll only be training the entity linker.
-2. **Shuffle and loop over** the examples. For each example, **update the
- model** by calling [`nlp.update`](/api/language#update), which steps through
- the annotated examples of the input. For each combination of a mention in
- text and a potential KB identifier, the model makes a **prediction** whether
- or not this is the correct match. It then consults the annotations to see
- whether it was right. If it was wrong, it adjusts its weights so that the
- correct combination will score higher next time.
-3. **Save** the trained model using [`nlp.to_disk`](/api/language#to_disk).
-4. **Test** the model to make sure the entities in the training data are
- recognized correctly.
-
## Optimization tips and advice {#tips}
There are lots of conflicting "recipes" for training deep neural networks at the
diff --git a/website/docs/usage/v2-1.md b/website/docs/usage/v2-1.md
index 4a8ef5a37..d3c9fb504 100644
--- a/website/docs/usage/v2-1.md
+++ b/website/docs/usage/v2-1.md
@@ -99,8 +99,8 @@ flexibility.
>
> ```python
> matcher = PhraseMatcher(nlp.vocab, attr="POS")
-> matcher.add("PATTERN", None, nlp("I love cats"))
-> doc = nlp("You like dogs")
+> matcher.add("PATTERN", None, nlp(u"I love cats"))
+> doc = nlp(u"You like dogs")
> matches = matcher(doc)
> ```
@@ -122,9 +122,9 @@ or `POS` for finding sequences of the same part-of-speech tags.
> #### Example
>
> ```python
-> doc = nlp("I like David Bowie")
+> doc = nlp(u"I like David Bowie")
> with doc.retokenize() as retokenizer:
-> attrs = {"LEMMA": "David Bowie"}
+> attrs = {"LEMMA": u"David Bowie"}
> retokenizer.merge(doc[2:4], attrs=attrs)
> ```
diff --git a/website/docs/usage/v2-2.md b/website/docs/usage/v2-2.md
deleted file mode 100644
index d256037ac..000000000
--- a/website/docs/usage/v2-2.md
+++ /dev/null
@@ -1,351 +0,0 @@
----
-title: What's New in v2.2
-teaser: New features, backwards incompatibilities and migration guide
-menu:
- - ['New Features', 'features']
- - ['Backwards Incompatibilities', 'incompat']
----
-
-## New Features {#features hidden="true"}
-
-spaCy v2.2 features improved statistical models, new pretrained models for
-Norwegian and Lithuanian, better Dutch NER, as well as a new mechanism for
-storing language data that makes the installation about **15× smaller** on
-disk. We've also added a new class to efficiently **serialize annotations**, an
-improved and **10× faster** phrase matching engine, built-in scoring and
-**CLI training for text classification**, a new command to analyze and **debug
-training data**, data augmentation during training and more. For the full
-changelog, see the
-[release notes on GitHub](https://github.com/explosion/spaCy/releases/tag/v2.2.0).
-
-
-
-### Better pretrained models and more languages {#models}
-
-> #### Example
->
-> ```bash
-> python -m spacy download nl_core_news_sm
-> python -m spacy download nb_core_news_sm
-> python -m spacy download lt_core_news_sm
-> ```
-
-The new version also features new and re-trained models for all languages and
-resolves a number of data bugs. The [Dutch model](/models/nl) has been retrained
-with a new and custom-labelled NER corpus using the same extended label scheme
-as the English models. It should now produce significantly better NER results
-overall. We've also added new core models for [Norwegian](/models/nb) (MIT) and
-[Lithuanian](/models/lt) (CC BY-SA).
-
-
-
-**Usage:** [Models directory](/models) **Benchmarks: **
-[Release notes](https://github.com/explosion/spaCy/releases/tag/v2.2.0)
-
-
-
-### Serializable lookup table and dictionary API {#lookups}
-
-> #### Example
->
-> ```python
-> data = {"foo": "bar"}
-> nlp.vocab.lookups.add_table("my_dict", data)
->
-> def custom_component(doc):
-> table = doc.vocab.lookups.get_table("my_dict")
-> print(table.get("foo")) # look something up
-> return doc
-> ```
-
-The new `Lookups` API lets you add large dictionaries and lookup tables to the
-`Vocab` and access them from the tokenizer or custom components and extension
-attributes. Internally, the tables use Bloom filters for efficient lookup
-checks. They're also fully serializable out-of-the-box. All large data resources
-included with spaCy now use this API and are additionally compressed at build
-time. This allowed us to make the installed library roughly **15 times smaller
-on disk**.
-
-
-
-**API:** [`Lookups`](/api/lookups) **Usage: **
-[Adding languages: Lemmatizer](/usage/adding-languages#lemmatizer)
-
-
-
-### Text classification scores and CLI training {#train-textcat-cli}
-
-> #### Example
->
-> ```bash
-> $ python -m spacy train en /output /train /dev \\
-> --pipeline textcat --textcat-arch simple_cnn \\
-> --textcat-multilabel
-> ```
-
-When training your models using the `spacy train` command, you can now also
-include text categories in the JSON-formatted training data. The `Scorer` and
-`nlp.evaluate` now report the text classification scores, calculated as the
-F-score on positive label for binary exclusive tasks, the macro-averaged F-score
-for 3+ exclusive labels or the macro-averaged AUC ROC score for multilabel
-classification.
-
-
-
-**API:** [`spacy train`](/api/cli#train), [`Scorer`](/api/scorer),
-[`Language.evaluate`](/api/language#evaluate)
-
-
-
-### New DocBin class to efficiently serialize Doc collections
-
-> #### Example
->
-> ```python
-> from spacy.tokens import DocBin
-> doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
-> for doc in nlp.pipe(texts):
-> doc_bin.add(doc)
-> bytes_data = doc_bin.to_bytes()
-> # Deserialize later, e.g. in a new process
-> nlp = spacy.blank("en")
-> doc_bin = DocBin().from_bytes(bytes_data)
-> docs = list(doc_bin.get_docs(nlp.vocab))
-> ```
-
-If you're working with lots of data, you'll probably need to pass analyses
-between machines, either to use something like [Dask](https://dask.org) or
-[Spark](https://spark.apache.org), or even just to save out work to disk. Often
-it's sufficient to use the `Doc.to_array` functionality for this, and just
-serialize the numpy arrays – but other times you want a more general way to save
-and restore `Doc` objects.
-
-The new `DocBin` class makes it easy to serialize and deserialize a collection
-of `Doc` objects together, and is much more efficient than calling
-`Doc.to_bytes` on each individual `Doc` object. You can also control what data
-gets saved, and you can merge pallets together for easy map/reduce-style
-processing.
-
-
-
-**API:** [`DocBin`](/api/docbin) **Usage: **
-[Serializing Doc objects](/usage/saving-loading#docs)
-
-
-
-### CLI command to debug and validate training data {#debug-data}
-
-> #### Example
->
-> ```bash
-> $ python -m spacy debug-data en train.json dev.json
-> ```
-
-The new `debug-data` command lets you analyze and validate your training and
-development data, get useful stats, and find problems like invalid entity
-annotations, cyclic dependencies, low data labels and more. If you're training a
-model with `spacy train` and the results seem surprising or confusing,
-`debug-data` may help you track down the problems and improve your training
-data.
-
-
-
-```
-=========================== Data format validation ===========================
-✔ Corpus is loadable
-
-=============================== Training stats ===============================
-Training pipeline: tagger, parser, ner
-Starting with blank model 'en'
-18127 training docs
-2939 evaluation docs
-⚠ 34 training examples also in evaluation data
-
-============================== Vocab & Vectors ==============================
-ℹ 2083156 total words in the data (56962 unique)
-⚠ 13020 misaligned tokens in the training data
-⚠ 2423 misaligned tokens in the dev data
-10 most common words: 'the' (98429), ',' (91756), '.' (87073), 'to' (50058),
-'of' (49559), 'and' (44416), 'a' (34010), 'in' (31424), 'that' (22792), 'is'
-(18952)
-ℹ No word vectors present in the model
-
-========================== Named Entity Recognition ==========================
-ℹ 18 new labels, 0 existing labels
-528978 missing values (tokens with '-' label)
-New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
-(10490), 'NORP' (9033), 'MONEY' (5164), 'PERCENT' (3761), 'ORDINAL' (2122),
-'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
-(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
-✔ Good amount of examples for all labels
-✔ Examples without occurences available for all labels
-✔ No entities consisting of or starting/ending with whitespace
-
-=========================== Part-of-speech Tagging ===========================
-ℹ 49 labels in data (57 labels in tag map)
-'NN' (266331), 'IN' (227365), 'DT' (185600), 'NNP' (164404), 'JJ' (119830),
-'NNS' (110957), '.' (101482), ',' (92476), 'RB' (90090), 'PRP' (90081), 'VB'
-(74538), 'VBD' (68199), 'CC' (62862), 'VBZ' (50712), 'VBP' (43420), 'VBN'
-(42193), 'CD' (40326), 'VBG' (34764), 'TO' (31085), 'MD' (25863), 'PRP$'
-(23335), 'HYPH' (13833), 'POS' (13427), 'UH' (13322), 'WP' (10423), 'WDT'
-(9850), 'RP' (8230), 'WRB' (8201), ':' (8168), '''' (7392), '``' (6984), 'NNPS'
-(5817), 'JJR' (5689), '$' (3710), 'EX' (3465), 'JJS' (3118), 'RBR' (2872),
-'-RRB-' (2825), '-LRB-' (2788), 'PDT' (2078), 'XX' (1316), 'RBS' (1142), 'FW'
-(794), 'NFP' (557), 'SYM' (440), 'WP$' (294), 'LS' (293), 'ADD' (191), 'AFX'
-(24)
-✔ All labels present in tag map for language 'en'
-
-============================= Dependency Parsing =============================
-ℹ Found 111703 sentences with an average length of 18.6 words.
-ℹ Found 2251 nonprojective train sentences
-ℹ Found 303 nonprojective dev sentences
-ℹ 47 labels in train data
-ℹ 211 labels in projectivized train data
-'punct' (236796), 'prep' (188853), 'pobj' (182533), 'det' (172674), 'nsubj'
-(169481), 'compound' (116142), 'ROOT' (111697), 'amod' (107945), 'dobj' (93540),
-'aux' (86802), 'advmod' (86197), 'cc' (62679), 'conj' (59575), 'poss' (36449),
-'ccomp' (36343), 'advcl' (29017), 'mark' (27990), 'nummod' (24582), 'relcl'
-(21359), 'xcomp' (21081), 'attr' (18347), 'npadvmod' (17740), 'acomp' (17204),
-'auxpass' (15639), 'appos' (15368), 'neg' (15266), 'nsubjpass' (13922), 'case'
-(13408), 'acl' (12574), 'pcomp' (10340), 'nmod' (9736), 'intj' (9285), 'prt'
-(8196), 'quantmod' (7403), 'dep' (4300), 'dative' (4091), 'agent' (3908), 'expl'
-(3456), 'parataxis' (3099), 'oprd' (2326), 'predet' (1946), 'csubj' (1494),
-'subtok' (1147), 'preconj' (692), 'meta' (469), 'csubjpass' (64), 'iobj' (1)
-⚠ Low number of examples for label 'iobj' (1)
-⚠ Low number of examples for 130 labels in the projectivized dependency
-trees used for training. You may want to projectivize labels such as punct
-before training in order to improve parser performance.
-⚠ Projectivized labels with low numbers of examples: appos||attr: 12
-advmod||dobj: 13 prep||ccomp: 12 nsubjpass||ccomp: 15 pcomp||prep: 14
-amod||dobj: 9 attr||xcomp: 14 nmod||nsubj: 17 prep||advcl: 2 prep||prep: 5
-nsubj||conj: 12 advcl||advmod: 18 ccomp||advmod: 11 ccomp||pcomp: 5 acl||pobj:
-10 npadvmod||acomp: 7 dobj||pcomp: 14 nsubjpass||pcomp: 1 nmod||pobj: 8
-amod||attr: 6 nmod||dobj: 12 aux||conj: 1 neg||conj: 1 dative||xcomp: 11
-pobj||dative: 3 xcomp||acomp: 19 advcl||pobj: 2 nsubj||advcl: 2 csubj||ccomp: 1
-advcl||acl: 1 relcl||nmod: 2 dobj||advcl: 10 advmod||advcl: 3 nmod||nsubjpass: 6
-amod||pobj: 5 cc||neg: 1 attr||ccomp: 16 advcl||xcomp: 3 nmod||attr: 4
-advcl||nsubjpass: 5 advcl||ccomp: 4 ccomp||conj: 1 punct||acl: 1 meta||acl: 1
-parataxis||acl: 1 prep||acl: 1 amod||nsubj: 7 ccomp||ccomp: 3 acomp||xcomp: 5
-dobj||acl: 5 prep||oprd: 6 advmod||acl: 2 dative||advcl: 1 pobj||agent: 5
-xcomp||amod: 1 dep||advcl: 1 prep||amod: 8 relcl||compound: 1 advcl||csubj: 3
-npadvmod||conj: 2 npadvmod||xcomp: 4 advmod||nsubj: 3 ccomp||amod: 7
-advcl||conj: 1 nmod||conj: 2 advmod||nsubjpass: 2 dep||xcomp: 2 appos||ccomp: 1
-advmod||dep: 1 advmod||advmod: 5 aux||xcomp: 8 dep||advmod: 1 dative||ccomp: 2
-prep||dep: 1 conj||conj: 1 dep||ccomp: 4 cc||ROOT: 1 prep||ROOT: 1 nsubj||pcomp:
-3 advmod||prep: 2 relcl||dative: 1 acl||conj: 1 advcl||attr: 4 prep||npadvmod: 1
-nsubjpass||xcomp: 1 neg||advmod: 1 xcomp||oprd: 1 advcl||advcl: 1 dobj||dep: 3
-nsubjpass||parataxis: 1 attr||pcomp: 1 ccomp||parataxis: 1 advmod||attr: 1
-nmod||oprd: 1 appos||nmod: 2 advmod||relcl: 1 appos||npadvmod: 1 appos||conj: 1
-prep||expl: 1 nsubjpass||conj: 1 punct||pobj: 1 cc||pobj: 1 conj||pobj: 1
-punct||conj: 1 ccomp||dep: 1 oprd||xcomp: 3 ccomp||xcomp: 1 ccomp||nsubj: 1
-nmod||dep: 1 xcomp||ccomp: 1 acomp||advcl: 1 intj||advmod: 1 advmod||acomp: 2
-relcl||oprd: 1 advmod||prt: 1 advmod||pobj: 1 appos||nummod: 1 relcl||npadvmod:
-3 mark||advcl: 1 aux||ccomp: 1 amod||nsubjpass: 1 npadvmod||advmod: 1 conj||dep:
-1 nummod||pobj: 1 amod||npadvmod: 1 intj||pobj: 1 nummod||npadvmod: 1
-xcomp||xcomp: 1 aux||dep: 1 advcl||relcl: 1
-⚠ The following labels were found only in the train data: xcomp||amod,
-advcl||relcl, prep||nsubjpass, acl||nsubj, nsubjpass||conj, xcomp||oprd,
-advmod||conj, advmod||advmod, iobj, advmod||nsubjpass, dobj||conj, ccomp||amod,
-meta||acl, xcomp||xcomp, prep||attr, prep||ccomp, advcl||acomp, acl||dobj,
-advcl||advcl, pobj||agent, prep||advcl, nsubjpass||xcomp, prep||dep,
-acomp||xcomp, aux||ccomp, ccomp||dep, conj||dep, relcl||compound,
-nsubjpass||ccomp, nmod||dobj, advmod||advcl, advmod||acl, dobj||advcl,
-dative||xcomp, prep||nsubj, ccomp||ccomp, nsubj||ccomp, xcomp||acomp,
-prep||acomp, dep||advmod, acl||pobj, appos||dobj, npadvmod||acomp, cc||ROOT,
-relcl||nsubj, nmod||pobj, acl||nsubjpass, ccomp||advmod, pcomp||prep,
-amod||dobj, advmod||attr, advcl||csubj, appos||attr, dobj||pcomp, prep||ROOT,
-relcl||pobj, advmod||pobj, amod||nsubj, ccomp||xcomp, prep||oprd,
-npadvmod||advmod, appos||nummod, advcl||pobj, neg||advmod, acl||attr,
-appos||nsubjpass, csubj||ccomp, amod||nsubjpass, intj||pobj, dep||advcl,
-cc||neg, xcomp||ccomp, dative||ccomp, nmod||oprd, pobj||dative, prep||dobj,
-dep||ccomp, relcl||attr, ccomp||nsubj, advcl||xcomp, nmod||dep, advcl||advmod,
-ccomp||conj, pobj||prep, advmod||acomp, advmod||relcl, attr||pcomp,
-ccomp||parataxis, oprd||xcomp, intj||advmod, nmod||nsubjpass, prep||npadvmod,
-parataxis||acl, prep||pobj, advcl||dobj, amod||pobj, prep||acl, conj||pobj,
-advmod||dep, punct||pobj, ccomp||acomp, acomp||advcl, nummod||npadvmod,
-dobj||dep, npadvmod||xcomp, advcl||conj, relcl||npadvmod, punct||acl,
-relcl||dobj, dobj||xcomp, nsubjpass||parataxis, dative||advcl, relcl||nmod,
-advcl||ccomp, appos||npadvmod, ccomp||pcomp, prep||amod, mark||advcl,
-prep||advmod, prep||xcomp, appos||nsubj, attr||ccomp, advmod||prt, dobj||ccomp,
-aux||conj, advcl||nsubj, conj||conj, advmod||ccomp, advcl||nsubjpass,
-attr||xcomp, nmod||conj, npadvmod||conj, relcl||dative, prep||expl,
-nsubjpass||pcomp, advmod||xcomp, advmod||dobj, appos||pobj, nsubj||conj,
-relcl||nsubjpass, advcl||attr, appos||ccomp, advmod||prep, prep||conj,
-nmod||attr, punct||conj, neg||conj, dep||xcomp, aux||xcomp, dobj||acl,
-nummod||pobj, amod||npadvmod, nsubj||pcomp, advcl||acl, appos||nmod,
-relcl||oprd, prep||prep, cc||pobj, nmod||nsubj, amod||attr, aux||dep,
-appos||conj, advmod||nsubj, nsubj||advcl, acl||conj
-To train a parser, your data should include at least 20 instances of each label.
-⚠ Multiple root labels (ROOT, nsubj, aux, npadvmod, prep) found in
-training data. spaCy's parser uses a single root label ROOT so this distinction
-will not be available.
-
-================================== Summary ==================================
-✔ 5 checks passed
-⚠ 8 warnings
-```
-
-
-
-
-
-**API:** [`spacy debug-data`](/api/cli#debug-data)
-
-
-
-## Backwards incompatibilities {#incompat}
-
-
-
-If you've been training **your own models**, you'll need to **retrain** them
-with the new version. Also don't forget to upgrade all models to the latest
-versions. Models for v2.0 or v2.1 aren't compatible with models for v2.2. To
-check if all of your models are up to date, you can run the
-[`spacy validate`](/api/cli#validate) command.
-
-
-
-- The [Dutch model](/models/nl) has been trained on a new NER corpus (custom
- labelled UD instead of WikiNER), so their predictions may be very different
- compared to the previous version. The results should be significantly better
- and more generalizable, though.
-- The [`spacy download`](/api/cli#download) command does **not** set the
- `--no-deps` pip argument anymore by default, meaning that model package
- dependencies (if available) will now be also downloaded and installed. If
- spaCy (which is also a model dependency) is not installed in the current
- environment, e.g. if a user has built from source, `--no-deps` is added back
- automatically to prevent spaCy from being downloaded and installed again from
- pip.
-- The built-in
- [`biluo_tags_from_offsets`](/api/goldparse#biluo_tags_from_offsets) converter
- is now stricter and will raise an error if entities are overlapping (instead
- of silently skipping them). If your data contains invalid entity annotations,
- make sure to clean it and resolve conflicts. You can now also use the new
- `debug-data` command to find problems in your data.
-- Pipeline components can now overwrite IOB tags of tokens that are not yet part
- of an entity. Once a token has an `ent_iob` value set, it won't be reset to an
- "unset" state and will always have at least `O` assigned. `list(doc.ents)` now
- actually keeps the annotations on the token level consistent, instead of
- resetting `O` to an empty string.
-- The default punctuation in the [`Sentencizer`](/api/sentencizer) has been
- extended and now includes more characters common in various languages. This
- also means that the results it produces may change, depending on your text. If
- you want the previous behaviour with limited characters, set
- `punct_chars=[".", "!", "?"]` on initialization.
-- The [`PhraseMatcher`](/api/phrasematcher) algorithm was rewritten from scratch
- and it's now 10× faster. The rewrite also resolved a few subtle bugs
- with very large terminology lists. So if you were matching large lists, you
- may see slightly different results – however, the results should now be fully
- correct. See [this PR](https://github.com/explosion/spaCy/pull/4309) for more
- details.
-- Lemmatization tables (rules, exceptions, index and lookups) are now part of
- the `Vocab` and serialized with it. This means that serialized objects (`nlp`,
- pipeline components, vocab) will now include additional data, and models
- written to disk will include additional files.
-- The `Serbian` language class (introduced in v2.1.8) incorrectly used the
- language code `rs` instead of `sr`. This has now been fixed, so `Serbian` is
- now available via `spacy.lang.sr`.
-- The `"sources"` in the `meta.json` have changed from a list of strings to a
- list of dicts. This is mostly internals, but if your code used
- `nlp.meta["sources"]`, you might have to update it.
diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md
index 0ac8bfe75..a412eeba4 100644
--- a/website/docs/usage/v2.md
+++ b/website/docs/usage/v2.md
@@ -156,7 +156,7 @@ spaCy or plug in your own machine learning models.
> for itn in range(100):
> for doc, gold in train_data:
> nlp.update([doc], [gold])
-> doc = nlp("This is a text.")
+> doc = nlp(u"This is a text.")
> print(doc.cats)
> ```
@@ -179,13 +179,13 @@ network to assign position-sensitive vectors to each word in the document.
> #### Example
>
> ```python
-> doc = nlp("I love coffee")
-> assert doc.vocab.strings["coffee"] == 3197928453018144401
-> assert doc.vocab.strings[3197928453018144401] == "coffee"
+> doc = nlp(u"I love coffee")
+> assert doc.vocab.strings[u"coffee"] == 3197928453018144401
+> assert doc.vocab.strings[3197928453018144401] == u"coffee"
>
-> beer_hash = doc.vocab.strings.add("beer")
-> assert doc.vocab.strings["beer"] == beer_hash
-> assert doc.vocab.strings[beer_hash] == "beer"
+> beer_hash = doc.vocab.strings.add(u"beer")
+> assert doc.vocab.strings[u"beer"] == beer_hash
+> assert doc.vocab.strings[beer_hash] == u"beer"
> ```
The [`StringStore`](/api/stringstore) now resolves all strings to hash values
@@ -275,7 +275,7 @@ language, you can import the class directly, e.g.
>
> ```python
> from spacy import displacy
-> doc = nlp("This is a sentence about Facebook.")
+> doc = nlp(u"This is a sentence about Facebook.")
> displacy.serve(doc, style="dep") # run the web server
> html = displacy.render(doc, style="ent") # generate HTML
> ```
@@ -322,7 +322,7 @@ lookup-based lemmatization – and **many new languages**!
> matcher.add('HEARTS', None, [{"ORTH": "❤️", "OP": '+'}])
>
> phrasematcher = PhraseMatcher(nlp.vocab)
-> phrasematcher.add("OBAMA", None, nlp("Barack Obama"))
+> phrasematcher.add("OBAMA", None, nlp(u"Barack Obama"))
> ```
Patterns can now be added to the matcher by calling
@@ -477,12 +477,12 @@ to the `disable` keyword argument on load, or by using
[`disable_pipes`](/api/language#disable_pipes) as a method or context manager:
```diff
-- nlp = spacy.load("en_core_web_sm", tagger=False, entity=False)
-- doc = nlp("I don't want parsed", parse=False)
+- nlp = spacy.load("en", tagger=False, entity=False)
+- doc = nlp(u"I don't want parsed", parse=False)
-+ nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner"])
++ nlp = spacy.load("en", disable=["tagger", "ner"])
+ with nlp.disable_pipes("parser"):
-+ doc = nlp("I don't want parsed")
++ doc = nlp(u"I don't want parsed")
```
To add spaCy's built-in pipeline components to your pipeline, you can still
@@ -539,7 +539,7 @@ This means that your application can – and should – only pass around `Doc`
objects and refer to them as the single source of truth.
```diff
-- doc = nlp("This is a regular doc")
+- doc = nlp(u"This is a regular doc")
- doc_array = doc.to_array(["ORTH", "POS"])
- doc_with_meta = {"doc_array": doc_array, "meta": get_doc_meta(doc_array)}
@@ -556,11 +556,11 @@ utilities that interact with the pipeline, consider moving this logic into its
own extension module.
```diff
-- doc = nlp("Doc with a standard pipeline")
+- doc = nlp(u"Doc with a standard pipeline")
- meta = get_meta(doc)
+ nlp.add_pipe(meta_component)
-+ doc = nlp("Doc with a custom pipeline that assigns meta")
++ doc = nlp(u"Doc with a custom pipeline that assigns meta")
+ meta = doc._.meta
```
@@ -572,12 +572,12 @@ to call [`StringStore.add`](/api/stringstore#add) explicitly. You can also now
be sure that the string-to-hash mapping will always match across vocabularies.
```diff
-- nlp.vocab.strings["coffee"] # 3672
-- other_nlp.vocab.strings["coffee"] # 40259
+- nlp.vocab.strings[u"coffee"] # 3672
+- other_nlp.vocab.strings[u"coffee"] # 40259
-+ nlp.vocab.strings.add("coffee")
-+ nlp.vocab.strings["coffee"] # 3197928453018144401
-+ other_nlp.vocab.strings["coffee"] # 3197928453018144401
++ nlp.vocab.strings.add(u"coffee")
++ nlp.vocab.strings[u"coffee"] # 3197928453018144401
++ other_nlp.vocab.strings[u"coffee"] # 3197928453018144401
```
### Adding patterns and callbacks to the matcher {#migrating-matcher}
diff --git a/website/docs/usage/vectors-similarity.md b/website/docs/usage/vectors-similarity.md
index 53648f66e..f7c9d1cd9 100644
--- a/website/docs/usage/vectors-similarity.md
+++ b/website/docs/usage/vectors-similarity.md
@@ -74,8 +74,8 @@ path to [`spacy.load()`](/api/top-level#spacy.load).
```python
nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg")
-doc1 = nlp_latin("Caecilius est in horto")
-doc2 = nlp_latin("servus est in atrio")
+doc1 = nlp_latin(u"Caecilius est in horto")
+doc2 = nlp_latin(u"servus est in atrio")
doc1.similarity(doc2)
```
@@ -168,9 +168,10 @@ vectors to the vocabulary, you can use the
### Adding vectors
from spacy.vocab import Vocab
-vector_data = {"dog": numpy.random.uniform(-1, 1, (300,)),
- "cat": numpy.random.uniform(-1, 1, (300,)),
- "orange": numpy.random.uniform(-1, 1, (300,))}
+vector_data = {u"dog": numpy.random.uniform(-1, 1, (300,)),
+ u"cat": numpy.random.uniform(-1, 1, (300,)),
+ u"orange": numpy.random.uniform(-1, 1, (300,))}
+
vocab = Vocab()
for word, vector in vector_data.items():
vocab.set_vector(word, vector)
@@ -240,7 +241,7 @@ import cupy.cuda
from spacy.vectors import Vectors
vector_table = numpy.zeros((3, 300), dtype="f")
-vectors = Vectors(["dog", "cat", "orange"], vector_table)
+vectors = Vectors([u"dog", u"cat", u"orange"], vector_table)
with cupy.cuda.Device(0):
vectors.data = cupy.asarray(vectors.data)
```
@@ -251,6 +252,6 @@ import torch
from spacy.vectors import Vectors
vector_table = numpy.zeros((3, 300), dtype="f")
-vectors = Vectors(["dog", "cat", "orange"], vector_table)
+vectors = Vectors([u"dog", u"cat", u"orange"], vector_table)
vectors.data = torch.Tensor(vectors.data).cuda(0)
```
diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md
index dd0b0eb50..6172d2f48 100644
--- a/website/docs/usage/visualizers.md
+++ b/website/docs/usage/visualizers.md
@@ -48,7 +48,7 @@ import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp("This is a sentence.")
+doc = nlp(u"This is a sentence.")
displacy.serve(doc, style="dep")
```
@@ -101,7 +101,7 @@ import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
-text = """In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin)."""
+text = u"""In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin)."""
doc = nlp(text)
sentence_spans = list(doc.sents)
displacy.serve(sentence_spans, style="dep")
@@ -117,7 +117,7 @@ text.
import spacy
from spacy import displacy
-text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
+text = u"When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
@@ -168,7 +168,7 @@ add a headline to each visualization, you can add a `title` to its `user_data`.
User data is never touched or modified by spaCy.
```python
-doc = nlp("This is a sentence about Google.")
+doc = nlp(u"This is a sentence about Google.")
doc.user_data["title"] = "This is a title"
displacy.serve(doc, style="ent")
```
@@ -193,7 +193,7 @@ import spacy
from spacy import displacy
# In[2]:
-doc = nlp("Rats are various medium-sized, long-tailed rodents.")
+doc = nlp(u"Rats are various medium-sized, long-tailed rodents.")
displacy.render(doc, style="dep")
# In[3]:
@@ -209,6 +209,7 @@ rendering if auto-detection fails.
+
![displaCy visualizer in a Jupyter notebook](../images/displacy_jupyter.jpg)
Internally, displaCy imports `display` and `HTML` from `IPython.core.display`
@@ -235,8 +236,8 @@ import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
-doc1 = nlp("This is a sentence.")
-doc2 = nlp("This is another sentence.")
+doc1 = nlp(u"This is a sentence.")
+doc2 = nlp(u"This is another sentence.")
html = displacy.render([doc1, doc2], style="dep", page=True)
```
@@ -280,7 +281,7 @@ from spacy import displacy
from pathlib import Path
nlp = spacy.load("en_core_web_sm")
-sentences = ["This is an example.", "This is another one."]
+sentences = [u"This is an example.", u"This is another one."]
for sent in sentences:
doc = nlp(sent)
svg = displacy.render(doc, style="dep", jupyter=False)
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 09a17b568..77b46c798 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -65,14 +65,19 @@
"example": "Αυτή είναι μια πρόταση.",
"has_examples": true
},
+ {
+ "code": "xx",
+ "name": "Multi-language",
+ "models": ["xx_ent_wiki_sm"],
+ "example": "This is a sentence about Facebook."
+ },
{ "code": "sv", "name": "Swedish", "has_examples": true },
{ "code": "fi", "name": "Finnish", "has_examples": true },
{
"code": "nb",
"name": "Norwegian Bokmål",
"example": "Dette er en setning.",
- "has_examples": true,
- "models": ["nb_core_news_sm"]
+ "has_examples": true
},
{ "code": "da", "name": "Danish", "example": "Dette er en sætning.", "has_examples": true },
{ "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true },
@@ -122,7 +127,7 @@
{ "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true },
{ "code": "cs", "name": "Czech" },
{ "code": "is", "name": "Icelandic" },
- { "code": "lt", "name": "Lithuanian", "has_examples": true, "models": ["lt_core_news_sm"] },
+ { "code": "lt", "name": "Lithuanian" },
{ "code": "lv", "name": "Latvian" },
{ "code": "sr", "name": "Serbian" },
{ "code": "sk", "name": "Slovak" },
@@ -177,15 +182,10 @@
"code": "vi",
"name": "Vietnamese",
"dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }]
- },
- {
- "code": "xx",
- "name": "Multi-language",
- "models": ["xx_ent_wiki_sm"],
- "example": "This is a sentence about Facebook."
}
],
"licenses": [
+ { "id": "CC BY 4.0", "url": "https://creativecommons.org/licenses/by/4.0/" },
{ "id": "CC BY 4.0", "url": "https://creativecommons.org/licenses/by/4.0/" },
{ "id": "CC BY-SA", "url": "https://creativecommons.org/licenses/by-sa/3.0/" },
{ "id": "CC BY-SA 3.0", "url": "https://creativecommons.org/licenses/by-sa/3.0/" },
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 68d46605f..31083b091 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -9,7 +9,6 @@
{ "text": "Models & Languages", "url": "/usage/models" },
{ "text": "Facts & Figures", "url": "/usage/facts-figures" },
{ "text": "spaCy 101", "url": "/usage/spacy-101" },
- { "text": "New in v2.2", "url": "/usage/v2-2" },
{ "text": "New in v2.1", "url": "/usage/v2-1" },
{ "text": "New in v2.0", "url": "/usage/v2" }
]
@@ -76,7 +75,6 @@
{ "text": "Tagger", "url": "/api/tagger" },
{ "text": "DependencyParser", "url": "/api/dependencyparser" },
{ "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
- { "text": "EntityLinker", "url": "/api/entitylinker" },
{ "text": "TextCategorizer", "url": "/api/textcategorizer" },
{ "text": "Matcher", "url": "/api/matcher" },
{ "text": "PhraseMatcher", "url": "/api/phrasematcher" },
@@ -91,12 +89,9 @@
{ "text": "Vocab", "url": "/api/vocab" },
{ "text": "StringStore", "url": "/api/stringstore" },
{ "text": "Vectors", "url": "/api/vectors" },
- { "text": "Lookups", "url": "/api/lookups" },
- { "text": "KnowledgeBase", "url": "/api/kb" },
{ "text": "GoldParse", "url": "/api/goldparse" },
{ "text": "GoldCorpus", "url": "/api/goldcorpus" },
- { "text": "Scorer", "url": "/api/scorer" },
- { "text": "DocBin", "url": "/api/docbin" }
+ { "text": "Scorer", "url": "/api/scorer" }
]
},
{
diff --git a/website/meta/site.json b/website/meta/site.json
index 0325e78ca..edb60ab0c 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -23,6 +23,7 @@
"apiKey": "371e26ed49d29a27bd36273dfdaf89af",
"indexName": "spacy"
},
+ "spacyVersion": "2.1",
"binderUrl": "ines/spacy-io-binder",
"binderBranch": "live",
"binderVersion": "2.1.8",
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 66b5e4ba7..2997f9300 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -119,14 +119,14 @@
"emoji = Emoji(nlp)",
"nlp.add_pipe(emoji, first=True)",
"",
- "doc = nlp('This is a test 😻 👍🏿')",
+ "doc = nlp(u'This is a test 😻 👍🏿')",
"assert doc._.has_emoji == True",
"assert doc[2:5]._.has_emoji == True",
"assert doc[0]._.is_emoji == False",
"assert doc[4]._.is_emoji == True",
- "assert doc[5]._.emoji_desc == 'thumbs up dark skin tone'",
+ "assert doc[5]._.emoji_desc == u'thumbs up dark skin tone'",
"assert len(doc._.emoji) == 2",
- "assert doc._.emoji[1] == ('👍🏿', 5, 'thumbs up dark skin tone')"
+ "assert doc._.emoji[1] == (u'👍🏿', 5, u'thumbs up dark skin tone')"
],
"author": "Ines Montani",
"author_links": {
@@ -432,21 +432,17 @@
{
"id": "neuralcoref",
"slogan": "State-of-the-art coreference resolution based on neural nets and spaCy",
- "description": "This coreference resolution module is based on the super fast [spaCy](https://spacy.io/) parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. Since ✨Neuralcoref v2.0, you can train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**. Note that to use neuralcoref with spaCy > 2.1.0, you'll have to install neuralcoref from source.",
+ "description": "This coreference resolution module is based on the super fast [spaCy](https://spacy.io/) parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. With ✨Neuralcoref v2.0, you should now be able to train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**.",
"github": "huggingface/neuralcoref",
"thumb": "https://i.imgur.com/j6FO9O6.jpg",
"code_example": [
- "import spacy",
- "import neuralcoref",
+ "from neuralcoref import Coref",
"",
- "nlp = spacy.load('en')",
- "neuralcoref.add_to_pipe(nlp)",
- "doc1 = nlp('My sister has a dog. She loves him.')",
- "print(doc1._.coref_clusters)",
- "",
- "doc2 = nlp('Angela lives in Boston. She is quite happy in that city.')",
- "for ent in doc2.ents:",
- " print(ent._.coref_cluster)"
+ "coref = Coref()",
+ "clusters = coref.one_shot_coref(utterances=u\"She loves him.\", context=u\"My sister has a dog.\")",
+ "mentions = coref.get_mentions()",
+ "utterances = coref.get_utterances()",
+ "resolved_utterance_text = coref.get_resolved_utterances()"
],
"author": "Hugging Face",
"author_links": {
@@ -739,7 +735,7 @@
"slogan": "Use NLP to go beyond vanilla word2vec",
"description": "sense2vec ([Trask et. al](https://arxiv.org/abs/1511.06388), 2015) is a nice twist on [word2vec](https://en.wikipedia.org/wiki/Word2vec) that lets you learn more interesting, detailed and context-sensitive word vectors. For an interactive example of the technology, see our [sense2vec demo](https://explosion.ai/demos/sense2vec) that lets you explore semantic similarities across all Reddit comments of 2015.",
"github": "explosion/sense2vec",
- "pip": "sense2vec==1.0.0a1",
+ "pip": "sense2vec==1.0.0a0",
"thumb": "https://i.imgur.com/awfdhX6.jpg",
"image": "https://explosion.ai/assets/img/demos/sense2vec.png",
"url": "https://explosion.ai/demos/sense2vec",
@@ -751,8 +747,8 @@
"s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')",
"nlp.add_pipe(s2v)",
"",
- "doc = nlp(\"A sentence about natural language processing.\")",
- "assert doc[3].text == 'natural language processing'",
+ "doc = nlp(u\"A sentence about natural language processing.\")",
+ "assert doc[3].text == u'natural language processing'",
"freq = doc[3]._.s2v_freq",
"vector = doc[3]._.s2v_vec",
"most_similar = doc[3]._.s2v_most_similar(3)",
@@ -1301,7 +1297,7 @@
"",
"nlp = spacy.load('en')",
"nlp.add_pipe(BeneparComponent('benepar_en'))",
- "doc = nlp('The time for action is now. It's never too late to do something.')",
+ "doc = nlp(u'The time for action is now. It's never too late to do something.')",
"sent = list(doc.sents)[0]",
"print(sent._.parse_string)",
"# (S (NP (NP (DT The) (NN time)) (PP (IN for) (NP (NN action)))) (VP (VBZ is) (ADVP (RB now))) (. .))",
@@ -1434,7 +1430,7 @@
"thumb": "https://i.imgur.com/3y2uPUv.jpg",
"code_example": [
"import spacy",
- "from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
+ "from spacy_wordnet.wornet_annotator import WordnetAnnotator ",
"",
"# Load an spacy model (supported models are \"es\" and \"en\") ",
"nlp = spacy.load('en')",
diff --git a/website/src/components/table.js b/website/src/components/table.js
index 85b8e2144..3c345b046 100644
--- a/website/src/components/table.js
+++ b/website/src/components/table.js
@@ -42,19 +42,12 @@ function isFootRow(children) {
return false
}
-export const Table = ({ fixed, className, ...props }) => {
- const tableClassNames = classNames(classes.root, className, {
- [classes.fixed]: fixed,
- })
- return
-}
-
+export const Table = props =>
export const Th = props =>
-export const Tr = ({ evenodd = true, children, ...props }) => {
+export const Tr = ({ children, ...props }) => {
const foot = isFootRow(children)
- const trClasssNames = classNames({
- [classes.tr]: evenodd,
+ const trClasssNames = classNames(classes.tr, {
[classes.footer]: foot,
'table-footer': foot,
})
diff --git a/website/src/styles/accordion.module.sass b/website/src/styles/accordion.module.sass
index bdcbba9ac..707e29aef 100644
--- a/website/src/styles/accordion.module.sass
+++ b/website/src/styles/accordion.module.sass
@@ -13,7 +13,6 @@
width: 100%
padding: 1rem 1.5rem
border-radius: var(--border-radius)
- text-align: left
&:focus
background: var(--color-theme-opaque)
diff --git a/website/src/styles/code.module.sass b/website/src/styles/code.module.sass
index b268904f5..f72f1ffe6 100644
--- a/website/src/styles/code.module.sass
+++ b/website/src/styles/code.module.sass
@@ -56,7 +56,6 @@
.wrap
white-space: pre-wrap
- word-wrap: break-word
.title,
.juniper-button
diff --git a/website/src/styles/grid.module.sass b/website/src/styles/grid.module.sass
index 482ad03cf..63ea3d160 100644
--- a/website/src/styles/grid.module.sass
+++ b/website/src/styles/grid.module.sass
@@ -37,5 +37,5 @@ $flex-gap: 2rem
.narrow
grid-column-gap: $grid-gap-narrow
-.spacing:not(:empty)
+.spacing
margin-bottom: var(--spacing-md)
diff --git a/website/src/styles/table.module.sass b/website/src/styles/table.module.sass
index 68cc4bace..3e73ffb7f 100644
--- a/website/src/styles/table.module.sass
+++ b/website/src/styles/table.module.sass
@@ -6,9 +6,6 @@
margin-bottom: var(--spacing-md)
max-width: 100%
-.fixed
- table-layout: fixed
-
.tr
thead &:nth-child(odd)
background: transparent
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 159744aa8..4713f4b34 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -14,15 +14,13 @@ import Icon from '../components/icon'
import Link from '../components/link'
import Grid from '../components/grid'
import Infobox from '../components/infobox'
-import Accordion from '../components/accordion'
-import { join, arrayToObj, abbrNum, markdownToReact, isString } from '../components/util'
+import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
const MODEL_META = {
core: 'Vocabulary, syntax, entities, vectors',
core_sm: 'Vocabulary, syntax, entities',
dep: 'Vocabulary, syntax',
ent: 'Named entities',
- pytt: 'PyTorch Transformers',
vectors: 'Word vectors',
web: 'written text (blogs, news, comments)',
news: 'written text (news, media)',
@@ -45,12 +43,6 @@ const MODEL_META = {
compat: 'Latest compatible model version for your spaCy installation',
}
-const LABEL_SCHEME_META = {
- tagger: 'Part-of-speech tags via Token.tag_',
- parser: 'Dependency labels via Token.dep_',
- ner: 'Named entity labels',
-}
-
const MARKDOWN_COMPONENTS = {
code: InlineCode,
}
@@ -104,23 +96,11 @@ function formatModelMeta(data) {
author: data.author,
url: data.url,
license: data.license,
- labels: data.labels,
vectors: formatVectors(data.vectors),
accuracy: formatAccuracy(data.accuracy),
}
}
-function formatSources(data = []) {
- const sources = data.map(s => (isString(s) ? { name: s } : s))
- return sources.map(({ name, url, author }, i) => (
- <>
- {i > 0 && }
- {name && url ? {name} : name}
- {author && ` (${author})`}
- >
- ))
-}
-
const Help = ({ children }) => (
@@ -155,12 +135,11 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const releaseUrl = `https://github.com/${repo}/releases/${releaseTag}`
const pipeline =
meta.pipeline && join(meta.pipeline.map(p => {p} ))
- const sources = formatSources(meta.sources)
+ const sources = meta.sources && join(meta.sources)
const author = !meta.url ? meta.author : {meta.author}
const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null
const license = licenseUrl ? {meta.license} : meta.license
const hasInteractiveCode = size === 'sm' && hasExamples && !isError
- const labels = meta.labels
const rows = [
{ label: 'Language', tag: langId, content: langName },
@@ -239,11 +218,11 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
)}
-
+
{accuracy &&
accuracy.map(({ label, items }, i) =>
!items ? null : (
-
+
{label}
@@ -281,46 +260,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
].join('\n')}
)}
- {labels && (
-
-
- The statistical components included in this model package assign the
- following labels. The labels are specific to the corpus that the model was
- trained on. To see the description of a label, you can use{' '}
-
- spacy.explain
-
- .
-
-
-
- {Object.keys(labels).map(pipe => {
- const labelNames = labels[pipe] || []
- const help = LABEL_SCHEME_META[pipe]
- return (
-
-
-
- {pipe} {help && {help} }
-
-
-
- {labelNames.map((label, i) => (
- <>
- {i > 0 && ', '}
-
- {label}
-
- >
- ))}
-
-
- )
- })}
-
-
-
- )}
)
}
diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index 91fd756fa..e9dec87f4 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -150,24 +150,6 @@ const Landing = ({ data }) => {
-
- Prodigy is an annotation tool so efficient that data scientists
- can do the annotation themselves, enabling a new level of rapid iteration.
- Whether you're working on entity recognition, intent detection or image
- classification, Prodigy can help you train and evaluate your
- models faster. Stream in your own examples or real-world data from live APIs,
- update your model in real-time and chain models together to build more complex
- systems.
-
-
{
research, development and applications, with keynotes by Sebastian Ruder
(DeepMind) and Yoav Goldberg (Allen AI).
+
+
+ Prodigy is an annotation tool so efficient that data scientists
+ can do the annotation themselves, enabling a new level of rapid iteration.
+ Whether you're working on entity recognition, intent detection or image
+ classification, Prodigy can help you train and evaluate your
+ models faster. Stream in your own examples or real-world data from live APIs,
+ update your model in real-time and chain models together to build more complex
+ systems.
+
diff --git a/website/src/widgets/quickstart-models.js b/website/src/widgets/quickstart-models.js
index d116fae0a..83bb4527b 100644
--- a/website/src/widgets/quickstart-models.js
+++ b/website/src/widgets/quickstart-models.js
@@ -65,7 +65,7 @@ const QuickstartInstall = ({ id, title, description, defaultLang, children }) =>
nlp = {pkg}.load()
- doc = nlp("{exampleText}")
+ doc = nlp(u"{exampleText}")
print([
From 7c701784e58f2ca140a1ef2b1dd6ee4efc1095a4 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Mon, 30 Sep 2019 12:01:09 +0200
Subject: [PATCH 012/496] Update models.js
---
website/src/templates/models.js | 42 ++++++++++++++++++++++++++++++++-
1 file changed, 41 insertions(+), 1 deletion(-)
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 4713f4b34..1466235e4 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -218,7 +218,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
)}
-
+
{accuracy &&
accuracy.map(({ label, items }, i) =>
!items ? null : (
@@ -260,6 +260,46 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
].join('\n')}
)}
+ {labels && (
+
+
+ The statistical components included in this model package assign the
+ following labels. The labels are specific to the corpus that the model was
+ trained on. To see the description of a label, you can use{' '}
+
+ spacy.explain
+
+ .
+
+
+
+ {Object.keys(labels).map(pipe => {
+ const labelNames = labels[pipe] || []
+ const help = LABEL_SCHEME_META[pipe]
+ return (
+
+
+
+ {pipe} {help && {help} }
+
+
+
+ {labelNames.map((label, i) => (
+ <>
+ {i > 0 && ', '}
+
+ {label}
+
+ >
+ ))}
+
+
+ )
+ })}
+
+
+
+ )}
)
}
From 88fee1a768c120874b3222bb5e1b7adff841fc7c Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Mon, 30 Sep 2019 13:22:17 +0200
Subject: [PATCH 013/496] Update models.js
---
website/src/templates/models.js | 27 ++++++++++++++++++++++++---
1 file changed, 24 insertions(+), 3 deletions(-)
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 1466235e4..159744aa8 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -14,13 +14,15 @@ import Icon from '../components/icon'
import Link from '../components/link'
import Grid from '../components/grid'
import Infobox from '../components/infobox'
-import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
+import Accordion from '../components/accordion'
+import { join, arrayToObj, abbrNum, markdownToReact, isString } from '../components/util'
const MODEL_META = {
core: 'Vocabulary, syntax, entities, vectors',
core_sm: 'Vocabulary, syntax, entities',
dep: 'Vocabulary, syntax',
ent: 'Named entities',
+ pytt: 'PyTorch Transformers',
vectors: 'Word vectors',
web: 'written text (blogs, news, comments)',
news: 'written text (news, media)',
@@ -43,6 +45,12 @@ const MODEL_META = {
compat: 'Latest compatible model version for your spaCy installation',
}
+const LABEL_SCHEME_META = {
+ tagger: 'Part-of-speech tags via Token.tag_',
+ parser: 'Dependency labels via Token.dep_',
+ ner: 'Named entity labels',
+}
+
const MARKDOWN_COMPONENTS = {
code: InlineCode,
}
@@ -96,11 +104,23 @@ function formatModelMeta(data) {
author: data.author,
url: data.url,
license: data.license,
+ labels: data.labels,
vectors: formatVectors(data.vectors),
accuracy: formatAccuracy(data.accuracy),
}
}
+function formatSources(data = []) {
+ const sources = data.map(s => (isString(s) ? { name: s } : s))
+ return sources.map(({ name, url, author }, i) => (
+ <>
+ {i > 0 && }
+ {name && url ? {name} : name}
+ {author && ` (${author})`}
+ >
+ ))
+}
+
const Help = ({ children }) => (
@@ -135,11 +155,12 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const releaseUrl = `https://github.com/${repo}/releases/${releaseTag}`
const pipeline =
meta.pipeline && join(meta.pipeline.map(p => {p} ))
- const sources = meta.sources && join(meta.sources)
+ const sources = formatSources(meta.sources)
const author = !meta.url ? meta.author : {meta.author}
const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null
const license = licenseUrl ? {meta.license} : meta.license
const hasInteractiveCode = size === 'sm' && hasExamples && !isError
+ const labels = meta.labels
const rows = [
{ label: 'Language', tag: langId, content: langName },
@@ -222,7 +243,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
{accuracy &&
accuracy.map(({ label, items }, i) =>
!items ? null : (
-
+
{label}
From 31cebf66a8005464944d5363ec5e0263e0cd25d6 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Mon, 30 Sep 2019 13:50:08 +0200
Subject: [PATCH 014/496] Update universe.json
---
website/meta/universe.json | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 2997f9300..8d5227b38 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1103,6 +1103,20 @@
"youtube": "WnGPv6HnBok",
"category": ["videos"]
},
+ {
+ "type": "education",
+ "id": "video-intro-to-nlp-episode-2",
+ "title": "Intro to NLP with spaCy",
+ "slogan": "Episode 2: Rule-based Matching",
+ "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.",
+ "author": "Vincent Warmerdam",
+ "author_links": {
+ "twitter": "fishnets88",
+ "github": "koaning"
+ },
+ "youtube": "KL4-Mpgbahw",
+ "category": ["videos"]
+ },
{
"type": "education",
"id": "video-spacy-irl-entity-linking",
From 6316243941bdff2188ceea4517f5603ba314a691 Mon Sep 17 00:00:00 2001
From: Nipun Sadvilkar
Date: Wed, 30 Oct 2019 16:43:29 +0530
Subject: [PATCH 015/496] =?UTF-8?q?=E2=9C=A8=20=20project:=20pySBD=20-=20P?=
=?UTF-8?q?ython=20Sentence=20Boundary=20Disambiguation=20(#4455)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* ✨ project: pySBD - Python Sentence Boundary Disambiguation
* 📝 Update links and description
* 🐛 Fix missing comma
* Update universe.json
pysbd as a spacy component through entrypoints
* 🚨 Fix universe.json
* 📝 Update code_example
---
website/meta/universe.json | 26 ++++++++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/website/meta/universe.json b/website/meta/universe.json
index bc8a27a1a..e64e462d8 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1801,6 +1801,32 @@
"github": "microsoft"
}
},
+ {
+ "id": "python-sentence-boundary-disambiguation",
+ "title": "pySBD - python Sentence Boundary Disambiguation",
+ "slogan": "a rule-based sentence boundary detection that works out-of-the-box",
+ "github": "nipunsadvilkar/pySBD",
+ "description": "pySBD is 'real-world' sentence segmenter which extracts a reasonable sentences when the format and domain of the input text are unknown. It is a rules-based algorithm based on [The Golden Rules](https://s3.amazonaws.com/tm-town-nlp-resources/golden_rules.txt) - a set of tests to check accuracy of segmenter in regards to edge case scenarios developed by [TM-Town](https://www.tm-town.com/) dev team. pySBD is python port of ruby gem [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter).",
+ "pip": "pysbd",
+ "category": ["scientific"],
+ "tags": ["sentence segmentation"],
+ "code_example": [
+ "from pysbd.util import PySBDFactory",
+ "",
+ "nlp = spacy.blank('en')",
+ "nlp.add_pipe(PySBDFactory(nlp))",
+ "",
+ "doc = nlp('My name is Jonas E. Smith. Please turn to p. 55.')",
+ "print(list(doc.sents))",
+ "# [My name is Jonas E. Smith., Please turn to p. 55.]"
+ ],
+ "author": "Nipun Sadvilkar",
+ "author_links": {
+ "twitter": "nipunsadvilkar",
+ "github": "nipunsadvilkar",
+ "website": "https://nipunsadvilkar.github.io"
+ }
+ },
{
"id": "cookiecutter-spacy-fastapi",
"title": "cookiecutter-spacy-fastapi",
From 4cbc172cc6ceeff608df5197b2402c2657465f73 Mon Sep 17 00:00:00 2001
From: Neel Kamath
Date: Wed, 30 Oct 2019 17:50:46 +0530
Subject: [PATCH 016/496] Add "spaCy Server" to spaCy Universe (#4553)
* Add "spaCy Server" to spaCy Universe
* Accept the spaCy Contributor Agreement
---
.github/contributors/neelkamath.md | 106 +++++++++++++++++++++++++++++
website/meta/universe.json | 20 ++++++
2 files changed, 126 insertions(+)
create mode 100644 .github/contributors/neelkamath.md
diff --git a/.github/contributors/neelkamath.md b/.github/contributors/neelkamath.md
new file mode 100644
index 000000000..76502e7c0
--- /dev/null
+++ b/.github/contributors/neelkamath.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | ---------------------- |
+| Name | Neel Kamath |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | October 30, 2019 |
+| GitHub username | neelkamath |
+| Website (optional) | https://neelkamath.com |
diff --git a/website/meta/universe.json b/website/meta/universe.json
index e64e462d8..714d15b2f 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,25 @@
{
"resources": [
+ {
+ "id": "spacy-server",
+ "title": "spaCy Server",
+ "slogan": "\uD83E\uDD9C Containerized HTTP API for spaCy NLP",
+ "description": "For developers who need programming language agnostic NLP, spaCy Server is a containerized HTTP API that provides industrial-strength natural language processing. Unlike other servers, our server is fast, idiomatic, and well documented.",
+ "github": "neelkamath/spacy-server",
+ "code_example": [
+ "docker run --rm -dp 8080:8080 neelkamath/spacy-server",
+ "curl http://localhost:8080/ner -H 'Content-Type: application/json' -d '{\"sections\": [\"My name is John Doe. I grew up in California.\"]}'"
+ ],
+ "code_language": "shell",
+ "url": "https://hub.docker.com/r/neelkamath/spacy-server",
+ "author": "Neel Kamath",
+ "author_links": {
+ "github": "neelkamath",
+ "website": "https://neelkamath.com"
+ },
+ "category": ["apis"],
+ "tags": ["docker"]
+ },
{
"id": "nlp-architect",
"title": "NLP Architect",
From d8c2365b04547f2380ec2c9896803a36fd6a59ae Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 30 Oct 2019 13:29:00 +0100
Subject: [PATCH 017/496] Update universe.json [ci skip]
---
website/meta/universe.json | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 714d15b2f..749abc659 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1824,9 +1824,9 @@
{
"id": "python-sentence-boundary-disambiguation",
"title": "pySBD - python Sentence Boundary Disambiguation",
- "slogan": "a rule-based sentence boundary detection that works out-of-the-box",
+ "slogan": "Rule-based sentence boundary detection that works out-of-the-box",
"github": "nipunsadvilkar/pySBD",
- "description": "pySBD is 'real-world' sentence segmenter which extracts a reasonable sentences when the format and domain of the input text are unknown. It is a rules-based algorithm based on [The Golden Rules](https://s3.amazonaws.com/tm-town-nlp-resources/golden_rules.txt) - a set of tests to check accuracy of segmenter in regards to edge case scenarios developed by [TM-Town](https://www.tm-town.com/) dev team. pySBD is python port of ruby gem [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter).",
+ "description": "pySBD is 'real-world' sentence segmenter which extracts reasonable sentences when the format and domain of the input text are unknown. It is a rules-based algorithm based on [The Golden Rules](https://s3.amazonaws.com/tm-town-nlp-resources/golden_rules.txt) - a set of tests to check accuracy of segmenter in regards to edge case scenarios developed by [TM-Town](https://www.tm-town.com/) dev team. pySBD is python port of ruby gem [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter).",
"pip": "pysbd",
"category": ["scientific"],
"tags": ["sentence segmentation"],
@@ -1845,7 +1845,7 @@
"twitter": "nipunsadvilkar",
"github": "nipunsadvilkar",
"website": "https://nipunsadvilkar.github.io"
- }
+ }
},
{
"id": "cookiecutter-spacy-fastapi",
From 86c3185f34e02ab81356e89b4848046c028151b6 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 30 Oct 2019 14:31:40 +0100
Subject: [PATCH 018/496] Update syntax iterators [ci skip]
---
website/docs/usage/adding-languages.md | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md
index 36e6e6809..4b12c6be1 100644
--- a/website/docs/usage/adding-languages.md
+++ b/website/docs/usage/adding-languages.md
@@ -402,12 +402,17 @@ iterators:
> assert chunks[1].text == "another phrase"
> ```
-| Language | Code | Source |
-| -------- | ---- | ----------------------------------------------------------------------------------------------------------------- |
-| English | `en` | [`lang/en/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py) |
-| German | `de` | [`lang/de/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/de/syntax_iterators.py) |
-| French | `fr` | [`lang/fr/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/fr/syntax_iterators.py) |
-| Spanish | `es` | [`lang/es/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/es/syntax_iterators.py) |
+| Language | Code | Source |
+| ---------------- | ---- | ----------------------------------------------------------------------------------------------------------------- |
+| English | `en` | [`lang/en/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py) |
+| German | `de` | [`lang/de/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/de/syntax_iterators.py) |
+| French | `fr` | [`lang/fr/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/fr/syntax_iterators.py) |
+| Spanish | `es` | [`lang/es/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/es/syntax_iterators.py) |
+| Greek | `el` | [`lang/el/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/el/syntax_iterators.py) |
+| Norwegian Bokmål | `nb` | [`lang/nb/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/nb/syntax_iterators.py) |
+| Swedish | `sv` | [`lang/sv/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/sv/syntax_iterators.py) |
+| Indonesian | `id` | [`lang/id/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/id/syntax_iterators.py) |
+| Persian | `fa` | [`lang/fa/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/fa/syntax_iterators.py) |
### Lemmatizer {#lemmatizer new="2"}
From e48a09df4eb580bd4ffb2270d9034dc69e081742 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Mon, 11 Nov 2019 17:35:27 +0100
Subject: [PATCH 019/496] Example class for training data (#4543)
* OrigAnnot class instead of gold.orig_annot list of zipped tuples
* from_orig to replace from_annot_tuples
* rename to RawAnnot
* some unit tests for GoldParse creation and internal format
* removing orig_annot and switching to lists instead of tuple
* rewriting tuples to use RawAnnot (+ debug statements, WIP)
* fix pop() changing the data
* small fixes
* pop-append fixes
* return RawAnnot for existing GoldParse to have uniform interface
* clean up imports
* fix merge_sents
* add unit test for 4402 with new structure (not working yet)
* introduce DocAnnot
* typo fixes
* add unit test for merge_sents
* rename from_orig to from_raw
* fixing unit tests
* fix nn parser
* read_annots to produce text, doc_annot pairs
* _make_golds fix
* rename golds_to_gold_annots
* small fixes
* fix encoding
* have golds_to_gold_annots use DocAnnot
* missed a spot
* merge_sents as function in DocAnnot
* allow specifying only part of the token-level annotations
* refactor with Example class + underlying dicts
* pipeline components to work with Example objects (wip)
* input checking
* fix yielding
* fix calls to update
* small fixes
* fix scorer unit test with new format
* fix kwargs order
* fixes for ud and conllu scripts
* fix reading data for conllu script
* add in proper errors (not fixed numbering yet to avoid merge conflicts)
* fixing few more small bugs
* fix EL script
---
bin/ud/ud_run_test.py | 13 +-
bin/ud/ud_train.py | 68 +--
bin/wiki_entity_linking/kb_creator.py | 2 +-
.../wikidata_train_entity_linker.py | 4 +-
examples/training/conllu.py | 78 +--
examples/training/ner_multitask_objective.py | 29 +-
examples/training/pretrain_textcat.py | 5 +-
examples/training/rehearsal.py | 3 +-
examples/training/train_entity_linker.py | 4 +-
examples/training/train_intent_parser.py | 3 +-
examples/training/train_ner.py | 4 +-
examples/training/train_new_entity_type.py | 3 +-
examples/training/train_parser.py | 3 +-
examples/training/train_tagger.py | 3 +-
examples/training/train_textcat.py | 3 +-
spacy/cli/converters/conllu2json.py | 62 +-
spacy/cli/debug_data.py | 40 +-
spacy/cli/evaluate.py | 8 +-
spacy/cli/pretrain.py | 3 +-
spacy/cli/train.py | 39 +-
spacy/errors.py | 6 +
spacy/gold.pxd | 32 +-
spacy/gold.pyx | 575 +++++++++++++-----
spacy/language.py | 119 ++--
spacy/pipeline/morphologizer.pyx | 12 +-
spacy/pipeline/pipes.pyx | 337 ++++++----
spacy/scorer.py | 28 +-
spacy/syntax/arc_eager.pyx | 12 +-
spacy/syntax/ner.pyx | 6 +-
spacy/syntax/nn_parser.pyx | 71 +--
spacy/syntax/nonproj.pyx | 53 +-
spacy/tests/parser/test_add_label.py | 4 +-
spacy/tests/parser/test_arc_eager_oracle.py | 11 +-
spacy/tests/parser/test_neural_parser.py | 4 +-
spacy/tests/parser/test_preset_sbd.py | 2 +-
spacy/tests/pipeline/test_textcat.py | 2 +-
spacy/tests/regression/test_issue1-1000.py | 2 +-
spacy/tests/regression/test_issue1501-2000.py | 8 +-
spacy/tests/regression/test_issue2501-3000.py | 2 +-
spacy/tests/regression/test_issue3611.py | 4 +-
spacy/tests/regression/test_issue4030.py | 4 +-
spacy/tests/regression/test_issue4348.py | 3 +-
spacy/tests/regression/test_issue4402.py | 5 +-
spacy/tests/test_gold.py | 163 ++++-
spacy/tests/test_language.py | 18 +-
spacy/tests/test_scorer.py | 16 +-
spacy/tokenizer.pyx | 2 +-
spacy/util.py | 16 +-
48 files changed, 1178 insertions(+), 716 deletions(-)
diff --git a/bin/ud/ud_run_test.py b/bin/ud/ud_run_test.py
index 7cb270d84..70c6be0d0 100644
--- a/bin/ud/ud_run_test.py
+++ b/bin/ud/ud_run_test.py
@@ -13,23 +13,12 @@ import srsly
import spacy
import spacy.util
from spacy.tokens import Token, Doc
-from spacy.gold import GoldParse
-from spacy.util import compounding, minibatch_by_words
-from spacy.syntax.nonproj import projectivize
from spacy.matcher import Matcher
-# from spacy.morphology import Fused_begin, Fused_inside
-from spacy import displacy
-from collections import defaultdict, Counter
-from timeit import default_timer as timer
Fused_begin = None
Fused_inside = None
-import itertools
-import random
-import numpy.random
-
from . import conll17_ud_eval
from spacy import lang
@@ -268,7 +257,7 @@ def load_nlp(experiments_dir, corpus):
return nlp
-def initialize_pipeline(nlp, docs, golds, config, device):
+def initialize_pipeline(nlp, examples, config, device):
nlp.add_pipe(nlp.create_pipe("parser"))
return nlp
diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py
index 945bf57eb..b6a44b861 100644
--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@@ -7,24 +7,20 @@ from __future__ import unicode_literals
import plac
from pathlib import Path
import re
-import sys
import json
import spacy
import spacy.util
from bin.ud import conll17_ud_eval
from spacy.tokens import Token, Doc
-from spacy.gold import GoldParse
+from spacy.gold import GoldParse, Example
from spacy.util import compounding, minibatch, minibatch_by_words
from spacy.syntax.nonproj import projectivize
from spacy.matcher import Matcher
from spacy import displacy
-from collections import defaultdict, Counter
-from timeit import default_timer as timer
+from collections import defaultdict
-import itertools
import random
-import numpy.random
from spacy import lang
from spacy.lang import zh
@@ -56,7 +52,7 @@ def read_data(
max_doc_length=None,
limit=None,
):
- """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
+ """Read the CONLLU format into Example objects. If raw_text=True,
include Doc objects created using nlp.make_doc and then aligned against
the gold-standard sequences. If oracle_segments=True, include Doc objects
created from the gold-standard segments. At least one must be True."""
@@ -101,15 +97,16 @@ def read_data(
docs.append(doc)
golds.append(gold)
if limit and len(docs) >= limit:
- return docs, golds
+ return golds_to_gold_data(docs, golds)
if raw_text and sent_annots:
doc, gold = _make_gold(nlp, None, sent_annots)
docs.append(doc)
golds.append(gold)
if limit and len(docs) >= limit:
- return docs, golds
- return docs, golds
+ return golds_to_gold_data(docs, golds)
+ return golds_to_gold_data(docs, golds)
+
def _parse_morph_string(morph_string):
if morph_string == '_':
@@ -123,6 +120,7 @@ def _parse_morph_string(morph_string):
output.append('%s_%s' % (key, value.lower()))
return set(output)
+
def read_conllu(file_):
docs = []
sent = []
@@ -183,16 +181,18 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
#############################
-def golds_to_gold_tuples(docs, golds):
- """Get out the annoying 'tuples' format used by begin_training, given the
+def golds_to_gold_data(docs, golds):
+ """Get out the training data format used by begin_training, given the
GoldParse objects."""
- tuples = []
+ data = []
for doc, gold in zip(docs, golds):
- text = doc.text
- ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
- sents = [((ids, words, tags, heads, labels, iob), [])]
- tuples.append((text, sents))
- return tuples
+ example = Example(doc=doc)
+ example.add_doc_annotation(cats=gold.cats)
+ token_annotation_dict = gold.orig.to_dict()
+ example.add_token_annotation(**token_annotation_dict)
+ example.goldparse = gold
+ data.append(example)
+ return data
##############
@@ -348,7 +348,7 @@ def load_nlp(corpus, config, vectors=None):
return nlp
-def initialize_pipeline(nlp, docs, golds, config, device):
+def initialize_pipeline(nlp, examples, config, device):
nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False}))
nlp.add_pipe(nlp.create_pipe("morphologizer"))
nlp.add_pipe(nlp.create_pipe("parser"))
@@ -356,14 +356,15 @@ def initialize_pipeline(nlp, docs, golds, config, device):
nlp.parser.add_multitask_objective("tag")
if config.multitask_sent:
nlp.parser.add_multitask_objective("sent_start")
- for gold in golds:
+ for ex in examples:
+ gold = ex.gold
for tag in gold.tags:
if tag is not None:
nlp.tagger.add_label(tag)
if torch is not None and device != -1:
torch.set_default_tensor_type("torch.cuda.FloatTensor")
optimizer = nlp.begin_training(
- lambda: golds_to_gold_tuples(docs, golds),
+ lambda: examples,
device=device,
subword_features=config.subword_features,
conv_depth=config.conv_depth,
@@ -504,20 +505,20 @@ def main(
print("Train and evaluate", corpus, "using lang", paths.lang)
nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
- docs, golds = read_data(
+ examples = read_data(
nlp,
- paths.train.conllu.open(),
- paths.train.text.open(),
+ paths.train.conllu.open(encoding="utf8"),
+ paths.train.text.open(encoding="utf8"),
max_doc_length=config.max_doc_length,
limit=limit,
)
- optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
+ optimizer = initialize_pipeline(nlp, examples, config, gpu_device)
batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
beam_prob = compounding(0.2, 0.8, 1.001)
for i in range(config.nr_epoch):
- docs, golds = read_data(
+ examples = read_data(
nlp,
paths.train.conllu.open(encoding="utf8"),
paths.train.text.open(encoding="utf8"),
@@ -526,22 +527,19 @@ def main(
oracle_segments=use_oracle_segments,
raw_text=not use_oracle_segments,
)
- Xs = list(zip(docs, golds))
- random.shuffle(Xs)
+ random.shuffle(examples)
if config.batch_by_words:
- batches = minibatch_by_words(Xs, size=batch_sizes)
+ batches = minibatch_by_words(examples, size=batch_sizes)
else:
- batches = minibatch(Xs, size=batch_sizes)
+ batches = minibatch(examples, size=batch_sizes)
losses = {}
- n_train_words = sum(len(doc) for doc in docs)
+ n_train_words = sum(len(ex.doc) for ex in examples)
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
for batch in batches:
- batch_docs, batch_gold = zip(*batch)
- pbar.update(sum(len(doc) for doc in batch_docs))
+ pbar.update(sum(len(ex.doc) for ex in batch))
nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
nlp.update(
- batch_docs,
- batch_gold,
+ batch,
sgd=optimizer,
drop=config.dropout,
losses=losses,
diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py
index 7778fc701..8691308e0 100644
--- a/bin/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@@ -46,7 +46,7 @@ def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_fre
" cf. https://spacy.io/usage/models#languages."
)
- logger.info("Filtering entities with fewer than {} mentions".format(min_entity_freq))
+ logger.info("Filtering entities with fewer than {} mentions or no description".format(min_entity_freq))
entity_frequencies = io.read_entity_to_count(entity_freq_path)
# filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
index 8635ae547..6b5f4c30d 100644
--- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py
+++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
@@ -131,10 +131,8 @@ def main(
with nlp.disable_pipes(*other_pipes):
for batch in batches:
try:
- docs, golds = zip(*batch)
nlp.update(
- docs=docs,
- golds=golds,
+ examples=batch,
sgd=optimizer,
drop=dropout,
losses=losses,
diff --git a/examples/training/conllu.py b/examples/training/conllu.py
index dfc790456..ba3cf450c 100644
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@@ -11,10 +11,9 @@ import json
import spacy
import spacy.util
from spacy.tokens import Token, Doc
-from spacy.gold import GoldParse
+from spacy.gold import GoldParse, Example
from spacy.syntax.nonproj import projectivize
-from collections import defaultdict, Counter
-from timeit import default_timer as timer
+from collections import defaultdict
from spacy.matcher import Matcher
import itertools
@@ -33,25 +32,25 @@ random.seed(0)
numpy.random.seed(0)
-def minibatch_by_words(items, size=5000):
- random.shuffle(items)
+def minibatch_by_words(examples, size=5000):
+ random.shuffle(examples)
if isinstance(size, int):
size_ = itertools.repeat(size)
else:
size_ = size
- items = iter(items)
+ examples = iter(examples)
while True:
batch_size = next(size_)
batch = []
while batch_size >= 0:
try:
- doc, gold = next(items)
+ example = next(examples)
except StopIteration:
if batch:
yield batch
return
- batch_size -= len(doc)
- batch.append((doc, gold))
+ batch_size -= len(example.doc)
+ batch.append(example)
if batch:
yield batch
else:
@@ -78,7 +77,7 @@ def read_data(
max_doc_length=None,
limit=None,
):
- """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
+ """Read the CONLLU format into Example objects. If raw_text=True,
include Doc objects created using nlp.make_doc and then aligned against
the gold-standard sequences. If oracle_segments=True, include Doc objects
created from the gold-standard segments. At least one must be True."""
@@ -119,15 +118,15 @@ def read_data(
docs.append(doc)
golds.append(gold)
if limit and len(docs) >= limit:
- return docs, golds
+ return golds_to_gold_data(docs, golds)
if raw_text and sent_annots:
doc, gold = _make_gold(nlp, None, sent_annots)
docs.append(doc)
golds.append(gold)
if limit and len(docs) >= limit:
- return docs, golds
- return docs, golds
+ return golds_to_gold_data(docs, golds)
+ return golds_to_gold_data(docs, golds)
def read_conllu(file_):
@@ -181,16 +180,18 @@ def _make_gold(nlp, text, sent_annots):
#############################
-def golds_to_gold_tuples(docs, golds):
- """Get out the annoying 'tuples' format used by begin_training, given the
+def golds_to_gold_data(docs, golds):
+ """Get out the training data format used by begin_training, given the
GoldParse objects."""
- tuples = []
+ data = []
for doc, gold in zip(docs, golds):
- text = doc.text
- ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
- sents = [((ids, words, tags, heads, labels, iob), [])]
- tuples.append((text, sents))
- return tuples
+ example = Example(doc=doc)
+ example.add_doc_annotation(cats=gold.cats)
+ token_annotation_dict = gold.orig.to_dict()
+ example.add_token_annotation(**token_annotation_dict)
+ example.goldparse = gold
+ data.append(example)
+ return data
##############
@@ -290,9 +291,9 @@ def get_token_conllu(token, i):
return "\n".join(lines)
-Token.set_extension("get_conllu_lines", method=get_token_conllu)
-Token.set_extension("begins_fused", default=False)
-Token.set_extension("inside_fused", default=False)
+Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
+Token.set_extension("begins_fused", default=False, force=True)
+Token.set_extension("inside_fused", default=False, force=True)
##################
@@ -308,7 +309,7 @@ def load_nlp(corpus, config):
return nlp
-def initialize_pipeline(nlp, docs, golds, config):
+def initialize_pipeline(nlp, examples, config):
nlp.add_pipe(nlp.create_pipe("parser"))
if config.multitask_tag:
nlp.parser.add_multitask_objective("tag")
@@ -316,18 +317,19 @@ def initialize_pipeline(nlp, docs, golds, config):
nlp.parser.add_multitask_objective("sent_start")
nlp.parser.moves.add_action(2, "subtok")
nlp.add_pipe(nlp.create_pipe("tagger"))
- for gold in golds:
- for tag in gold.tags:
+ for ex in examples:
+ for tag in ex.gold.tags:
if tag is not None:
nlp.tagger.add_label(tag)
# Replace labels that didn't make the frequency cutoff
actions = set(nlp.parser.labels)
label_set = set([act.split("-")[1] for act in actions if "-" in act])
- for gold in golds:
+ for ex in examples:
+ gold = ex.gold
for i, label in enumerate(gold.labels):
if label is not None and label not in label_set:
gold.labels[i] = label.split("||")[0]
- return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
+ return nlp.begin_training(lambda: examples)
########################
@@ -401,28 +403,26 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
print("Train and evaluate", corpus, "using lang", paths.lang)
nlp = load_nlp(paths.lang, config)
- docs, golds = read_data(
+ examples = read_data(
nlp,
- paths.train.conllu.open(),
- paths.train.text.open(),
+ paths.train.conllu.open(encoding="utf8"),
+ paths.train.text.open(encoding="utf8"),
max_doc_length=config.max_doc_length,
limit=limit,
)
- optimizer = initialize_pipeline(nlp, docs, golds, config)
+ optimizer = initialize_pipeline(nlp, examples, config)
for i in range(config.nr_epoch):
- docs = [nlp.make_doc(doc.text) for doc in docs]
- batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
+ docs = [nlp.make_doc(example.doc.text) for example in examples]
+ batches = minibatch_by_words(examples, size=config.batch_size)
losses = {}
n_train_words = sum(len(doc) for doc in docs)
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
for batch in batches:
- batch_docs, batch_gold = zip(*batch)
- pbar.update(sum(len(doc) for doc in batch_docs))
+ pbar.update(sum(len(ex.doc) for ex in batch))
nlp.update(
- batch_docs,
- batch_gold,
+ examples=batch,
sgd=optimizer,
drop=config.dropout,
losses=losses,
diff --git a/examples/training/ner_multitask_objective.py b/examples/training/ner_multitask_objective.py
index 4bf7a008f..7561d4877 100644
--- a/examples/training/ner_multitask_objective.py
+++ b/examples/training/ner_multitask_objective.py
@@ -31,14 +31,13 @@ random.seed(0)
PWD = os.path.dirname(__file__)
-TRAIN_DATA = list(read_json_file(
- os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json")))
+TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))
-def get_position_label(i, words, tags, heads, labels, ents):
+def get_position_label(i, token_annotation):
"""Return labels indicating the position of the word in the document.
"""
- if len(words) < 20:
+ if len(token_annotation.words) < 20:
return "short-doc"
elif i == 0:
return "first-word"
@@ -46,7 +45,7 @@ def get_position_label(i, words, tags, heads, labels, ents):
return "early-word"
elif i < 20:
return "mid-word"
- elif i == len(words) - 1:
+ elif i == len(token_annotation.words) - 1:
return "last-word"
else:
return "late-word"
@@ -60,17 +59,17 @@ def main(n_iter=10):
print(nlp.pipeline)
print("Create data", len(TRAIN_DATA))
- optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
+ optimizer = nlp.begin_training(get_examples=lambda: TRAIN_DATA)
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
- for text, annot_brackets in TRAIN_DATA:
- for annotations, _ in annot_brackets:
- doc = Doc(nlp.vocab, words=annotations[1])
- gold = GoldParse.from_annot_tuples(doc, annotations)
+ for example in TRAIN_DATA:
+ for token_annotation in example.token_annotations:
+ doc = Doc(nlp.vocab, words=token_annotation.words)
+ gold = GoldParse.from_annotation(doc, example.doc_annotation, token_annotation)
+
nlp.update(
- [doc], # batch of texts
- [gold], # batch of annotations
+ examples=[(doc, gold)], # 1 example
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses,
@@ -78,9 +77,9 @@ def main(n_iter=10):
print(losses.get("nn_labeller", 0.0), losses["ner"])
# test the trained model
- for text, _ in TRAIN_DATA:
- if text is not None:
- doc = nlp(text)
+ for example in TRAIN_DATA:
+ if example.text is not None:
+ doc = nlp(example.text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py
index e45f3345e..828479881 100644
--- a/examples/training/pretrain_textcat.py
+++ b/examples/training/pretrain_textcat.py
@@ -116,7 +116,7 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
losses = {}
for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
docs = [nlp.make_doc(text) for text in batch]
- tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout)
+ tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout)
print(losses)
return optimizer
@@ -147,8 +147,7 @@ def train_textcat(nlp, n_texts, n_iter=10):
# batch up the examples using spaCy's minibatch
batches = minibatch(tqdm.tqdm(train_data), size=2)
for batch in batches:
- texts, annotations = zip(*batch)
- nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
+ nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data()
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py
index 123f5049d..b08ba9f9a 100644
--- a/examples/training/rehearsal.py
+++ b/examples/training/rehearsal.py
@@ -74,8 +74,7 @@ def main(model_name, unlabelled_loc):
# batch up the examples using spaCy's minibatch
raw_batches = minibatch(raw_docs, size=4)
for batch in minibatch(TRAIN_DATA, size=sizes):
- docs, golds = zip(*batch)
- nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses)
+ nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses)
raw_batch = list(next(raw_batches))
nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
print("Losses", losses)
diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py
index d2b2c2417..9d7357b8c 100644
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@@ -108,10 +108,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
- texts, annotations = zip(*batch)
nlp.update(
- texts, # batch of texts
- annotations, # batch of annotations
+ batch,
drop=0.2, # dropout - make it harder to memorise data
losses=losses,
sgd=optimizer,
diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py
index 08d06bd4c..beb39fa1d 100644
--- a/examples/training/train_intent_parser.py
+++ b/examples/training/train_intent_parser.py
@@ -133,8 +133,7 @@ def main(model=None, output_dir=None, n_iter=15):
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
- texts, annotations = zip(*batch)
- nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+ nlp.update(batch, sgd=optimizer, losses=losses)
print("Losses", losses)
# test the trained model
diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
index 49c25654c..e83d5cd0d 100644
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@@ -67,10 +67,8 @@ def main(model=None, output_dir=None, n_iter=100):
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
- texts, annotations = zip(*batch)
nlp.update(
- texts, # batch of texts
- annotations, # batch of annotations
+ batch,
drop=0.5, # dropout - make it harder to memorise data
losses=losses,
)
diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index e3a76f0c0..7fe443fc2 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -104,8 +104,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
batches = minibatch(TRAIN_DATA, size=sizes)
losses = {}
for batch in batches:
- texts, annotations = zip(*batch)
- nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
+ nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses)
print("Losses", losses)
# test the trained model
diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py
index aa60af00b..6db8af854 100644
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@@ -74,8 +74,7 @@ def main(model=None, output_dir=None, n_iter=15):
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
- texts, annotations = zip(*batch)
- nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+ nlp.update(batch, sgd=optimizer, losses=losses)
print("Losses", losses)
# test the trained model
diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py
index 7136273b3..06e05f6cd 100644
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@@ -65,8 +65,7 @@ def main(lang="en", output_dir=None, n_iter=25):
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
- texts, annotations = zip(*batch)
- nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+ nlp.update(batch, sgd=optimizer, losses=losses)
print("Losses", losses)
# test the trained model
diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py
index 4d4ebf396..128773c0a 100644
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@@ -82,8 +82,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
random.shuffle(train_data)
batches = minibatch(train_data, size=batch_sizes)
for batch in batches:
- texts, annotations = zip(*batch)
- nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
+ nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data()
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index 8f2900a9b..43216c943 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import re
+from spacy.gold import Example
from ...gold import iob_to_biluo
@@ -19,21 +20,21 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
# by @katarkor
docs = []
sentences = []
- conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
+ conll_data = read_conllx(input_data, use_morphology=use_morphology)
checked_for_ner = False
has_ner_tags = False
- for i, (raw_text, tokens) in enumerate(conll_tuples):
- sentence, brackets = tokens[0]
- if not checked_for_ner:
- has_ner_tags = is_ner(sentence[5][0])
- checked_for_ner = True
- sentences.append(generate_sentence(sentence, has_ner_tags))
- # Real-sized documents could be extracted using the comments on the
- # conluu document
- if len(sentences) % n_sents == 0:
- doc = create_doc(sentences, i)
- docs.append(doc)
- sentences = []
+ for i, example in enumerate(conll_data):
+ for token_annotation in example.token_annotations:
+ if not checked_for_ner:
+ has_ner_tags = is_ner(token_annotation.entities[0])
+ checked_for_ner = True
+ sentences.append(generate_sentence(token_annotation, has_ner_tags))
+ # Real-sized documents could be extracted using the comments on the
+ # conluu document
+ if len(sentences) % n_sents == 0:
+ doc = create_doc(sentences, i)
+ docs.append(doc)
+ sentences = []
return docs
@@ -52,15 +53,15 @@ def is_ner(tag):
def read_conllx(input_data, use_morphology=False, n=0):
+ """ Yield example data points, one for each sentence """
i = 0
for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n")
if lines:
while lines[0].startswith("#"):
lines.pop(0)
- tokens = []
+ ids, words, tags, heads, deps, ents = [], [], [], [], [], []
for line in lines:
-
parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
if "-" in id_ or "." in id_:
@@ -72,14 +73,22 @@ def read_conllx(input_data, use_morphology=False, n=0):
tag = pos if tag == "_" else tag
tag = tag + "__" + morph if use_morphology else tag
iob = iob if iob else "O"
- tokens.append((id_, word, tag, head, dep, iob))
+
+ ids.append(id_)
+ words.append(word)
+ tags.append(tag)
+ heads.append(head)
+ deps.append(dep)
+ ents.append(iob)
except: # noqa: E722
print(line)
raise
- tuples = [list(t) for t in zip(*tokens)]
- yield (None, [[tuples, []]])
+ example = Example(doc=None)
+ example.add_token_annotation(ids=ids, words=words, tags=tags,
+ heads=heads, deps=deps, entities=ents)
+ yield example
i += 1
- if n >= 1 and i >= n:
+ if 1 <= n <= i:
break
@@ -107,20 +116,19 @@ def simplify_tags(iob):
return new_iob
-def generate_sentence(sent, has_ner_tags):
- (id_, word, tag, head, dep, iob) = sent
+def generate_sentence(token_annotation, has_ner_tags):
sentence = {}
tokens = []
if has_ner_tags:
- iob = simplify_tags(iob)
+ iob = simplify_tags(token_annotation.entities)
biluo = iob_to_biluo(iob)
- for i, id in enumerate(id_):
+ for i, id in enumerate(token_annotation.ids):
token = {}
token["id"] = id
- token["orth"] = word[i]
- token["tag"] = tag[i]
- token["head"] = head[i] - id
- token["dep"] = dep[i]
+ token["orth"] = token_annotation.words[i]
+ token["tag"] = token_annotation.tags[i]
+ token["head"] = token_annotation.heads[i] - id
+ token["dep"] = token_annotation.deps[i]
if has_ner_tags:
token["ner"] = biluo[i]
tokens.append(token)
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 8161ddf45..76276ee56 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -80,16 +80,16 @@ def debug_data(
with msg.loading("Loading corpus..."):
corpus = GoldCorpus(train_path, dev_path)
try:
- train_docs = list(corpus.train_docs(nlp))
- train_docs_unpreprocessed = list(
- corpus.train_docs_without_preprocessing(nlp)
+ train_dataset = list(corpus.train_dataset(nlp))
+ train_dataset_unpreprocessed = list(
+ corpus.train_dataset_without_preprocessing(nlp)
)
except ValueError as e:
loading_train_error_message = "Training data cannot be loaded: {}".format(
str(e)
)
try:
- dev_docs = list(corpus.dev_docs(nlp))
+ dev_dataset = list(corpus.dev_dataset(nlp))
except ValueError as e:
loading_dev_error_message = "Development data cannot be loaded: {}".format(
str(e)
@@ -102,10 +102,10 @@ def debug_data(
sys.exit(1)
msg.good("Corpus is loadable")
- # Create all gold data here to avoid iterating over the train_docs constantly
- gold_train_data = _compile_gold(train_docs, pipeline)
- gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline)
- gold_dev_data = _compile_gold(dev_docs, pipeline)
+ # Create all gold data here to avoid iterating over the train_dataset constantly
+ gold_train_data = _compile_gold(train_dataset, pipeline)
+ gold_train_unpreprocessed_data = _compile_gold(train_dataset_unpreprocessed, pipeline)
+ gold_dev_data = _compile_gold(dev_dataset, pipeline)
train_texts = gold_train_data["texts"]
dev_texts = gold_dev_data["texts"]
@@ -118,19 +118,19 @@ def debug_data(
msg.text("Starting with base model '{}'".format(base_model))
else:
msg.text("Starting with blank model '{}'".format(lang))
- msg.text("{} training docs".format(len(train_docs)))
- msg.text("{} evaluation docs".format(len(dev_docs)))
+ msg.text("{} training docs".format(len(train_dataset)))
+ msg.text("{} evaluation docs".format(len(gold_dev_data)))
overlap = len(train_texts.intersection(dev_texts))
if overlap:
msg.warn("{} training examples also in evaluation data".format(overlap))
else:
msg.good("No overlap between training and evaluation data")
- if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
+ if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD:
text = "Low number of examples to train from a blank model ({})".format(
- len(train_docs)
+ len(train_dataset)
)
- if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
+ if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
msg.fail(text)
else:
msg.warn(text)
@@ -238,7 +238,7 @@ def debug_data(
has_low_data_warning = True
with msg.loading("Analyzing label distribution..."):
- neg_docs = _get_examples_without_label(train_docs, label)
+ neg_docs = _get_examples_without_label(train_dataset, label)
if neg_docs == 0:
msg.warn(
"No examples for texts WITHOUT new label '{}'".format(label)
@@ -358,7 +358,7 @@ def debug_data(
msg.info(
"Found {} sentence{} with an average length of {:.1f} words.".format(
gold_train_data["n_sents"],
- "s" if len(train_docs) > 1 else "",
+ "s" if len(train_dataset) > 1 else "",
gold_train_data["n_words"] / gold_train_data["n_sents"],
)
)
@@ -536,7 +536,7 @@ def _load_file(file_path, msg):
)
-def _compile_gold(train_docs, pipeline):
+def _compile_gold(examples, pipeline):
data = {
"ner": Counter(),
"cats": Counter(),
@@ -553,7 +553,9 @@ def _compile_gold(train_docs, pipeline):
"n_cats_multilabel": 0,
"texts": set(),
}
- for doc, gold in train_docs:
+ for example in examples:
+ gold = example.gold
+ doc = example.doc
valid_words = [x for x in gold.words if x is not None]
data["words"].update(valid_words)
data["n_words"] += len(valid_words)
@@ -598,8 +600,8 @@ def _format_labels(labels, counts=False):
def _get_examples_without_label(data, label):
count = 0
- for doc, gold in data:
- labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
+ for ex in data:
+ labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-")]
if label not in labels:
count += 1
return count
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 1114ada08..e5b2d0f02 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -45,11 +45,11 @@ def evaluate(
msg.fail("Visualization output directory not found", displacy_path, exits=1)
corpus = GoldCorpus(data_path, data_path)
nlp = util.load_model(model)
- dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
+ dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
begin = timer()
- scorer = nlp.evaluate(dev_docs, verbose=False)
+ scorer = nlp.evaluate(dev_dataset, verbose=False)
end = timer()
- nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
+ nwords = sum(len(ex.doc) for ex in dev_dataset)
results = {
"Time": "%.2f s" % (end - begin),
"Words": nwords,
@@ -66,7 +66,7 @@ def evaluate(
msg.table(results, title="Results")
if displacy_path:
- docs, golds = zip(*dev_docs)
+ docs = [ex.doc for ex in dev_dataset]
render_deps = "parser" in nlp.meta.get("pipeline", [])
render_ents = "ner" in nlp.meta.get("pipeline", [])
render_parses(
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index f7236f7de..59269cb85 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -14,6 +14,7 @@ from thinc.neural.util import prefer_gpu
from wasabi import Printer
import srsly
+from spacy.gold import Example
from ..errors import Errors
from ..tokens import Doc
from ..attrs import ID, HEAD
@@ -221,7 +222,7 @@ def pretrain(
skip_counter = 0
for epoch in range(epoch_start, n_iter + epoch_start):
for batch_id, batch in enumerate(
- util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
+ util.minibatch_by_words((Example(doc=text) for text in texts), size=batch_size)
):
docs, count = make_docs(
nlp,
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 13fcae37f..24255437c 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -236,7 +236,7 @@ def train(
optimizer = create_default_optimizer(Model.ops)
else:
# Start with a blank model, call begin_training
- optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
+ optimizer = nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
nlp._optimizer = None
@@ -261,7 +261,7 @@ def train(
"problem with two labels.".format(textcat_positive_label),
exits=1,
)
- train_docs = corpus.train_docs(
+ train_data = corpus.train_data(
nlp,
noise_level=noise_level,
gold_preproc=gold_preproc,
@@ -271,9 +271,9 @@ def train(
train_labels = set()
if textcat_multilabel:
multilabel_found = False
- for text, gold in train_docs:
- train_labels.update(gold.cats.keys())
- if list(gold.cats.values()).count(1.0) != 1:
+ for ex in train_data:
+ train_labels.update(ex.gold.cats.keys())
+ if list(ex.gold.cats.values()).count(1.0) != 1:
multilabel_found = True
if not multilabel_found and not base_model:
msg.warn(
@@ -283,9 +283,9 @@ def train(
"mutually-exclusive classes."
)
if not textcat_multilabel:
- for text, gold in train_docs:
- train_labels.update(gold.cats.keys())
- if list(gold.cats.values()).count(1.0) != 1 and not base_model:
+ for ex in train_data:
+ train_labels.update(ex.gold.cats.keys())
+ if list(ex.gold.cats.values()).count(1.0) != 1 and not base_model:
msg.warn(
"Some textcat training instances do not have exactly "
"one positive label. Modifying training options to "
@@ -341,7 +341,7 @@ def train(
iter_since_best = 0
best_score = 0.0
for i in range(n_iter):
- train_docs = corpus.train_docs(
+ train_data = corpus.train_data(
nlp,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
@@ -357,13 +357,11 @@ def train(
words_seen = 0
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
losses = {}
- for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
+ for batch in util.minibatch_by_words(train_data, size=batch_sizes):
if not batch:
continue
- docs, golds = zip(*batch)
nlp.update(
- docs,
- golds,
+ batch,
sgd=optimizer,
drop=next(dropout_rates),
losses=losses,
@@ -373,6 +371,7 @@ def train(
# which use unlabelled data to reduce overfitting.
raw_batch = list(next(raw_batches))
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
+ docs = [ex.doc for ex in batch]
if not int(os.environ.get("LOG_FRIENDLY", 0)):
pbar.update(sum(len(doc) for doc in docs))
words_seen += sum(len(doc) for doc in docs)
@@ -385,16 +384,16 @@ def train(
for name, component in nlp_loaded.pipeline:
if hasattr(component, "cfg"):
component.cfg["beam_width"] = beam_width
- dev_docs = list(
- corpus.dev_docs(
+ dev_dataset = list(
+ corpus.dev_dataset(
nlp_loaded,
gold_preproc=gold_preproc,
ignore_misaligned=True,
)
)
- nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
+ nwords = sum(len(ex.doc) for ex in dev_dataset)
start_time = timer()
- scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
+ scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
end_time = timer()
if use_gpu < 0:
gpu_wps = None
@@ -406,15 +405,15 @@ def train(
for name, component in nlp_loaded.pipeline:
if hasattr(component, "cfg"):
component.cfg["beam_width"] = beam_width
- dev_docs = list(
- corpus.dev_docs(
+ dev_dataset = list(
+ corpus.dev_dataset(
nlp_loaded,
gold_preproc=gold_preproc,
ignore_misaligned=True,
)
)
start_time = timer()
- scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
+ scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
end_time = timer()
cpu_wps = nwords / (end_time - start_time)
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
diff --git a/spacy/errors.py b/spacy/errors.py
index c708f0a5b..d2898cf53 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -530,6 +530,12 @@ class Errors(object):
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
+ # TODO: fix numbering after merging develop into master
+ E998 = ("Can only create GoldParse's from Example's without a Doc, "
+ "if get_gold_parses() is called with a Vocab object.")
+ E999 = ("Encountered an unexpected format for the dictionary holding "
+ "gold annotations: {gold_dict}")
+
@add_codes
class TempErrors(object):
diff --git a/spacy/gold.pxd b/spacy/gold.pxd
index 20a25a939..6027d85b6 100644
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@@ -1,6 +1,6 @@
from cymem.cymem cimport Pool
-from .structs cimport TokenC
+from spacy.tokens import Doc
from .typedefs cimport attr_t
from .syntax.transition_system cimport Transition
@@ -19,6 +19,7 @@ cdef class GoldParse:
cdef Pool mem
cdef GoldParseC c
+ cdef readonly TokenAnnotation orig
cdef int length
cdef public int loss
@@ -29,13 +30,36 @@ cdef class GoldParse:
cdef public list labels
cdef public dict orths
cdef public list ner
- cdef public list ents
cdef public dict brackets
- cdef public object cats
+ cdef public dict cats
cdef public dict links
cdef readonly list cand_to_gold
cdef readonly list gold_to_cand
- cdef readonly list orig_annot
+
+
+cdef class TokenAnnotation:
+ cdef public list ids
+ cdef public list words
+ cdef public list tags
+ cdef public list heads
+ cdef public list deps
+ cdef public list entities
+ cdef public list morphology
+ cdef public list brackets
+
+
+cdef class DocAnnotation:
+ cdef public object cats
+ cdef public object links
+
+
+cdef class Example:
+ cdef public object doc
+ cdef public list token_annotations
+ cdef public DocAnnotation doc_annotation
+ cdef public object make_projective
+ cdef public object ignore_misaligned
+ cdef public object goldparse
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 5aecc2584..ea3589ea5 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -14,11 +14,8 @@ import srsly
from .syntax import nonproj
from .tokens import Doc, Span
from .errors import Errors, AlignmentError
-from .compat import path2str
+from .compat import path2str, basestring_
from . import util
-from .util import minibatch, itershuffle
-
-from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
USE_NEW_ALIGN = False
@@ -54,25 +51,6 @@ def tags_to_entities(tags):
return entities
-def merge_sents(sents):
- m_deps = [[], [], [], [], [], []]
- m_cats = {}
- m_brackets = []
- i = 0
- for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents:
- m_deps[0].extend(id_ + i for id_ in ids)
- m_deps[1].extend(words)
- m_deps[2].extend(tags)
- m_deps[3].extend(head + i for head in heads)
- m_deps[4].extend(labels)
- m_deps[5].extend(ner)
- m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
- for b in brackets)
- m_cats.update(cats)
- i += len(ids)
- return [(m_deps, (m_cats, m_brackets))]
-
-
_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
@@ -211,14 +189,14 @@ class GoldCorpus(object):
def __init__(self, train, dev, gold_preproc=False, limit=None):
"""Create a GoldCorpus.
- train_path (unicode or Path): File or directory of training data.
- dev_path (unicode or Path): File or directory of development data.
+ train (unicode or Path): File or directory of training data.
+ dev (unicode or Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object.
"""
self.limit = limit
if isinstance(train, str) or isinstance(train, Path):
- train = self.read_tuples(self.walk_corpus(train))
- dev = self.read_tuples(self.walk_corpus(dev))
+ train = self.read_examples(self.walk_corpus(train))
+ dev = self.read_examples(self.walk_corpus(dev))
# Write temp directory with one doc per file, so we can shuffle and stream
self.tmp_dir = Path(tempfile.mkdtemp())
self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
@@ -228,13 +206,15 @@ class GoldCorpus(object):
shutil.rmtree(path2str(self.tmp_dir))
@staticmethod
- def write_msgpack(directory, doc_tuples, limit=0):
+ def write_msgpack(directory, examples, limit=0):
if not directory.exists():
directory.mkdir()
n = 0
- for i, doc_tuple in enumerate(doc_tuples):
- srsly.write_msgpack(directory / "{}.msg".format(i), [doc_tuple])
- n += len(doc_tuple[1])
+ for i, example in enumerate(examples):
+ ex_dict = example.to_dict()
+ text = example.text
+ srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict))
+ n += len(example.token_annotations)
if limit and n >= limit:
break
@@ -259,128 +239,144 @@ class GoldCorpus(object):
return locs
@staticmethod
- def read_tuples(locs, limit=0):
+ def read_examples(locs, limit=0):
+ """ Yield training examples """
i = 0
for loc in locs:
loc = util.ensure_path(loc)
if loc.parts[-1].endswith("json"):
- gold_tuples = read_json_file(loc)
+ examples = read_json_file(loc)
elif loc.parts[-1].endswith("jsonl"):
gold_tuples = srsly.read_jsonl(loc)
first_gold_tuple = next(gold_tuples)
gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
# TODO: proper format checks with schemas
if isinstance(first_gold_tuple, dict):
- gold_tuples = read_json_object(gold_tuples)
+ if first_gold_tuple.get("paragraphs", None):
+ examples = read_json_object(gold_tuples)
+ elif first_gold_tuple.get("doc_annotation", None):
+ examples = []
+ for ex_dict in gold_tuples:
+ doc = ex_dict.get("doc", None)
+ if doc is None:
+ doc = ex_dict.get("text", None)
+ examples.append(Example.from_dict(ex_dict, doc=doc))
+
elif loc.parts[-1].endswith("msg"):
- gold_tuples = srsly.read_msgpack(loc)
+ text, ex_dict = srsly.read_msgpack(loc)
+ examples = [Example.from_dict(ex_dict, doc=text)]
else:
supported = ("json", "jsonl", "msg")
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
- for item in gold_tuples:
- yield item
- i += len(item[1])
+ for example in examples:
+ yield example
+ i += len(example.token_annotations)
if limit and i >= limit:
return
@property
- def dev_tuples(self):
+ def dev_examples(self):
locs = (self.tmp_dir / "dev").iterdir()
- yield from self.read_tuples(locs, limit=self.limit)
+ yield from self.read_examples(locs, limit=self.limit)
@property
- def train_tuples(self):
+ def train_examples(self):
locs = (self.tmp_dir / "train").iterdir()
- yield from self.read_tuples(locs, limit=self.limit)
+ yield from self.read_examples(locs, limit=self.limit)
def count_train(self):
+ # TODO: should this count words or sentences ?
n = 0
i = 0
- for raw_text, paragraph_tuples in self.train_tuples:
- for sent_tuples, brackets in paragraph_tuples:
- n += len(sent_tuples[1])
+ for example in self.train_examples:
+ for token_annotation in example.token_annotations:
+ n += len(token_annotation.words)
if self.limit and i >= self.limit:
break
i += 1
return n
- def train_docs(self, nlp, gold_preproc=False, max_length=None,
+ def train_dataset(self, nlp, gold_preproc=False, max_length=None,
noise_level=0.0, orth_variant_level=0.0,
ignore_misaligned=False):
locs = list((self.tmp_dir / 'train').iterdir())
random.shuffle(locs)
- train_tuples = self.read_tuples(locs, limit=self.limit)
- gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
+ train_examples = self.read_examples(locs, limit=self.limit)
+ gold_examples = self.iter_gold_docs(nlp, train_examples, gold_preproc,
max_length=max_length,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
make_projective=True,
ignore_misaligned=ignore_misaligned)
- yield from gold_docs
+ yield from gold_examples
- def train_docs_without_preprocessing(self, nlp, gold_preproc=False):
- gold_docs = self.iter_gold_docs(nlp, self.train_tuples, gold_preproc=gold_preproc)
- yield from gold_docs
+ def train_dataset_without_preprocessing(self, nlp, gold_preproc=False):
+ examples = self.iter_gold_docs(nlp, self.train_examples, gold_preproc=gold_preproc)
+ yield from examples
- def dev_docs(self, nlp, gold_preproc=False, ignore_misaligned=False):
- gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc=gold_preproc,
+ def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
+ examples = self.iter_gold_docs(nlp, self.dev_examples, gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned)
- yield from gold_docs
+ yield from examples
@classmethod
- def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
+ def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
noise_level=0.0, orth_variant_level=0.0, make_projective=False,
ignore_misaligned=False):
- for raw_text, paragraph_tuples in tuples:
+ """ Setting gold_preproc will result in creating a doc per 'sentence' """
+ for example in examples:
if gold_preproc:
- raw_text = None
+ example.doc = None
else:
- paragraph_tuples = merge_sents(paragraph_tuples)
- docs, paragraph_tuples = cls._make_docs(nlp, raw_text,
- paragraph_tuples, gold_preproc, noise_level=noise_level,
- orth_variant_level=orth_variant_level)
- golds = cls._make_golds(docs, paragraph_tuples, make_projective,
- ignore_misaligned=ignore_misaligned)
- for doc, gold in zip(docs, golds):
- if gold is not None:
- if (not max_length) or len(doc) < max_length:
- yield doc, gold
+ example = example.merge_sents()
+ example.make_projective = make_projective
+ example.ignore_misaligned = ignore_misaligned
+ examples = cls._make_docs(nlp, example,
+ gold_preproc, noise_level=noise_level,
+ orth_variant_level=orth_variant_level)
+ examples = cls._make_golds(examples, vocab=nlp.vocab)
+ for ex in examples:
+ if ex.gold is not None:
+ if (not max_length) or len(ex.doc) < max_length:
+ yield ex
@classmethod
- def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
- if raw_text is not None:
- raw_text, paragraph_tuples = make_orth_variants(nlp, raw_text, paragraph_tuples, orth_variant_level=orth_variant_level)
- raw_text = add_noise(raw_text, noise_level)
- return [nlp.make_doc(raw_text)], paragraph_tuples
+ def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
+ # gold_preproc is not used ?!
+ if example.text is not None:
+ var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
+ var_text = add_noise(var_example.text, noise_level)
+ var_doc = nlp.make_doc(var_text)
+ var_example.doc = var_doc
+ return [var_example]
else:
- docs = []
- raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level=orth_variant_level)
- return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
- for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples
-
+ var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
+ doc_examples = []
+ for token_annotation in var_example.token_annotations:
+ t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level))
+ doc_example = Example(doc_annotation=example.doc_annotation,
+ token_annotations=[token_annotation],
+ doc=t_doc)
+ doc_examples.append(doc_example)
+ return doc_examples
@classmethod
- def _make_golds(cls, docs, paragraph_tuples, make_projective, ignore_misaligned=False):
- if len(docs) != len(paragraph_tuples):
- n_annots = len(paragraph_tuples)
- raise ValueError(Errors.E070.format(n_docs=len(docs), n_annots=n_annots))
- golds = []
- for doc, (sent_tuples, (cats, brackets)) in zip(docs, paragraph_tuples):
- try:
- gold = GoldParse.from_annot_tuples(doc, sent_tuples, cats=cats,
- make_projective=make_projective)
- except AlignmentError:
- if ignore_misaligned:
- gold = None
- else:
- raise
- golds.append(gold)
- return golds
+ def _make_golds(cls, examples, vocab=None):
+ gold_examples = []
+ for example in examples:
+ gold_parses = example.get_gold_parses(vocab=vocab)
+ for (doc, gold) in gold_parses:
+ ex = Example(doc=doc)
+ ex.goldparse = gold
+ gold_examples.append(ex)
+ return gold_examples
-
-def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
+def make_orth_variants(nlp, example, orth_variant_level=0.0):
if random.random() >= orth_variant_level:
- return raw, paragraph_tuples
+ return example
+ if not example.token_annotations:
+ return example
+ raw = example.text
if random.random() >= 0.5:
lower = True
if raw is not None:
@@ -388,38 +384,47 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
ndsv = nlp.Defaults.single_orth_variants
ndpv = nlp.Defaults.paired_orth_variants
# modify words in paragraph_tuples
- variant_paragraph_tuples = []
- for sent_tuples, brackets in paragraph_tuples:
- ids, words, tags, heads, labels, ner = sent_tuples
- if lower:
- words = [w.lower() for w in words]
- # single variants
- punct_choices = [random.choice(x["variants"]) for x in ndsv]
- for word_idx in range(len(words)):
- for punct_idx in range(len(ndsv)):
- if tags[word_idx] in ndsv[punct_idx]["tags"] \
- and words[word_idx] in ndsv[punct_idx]["variants"]:
- words[word_idx] = punct_choices[punct_idx]
- # paired variants
- punct_choices = [random.choice(x["variants"]) for x in ndpv]
- for word_idx in range(len(words)):
- for punct_idx in range(len(ndpv)):
- if tags[word_idx] in ndpv[punct_idx]["tags"] \
- and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
- # backup option: random left vs. right from pair
- pair_idx = random.choice([0, 1])
- # best option: rely on paired POS tags like `` / ''
- if len(ndpv[punct_idx]["tags"]) == 2:
- pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
- # next best option: rely on position in variants
- # (may not be unambiguous, so order of variants matters)
- else:
- for pair in ndpv[punct_idx]["variants"]:
- if words[word_idx] in pair:
- pair_idx = pair.index(words[word_idx])
- words[word_idx] = punct_choices[punct_idx][pair_idx]
+ variant_example = Example(doc=raw)
+ for token_annotation in example.token_annotations:
+ words = token_annotation.words
+ tags = token_annotation.tags
+ if not words or not tags:
+ # add the unmodified annotation
+ token_dict = token_annotation.to_dict()
+ variant_example.add_token_annotation(**token_dict)
+ else:
+ if lower:
+ words = [w.lower() for w in words]
+ # single variants
+ punct_choices = [random.choice(x["variants"]) for x in ndsv]
+ for word_idx in range(len(words)):
+ for punct_idx in range(len(ndsv)):
+ if tags[word_idx] in ndsv[punct_idx]["tags"] \
+ and words[word_idx] in ndsv[punct_idx]["variants"]:
+ words[word_idx] = punct_choices[punct_idx]
+ # paired variants
+ punct_choices = [random.choice(x["variants"]) for x in ndpv]
+ for word_idx in range(len(words)):
+ for punct_idx in range(len(ndpv)):
+ if tags[word_idx] in ndpv[punct_idx]["tags"] \
+ and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
+ # backup option: random left vs. right from pair
+ pair_idx = random.choice([0, 1])
+ # best option: rely on paired POS tags like `` / ''
+ if len(ndpv[punct_idx]["tags"]) == 2:
+ pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
+ # next best option: rely on position in variants
+ # (may not be unambiguous, so order of variants matters)
+ else:
+ for pair in ndpv[punct_idx]["variants"]:
+ if words[word_idx] in pair:
+ pair_idx = pair.index(words[word_idx])
+ words[word_idx] = punct_choices[punct_idx][pair_idx]
- variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets))
+ token_dict = token_annotation.to_dict()
+ token_dict["words"] = words
+ token_dict["tags"] = tags
+ variant_example.add_token_annotation(**token_dict)
# modify raw to match variant_paragraph_tuples
if raw is not None:
variants = []
@@ -437,9 +442,8 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx]
raw_idx += 1
- for sent_tuples, brackets in variant_paragraph_tuples:
- ids, words, tags, heads, labels, ner = sent_tuples
- for word in words:
+ for token_annotation in variant_example.token_annotations:
+ for word in token_annotation.words:
match_found = False
# add identical word
if word not in variants and raw[raw_idx:].startswith(word):
@@ -457,13 +461,14 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
# something went wrong, abort
# (add a warning message?)
if not match_found:
- return raw, paragraph_tuples
+ return example
# add following whitespace
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx]
raw_idx += 1
- return variant_raw, variant_paragraph_tuples
- return raw, variant_paragraph_tuples
+ variant_example.doc = variant_raw
+ return variant_example
+ return variant_example
def add_noise(orig, noise_level):
@@ -488,30 +493,27 @@ def _corrupt(c, noise_level):
def read_json_object(json_corpus_section):
"""Take a list of JSON-formatted documents (e.g. from an already loaded
- training data file) and yield tuples in the GoldParse format.
+ training data file) and yield annotations in the GoldParse format.
json_corpus_section (list): The data.
- YIELDS (tuple): The reformatted data.
+ YIELDS (Example): The reformatted data - one training example per paragraph
"""
for json_doc in json_corpus_section:
- tuple_doc = json_to_tuple(json_doc)
- for tuple_paragraph in tuple_doc:
- yield tuple_paragraph
+ examples = json_to_examples(json_doc)
+ for ex in examples:
+ yield ex
-def json_to_tuple(doc):
- """Convert an item in the JSON-formatted training data to the tuple format
+def json_to_examples(doc):
+ """Convert an item in the JSON-formatted training data to the format
used by GoldParse.
doc (dict): One entry in the training data.
- YIELDS (tuple): The reformatted data.
+ YIELDS (Example): The reformatted data - one training example per paragraph
"""
paragraphs = []
for paragraph in doc["paragraphs"]:
- sents = []
- cats = {}
- for cat in paragraph.get("cats", {}):
- cats[cat["label"]] = cat["value"]
+ example = Example(doc=paragraph.get("raw", None))
for sent in paragraph["sentences"]:
words = []
ids = []
@@ -529,11 +531,14 @@ def json_to_tuple(doc):
if labels[-1].lower() == "root":
labels[-1] = "ROOT"
ner.append(token.get("ner", "-"))
- sents.append([
- [ids, words, tags, heads, labels, ner],
- [cats, sent.get("brackets", [])]])
- if sents:
- yield [paragraph.get("raw", None), sents]
+ example.add_token_annotation(ids=ids, words=words, tags=tags,
+ heads=heads, deps=labels, entities=ner,
+ brackets=sent.get("brackets", []))
+ cats = {}
+ for cat in paragraph.get("cats", {}):
+ cats[cat["label"]] = cat["value"]
+ example.add_doc_annotation(cats=cats)
+ yield example
def read_json_file(loc, docs_filter=None, limit=None):
@@ -545,8 +550,8 @@ def read_json_file(loc, docs_filter=None, limit=None):
for doc in _json_iterate(loc):
if docs_filter is not None and not docs_filter(doc):
continue
- for json_tuple in json_to_tuple(doc):
- yield json_tuple
+ for json_data in json_to_examples(doc):
+ yield json_data
def _json_iterate(loc):
@@ -639,21 +644,254 @@ def _consume_ent(tags):
return [start] + middle + [end]
+cdef class TokenAnnotation:
+ def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphology=None, brackets=None):
+ self.ids = ids if ids else []
+ self.words = words if words else []
+ self.tags = tags if tags else []
+ self.heads = heads if heads else []
+ self.deps = deps if deps else []
+ self.entities = entities if entities else []
+ self.brackets = brackets if brackets else []
+ self.morphology = morphology if morphology else []
+
+ @classmethod
+ def from_dict(cls, token_dict):
+ return cls(ids=token_dict.get("ids", None),
+ words=token_dict.get("words", None),
+ tags=token_dict.get("tags", None),
+ heads=token_dict.get("heads", None),
+ deps=token_dict.get("deps", None),
+ entities=token_dict.get("entities", None),
+ morphology=token_dict.get("morphology", None),
+ brackets=token_dict.get("brackets", None))
+
+ def to_dict(self):
+ return {"ids": self.ids,
+ "words": self.words,
+ "tags": self.tags,
+ "heads": self.heads,
+ "deps": self.deps,
+ "entities": self.entities,
+ "morphology": self.morphology,
+ "brackets": self.brackets}
+
+
+cdef class DocAnnotation:
+ def __init__(self, cats=None, links=None):
+ self.cats = cats if cats else {}
+ self.links = links if links else {}
+
+ @classmethod
+ def from_dict(cls, doc_dict):
+ return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None))
+
+ def to_dict(self):
+ return {"cats": self.cats, "links": self.links}
+
+
+cdef class Example:
+ def __init__(self, doc_annotation=None, token_annotations=None, doc=None,
+ make_projective=False, ignore_misaligned=False, goldparse=None):
+ """ Doc can either be text, or an actual Doc """
+ self.doc = doc
+ self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
+ self.token_annotations = token_annotations if token_annotations else []
+ self.make_projective = make_projective
+ self.ignore_misaligned = ignore_misaligned
+ self.goldparse = goldparse
+
+ @classmethod
+ def from_gold(cls, goldparse, doc=None):
+ doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
+ token_annotation = goldparse.get_token_annotation()
+ return cls(doc_annotation, [token_annotation], doc)
+
+ @classmethod
+ def from_dict(cls, example_dict, doc=None):
+ token_dicts = example_dict["token_annotations"]
+ token_annotations = [TokenAnnotation.from_dict(t) for t in token_dicts]
+ doc_dict = example_dict["doc_annotation"]
+ doc_annotation = DocAnnotation.from_dict(doc_dict)
+ return cls(doc_annotation, token_annotations, doc)
+
+ def to_dict(self):
+ """ Note that this method does NOT export the doc, only the annotations ! """
+ token_dicts = [t.to_dict() for t in self.token_annotations]
+ doc_dict = self.doc_annotation.to_dict()
+ return {"token_annotations": token_dicts, "doc_annotation": doc_dict}
+
+ @property
+ def text(self):
+ if self.doc is None:
+ return None
+ if isinstance(self.doc, Doc):
+ return self.doc.text
+ return self.doc
+
+ @property
+ def gold(self):
+ if self.goldparse is None:
+ doc, gold = self.get_gold_parses(merge=True)[0]
+ self.goldparse = gold
+ return self.goldparse
+
+ def add_token_annotation(self, ids=None, words=None, tags=None, heads=None,
+ deps=None, entities=None, morphology=None, brackets=None):
+ t = TokenAnnotation(ids=ids, words=words, tags=tags,
+ heads=heads, deps=deps, entities=entities,
+ morphology=morphology, brackets=brackets)
+ self.token_annotations.append(t)
+
+ def add_doc_annotation(self, cats=None, links=None):
+ if cats:
+ self.doc_annotation.cats.update(cats)
+ if links:
+ self.doc_annotation.links.update(links)
+
+ def merge_sents(self):
+ """ Merge the list of token annotations into one object and return this new object """
+ m_example = Example(doc=self.doc, doc_annotation=self.doc_annotation)
+ m_ids, m_words, m_tags, m_heads, m_deps, m_ents, m_morph = [], [], [], [], [], [], []
+ m_brackets = []
+ i = 0
+ for t in self.token_annotations:
+ m_ids.extend(id_ + i for id_ in t.ids)
+ m_words.extend(t.words)
+ m_tags.extend(t.tags)
+ m_heads.extend(head + i if head else None for head in t.heads)
+ m_deps.extend(t.deps)
+ m_ents.extend(t.entities)
+ m_morph.extend(t.morphology)
+ m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
+ for b in t.brackets)
+ i += len(t.ids)
+ m_example.add_token_annotation(ids=m_ids, words=m_words, tags=m_tags,
+ heads=m_heads, deps=m_deps, entities=m_ents,
+ morphology=m_morph, brackets=m_brackets)
+ return m_example
+
+
+ def get_gold_parses(self, merge=False, vocab=None):
+ """Return a list of (doc, GoldParse) objects.
+ If merge is set to True, add all Token annotations to one big list."""
+ d = self.doc_annotation
+ # merging different sentences
+ if merge:
+ merged_example = self.merge_sents()
+ assert(len(merged_example.token_annotations)) == 1
+ t = merged_example.token_annotations[0]
+ m_doc = merged_example.doc
+ if not m_doc:
+ if not vocab:
+ raise ValueError(Errors.E998)
+ m_doc = Doc(vocab, words=t.words)
+ try:
+ gp = GoldParse.from_annotation(m_doc, d, t, make_projective=self.make_projective)
+ except AlignmentError:
+ if self.ignore_misaligned:
+ gp = None
+ else:
+ raise
+ return [(self.doc, gp)]
+ # we only have one sentence and an appropriate doc
+ elif len(self.token_annotations) == 1 and self.doc is not None:
+ t = self.token_annotations[0]
+ try:
+ gp = GoldParse.from_annotation(self.doc, d, t, make_projective=self.make_projective)
+ except AlignmentError:
+ if self.ignore_misaligned:
+ gp = None
+ else:
+ raise
+ return [(self.doc, gp)]
+ # not merging: one GoldParse per 'sentence', defining docs with the words from each sentence
+ else:
+ parses = []
+ for t in self.token_annotations:
+ if not vocab:
+ raise ValueError(Errors.E998)
+ t_doc = Doc(vocab, words=t.words)
+ try:
+ gp = GoldParse.from_annotation(t_doc, d, t, make_projective=self.make_projective)
+ except AlignmentError:
+ if self.ignore_misaligned:
+ gp = None
+ else:
+ raise
+ if gp is not None:
+ parses.append((t_doc, gp))
+ return parses
+
+ @classmethod
+ def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False):
+ """
+ Return a list of Example objects, from a variety of input formats.
+ make_doc needs to be provided when the examples contain text strings and keep_raw_text=False
+ """
+ if isinstance(examples, Example):
+ return [examples]
+ if isinstance(examples, tuple):
+ examples = [examples]
+ converted_examples = []
+ for ex in examples:
+ # convert string to Doc to Example
+ if isinstance(ex, basestring_):
+ if keep_raw_text:
+ converted_examples.append(Example(doc=ex))
+ else:
+ doc = make_doc(ex)
+ converted_examples.append(Example(doc=doc))
+ # convert Doc to Example
+ elif isinstance(ex, Doc):
+ converted_examples.append(Example(doc=ex))
+ # convert tuples to Example
+ elif isinstance(ex, tuple) and len(ex) == 2:
+ doc, gold = ex
+ gold_dict = {}
+ # convert string to Doc
+ if isinstance(doc, basestring_) and not keep_raw_text:
+ doc = make_doc(doc)
+ # convert dict to GoldParse
+ if isinstance(gold, dict):
+ gold_dict = gold
+ if doc is not None or gold.get("words", None) is not None:
+ gold = GoldParse(doc, **gold)
+ else:
+ gold = None
+ if gold is not None:
+ converted_examples.append(Example.from_gold(goldparse=gold, doc=doc))
+ else:
+ raise ValueError(Errors.E999.format(gold_dict=gold_dict))
+ else:
+ converted_examples.append(ex)
+ return converted_examples
+
+
cdef class GoldParse:
"""Collection for training annotations.
DOCS: https://spacy.io/api/goldparse
"""
@classmethod
- def from_annot_tuples(cls, doc, annot_tuples, cats=None, make_projective=False):
- _, words, tags, heads, deps, entities = annot_tuples
- return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
- entities=entities, cats=cats,
+ def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
+ return cls(doc, words=token_annotation.words, tags=token_annotation.tags,
+ heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities,
+ morphology=token_annotation.morphology, cats=doc_annotation.cats, links=doc_annotation.links,
make_projective=make_projective)
- def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None,
+ def get_token_annotation(self):
+ ids = None
+ if self.words:
+ ids = list(range(len(self.words)))
+
+ return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
+ heads=self.heads, deps=self.labels, entities=self.ner,
+ morphology=self.morphology)
+
+ def __init__(self, doc, words=None, tags=None, morphology=None,
heads=None, deps=None, entities=None, make_projective=False,
- cats=None, links=None, **_):
+ cats=None, links=None):
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
doc (Doc): The document the annotations refer to.
@@ -688,19 +926,19 @@ cdef class GoldParse:
self.length = len(doc)
self.cats = {} if cats is None else dict(cats)
- self.links = links
+ self.links = {} if links is None else dict(links)
# avoid allocating memory if the doc does not contain any tokens
if self.length > 0:
- if words is None:
+ if not words:
words = [token.text for token in doc]
- if tags is None:
+ if not tags:
tags = [None for _ in words]
- if heads is None:
+ if not heads:
heads = [None for _ in words]
- if deps is None:
+ if not deps:
deps = [None for _ in words]
- if morphology is None:
+ if not morphology:
morphology = [None for _ in words]
if entities is None:
entities = ["-" for _ in words]
@@ -710,7 +948,7 @@ cdef class GoldParse:
# Translate the None values to '-', to make processing easier.
# See Issue #2603
entities = [(ent if ent is not None else "-") for ent in entities]
- if not isinstance(entities[0], basestring):
+ if not isinstance(entities[0], basestring_):
# Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities)
@@ -745,8 +983,9 @@ cdef class GoldParse:
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
- annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
- self.orig_annot = list(zip(*annot_tuples))
+ self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags,
+ heads=heads, deps=deps, entities=entities, morphology=morphology,
+ brackets=[])
for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].text.isspace():
diff --git a/spacy/language.py b/spacy/language.py
index d53710f58..3106c6afe 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -3,6 +3,8 @@ from __future__ import absolute_import, unicode_literals
import random
import itertools
+
+from spacy.gold import Example
from spacy.util import minibatch
import weakref
import functools
@@ -409,7 +411,7 @@ class Language(object):
def __call__(self, text, disable=[], component_cfg=None):
"""Apply the pipeline to some text. The text can span multiple sentences,
- and can contain arbtrary whitespace. Alignment into the original string
+ and can contain arbitrary whitespace. Alignment into the original string
is preserved.
text (unicode): The text to be processed.
@@ -452,30 +454,10 @@ class Language(object):
def make_doc(self, text):
return self.tokenizer(text)
- def _format_docs_and_golds(self, docs, golds):
- """Format golds and docs before update models."""
- expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links")
- gold_objs = []
- doc_objs = []
- for doc, gold in zip(docs, golds):
- if isinstance(doc, basestring_):
- doc = self.make_doc(doc)
- if not isinstance(gold, GoldParse):
- unexpected = [k for k in gold if k not in expected_keys]
- if unexpected:
- err = Errors.E151.format(unexp=unexpected, exp=expected_keys)
- raise ValueError(err)
- gold = GoldParse(doc, **gold)
- doc_objs.append(doc)
- gold_objs.append(gold)
-
- return doc_objs, gold_objs
-
- def update(self, docs, golds, drop=0.0, sgd=None, losses=None, component_cfg=None):
+ def update(self, examples, drop=0.0, sgd=None, losses=None, component_cfg=None):
"""Update the models in the pipeline.
- docs (iterable): A batch of `Doc` objects.
- golds (iterable): A batch of `GoldParse` objects.
+ examples (iterable): A batch of `Example` or `Doc` objects.
drop (float): The dropout rate.
sgd (callable): An optimizer.
losses (dict): Dictionary to update with the loss, keyed by component.
@@ -484,18 +466,16 @@ class Language(object):
DOCS: https://spacy.io/api/language#update
"""
- if len(docs) != len(golds):
- raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds)))
- if len(docs) == 0:
+ if len(examples) == 0:
return
+ examples = Example.to_example_objects(examples, make_doc=self.make_doc)
+
if sgd is None:
if self._optimizer is None:
self._optimizer = create_default_optimizer(Model.ops)
sgd = self._optimizer
- # Allow dict of args to GoldParse, instead of GoldParse objects.
- docs, golds = self._format_docs_and_golds(docs, golds)
- grads = {}
+ grads = {}
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
@@ -512,18 +492,18 @@ class Language(object):
grads = {}
kwargs = component_cfg.get(name, {})
kwargs.setdefault("drop", drop)
- proc.update(docs, golds, sgd=get_grads, losses=losses, **kwargs)
+ proc.update(examples, sgd=get_grads, losses=losses, **kwargs)
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
- def rehearse(self, docs, sgd=None, losses=None, config=None):
+ def rehearse(self, examples, sgd=None, losses=None, config=None):
"""Make a "rehearsal" update to the models in the pipeline, to prevent
forgetting. Rehearsal updates run an initial copy of the model over some
data, and update the model so its current predictions are more like the
initial ones. This is useful for keeping a pretrained model on-track,
even if you're updating it with a smaller set of examples.
- docs (iterable): A batch of `Doc` objects.
+ examples (iterable): A batch of `Doc` objects.
drop (float): The dropout rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
@@ -531,22 +511,18 @@ class Language(object):
EXAMPLE:
>>> raw_text_batches = minibatch(raw_texts)
>>> for labelled_batch in minibatch(zip(train_docs, train_golds)):
- >>> docs, golds = zip(*train_docs)
- >>> nlp.update(docs, golds)
+ >>> nlp.update(labelled_batch)
>>> raw_batch = [nlp.make_doc(text) for text in next(raw_text_batches)]
>>> nlp.rehearse(raw_batch)
"""
# TODO: document
- if len(docs) == 0:
+ if len(examples) == 0:
return
+ examples = Example.to_example_objects(examples, make_doc=self.make_doc)
if sgd is None:
if self._optimizer is None:
self._optimizer = create_default_optimizer(Model.ops)
sgd = self._optimizer
- docs = list(docs)
- for i, doc in enumerate(docs):
- if isinstance(doc, basestring_):
- docs[i] = self.make_doc(doc)
pipes = list(self.pipeline)
random.shuffle(pipes)
if config is None:
@@ -563,44 +539,45 @@ class Language(object):
if not hasattr(proc, "rehearse"):
continue
grads = {}
- proc.rehearse(docs, sgd=get_grads, losses=losses, **config.get(name, {}))
+ proc.rehearse(examples, sgd=get_grads, losses=losses, **config.get(name, {}))
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
return losses
- def preprocess_gold(self, docs_golds):
+ def preprocess_gold(self, examples):
"""Can be called before training to pre-process gold data. By default,
it handles nonprojectivity and adds missing tags to the tag map.
- docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
- YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
+ examples (iterable): `Example` objects.
+ YIELDS (tuple): `Example` objects.
"""
for name, proc in self.pipeline:
if hasattr(proc, "preprocess_gold"):
- docs_golds = proc.preprocess_gold(docs_golds)
- for doc, gold in docs_golds:
- yield doc, gold
+ examples = proc.preprocess_gold(examples)
+ for ex in examples:
+ yield ex
- def begin_training(self, get_gold_tuples=None, sgd=None, component_cfg=None, **cfg):
+ def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager.
- get_gold_tuples (function): Function returning gold data
+ get_examples (function): Function returning example training data (TODO: document format change since 3.0)
component_cfg (dict): Config parameters for specific components.
**cfg: Config parameters.
RETURNS: An optimizer.
DOCS: https://spacy.io/api/language#begin_training
"""
- if get_gold_tuples is None:
- get_gold_tuples = lambda: []
+ # TODO: throw warning when get_gold_tuples is provided instead of get_examples
+ if get_examples is None:
+ get_examples = lambda: []
# Populate vocab
else:
- for _, annots_brackets in get_gold_tuples():
- _ = annots_brackets.pop()
- for annots, _ in annots_brackets:
- for word in annots[1]:
+ for example in get_examples():
+ for token_annotation in example.token_annotations:
+ for word in token_annotation.words:
_ = self.vocab[word] # noqa: F841
+
if cfg.get("device", -1) >= 0:
util.use_gpu(cfg["device"])
if self.vocab.vectors.data.shape[1] >= 1:
@@ -618,7 +595,7 @@ class Language(object):
kwargs = component_cfg.get(name, {})
kwargs.update(cfg)
proc.begin_training(
- get_gold_tuples,
+ get_examples,
pipeline=self.pipeline,
sgd=self._optimizer,
**kwargs
@@ -650,11 +627,11 @@ class Language(object):
return self._optimizer
def evaluate(
- self, docs_golds, verbose=False, batch_size=256, scorer=None, component_cfg=None
+ self, examples, verbose=False, batch_size=256, scorer=None, component_cfg=None
):
"""Evaluate a model's pipeline components.
- docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
+ examples (iterable): `Example` objects.
verbose (bool): Print debugging information.
batch_size (int): Batch size to use.
scorer (Scorer): Optional `Scorer` to use. If not passed in, a new one
@@ -665,30 +642,24 @@ class Language(object):
DOCS: https://spacy.io/api/language#evaluate
"""
+ examples = Example.to_example_objects(examples, make_doc=self.make_doc)
if scorer is None:
scorer = Scorer(pipeline=self.pipeline)
if component_cfg is None:
component_cfg = {}
- docs, golds = zip(*docs_golds)
- docs = [
- self.make_doc(doc) if isinstance(doc, basestring_) else doc for doc in docs
- ]
- golds = list(golds)
for name, pipe in self.pipeline:
kwargs = component_cfg.get(name, {})
kwargs.setdefault("batch_size", batch_size)
if not hasattr(pipe, "pipe"):
- docs = _pipe(pipe, docs, kwargs)
+ examples = _pipe(pipe, examples, kwargs)
else:
- docs = pipe.pipe(docs, **kwargs)
- for doc, gold in zip(docs, golds):
- if not isinstance(gold, GoldParse):
- gold = GoldParse(doc, **gold)
+ examples = pipe.pipe(examples, as_example=True, **kwargs)
+ for ex in examples:
if verbose:
- print(doc)
+ print(ex.doc)
kwargs = component_cfg.get("scorer", {})
kwargs.setdefault("verbose", verbose)
- scorer.score(doc, gold, **kwargs)
+ scorer.score(ex, **kwargs)
return scorer
@contextmanager
@@ -733,6 +704,7 @@ class Language(object):
cleanup=False,
component_cfg=None,
n_process=1,
+ as_example=False
):
"""Process texts as a stream, and yield `Doc` objects in order.
@@ -770,6 +742,7 @@ class Language(object):
batch_size=batch_size,
disable=disable,
component_cfg=component_cfg,
+ as_example=False
)
for doc, context in izip(docs, contexts):
yield (doc, context)
@@ -1095,15 +1068,15 @@ class DisabledPipes(list):
self[:] = []
-def _pipe(docs, proc, kwargs):
+def _pipe(examples, proc, kwargs):
# We added some args for pipe that __call__ doesn't expect.
kwargs = dict(kwargs)
for arg in ["n_threads", "batch_size"]:
if arg in kwargs:
kwargs.pop(arg)
- for doc in docs:
- doc = proc(doc, **kwargs)
- yield doc
+ for ex in examples:
+ ex = proc(ex, **kwargs)
+ yield ex
def _apply_pipes(make_doc, pipes, reciever, sender):
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 72e31f120..adcff9280 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -97,18 +97,19 @@ class Morphologizer(Pipe):
if doc[j].morph.pos != 0:
doc.c[j].pos = doc[j].morph.pos
- def update(self, docs, golds, drop=0., sgd=None, losses=None):
+ def update(self, examples, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
+ docs = [self._get_doc(ex) for ex in examples]
tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
- loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
+ loss, d_tag_scores = self.get_loss(examples, tag_scores)
bp_tag_scores(d_tag_scores, sgd=sgd)
if losses is not None:
losses[self.name] += loss
- def get_loss(self, docs, golds, scores):
+ def get_loss(self, examples, scores):
guesses = []
for doc_scores in scores:
guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes))
@@ -122,7 +123,9 @@ class Morphologizer(Pipe):
# Do this on CPU, as we can't vectorize easily.
target = numpy.zeros(scores.shape, dtype='f')
field_sizes = self.model.softmax.out_sizes
- for doc, gold in zip(docs, golds):
+ for example in examples:
+ doc = example.doc
+ gold = example.gold
for t, features in enumerate(gold.morphology):
if features is None:
target[idx] = scores[idx]
@@ -146,6 +149,7 @@ class Morphologizer(Pipe):
scores = self.model.ops.asarray(scores, dtype='f')
d_scores = scores - target
loss = (d_scores**2).sum()
+ docs = [self._get_doc(ex) for ex in examples]
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index d29cf9ce9..1d67d8e16 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -13,6 +13,7 @@ from thinc.misc import LayerNorm
from thinc.neural.util import to_categorical
from thinc.neural.util import get_array_module
+from spacy.gold import Example
from ..tokens.doc cimport Doc
from ..syntax.nn_parser cimport Parser
from ..syntax.ner cimport BiluoPushDown
@@ -59,11 +60,17 @@ class Pipe(object):
def from_nlp(cls, nlp, **cfg):
return cls(nlp.vocab, **cfg)
+ def _get_doc(self, example):
+ """ Use this method if the `example` method can be both a Doc or an Example """
+ if isinstance(example, Doc):
+ return example
+ return example.doc
+
def __init__(self, vocab, model=True, **cfg):
"""Create a new pipe instance."""
raise NotImplementedError
- def __call__(self, doc):
+ def __call__(self, example):
"""Apply the pipe to one document. The document is
modified in-place, and returned.
@@ -71,12 +78,16 @@ class Pipe(object):
and `set_annotations()` methods.
"""
self.require_model()
+ doc = self._get_doc(example)
predictions = self.predict([doc])
if isinstance(predictions, tuple) and len(predictions) == 2:
scores, tensors = predictions
self.set_annotations([doc], scores, tensors=tensors)
else:
self.set_annotations([doc], predictions)
+ if isinstance(example, Example):
+ example.doc = doc
+ return example
return doc
def require_model(self):
@@ -84,21 +95,30 @@ class Pipe(object):
if getattr(self, "model", None) in (None, True, False):
raise ValueError(Errors.E109.format(name=self.name))
- def pipe(self, stream, batch_size=128, n_threads=-1):
+ def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
"""Apply the pipe to a stream of documents.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
- for docs in util.minibatch(stream, size=batch_size):
- docs = list(docs)
+ for examples in util.minibatch(stream, size=batch_size):
+ examples = list(examples)
+ docs = [self._get_doc(ex) for ex in examples]
predictions = self.predict(docs)
if isinstance(predictions, tuple) and len(tuple) == 2:
scores, tensors = predictions
self.set_annotations(docs, scores, tensors=tensors)
else:
self.set_annotations(docs, predictions)
- yield from docs
+
+ if as_example:
+ examples = []
+ for ex, doc in zip(examples, docs):
+ ex.doc = doc
+ examples.append(ex)
+ yield from examples
+ else:
+ yield from docs
def predict(self, docs):
"""Apply the pipeline's model to a batch of docs, without
@@ -111,7 +131,7 @@ class Pipe(object):
"""Modify a batch of documents, using pre-computed scores."""
raise NotImplementedError
- def update(self, docs, golds, drop=0.0, sgd=None, losses=None):
+ def update(self, examples, drop=0.0, sgd=None, losses=None):
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model.
@@ -119,12 +139,12 @@ class Pipe(object):
"""
pass
- def rehearse(self, docs, sgd=None, losses=None, **config):
+ def rehearse(self, examples, sgd=None, losses=None, **config):
pass
- def get_loss(self, docs, golds, scores):
+ def get_loss(self, examples, scores):
"""Find the loss and gradient of loss for the batch of
- documents and their predicted scores."""
+ examples (with embedded docs) and their predicted scores."""
raise NotImplementedError
def add_label(self, label):
@@ -140,7 +160,7 @@ class Pipe(object):
return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {}))
def begin_training(
- self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs
+ self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
):
"""Initialize the pipe for training, using data exampes if available.
If no model has been initialized yet, the model is added."""
@@ -264,29 +284,41 @@ class Tensorizer(Pipe):
self.cfg = dict(cfg)
self.cfg.setdefault("cnn_maxout_pieces", 3)
- def __call__(self, doc):
+ def __call__(self, example):
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
model. Vectors are set to the `Doc.tensor` attribute.
docs (Doc or iterable): One or more documents to add vectors to.
RETURNS (dict or None): Intermediate computations.
"""
+ doc = self._get_doc(example)
tokvecses = self.predict([doc])
self.set_annotations([doc], tokvecses)
+ if isinstance(example, Example):
+ example.doc = doc
+ return example
return doc
- def pipe(self, stream, batch_size=128, n_threads=-1):
+ def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
"""Process `Doc` objects as a stream.
- stream (iterator): A sequence of `Doc` objects to process.
- batch_size (int): Number of `Doc` objects to group.
- YIELDS (iterator): A sequence of `Doc` objects, in order of input.
+ stream (iterator): A sequence of `Doc` or `Example` objects to process.
+ batch_size (int): Number of `Doc` or `Example` objects to group.
+ YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input.
"""
- for docs in util.minibatch(stream, size=batch_size):
- docs = list(docs)
+ for examples in util.minibatch(stream, size=batch_size):
+ docs = [self._get_doc(ex) for ex in examples]
tensors = self.predict(docs)
self.set_annotations(docs, tensors)
- yield from docs
+
+ if as_example:
+ examples = []
+ for ex, doc in zip(examples, docs):
+ ex.doc = doc
+ examples.append(ex)
+ yield from examples
+ else:
+ yield from docs
def predict(self, docs):
"""Return a single tensor for a batch of documents.
@@ -310,7 +342,7 @@ class Tensorizer(Pipe):
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
doc.tensor = tensor
- def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
+ def update(self, examples, state=None, drop=0.0, sgd=None, losses=None):
"""Update the model.
docs (iterable): A batch of `Doc` objects.
@@ -320,17 +352,16 @@ class Tensorizer(Pipe):
RETURNS (dict): Results from the update.
"""
self.require_model()
- if isinstance(docs, Doc):
- docs = [docs]
+ examples = Example.to_example_objects(examples)
inputs = []
bp_inputs = []
for tok2vec in self.input_models:
- tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop)
+ tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples], drop=drop)
inputs.append(tensor)
bp_inputs.append(bp_tensor)
inputs = self.model.ops.xp.hstack(inputs)
scores, bp_scores = self.model.begin_update(inputs, drop=drop)
- loss, d_scores = self.get_loss(docs, golds, scores)
+ loss, d_scores = self.get_loss(examples, scores)
d_inputs = bp_scores(d_scores, sgd=sgd)
d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
for d_input, bp_input in zip(d_inputs, bp_inputs):
@@ -340,18 +371,19 @@ class Tensorizer(Pipe):
losses[self.name] += loss
return loss
- def get_loss(self, docs, golds, prediction):
- ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+ def get_loss(self, examples, prediction):
+ examples = Example.to_example_objects(examples)
+ ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
target = self.vocab.vectors.data[ids]
d_scores = (prediction - target) / prediction.shape[0]
loss = (d_scores ** 2).sum()
return loss, d_scores
- def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
+ def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
"""Allocate models, pre-process training data and acquire an
optimizer.
- gold_tuples (iterable): Gold-standard training data.
+ get_examples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
"""
if pipeline is not None:
@@ -391,17 +423,30 @@ class Tagger(Pipe):
else:
return chain(self.model.tok2vec, flatten)
- def __call__(self, doc):
+ def __call__(self, example):
+ doc = self._get_doc(example)
tags, tokvecs = self.predict([doc])
self.set_annotations([doc], tags, tensors=tokvecs)
+ if isinstance(example, Example):
+ example.doc = doc
+ return example
return doc
- def pipe(self, stream, batch_size=128, n_threads=-1):
- for docs in util.minibatch(stream, size=batch_size):
- docs = list(docs)
+ def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
+ for examples in util.minibatch(stream, size=batch_size):
+ examples = list(examples)
+ docs = [self._get_doc(ex) for ex in examples]
tag_ids, tokvecs = self.predict(docs)
self.set_annotations(docs, tag_ids, tensors=tokvecs)
- yield from docs
+
+ if as_example:
+ examples = []
+ for ex, doc in zip(examples, docs):
+ ex.doc = doc
+ examples.append(ex)
+ yield from examples
+ else:
+ yield from docs
def predict(self, docs):
self.require_model()
@@ -452,47 +497,51 @@ class Tagger(Pipe):
doc.extend_tensor(tensors[i])
doc.is_tagged = True
- def update(self, docs, golds, drop=0., sgd=None, losses=None):
+ def update(self, examples, drop=0., sgd=None, losses=None):
self.require_model()
+ examples = Example.to_example_objects(examples)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
- if not any(len(doc) for doc in docs):
+ if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
# Handle cases where there are no tokens in any docs.
return
- tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
- loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
+ tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+ loss, d_tag_scores = self.get_loss(examples, tag_scores)
bp_tag_scores(d_tag_scores, sgd=sgd)
if losses is not None:
losses[self.name] += loss
- def rehearse(self, docs, drop=0., sgd=None, losses=None):
+ def rehearse(self, examples, drop=0., sgd=None, losses=None):
"""Perform a 'rehearsal' update, where we try to match the output of
an initial model.
"""
if self._rehearsal_model is None:
return
+ examples = Example.to_example_objects(examples)
+ docs = [ex.doc for ex in examples]
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
return
guesses, backprop = self.model.begin_update(docs, drop=drop)
- target = self._rehearsal_model(docs)
+ target = self._rehearsal_model(examples)
gradient = guesses - target
backprop(gradient, sgd=sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += (gradient**2).sum()
- def get_loss(self, docs, golds, scores):
+ def get_loss(self, examples, scores):
scores = self.model.ops.flatten(scores)
tag_index = {tag: i for i, tag in enumerate(self.labels)}
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype="i")
guesses = scores.argmax(axis=1)
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
- for gold in golds:
+ for ex in examples:
+ gold = ex.gold
for tag in gold.tags:
if tag is None:
correct[idx] = guesses[idx]
@@ -506,20 +555,20 @@ class Tagger(Pipe):
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores *= self.model.ops.asarray(known_labels)
loss = (d_scores**2).sum()
+ docs = [ex.doc for ex in examples]
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
- def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
+ def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
**kwargs):
lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
if not any(table in self.vocab.lookups for table in lemma_tables):
user_warning(Warnings.W022)
orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = OrderedDict()
- for raw_text, annots_brackets in get_gold_tuples():
- for annots, brackets in annots_brackets:
- ids, words, tags, heads, deps, ents = annots
- for tag in tags:
+ for example in get_examples():
+ for token_annotation in example.token_annotations:
+ for tag in token_annotation.tags:
if tag in orig_tag_map:
new_tag_map[tag] = orig_tag_map[tag]
else:
@@ -698,14 +747,14 @@ class MultitaskObjective(Tagger):
def set_annotations(self, docs, dep_ids, tensors=None):
pass
- def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, tok2vec=None,
+ def begin_training(self, get_examples=lambda: [], pipeline=None, tok2vec=None,
sgd=None, **kwargs):
- gold_tuples = nonproj.preprocess_training_data(get_gold_tuples())
- for raw_text, annots_brackets in gold_tuples:
- for annots, brackets in annots_brackets:
- ids, words, tags, heads, deps, ents = annots
- for i in range(len(ids)):
- label = self.make_label(i, words, tags, heads, deps, ents)
+ gold_examples = nonproj.preprocess_training_data(get_examples())
+ # for raw_text, doc_annot in gold_tuples:
+ for example in gold_examples:
+ for token_annotation in example.token_annotations:
+ for i in range(len(token_annotation.ids)):
+ label = self.make_label(i, token_annotation)
if label is not None and label not in self.labels:
self.labels[label] = len(self.labels)
if self.model is True:
@@ -735,18 +784,17 @@ class MultitaskObjective(Tagger):
scores = self.model.softmax(tokvecs)
return tokvecs, scores
- def get_loss(self, docs, golds, scores):
- if len(docs) != len(golds):
- raise ValueError(Errors.E077.format(value="loss", n_docs=len(docs),
- n_golds=len(golds)))
+ def get_loss(self, examples, scores):
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype="i")
guesses = scores.argmax(axis=1)
+ golds = [ex.gold for ex in examples]
+ docs = [ex.doc for ex in examples]
for i, gold in enumerate(golds):
for j in range(len(docs[i])):
- # Handes alignment for tokenization differences
- label = self.make_label(j, gold.words, gold.tags,
- gold.heads, gold.labels, gold.ents)
+ # Handels alignment for tokenization differences
+ token_annotation = gold.get_token_annotation()
+ label = self.make_label(j, token_annotation)
if label is None or label not in self.labels:
correct[idx] = guesses[idx]
else:
@@ -758,39 +806,39 @@ class MultitaskObjective(Tagger):
return float(loss), d_scores
@staticmethod
- def make_dep(i, words, tags, heads, deps, ents):
- if deps[i] is None or heads[i] is None:
+ def make_dep(i, token_annotation):
+ if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
return None
- return deps[i]
+ return token_annotation.deps[i]
@staticmethod
- def make_tag(i, words, tags, heads, deps, ents):
- return tags[i]
+ def make_tag(i, token_annotation):
+ return token_annotation.tags[i]
@staticmethod
- def make_ent(i, words, tags, heads, deps, ents):
- if ents is None:
+ def make_ent(i, token_annotation):
+ if token_annotation.entities is None:
return None
- return ents[i]
+ return token_annotation.entities[i]
@staticmethod
- def make_dep_tag_offset(i, words, tags, heads, deps, ents):
- if deps[i] is None or heads[i] is None:
+ def make_dep_tag_offset(i, token_annotation):
+ if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
return None
- offset = heads[i] - i
+ offset = token_annotation.heads[i] - i
offset = min(offset, 2)
offset = max(offset, -2)
- return "%s-%s:%d" % (deps[i], tags[i], offset)
+ return "%s-%s:%d" % (token_annotation.deps[i], token_annotation.tags[i], offset)
@staticmethod
- def make_ent_tag(i, words, tags, heads, deps, ents):
- if ents is None or ents[i] is None:
+ def make_ent_tag(i, token_annotation):
+ if token_annotation.entities is None or token_annotation.entities[i] is None:
return None
else:
- return "%s-%s" % (tags[i], ents[i])
+ return "%s-%s" % (token_annotation.tags[i], token_annotation.entities[i])
@staticmethod
- def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}):
+ def make_sent_start(target, token_annotation, cache=True, _cache={}):
"""A multi-task objective for representing sentence boundaries,
using BILU scheme. (O is impossible)
@@ -799,6 +847,8 @@ class MultitaskObjective(Tagger):
of gold data. You can pass cache=False if you know the cache will
do the wrong thing.
"""
+ words = token_annotation.words
+ heads = token_annotation.heads
assert len(words) == len(heads)
assert target < len(words), (target, len(words))
if cache:
@@ -857,7 +907,7 @@ class ClozeMultitask(Pipe):
def set_annotations(self, docs, dep_ids, tensors=None):
pass
- def begin_training(self, get_gold_tuples=lambda: [], pipeline=None,
+ def begin_training(self, get_examples=lambda: [], pipeline=None,
tok2vec=None, sgd=None, **kwargs):
link_vectors_to_models(self.vocab)
if self.model is True:
@@ -874,25 +924,26 @@ class ClozeMultitask(Pipe):
vectors = self.model.output_layer(tokvecs)
return tokvecs, vectors
- def get_loss(self, docs, vectors, prediction):
+ def get_loss(self, examples, vectors, prediction):
# The simplest way to implement this would be to vstack the
# token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens,
# and look them up all at once. This prevents data copying.
- ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+ ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
target = vectors[ids]
loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
return float(loss), gradient
- def update(self, docs, golds, drop=0., sgd=None, losses=None):
+ def update(self, examples, drop=0., sgd=None, losses=None):
pass
- def rehearse(self, docs, drop=0., sgd=None, losses=None):
+ def rehearse(self, examples, drop=0., sgd=None, losses=None):
self.require_model()
+ examples = Example.to_example_objects(examples)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
- predictions, bp_predictions = self.model.begin_update(docs, drop=drop)
- loss, d_predictions = self.get_loss(docs, self.vocab.vectors.data, predictions)
+ predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+ loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
bp_predictions(d_predictions, sgd=sgd)
if losses is not None:
@@ -947,12 +998,21 @@ class TextCategorizer(Pipe):
def labels(self, value):
self.cfg["labels"] = tuple(value)
- def pipe(self, stream, batch_size=128, n_threads=-1):
- for docs in util.minibatch(stream, size=batch_size):
- docs = list(docs)
+ def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
+ for examples in util.minibatch(stream, size=batch_size):
+ examples = list(examples)
+ docs = [self._get_doc(ex) for ex in examples]
scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensors=tensors)
- yield from docs
+
+ if as_example:
+ examples = []
+ for ex, doc in zip(examples, docs):
+ ex.doc = doc
+ examples.append(ex)
+ yield from examples
+ else:
+ yield from docs
def predict(self, docs):
self.require_model()
@@ -973,33 +1033,37 @@ class TextCategorizer(Pipe):
for j, label in enumerate(self.labels):
doc.cats[label] = float(scores[i, j])
- def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
+ def update(self, examples, state=None, drop=0., sgd=None, losses=None):
self.require_model()
- if not any(len(doc) for doc in docs):
+ examples = Example.to_example_objects(examples)
+ if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
# Handle cases where there are no tokens in any docs.
return
- scores, bp_scores = self.model.begin_update(docs, drop=drop)
- loss, d_scores = self.get_loss(docs, golds, scores)
+ scores, bp_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+ loss, d_scores = self.get_loss(examples, scores)
bp_scores(d_scores, sgd=sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += loss
- def rehearse(self, docs, drop=0., sgd=None, losses=None):
+ def rehearse(self, examples, drop=0., sgd=None, losses=None):
if self._rehearsal_model is None:
return
+ examples = Example.to_example_objects(examples)
+ docs=[ex.doc for ex in examples]
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
return
scores, bp_scores = self.model.begin_update(docs, drop=drop)
- target = self._rehearsal_model(docs)
+ target = self._rehearsal_model(examples)
gradient = scores - target
bp_scores(gradient, sgd=sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += (gradient**2).sum()
- def get_loss(self, docs, golds, scores):
+ def get_loss(self, examples, scores):
+ golds = [ex.gold for ex in examples]
truths = numpy.zeros((len(golds), len(self.labels)), dtype="f")
not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f")
for i, gold in enumerate(golds):
@@ -1032,11 +1096,10 @@ class TextCategorizer(Pipe):
self.labels = tuple(list(self.labels) + [label])
return 1
- def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
- for raw_text, annot_brackets in get_gold_tuples():
- for _, (cats, _2) in annot_brackets:
- for cat in cats:
- self.add_label(cat)
+ def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
+ for example in get_examples():
+ for cat in example.doc_annotation.cats:
+ self.add_label(cat)
if self.model is True:
self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
self.require_labels()
@@ -1074,10 +1137,10 @@ cdef class DependencyParser(Parser):
labeller = MultitaskObjective(self.vocab, target=target)
self._multitasks.append(labeller)
- def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
+ def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
for labeller in self._multitasks:
tok2vec = self.model.tok2vec
- labeller.begin_training(get_gold_tuples, pipeline=pipeline,
+ labeller.begin_training(get_examples, pipeline=pipeline,
tok2vec=tok2vec, sgd=sgd)
def __reduce__(self):
@@ -1116,10 +1179,10 @@ cdef class EntityRecognizer(Parser):
labeller = MultitaskObjective(self.vocab, target=target)
self._multitasks.append(labeller)
- def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
+ def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
for labeller in self._multitasks:
tok2vec = self.model.tok2vec
- labeller.begin_training(get_gold_tuples, pipeline=pipeline,
+ labeller.begin_training(get_examples, pipeline=pipeline,
tok2vec=tok2vec)
def __reduce__(self):
@@ -1175,7 +1238,7 @@ class EntityLinker(Pipe):
if getattr(self, "kb", None) in (None, True, False):
raise ValueError(Errors.E139.format(name=self.name))
- def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
+ def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
self.require_kb()
self.cfg["entity_width"] = self.kb.entity_vector_length
@@ -1187,25 +1250,18 @@ class EntityLinker(Pipe):
return sgd
- def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
+ def update(self, examples, state=None, drop=0.0, sgd=None, losses=None):
self.require_model()
self.require_kb()
-
if losses is not None:
losses.setdefault(self.name, 0.0)
-
- if not docs or not golds:
+ if not examples:
return 0
-
- if len(docs) != len(golds):
- raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs),
- n_golds=len(golds)))
-
- if isinstance(docs, Doc):
- docs = [docs]
- golds = [golds]
+ examples = Example.to_example_objects(examples)
sentence_docs = []
+ docs = [ex.doc for ex in examples]
+ golds = [ex.gold for ex in examples]
for doc, gold in zip(docs, golds):
ents_by_offset = dict()
@@ -1219,19 +1275,19 @@ class EntityLinker(Pipe):
ent = ents_by_offset[(start, end)]
for kb_id, value in kb_dict.items():
- # Currently only training on the positive instances
+ # Currently only training on the positive instances - we assume there is at least 1 per doc/gold
if value:
sentence_docs.append(ent.sent.as_doc())
sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
- loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None)
+ loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds)
bp_context(d_scores, sgd=sgd)
if losses is not None:
losses[self.name] += loss
return loss
- def get_similarity_loss(self, docs, golds, scores):
+ def get_similarity_loss(self, golds, scores):
entity_encodings = []
for gold in golds:
for entity, kb_dict in gold.links.items():
@@ -1244,16 +1300,16 @@ class EntityLinker(Pipe):
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
if scores.shape != entity_encodings.shape:
- raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up"))
+ raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up"))
loss, gradients = get_cossim_loss(yh=scores, y=entity_encodings)
loss = loss / len(entity_encodings)
return loss, gradients
- def get_loss(self, docs, golds, scores):
+ def get_loss(self, examples, scores):
cats = []
- for gold in golds:
- for entity, kb_dict in gold.links.items():
+ for ex in examples:
+ for entity, kb_dict in ex.gold.links.items():
for kb_id, value in kb_dict.items():
cats.append([value])
@@ -1266,17 +1322,30 @@ class EntityLinker(Pipe):
loss = loss / len(cats)
return loss, d_scores
- def __call__(self, doc):
+ def __call__(self, example):
+ doc = self._get_doc(example)
kb_ids, tensors = self.predict([doc])
self.set_annotations([doc], kb_ids, tensors=tensors)
+ if isinstance(example, Example):
+ example.doc = doc
+ return example
return doc
- def pipe(self, stream, batch_size=128, n_threads=-1):
- for docs in util.minibatch(stream, size=batch_size):
- docs = list(docs)
+ def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
+ for examples in util.minibatch(stream, size=batch_size):
+ examples = list(examples)
+ docs = [self._get_doc(ex) for ex in examples]
kb_ids, tensors = self.predict(docs)
self.set_annotations(docs, kb_ids, tensors=tensors)
- yield from docs
+
+ if as_example:
+ examples = []
+ for ex, doc in zip(examples, docs):
+ ex.doc = doc
+ examples.append(ex)
+ yield from examples
+ else:
+ yield from docs
def predict(self, docs):
""" Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
@@ -1408,7 +1477,7 @@ class EntityLinker(Pipe):
util.from_disk(path, deserialize, exclude)
return self
- def rehearse(self, docs, sgd=None, losses=None, **config):
+ def rehearse(self, examples, sgd=None, losses=None, **config):
raise NotImplementedError
def add_label(self, label):
@@ -1416,7 +1485,7 @@ class EntityLinker(Pipe):
@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"])
-class Sentencizer(object):
+class Sentencizer(Pipe):
"""Segment the Doc into sentences using a rule-based strategy.
DOCS: https://spacy.io/api/sentencizer
@@ -1451,14 +1520,15 @@ class Sentencizer(object):
def from_nlp(cls, nlp, **cfg):
return cls(**cfg)
- def __call__(self, doc):
+ def __call__(self, example):
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
- doc (Doc): The document to process.
- RETURNS (Doc): The processed Doc.
+ example (Doc or Example): The document to process.
+ RETURNS (Doc or Example): The processed Doc or Example.
DOCS: https://spacy.io/api/sentencizer#call
"""
+ doc = self._get_doc(example)
start = 0
seen_period = False
for i, token in enumerate(doc):
@@ -1472,6 +1542,9 @@ class Sentencizer(object):
seen_period = True
if start < len(doc):
doc[start].is_sent_start = True
+ if isinstance(example, Example):
+ example.doc = doc
+ return example
return doc
def to_bytes(self, **kwargs):
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 0b4843f41..25c6935f3 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -3,7 +3,7 @@ from __future__ import division, print_function, unicode_literals
import numpy as np
-from .gold import tags_to_entities, GoldParse
+from .gold import tags_to_entities, GoldParse, DocAnnotation
from .errors import Errors
@@ -217,11 +217,10 @@ class Scorer(object):
"textcats_per_cat": self.textcats_per_cat,
}
- def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")):
+ def score(self, example, verbose=False, punct_labels=("p", "punct")):
"""Update the evaluation scores from a single Doc / GoldParse pair.
- doc (Doc): The predicted annotations.
- gold (GoldParse): The correct annotations.
+ example (Example): The predicted annotations + correct annotations.
verbose (bool): Print debugging information.
punct_labels (tuple): Dependency labels for punctuation. Used to
evaluate dependency attachments to punctuation if `eval_punct` is
@@ -229,15 +228,22 @@ class Scorer(object):
DOCS: https://spacy.io/api/scorer#score
"""
+ if isinstance(example, tuple) and len(example) == 2:
+ doc, gold = example
+ else:
+ gold = example.gold
+ doc = example.doc
+
if len(doc) != len(gold):
- gold = GoldParse.from_annot_tuples(
- doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
- )
+ doc_annotation = DocAnnotation(cats=gold.cats)
+ token_annotation = gold.orig
+ gold = GoldParse.from_annotation(doc, doc_annotation, [token_annotation])
+ orig = gold.orig
gold_deps = set()
gold_deps_per_dep = {}
gold_tags = set()
- gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
- for id_, word, tag, head, dep, ner in gold.orig_annot:
+ gold_ents = set(tags_to_entities(orig.entities))
+ for id_, tag, head, dep in zip(orig.ids, orig.tags, orig.heads, orig.deps):
gold_tags.add((id_, tag))
if dep not in (None, "") and dep.lower() not in punct_labels:
gold_deps.add((id_, head, dep.lower()))
@@ -272,7 +278,7 @@ class Scorer(object):
if token.dep_.lower() not in cand_deps_per_dep:
cand_deps_per_dep[token.dep_.lower()] = set()
cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower()))
- if "-" not in [token[-1] for token in gold.orig_annot]:
+ if "-" not in orig.entities:
# Find all NER labels in gold and doc
ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
# Set up all labels for per type scoring and prepare gold per type
@@ -336,7 +342,7 @@ class Scorer(object):
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
)
if verbose:
- gold_words = [item[1] for item in gold.orig_annot]
+ gold_words = orig.words
for w_id, h_id, dep in cand_deps - gold_deps:
print("F", gold_words[w_id], dep, gold_words[h_id])
for w_id, h_id, dep in gold_deps - cand_deps:
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index eb39124ce..0a99609a8 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -341,10 +341,10 @@ cdef class ArcEager(TransitionSystem):
for label in kwargs.get('right_labels', []):
actions[RIGHT][label] = 1
actions[REDUCE][label] = 1
- for raw_text, sents in kwargs.get('gold_parses', []):
- for (ids, words, tags, heads, labels, iob), ctnts in sents:
- heads, labels = nonproj.projectivize(heads, labels)
- for child, head, label in zip(ids, heads, labels):
+ for example in kwargs.get('gold_parses', []):
+ for token_annotation in example.token_annotations:
+ heads, labels = nonproj.projectivize(token_annotation.heads, token_annotation.deps)
+ for child, head, label in zip(token_annotation.ids, heads, labels):
if label.upper() == 'ROOT' :
label = 'ROOT'
if head == child:
@@ -397,7 +397,9 @@ cdef class ArcEager(TransitionSystem):
self.strings[state.safe_get(i).dep]))
else:
predicted.add((i, state.H(i), 'ROOT'))
- id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
+ id_ = gold.orig.ids[gold.cand_to_gold[i]]
+ head = gold.orig.heads[gold.cand_to_gold[i]]
+ dep = gold.orig.deps[gold.cand_to_gold[i]]
truth.add((id_, head, dep))
return truth == predicted
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index 9f8ad418c..d791534ee 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -72,9 +72,9 @@ cdef class BiluoPushDown(TransitionSystem):
for action in (BEGIN, IN, LAST, UNIT):
actions[action][entity_type] = 1
moves = ('M', 'B', 'I', 'L', 'U')
- for raw_text, sents in kwargs.get('gold_parses', []):
- for (ids, words, tags, heads, labels, biluo), _ in sents:
- for i, ner_tag in enumerate(biluo):
+ for example in kwargs.get('gold_parses', []):
+ for token_annotation in example.token_annotations:
+ for i, ner_tag in enumerate(token_annotation.entities):
if ner_tag != 'O' and ner_tag != '-':
_, label = ner_tag.split('-', 1)
for action in (BEGIN, IN, LAST, UNIT):
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 0ed7e6952..8fec87c50 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -27,6 +27,7 @@ from thinc.neural.util import get_array_module
from thinc.linalg cimport Vec, VecVec
import srsly
+from spacy.gold import Example
from ._parser_model cimport alloc_activations, free_activations
from ._parser_model cimport predict_states, arg_max_if_valid
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
@@ -193,7 +194,7 @@ cdef class Parser:
# Defined in subclasses, to avoid circular import
raise NotImplementedError
- def init_multitask_objectives(self, get_gold_tuples, pipeline, **cfg):
+ def init_multitask_objectives(self, get_examples, pipeline, **cfg):
'''Setup models for secondary objectives, to benefit from multi-task
learning. This method is intended to be overridden by subclasses.
@@ -203,9 +204,9 @@ cdef class Parser:
'''
pass
- def preprocess_gold(self, docs_golds):
- for doc, gold in docs_golds:
- yield doc, gold
+ def preprocess_gold(self, examples):
+ for ex in examples:
+ yield ex
def use_params(self, params):
# Can't decorate cdef class :(. Workaround.
@@ -411,35 +412,31 @@ cdef class Parser:
beam.check_done(_beam_utils.check_final_state, NULL)
return [b for b in beams if not b.is_done]
- def update(self, docs, golds, drop=0., sgd=None, losses=None):
+ def update(self, examples, drop=0., sgd=None, losses=None):
self.require_model()
- if isinstance(docs, Doc) and isinstance(golds, GoldParse):
- docs = [docs]
- golds = [golds]
- if len(docs) != len(golds):
- raise ValueError(Errors.E077.format(value='update', n_docs=len(docs),
- n_golds=len(golds)))
+ examples = Example.to_example_objects(examples)
+
if losses is None:
losses = {}
losses.setdefault(self.name, 0.)
for multitask in self._multitasks:
- multitask.update(docs, golds, drop=drop, sgd=sgd)
+ multitask.update(examples, drop=drop, sgd=sgd)
# The probability we use beam update, instead of falling back to
# a greedy update
beam_update_prob = self.cfg.get('beam_update_prob', 0.5)
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() < beam_update_prob:
- return self.update_beam(docs, golds, self.cfg.get('beam_width', 1),
+ return self.update_beam(examples, self.cfg.get('beam_width', 1),
drop=drop, sgd=sgd, losses=losses,
beam_density=self.cfg.get('beam_density', 0.001))
# Chop sequences into lengths of this many transitions, to make the
# batch uniform length.
cut_gold = numpy.random.choice(range(20, 100))
- states, golds, max_steps = self._init_gold_batch(docs, golds, max_length=cut_gold)
+ states, golds, max_steps = self._init_gold_batch(examples, max_length=cut_gold)
states_golds = [(s, g) for (s, g) in zip(states, golds)
if not s.is_final() and g is not None]
# Prepare the stepwise model, and get the callback for finishing the batch
- model, finish_update = self.model.begin_update(docs, drop=drop)
+ model, finish_update = self.model.begin_update([ex.doc for ex in examples], drop=drop)
for _ in range(max_steps):
if not states_golds:
break
@@ -454,19 +451,19 @@ cdef class Parser:
finish_update(golds, sgd=sgd)
return losses
- def rehearse(self, docs, sgd=None, losses=None, **cfg):
+ def rehearse(self, examples, sgd=None, losses=None, **cfg):
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
- if isinstance(docs, Doc):
- docs = [docs]
+ examples = Example.to_example_objects(examples)
if losses is None:
losses = {}
for multitask in self._multitasks:
if hasattr(multitask, 'rehearse'):
- multitask.rehearse(docs, losses=losses, sgd=sgd)
+ multitask.rehearse(examples, losses=losses, sgd=sgd)
if self._rehearsal_model is None:
return None
losses.setdefault(self.name, 0.)
+ docs = [ex.doc for ex in examples]
states = self.moves.init_batch(docs)
# This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to
@@ -494,15 +491,20 @@ cdef class Parser:
losses[self.name] += loss / n_scores
return losses
- def update_beam(self, docs, golds, width, drop=0., sgd=None, losses=None,
+ def update_beam(self, examples, width, drop=0., sgd=None, losses=None,
beam_density=0.0):
+ examples = Example.to_example_objects(examples)
+ docs = [ex.doc for ex in examples]
+ golds = [ex.gold for ex in examples]
+ new_golds = []
lengths = [len(d) for d in docs]
states = self.moves.init_batch(docs)
for gold in golds:
self.moves.preprocess_gold(gold)
+ new_golds.append(gold)
model, finish_update = self.model.begin_update(docs, drop=drop)
states_d_scores, backprops, beams = _beam_utils.update_beam(
- self.moves, self.nr_feature, 10000, states, golds, model.state2vec,
+ self.moves, self.nr_feature, 10000, states, new_golds, model.state2vec,
model.vec2scores, width, drop=drop, losses=losses,
beam_density=beam_density)
for i, d_scores in enumerate(states_d_scores):
@@ -522,7 +524,7 @@ cdef class Parser:
for beam in beams:
_beam_utils.cleanup_beam(beam)
- def _init_gold_batch(self, whole_docs, whole_golds, min_length=5, max_length=500):
+ def _init_gold_batch(self, whole_examples, min_length=5, max_length=500):
"""Make a square batch, of length equal to the shortest doc. A long
doc will get multiple states. Let's say we have a doc of length 2*N,
where N is the shortest doc. We'll make two states, one representing
@@ -530,6 +532,8 @@ cdef class Parser:
cdef:
StateClass state
Transition action
+ whole_docs = [ex.doc for ex in whole_examples]
+ whole_golds = [ex.gold for ex in whole_examples]
whole_states = self.moves.init_batch(whole_docs)
max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
max_moves = 0
@@ -592,14 +596,14 @@ cdef class Parser:
return create_default_optimizer(self.model.ops,
**self.cfg.get('optimizer', {}))
- def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg):
+ def begin_training(self, get_examples, pipeline=None, sgd=None, **cfg):
if 'model' in cfg:
self.model = cfg['model']
- if not hasattr(get_gold_tuples, '__call__'):
- gold_tuples = get_gold_tuples
- get_gold_tuples = lambda: gold_tuples
+ if not hasattr(get_examples, '__call__'):
+ gold_tuples = get_examples
+ get_examples = lambda: gold_tuples
cfg.setdefault('min_action_freq', 30)
- actions = self.moves.get_actions(gold_parses=get_gold_tuples(),
+ actions = self.moves.get_actions(gold_parses=get_examples(),
min_freq=cfg.get('min_action_freq', 30),
learn_tokens=self.cfg.get("learn_tokens", False))
for action, labels in self.moves.labels.items():
@@ -615,15 +619,14 @@ cdef class Parser:
sgd = self.create_optimizer()
doc_sample = []
gold_sample = []
- for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
- for annots, brackets in annots_brackets:
- ids, words, tags, heads, deps, ents = annots
- doc_sample.append(Doc(self.vocab, words=words))
- gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags,
- heads=heads, deps=deps, entities=ents))
+ for example in islice(get_examples(), 1000):
+ parses = example.get_gold_parses(merge=False, vocab=self.vocab)
+ for doc, gold in parses:
+ doc_sample.append(doc)
+ gold_sample.append(gold)
self.model.begin_training(doc_sample, gold_sample)
if pipeline is not None:
- self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
+ self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **cfg)
link_vectors_to_models(self.vocab)
else:
if sgd is None:
diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx
index 53e8a9cfe..c7ed25948 100644
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@@ -9,6 +9,7 @@ from __future__ import unicode_literals
from copy import copy
+from spacy.gold import Example
from ..tokens.doc cimport Doc, set_children_from_heads
from ..errors import Errors
@@ -77,39 +78,42 @@ def decompose(label):
def is_decorated(label):
return DELIMITER in label
-def count_decorated_labels(gold_tuples):
+def count_decorated_labels(gold_data):
freqs = {}
- for raw_text, sents in gold_tuples:
- for (ids, words, tags, heads, labels, iob), ctnts in sents:
- proj_heads, deco_labels = projectivize(heads, labels)
+ for example in gold_data:
+ for token_annotation in example.token_annotations:
+ proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps)
# set the label to ROOT for each root dependent
- deco_labels = ['ROOT' if head == i else deco_labels[i]
+ deco_deps = ['ROOT' if head == i else deco_deps[i]
for i, head in enumerate(proj_heads)]
# count label frequencies
- for label in deco_labels:
+ for label in deco_deps:
if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1
return freqs
-def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
+def preprocess_training_data(gold_data, label_freq_cutoff=30):
preprocessed = []
freqs = {}
- for raw_text, sents in gold_tuples:
- prepro_sents = []
- for (ids, words, tags, heads, labels, iob), ctnts in sents:
- proj_heads, deco_labels = projectivize(heads, labels)
+ for example in gold_data:
+ new_example = Example(doc=example.doc)
+ for token_annotation in example.token_annotations:
+ proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps)
# set the label to ROOT for each root dependent
- deco_labels = ['ROOT' if head == i else deco_labels[i]
+ deco_deps = ['ROOT' if head == i else deco_deps[i]
for i, head in enumerate(proj_heads)]
# count label frequencies
if label_freq_cutoff > 0:
- for label in deco_labels:
+ for label in deco_deps:
if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1
- prepro_sents.append(
- ((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
- preprocessed.append((raw_text, prepro_sents))
+ # TODO: the code would be less ugly when changing heads and deps in-place, but is this OK upstream ?
+ proj_token_dict = token_annotation.to_dict()
+ proj_token_dict["heads"] = proj_heads
+ proj_token_dict["deps"] = deco_deps
+ new_example.add_token_annotation(**proj_token_dict)
+ preprocessed.append(new_example)
if label_freq_cutoff > 0:
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
return preprocessed
@@ -203,20 +207,21 @@ def _find_new_head(token, headlabel):
return token.head
-def _filter_labels(gold_tuples, cutoff, freqs):
+def _filter_labels(examples, cutoff, freqs):
# throw away infrequent decorated labels
# can't learn them reliably anyway and keeps label set smaller
filtered = []
- for raw_text, sents in gold_tuples:
- filtered_sents = []
- for (ids, words, tags, heads, labels, iob), ctnts in sents:
+ for example in examples:
+ new_example = Example(doc=example.doc)
+ for token_annotation in example.token_annotations:
filtered_labels = []
- for label in labels:
+ for label in token_annotation.deps:
if is_decorated(label) and freqs.get(label, 0) < cutoff:
filtered_labels.append(decompose(label)[0])
else:
filtered_labels.append(label)
- filtered_sents.append(
- ((ids, words, tags, heads, filtered_labels, iob), ctnts))
- filtered.append((raw_text, filtered_sents))
+ filtered_token_dict = token_annotation.to_dict()
+ filtered_token_dict["deps"] = filtered_labels
+ new_example.add_token_annotation(**filtered_token_dict)
+ filtered.append(new_example)
return filtered
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index 4ab9c1e70..bee9db82e 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -37,7 +37,7 @@ def _train_parser(parser):
losses = {}
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
- parser.update([doc], [gold], sgd=sgd, losses=losses)
+ parser.update((doc, gold), sgd=sgd, losses=losses)
return parser
@@ -51,7 +51,7 @@ def test_add_label(parser):
gold = GoldParse(
doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"]
)
- parser.update([doc], [gold], sgd=sgd, losses=losses)
+ parser.update((doc, gold), sgd=sgd, losses=losses)
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc = parser(doc)
assert doc[0].dep_ == "right"
diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py
index 41b7a4861..0d9bd1ad0 100644
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@@ -130,18 +130,25 @@ annot_tuples = [
def test_get_oracle_actions():
+ ids, words, tags, heads, deps, ents = [], [], [], [], [], []
+ for id_, word, tag, head, dep, ent in annot_tuples:
+ ids.append(id_)
+ words.append(word)
+ tags.append(tag)
+ heads.append(head)
+ deps.append(dep)
+ ents.append(ent)
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
parser = DependencyParser(doc.vocab)
parser.moves.add_action(0, "")
parser.moves.add_action(1, "")
parser.moves.add_action(1, "")
parser.moves.add_action(4, "ROOT")
- for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
+ for i, (head, dep) in enumerate(zip(heads, deps)):
if head > i:
parser.moves.add_action(2, dep)
elif head < i:
parser.moves.add_action(3, dep)
- ids, words, tags, heads, deps, ents = zip(*annot_tuples)
heads, deps = projectivize(heads, deps)
gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps)
parser.moves.preprocess_gold(gold)
diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py
index 062c76ae3..468b3ff40 100644
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@@ -67,7 +67,7 @@ def test_update_doc(parser, model, doc, gold):
def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient
- parser.update([doc], [gold], sgd=optimize)
+ parser.update((doc, gold), sgd=optimize)
@pytest.mark.xfail
@@ -83,4 +83,4 @@ def test_update_doc_beam(parser, model, doc, gold):
def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient
- parser.update_beam([doc], [gold], sgd=optimize)
+ parser.update_beam((doc, gold), sgd=optimize)
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index 70beb2f60..d935494d6 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -30,7 +30,7 @@ def parser(vocab):
losses = {}
doc = Doc(vocab, words=["a", "b", "c", "d"])
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
- parser.update([doc], [gold], sgd=sgd, losses=losses)
+ parser.update((doc, gold), sgd=sgd, losses=losses)
return parser
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index ef70dc013..e967fffaf 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -24,7 +24,7 @@ def test_simple_train():
("bbbbbbbbb", 0.0),
("aaaaaa", 1),
]:
- nlp.update([text], [{"cats": {"answer": answer}}])
+ nlp.update((text, {"cats": {"answer": answer}}))
doc = nlp("aaa")
assert "answer" in doc.cats
assert doc.cats["answer"] >= 0.5
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index 6d88d68c2..61d2c9cd2 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -451,7 +451,7 @@ def test_issue999(train_data):
for itn in range(100):
random.shuffle(TRAIN_DATA)
for raw_text, entity_offsets in TRAIN_DATA:
- nlp.update([raw_text], [{"entities": entity_offsets}])
+ nlp.update((raw_text, {"entities": entity_offsets}))
with make_tempdir() as model_dir:
nlp.to_disk(model_dir)
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index e498417d1..ace25f8cc 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -5,6 +5,8 @@ import pytest
import gc
import numpy
import copy
+
+from spacy.gold import Example
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.lex_attrs import is_stop
@@ -270,9 +272,9 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label):
ner = EntityRecognizer(Vocab())
- entry = ([0], ["word"], ["tag"], [0], ["dep"], [label])
- gold_parses = [(None, [(entry, None)])]
- ner.moves.get_actions(gold_parses=gold_parses)
+ example = Example(doc=None)
+ example.add_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label])
+ ner.moves.get_actions(gold_parses=[example])
def test_issue1971(en_vocab):
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 73ff7376a..0acb25e90 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -157,7 +157,7 @@ def test_issue2800():
losses = {}
random.shuffle(train_data)
for statement, entities in train_data:
- nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5)
+ nlp.update((statement, entities), sgd=optimizer, losses=losses, drop=0.5)
def test_issue2822(it_tokenizer):
diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py
index 3c4836264..bc8603888 100644
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@@ -41,10 +41,8 @@ def test_issue3611():
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
- texts, annotations = zip(*batch)
nlp.update(
- docs=texts,
- golds=annotations,
+ examples=batch,
sgd=optimizer,
drop=0.1,
losses=losses,
diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py
index ed219573f..e774feb2d 100644
--- a/spacy/tests/regression/test_issue4030.py
+++ b/spacy/tests/regression/test_issue4030.py
@@ -41,10 +41,8 @@ def test_issue4030():
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
- texts, annotations = zip(*batch)
nlp.update(
- docs=texts,
- golds=annotations,
+ examples=batch,
sgd=optimizer,
drop=0.1,
losses=losses,
diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py
index 9391c3529..b0583f717 100644
--- a/spacy/tests/regression/test_issue4348.py
+++ b/spacy/tests/regression/test_issue4348.py
@@ -19,5 +19,4 @@ def test_issue4348():
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
- texts, annotations = zip(*batch)
- nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+ nlp.update(batch, sgd=optimizer, losses=losses)
diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py
index 2e1b69000..bf103a389 100644
--- a/spacy/tests/regression/test_issue4402.py
+++ b/spacy/tests/regression/test_issue4402.py
@@ -11,15 +11,14 @@ from spacy.tests.util import make_tempdir
def test_issue4402():
nlp = English()
with make_tempdir() as tmpdir:
- print("temp", tmpdir)
json_path = tmpdir / "test4402.json"
srsly.write_json(json_path, json_data)
corpus = GoldCorpus(str(json_path), str(json_path))
- train_docs = list(corpus.train_docs(nlp, gold_preproc=True, max_length=0))
+ train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0))
# assert that the data got split into 4 sentences
- assert len(train_docs) == 4
+ assert len(train_data) == 4
json_data = [
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 731a1b5c2..c1bdfcc4d 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -1,11 +1,12 @@
# coding: utf-8
from __future__ import unicode_literals
-from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
+from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Example, DocAnnotation
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
from spacy.gold import GoldCorpus, docs_to_json, align
from spacy.lang.en import English
from spacy.tokens import Doc
+from spacy.util import compounding, minibatch
from .util import make_tempdir
import pytest
import srsly
@@ -119,12 +120,13 @@ def test_roundtrip_docs_to_json():
with make_tempdir() as tmpdir:
json_file = tmpdir / "roundtrip.json"
srsly.write_json(json_file, [docs_to_json(doc)])
- goldcorpus = GoldCorpus(str(json_file), str(json_file))
+ goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
- reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
+ reloaded_example = next(goldcorpus.train_dataset(nlp))
+ goldparse = reloaded_example.gold
assert len(doc) == goldcorpus.count_train()
- assert text == reloaded_doc.text
+ assert text == reloaded_example.text
assert tags == goldparse.tags
assert deps == goldparse.labels
assert heads == goldparse.heads
@@ -140,10 +142,11 @@ def test_roundtrip_docs_to_json():
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
- reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
+ reloaded_example = next(goldcorpus.train_dataset(nlp))
+ goldparse = reloaded_example.gold
assert len(doc) == goldcorpus.count_train()
- assert text == reloaded_doc.text
+ assert text == reloaded_example.text
assert tags == goldparse.tags
assert deps == goldparse.labels
assert heads == goldparse.heads
@@ -160,13 +163,14 @@ def test_roundtrip_docs_to_json():
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
# load and rewrite as JSONL tuples
- srsly.write_jsonl(jsonl_file, goldcorpus.train_tuples)
+ srsly.write_jsonl(jsonl_file, goldcorpus.train_examples)
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
- reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
+ reloaded_example = next(goldcorpus.train_dataset(nlp))
+ goldparse = reloaded_example.gold
assert len(doc) == goldcorpus.count_train()
- assert text == reloaded_doc.text
+ assert text == reloaded_example.text
assert tags == goldparse.tags
assert deps == goldparse.labels
assert heads == goldparse.heads
@@ -217,3 +221,144 @@ def test_goldparse_startswith_space(en_tokenizer):
assert g.words == [" ", "a"]
assert g.ner == [None, "U-DATE"]
assert g.labels == [None, "ROOT"]
+
+
+def test_gold_constructor():
+ """Test that the GoldParse constructor works fine"""
+ nlp = English()
+ doc = nlp("This is a sentence")
+ gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0})
+
+ assert gold.cats["cat1"]
+ assert not gold.cats["cat2"]
+ assert gold.words == ["This", "is", "a", "sentence"]
+
+
+def test_gold_orig_annot():
+ nlp = English()
+ doc = nlp("This is a sentence")
+ gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0})
+
+ assert gold.orig.words == ["This", "is", "a", "sentence"]
+ assert gold.cats["cat1"]
+
+ doc_annotation = DocAnnotation(cats={"cat1": 0.0, "cat2": 1.0})
+ gold2 = GoldParse.from_annotation(doc, doc_annotation, gold.orig)
+ assert gold2.orig.words == ["This", "is", "a", "sentence"]
+ assert not gold2.cats["cat1"]
+
+
+def test_tuple_format_implicit():
+ """Test tuple format with implicit GoldParse creation"""
+
+ train_data = [
+ ("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}),
+ (
+ "Spotify steps up Asia expansion",
+ {"entities": [(0, 8, "ORG"), (17, 21, "LOC")]},
+ ),
+ ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
+ ]
+
+ _train(train_data)
+
+
+def test_tuple_format_implicit_invalid():
+ """Test that an error is thrown for an implicit invalid GoldParse field"""
+
+ train_data = [
+ ("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}),
+ (
+ "Spotify steps up Asia expansion",
+ {"entities": [(0, 8, "ORG"), (17, 21, "LOC")]},
+ ),
+ ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
+ ]
+
+ with pytest.raises(TypeError):
+ _train(train_data)
+
+
+def _train(train_data):
+ nlp = English()
+ ner = nlp.create_pipe("ner")
+ ner.add_label("ORG")
+ ner.add_label("LOC")
+ nlp.add_pipe(ner)
+
+ optimizer = nlp.begin_training()
+ for i in range(5):
+ losses = {}
+ batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+ for batch in batches:
+ nlp.update(batch, sgd=optimizer, losses=losses)
+
+
+tokens_1 = {
+ "ids": [1, 2, 3],
+ "words": ["Hi", "there", "everyone"],
+ "tags": ["INTJ", "ADV", "PRON"],
+}
+
+tokens_2 = {
+ "ids": [1, 2, 3, 4],
+ "words": ["It", "is", "just", "me"],
+ "tags": ["PRON", "AUX", "ADV", "PRON"],
+}
+
+text0 = "Hi there everyone It is just me"
+
+
+def test_merge_sents():
+ nlp = English()
+ example = Example()
+ example.add_token_annotation(**tokens_1)
+ example.add_token_annotation(**tokens_2)
+ assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2
+ assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 # this shouldn't change the original object
+
+ merged_example = example.merge_sents()
+
+ token_annotation_1 = example.token_annotations[0]
+ assert token_annotation_1.ids == [1, 2, 3]
+ assert token_annotation_1.words == ["Hi", "there", "everyone"]
+ assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"]
+
+ token_annotation_m = merged_example.token_annotations[0]
+ assert token_annotation_m.ids == [1, 2, 3, 4, 5, 6, 7]
+ assert token_annotation_m.words == ["Hi", "there", "everyone", "It", "is", "just", "me"]
+ assert token_annotation_m.tags == ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"]
+
+
+def test_tuples_to_example():
+ ex = Example()
+ ex.add_token_annotation(**tokens_1)
+ ex.add_token_annotation(**tokens_2)
+ ex.add_doc_annotation(cats={"TRAVEL": 1.0, "BAKING": 0.0})
+ ex_dict = ex.to_dict()
+
+ token_dicts = [
+ {
+ "ids": [1, 2, 3],
+ "words": ["Hi", "there", "everyone"],
+ "tags": ["INTJ", "ADV", "PRON"],
+ "heads": [],
+ "deps": [],
+ "entities": [],
+ "morphology": [],
+ "brackets": [],
+ },
+ {
+ "ids": [1, 2, 3, 4],
+ "words": ["It", "is", "just", "me"],
+ "tags": ["PRON", "AUX", "ADV", "PRON"],
+ "heads": [],
+ "deps": [],
+ "entities": [],
+ "morphology": [],
+ "brackets": [],
+ },
+ ]
+ doc_dict = {"cats": {"TRAVEL": 1.0, "BAKING": 0.0}, "links": {}}
+
+ assert ex_dict == {"token_annotations": token_dicts, "doc_annotation": doc_dict}
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index d5398c145..2b0bcc15e 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -31,20 +31,20 @@ def test_language_update(nlp):
doc = Doc(nlp.vocab, words=text.split(" "))
gold = GoldParse(doc, **annots)
# Update with doc and gold objects
- nlp.update([doc], [gold])
+ nlp.update((doc, gold))
# Update with text and dict
- nlp.update([text], [annots])
+ nlp.update((text, annots))
# Update with doc object and dict
- nlp.update([doc], [annots])
+ nlp.update((doc, annots))
# Update with text and gold object
- nlp.update([text], [gold])
+ nlp.update((text, gold))
+ # Update with empty doc and gold object
+ nlp.update((None, gold))
# Update badly
- with pytest.raises(IndexError):
- nlp.update([doc], [])
- with pytest.raises(IndexError):
- nlp.update([], [gold])
with pytest.raises(ValueError):
- nlp.update([text], [wrongkeyannots])
+ nlp.update((doc, None))
+ with pytest.raises(TypeError):
+ nlp.update((text, wrongkeyannots))
def test_language_evaluate(nlp):
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index c59358a6b..e8d74c405 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
from numpy.testing import assert_almost_equal, assert_array_almost_equal
import pytest
from pytest import approx
-from spacy.gold import GoldParse
+from spacy.gold import Example, GoldParse
from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve
from .util import get_doc
@@ -40,7 +40,7 @@ def test_las_per_type(en_vocab):
deps=annot["deps"],
)
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
- scorer.score(doc, gold)
+ scorer.score((doc, gold))
results = scorer.scores
assert results["uas"] == 100
@@ -63,7 +63,7 @@ def test_las_per_type(en_vocab):
)
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
doc[0].dep_ = "compound"
- scorer.score(doc, gold)
+ scorer.score((doc, gold))
results = scorer.scores
assert results["uas"] == 100
@@ -85,8 +85,9 @@ def test_ner_per_type(en_vocab):
words=input_.split(" "),
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
)
- gold = GoldParse(doc, entities=annot["entities"])
- scorer.score(doc, gold)
+ ex = Example(doc=doc)
+ ex.add_token_annotation(entities=annot["entities"])
+ scorer.score(ex)
results = scorer.scores
assert results["ents_p"] == 100
@@ -105,8 +106,9 @@ def test_ner_per_type(en_vocab):
words=input_.split(" "),
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
)
- gold = GoldParse(doc, entities=annot["entities"])
- scorer.score(doc, gold)
+ ex = Example(doc=doc)
+ ex.add_token_annotation(entities=annot["entities"])
+ scorer.score(ex)
results = scorer.scores
assert results["ents_p"] == approx(66.66666)
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index b39bb1ecb..262f19941 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -158,7 +158,7 @@ cdef class Tokenizer:
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
return doc
- def pipe(self, texts, batch_size=1000, n_threads=-1):
+ def pipe(self, texts, batch_size=1000, n_threads=-1, as_example=False):
"""Tokenize a stream of texts.
texts: A sequence of unicode texts.
diff --git a/spacy/util.py b/spacy/util.py
index 74e4cc1c6..f9e51f7d5 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -616,31 +616,25 @@ def decaying(start, stop, decay):
curr -= decay
-def minibatch_by_words(items, size, tuples=True, count_words=len):
+def minibatch_by_words(examples, size, tuples=True, count_words=len):
"""Create minibatches of a given number of words."""
if isinstance(size, int):
size_ = itertools.repeat(size)
else:
size_ = size
- items = iter(items)
+ examples = iter(examples)
while True:
batch_size = next(size_)
batch = []
while batch_size >= 0:
try:
- if tuples:
- doc, gold = next(items)
- else:
- doc = next(items)
+ example = next(examples)
except StopIteration:
if batch:
yield batch
return
- batch_size -= count_words(doc)
- if tuples:
- batch.append((doc, gold))
- else:
- batch.append(doc)
+ batch_size -= count_words(example.doc)
+ batch.append(example)
if batch:
yield batch
From 3ac4e8eb7a6c688ddc7abd205e2ed7060cbf0798 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 13 Nov 2019 15:25:03 +0100
Subject: [PATCH 020/496] Fix minor issues in debug-data (#4636)
* Add error in debug-data if no dev docs are available (see #4575)
* Update debug-data for GoldCorpus / Example
* Ignore None label in misaligned NER data
---
spacy/cli/debug_data.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 76276ee56..ed19703ac 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -121,6 +121,8 @@ def debug_data(
msg.text("{} training docs".format(len(train_dataset)))
msg.text("{} evaluation docs".format(len(gold_dev_data)))
+ if not len(gold_dev_data):
+ msg.fail("No evaluation docs")
overlap = len(train_texts.intersection(dev_texts))
if overlap:
msg.warn("{} training examples also in evaluation data".format(overlap))
@@ -181,7 +183,7 @@ def debug_data(
if "ner" in pipeline:
# Get all unique NER labels present in the data
labels = set(
- label for label in gold_train_data["ner"] if label not in ("O", "-")
+ label for label in gold_train_data["ner"] if label not in ("O", "-", None)
)
label_counts = gold_train_data["ner"]
model_labels = _get_labels_from_model(nlp, "ner")
@@ -601,7 +603,7 @@ def _format_labels(labels, counts=False):
def _get_examples_without_label(data, label):
count = 0
for ex in data:
- labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-")]
+ labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-", None)]
if label not in labels:
count += 1
return count
From d67b0f196a2fc09479099a52d64462527c83a647 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 13 Nov 2019 21:22:18 +0100
Subject: [PATCH 021/496] Fix initialization of token mappings in new align
(#4640)
Initialize all values in `a2b` and `b2a` since `numpy.empty()` otherwise
result unspecified integers.
---
spacy/gold.pyx | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index ea3589ea5..d79bc8205 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -129,6 +129,8 @@ def align(tokens_a, tokens_b):
cost = 0
a2b = numpy.empty(len(tokens_a), dtype="i")
b2a = numpy.empty(len(tokens_b), dtype="i")
+ a2b.fill(-1)
+ b2a.fill(-1)
a2b_multi = {}
b2a_multi = {}
i = 0
@@ -138,7 +140,6 @@ def align(tokens_a, tokens_b):
while i < len(tokens_a) and j < len(tokens_b):
a = tokens_a[i][offset_a:]
b = tokens_b[j][offset_b:]
- a2b[i] = b2a[j] = -1
if a == b:
if offset_a == offset_b == 0:
a2b[i] = j
From faaa832518228f29e5351676400ed8688cc4482e Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 13 Nov 2019 21:24:35 +0100
Subject: [PATCH 022/496] Generalize handling of tokenizer special cases
(#4259)
* Generalize handling of tokenizer special cases
Handle tokenizer special cases more generally by using the Matcher
internally to match special cases after the affix/token_match
tokenization is complete.
Instead of only matching special cases while processing balanced or
nearly balanced prefixes and suffixes, this recognizes special cases in
a wider range of contexts:
* Allows arbitrary numbers of prefixes/affixes around special cases
* Allows special cases separated by infixes
Existing tests/settings that couldn't be preserved as before:
* The emoticon '")' is no longer a supported special case
* The emoticon ':)' in "example:)" is a false positive again
When merged with #4258 (or the relevant cache bugfix), the affix and
token_match properties should be modified to flush and reload all
special cases to use the updated internal tokenization with the Matcher.
* Remove accidentally added test case
* Really remove accidentally added test
* Reload special cases when necessary
Reload special cases when affixes or token_match are modified. Skip
reloading during initialization.
* Update error code number
* Fix offset and whitespace in Matcher special cases
* Fix offset bugs when merging and splitting tokens
* Set final whitespace on final token in inserted special case
* Improve cache flushing in tokenizer
* Separate cache and specials memory (temporarily)
* Flush cache when adding special cases
* Repeated `self._cache = PreshMap()` and `self._specials = PreshMap()`
are necessary due to this bug:
https://github.com/explosion/preshed/issues/21
* Remove reinitialized PreshMaps on cache flush
* Update UD bin scripts
* Update imports for `bin/`
* Add all currently supported languages
* Update subtok merger for new Matcher validation
* Modify blinded check to look at tokens instead of lemmas (for corpora
with tokens but not lemmas like Telugu)
* Use special Matcher only for cases with affixes
* Reinsert specials cache checks during normal tokenization for special
cases as much as possible
* Additionally include specials cache checks while splitting on infixes
* Since the special Matcher needs consistent affix-only tokenization
for the special cases themselves, introduce the argument
`with_special_cases` in order to do tokenization with or without
specials cache checks
* After normal tokenization, postprocess with special cases Matcher for
special cases containing affixes
* Replace PhraseMatcher with Aho-Corasick
Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays
of the hash values for the relevant attribute. The implementation is
based on FlashText.
The speed should be similar to the previous PhraseMatcher. It is now
possible to easily remove match IDs and matches don't go missing with
large keyword lists / vocabularies.
Fixes #4308.
* Restore support for pickling
* Fix internal keyword add/remove for numpy arrays
* Add test for #4248, clean up test
* Improve efficiency of special cases handling
* Use PhraseMatcher instead of Matcher
* Improve efficiency of merging/splitting special cases in document
* Process merge/splits in one pass without repeated token shifting
* Merge in place if no splits
* Update error message number
* Remove UD script modifications
Only used for timing/testing, should be a separate PR
* Remove final traces of UD script modifications
* Update UD bin scripts
* Update imports for `bin/`
* Add all currently supported languages
* Update subtok merger for new Matcher validation
* Modify blinded check to look at tokens instead of lemmas (for corpora
with tokens but not lemmas like Telugu)
* Add missing loop for match ID set in search loop
* Remove cruft in matching loop for partial matches
There was a bit of unnecessary code left over from FlashText in the
matching loop to handle partial token matches, which we don't have with
PhraseMatcher.
* Replace dict trie with MapStruct trie
* Fix how match ID hash is stored/added
* Update fix for match ID vocab
* Switch from map_get_unless_missing to map_get
* Switch from numpy array to Token.get_struct_attr
Access token attributes directly in Doc instead of making a copy of the
relevant values in a numpy array.
Add unsatisfactory warning for hash collision with reserved terminal
hash key. (Ideally it would change the reserved terminal hash and redo
the whole trie, but for now, I'm hoping there won't be collisions.)
* Restructure imports to export find_matches
* Implement full remove()
Remove unnecessary trie paths and free unused maps.
Parallel to Matcher, raise KeyError when attempting to remove a match ID
that has not been added.
* Switch to PhraseMatcher.find_matches
* Switch to local cdef functions for span filtering
* Switch special case reload threshold to variable
Refer to variable instead of hard-coded threshold
* Move more of special case retokenize to cdef nogil
Move as much of the special case retokenization to nogil as possible.
* Rewrap sort as stdsort for OS X
* Rewrap stdsort with specific types
* Switch to qsort
* Fix merge
* Improve cmp functions
* Fix realloc
* Fix realloc again
* Initialize span struct while retokenizing
* Temporarily skip retokenizing
* Revert "Move more of special case retokenize to cdef nogil"
This reverts commit 0b7e52c797cd8ff1548f214bd4186ebb3a7ce8b1.
* Revert "Switch to qsort"
This reverts commit a98d71a942fc9bca531cf5eb05cf89fa88153b60.
* Fix specials check while caching
* Modify URL test with emoticons
The multiple suffix tests result in the emoticon `:>`, which is now
retokenized into one token as a special case after the suffixes are
split off.
* Refactor _apply_special_cases()
* Use cdef ints for span info used in multiple spots
* Modify _filter_special_spans() to prefer earlier
Parallel to #4414, modify _filter_special_spans() so that the earlier
span is preferred for overlapping spans of the same length.
* Replace MatchStruct with Entity
Replace MatchStruct with Entity since the existing Entity struct is
nearly identical.
* Replace Entity with more general SpanC
* Replace MatchStruct with SpanC
* Add error in debug-data if no dev docs are available (see #4575)
* Update azure-pipelines.yml
* Revert "Update azure-pipelines.yml"
This reverts commit ed1060cf59e5895b5fe92ad5b894fd1078ec4c49.
* Use latest wasabi
* Reorganise install_requires
* add dframcy to universe.json (#4580)
* Update universe.json [ci skip]
* Fix multiprocessing for as_tuples=True (#4582)
* Fix conllu script (#4579)
* force extensions to avoid clash between example scripts
* fix arg order and default file encoding
* add example config for conllu script
* newline
* move extension definitions to main function
* few more encodings fixes
* Add load_from_docbin example [ci skip]
TODO: upload the file somewhere
* Update README.md
* Add warnings about 3.8 (resolves #4593) [ci skip]
* Fixed typo: Added space between "recognize" and "various" (#4600)
* Fix DocBin.merge() example (#4599)
* Replace function registries with catalogue (#4584)
* Replace functions registries with catalogue
* Update __init__.py
* Fix test
* Revert unrelated flag [ci skip]
* Bugfix/dep matcher issue 4590 (#4601)
* add contributor agreement for prilopes
* add test for issue #4590
* fix on_match params for DependencyMacther (#4590)
* Minor updates to language example sentences (#4608)
* Add punctuation to Spanish example sentences
* Combine multilanguage examples for lang xx
* Add punctuation to nb examples
* Always realloc to a larger size
Avoid potential (unlikely) edge case and cymem error seen in #4604.
* Add error in debug-data if no dev docs are available (see #4575)
* Update debug-data for GoldCorpus / Example
* Ignore None label in misaligned NER data
---
.github/contributors/prilopes.md | 106 ++++++++
README.md | 10 +-
bin/ud/ud_train.py | 13 +-
examples/load_from_docbin.py | 45 +++
examples/training/conllu-config.json | 1 +
examples/training/conllu.py | 10 +-
requirements.txt | 4 +-
setup.cfg | 12 +-
spacy/__init__.py | 2 +-
spacy/__main__.py | 4 +-
spacy/cli/download.py | 5 +-
spacy/cli/evaluate.py | 3 +-
spacy/cli/info.py | 3 +-
spacy/cli/init_model.py | 3 +-
spacy/cli/link.py | 3 +-
spacy/cli/package.py | 3 +-
spacy/cli/pretrain.py | 3 +-
spacy/cli/profile.py | 3 +-
spacy/cli/train.py | 3 +-
spacy/cli/validate.py | 3 +-
spacy/compat.py | 5 -
spacy/displacy/render.py | 4 +-
spacy/errors.py | 3 +
spacy/lang/es/examples.py | 16 +-
spacy/lang/nb/examples.py | 6 +-
spacy/lang/tokenizer_exceptions.py | 1 -
spacy/lang/xx/examples.py | 99 +++++++
spacy/language.py | 7 +-
spacy/matcher/dependencymatcher.pyx | 2 +-
spacy/ml/common.py | 6 +-
spacy/ml/tok2vec.py | 18 +-
spacy/tests/regression/test_issue4590.py | 34 +++
spacy/tests/test_architectures.py | 19 ++
spacy/tests/test_register_architecture.py | 19 --
spacy/tests/tokenizer/test_exceptions.py | 9 +-
spacy/tests/tokenizer/test_tokenizer.py | 21 ++
spacy/tests/tokenizer/test_urls.py | 15 +-
spacy/tokenizer.pxd | 34 ++-
spacy/tokenizer.pyx | 316 ++++++++++++++++++----
spacy/util.py | 113 ++------
website/docs/api/docbin.md | 4 +-
website/docs/usage/101/_named-entities.md | 2 +-
website/docs/usage/index.md | 11 +
website/meta/universe.json | 24 ++
44 files changed, 754 insertions(+), 273 deletions(-)
create mode 100644 .github/contributors/prilopes.md
create mode 100644 examples/load_from_docbin.py
create mode 100644 examples/training/conllu-config.json
create mode 100644 spacy/lang/xx/examples.py
create mode 100644 spacy/tests/regression/test_issue4590.py
create mode 100644 spacy/tests/test_architectures.py
delete mode 100644 spacy/tests/test_register_architecture.py
diff --git a/.github/contributors/prilopes.md b/.github/contributors/prilopes.md
new file mode 100644
index 000000000..ad111d4de
--- /dev/null
+++ b/.github/contributors/prilopes.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Priscilla Lopes |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 2019-11-06 |
+| GitHub username | prilopes |
+| Website (optional) | |
diff --git a/README.md b/README.md
index 99d66bb31..980fc5b0b 100644
--- a/README.md
+++ b/README.md
@@ -104,6 +104,13 @@ For detailed installation instructions, see the
[pip]: https://pypi.org/project/spacy/
[conda]: https://anaconda.org/conda-forge/spacy
+> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary
+> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI
+> providers and other tooling to support it. This means that in order to run
+> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile
+> the library and its Cython dependencies locally. If this is causing problems
+> for you, the easiest solution is to **use Python 3.7** in the meantime.
+
### pip
Using pip, spaCy releases are available as source packages and binary wheels (as
@@ -180,9 +187,6 @@ pointing pip to a path or URL.
# download best-matching version of specific model for your spaCy installation
python -m spacy download en_core_web_sm
-# out-of-the-box: download best-matching default model
-python -m spacy download en
-
# pip install .tar.gz archive from path or URL
pip install /Users/you/en_core_web_sm-2.2.0.tar.gz
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py
index b6a44b861..75bf55771 100644
--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@@ -323,11 +323,6 @@ def get_token_conllu(token, i):
return "\n".join(lines)
-Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
-Token.set_extension("begins_fused", default=False, force=True)
-Token.set_extension("inside_fused", default=False, force=True)
-
-
##################
# Initialization #
##################
@@ -460,13 +455,13 @@ class TreebankPaths(object):
@plac.annotations(
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
+ parses_dir=("Directory to write the development parses", "positional", None, Path),
corpus=(
- "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+ "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
"positional",
None,
str,
),
- parses_dir=("Directory to write the development parses", "positional", None, Path),
config=("Path to json formatted config file", "option", "C", Path),
limit=("Size limit", "option", "n", int),
gpu_device=("Use GPU", "option", "g", int),
@@ -491,6 +486,10 @@ def main(
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
+ Token.set_extension("get_conllu_lines", method=get_token_conllu)
+ Token.set_extension("begins_fused", default=False)
+ Token.set_extension("inside_fused", default=False)
+
spacy.util.fix_random_seed()
lang.zh.Chinese.Defaults.use_jieba = False
lang.ja.Japanese.Defaults.use_janome = False
diff --git a/examples/load_from_docbin.py b/examples/load_from_docbin.py
new file mode 100644
index 000000000..f26e7fc49
--- /dev/null
+++ b/examples/load_from_docbin.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+"""
+Example of loading previously parsed text using spaCy's DocBin class. The example
+performs an entity count to show that the annotations are available.
+For more details, see https://spacy.io/usage/saving-loading#docs
+Installation:
+python -m spacy download en_core_web_lg
+Usage:
+python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy
+"""
+from __future__ import unicode_literals
+
+import spacy
+from spacy.tokens import DocBin
+from timeit import default_timer as timer
+from collections import Counter
+
+EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"
+
+
+def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
+ nlp = spacy.load(model)
+ print("Reading data from {}".format(docbin_path))
+ with open(docbin_path, "rb") as file_:
+ bytes_data = file_.read()
+ nr_word = 0
+ start_time = timer()
+ entities = Counter()
+ docbin = DocBin().from_bytes(bytes_data)
+ for doc in docbin.get_docs(nlp.vocab):
+ nr_word += len(doc)
+ entities.update((e.label_, e.text) for e in doc.ents)
+ end_time = timer()
+ msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
+ wps = nr_word / (end_time - start_time)
+ print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
+ print("Most common entities:")
+ for (label, entity), freq in entities.most_common(30):
+ print(freq, entity, label)
+
+
+if __name__ == "__main__":
+ import plac
+
+ plac.call(main)
diff --git a/examples/training/conllu-config.json b/examples/training/conllu-config.json
new file mode 100644
index 000000000..9a11dd96b
--- /dev/null
+++ b/examples/training/conllu-config.json
@@ -0,0 +1 @@
+{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0}
diff --git a/examples/training/conllu.py b/examples/training/conllu.py
index ba3cf450c..08febda50 100644
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@@ -383,20 +383,24 @@ class TreebankPaths(object):
@plac.annotations(
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
+ parses_dir=("Directory to write the development parses", "positional", None, Path),
+ config=("Path to json formatted config file", "positional", None, Config.load),
corpus=(
- "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+ "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
"positional",
None,
str,
),
- parses_dir=("Directory to write the development parses", "positional", None, Path),
- config=("Path to json formatted config file", "positional", None, Config.load),
limit=("Size limit", "option", "n", int),
)
def main(ud_dir, parses_dir, config, corpus, limit=0):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
+ Token.set_extension("get_conllu_lines", method=get_token_conllu)
+ Token.set_extension("begins_fused", default=False)
+ Token.set_extension("inside_fused", default=False)
+
paths = TreebankPaths(ud_dir, corpus)
if not (parses_dir / corpus).exists():
(parses_dir / corpus).mkdir()
diff --git a/requirements.txt b/requirements.txt
index ad7059f3a..12f19bb88 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,14 +4,14 @@ preshed>=3.0.2,<3.1.0
thinc>=7.3.0,<7.4.0
blis>=0.4.0,<0.5.0
murmurhash>=0.28.0,<1.1.0
-wasabi>=0.3.0,<1.1.0
+wasabi>=0.4.0,<1.1.0
srsly>=0.1.0,<1.1.0
+catalogue>=0.0.7,<1.1.0
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0
plac>=0.9.6,<1.2.0
pathlib==1.0.1; python_version < "3.4"
-importlib_metadata>=0.20; python_version < "3.8"
# Optional dependencies
jsonschema>=2.6.0,<3.1.0
# Development dependencies
diff --git a/setup.cfg b/setup.cfg
index 51e722354..940066a9e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -40,19 +40,21 @@ setup_requires =
murmurhash>=0.28.0,<1.1.0
thinc>=7.3.0,<7.4.0
install_requires =
- setuptools
- numpy>=1.15.0
+ # Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=7.3.0,<7.4.0
blis>=0.4.0,<0.5.0
+ wasabi>=0.4.0,<1.1.0
+ srsly>=0.1.0,<1.1.0
+ catalogue>=0.0.7,<1.1.0
+ # Third-party dependencies
+ setuptools
+ numpy>=1.15.0
plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0
- wasabi>=0.3.0,<1.1.0
- srsly>=0.1.0,<1.1.0
pathlib==1.0.1; python_version < "3.4"
- importlib_metadata>=0.20; python_version < "3.8"
[options.extras_require]
lookups =
diff --git a/spacy/__init__.py b/spacy/__init__.py
index 57701179f..4a0d16a49 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -15,7 +15,7 @@ from .glossary import explain
from .about import __version__
from .errors import Errors, Warnings, deprecation_warning
from . import util
-from .util import register_architecture, get_architecture
+from .util import registry
from .language import component
diff --git a/spacy/__main__.py b/spacy/__main__.py
index 716561566..2c285095e 100644
--- a/spacy/__main__.py
+++ b/spacy/__main__.py
@@ -7,12 +7,10 @@ from __future__ import print_function
if __name__ == "__main__":
import plac
import sys
- from wasabi import Printer
+ from wasabi import msg
from spacy.cli import download, link, info, package, train, pretrain, convert
from spacy.cli import init_model, profile, evaluate, validate, debug_data
- msg = Printer()
-
commands = {
"download": download,
"link": link,
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index c57e2364b..19f3e7860 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -6,16 +6,13 @@ import requests
import os
import subprocess
import sys
-from wasabi import Printer
+from wasabi import msg
from .link import link
from ..util import get_package_path
from .. import about
-msg = Printer()
-
-
@plac.annotations(
model=("Model to download (shortcut or name)", "positional", None, str),
direct=("Force direct download of name + version", "flag", "d", bool),
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index e5b2d0f02..a3193a5cf 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals, division, print_function
import plac
from timeit import default_timer as timer
-from wasabi import Printer
+from wasabi import msg
from ..gold import GoldCorpus
from .. import util
@@ -32,7 +32,6 @@ def evaluate(
Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument.
"""
- msg = Printer()
util.fix_random_seed()
if gpu_id >= 0:
util.use_gpu(gpu_id)
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 3655327ef..080d0dc77 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
import plac
import platform
from pathlib import Path
-from wasabi import Printer
+from wasabi import msg
import srsly
from ..compat import path2str, basestring_, unicode_
@@ -23,7 +23,6 @@ def info(model=None, markdown=False, silent=False):
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
- msg = Printer()
if model:
if util.is_package(model):
model_path = util.get_package_path(model)
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index c285a12a6..cda21cbcc 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -11,7 +11,7 @@ import tarfile
import gzip
import zipfile
import srsly
-from wasabi import Printer
+from wasabi import msg
from ..vectors import Vectors
from ..errors import Errors, Warnings, user_warning
@@ -24,7 +24,6 @@ except ImportError:
DEFAULT_OOV_PROB = -20
-msg = Printer()
@plac.annotations(
diff --git a/spacy/cli/link.py b/spacy/cli/link.py
index 6b719ffe6..8117829b5 100644
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import plac
from pathlib import Path
-from wasabi import Printer
+from wasabi import msg
from ..compat import symlink_to, path2str
from .. import util
@@ -20,7 +20,6 @@ def link(origin, link_name, force=False, model_path=None):
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
- msg = Printer()
if util.is_package(origin):
model_path = util.get_package_path(origin)
else:
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index e99a6d5ff..8ed92259c 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
import plac
import shutil
from pathlib import Path
-from wasabi import Printer, get_raw_input
+from wasabi import msg, get_raw_input
import srsly
from ..compat import path2str
@@ -27,7 +27,6 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
set and a meta.json already exists in the output directory, the existing
values will be used as the defaults in the command-line prompt.
"""
- msg = Printer()
input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path)
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 59269cb85..68038bc5c 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -11,7 +11,7 @@ from pathlib import Path
from thinc.v2v import Affine, Maxout
from thinc.misc import LayerNorm as LN
from thinc.neural.util import prefer_gpu
-from wasabi import Printer
+from wasabi import msg
import srsly
from spacy.gold import Example
@@ -123,7 +123,6 @@ def pretrain(
for key in config:
if isinstance(config[key], Path):
config[key] = str(config[key])
- msg = Printer()
util.fix_random_seed(seed)
has_gpu = prefer_gpu()
diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
index 201ab13d5..4995224f3 100644
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@@ -9,7 +9,7 @@ import pstats
import sys
import itertools
import thinc.extra.datasets
-from wasabi import Printer
+from wasabi import msg
from ..util import load_model
@@ -26,7 +26,6 @@ def profile(model, inputs=None, n_texts=10000):
It can either be provided as a JSONL file, or be read from sys.sytdin.
If no input file is specified, the IMDB dataset is loaded via Thinc.
"""
- msg = Printer()
if inputs is not None:
inputs = _read_inputs(inputs, msg)
if inputs is None:
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 24255437c..622a9ca97 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -8,7 +8,7 @@ from thinc.neural._classes.model import Model
from timeit import default_timer as timer
import shutil
import srsly
-from wasabi import Printer
+from wasabi import msg
import contextlib
import random
@@ -89,7 +89,6 @@ def train(
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
- msg = Printer()
util.fix_random_seed()
util.set_env_log(verbose)
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index 38f8d2313..93abad6f6 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -5,7 +5,7 @@ from pathlib import Path
import sys
import requests
import srsly
-from wasabi import Printer
+from wasabi import msg
from ..compat import path2str
from ..util import get_data_path
@@ -17,7 +17,6 @@ def validate():
Validate that the currently installed version of spaCy is compatible
with the installed models. Should be run after `pip install -U spacy`.
"""
- msg = Printer()
with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__)
if r.status_code != 200:
diff --git a/spacy/compat.py b/spacy/compat.py
index 5bff28815..0ea31c6b3 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -36,11 +36,6 @@ try:
except ImportError:
cupy = None
-try: # Python 3.8
- import importlib.metadata as importlib_metadata
-except ImportError:
- import importlib_metadata # noqa: F401
-
try:
from thinc.neural.optimizers import Optimizer # noqa: F401
except ImportError:
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 17b67940a..d6e33437b 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -5,7 +5,7 @@ import uuid
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
-from ..util import minify_html, escape_html, get_entry_points, ENTRY_POINTS
+from ..util import minify_html, escape_html, registry
from ..errors import Errors
@@ -242,7 +242,7 @@ class EntityRenderer(object):
"CARDINAL": "#e4e7d2",
"PERCENT": "#e4e7d2",
}
- user_colors = get_entry_points(ENTRY_POINTS.displacy_colors)
+ user_colors = registry.displacy_colors.get_all()
for user_color in user_colors.values():
colors.update(user_color)
colors.update(options.get("colors", {}))
diff --git a/spacy/errors.py b/spacy/errors.py
index d2898cf53..0b6a6775c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -529,6 +529,9 @@ class Errors(object):
E185 = ("Received invalid attribute in component attribute declaration: "
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
+ E187 = ("Tokenizer special cases are not allowed to modify the text. "
+ "This would map '{chunk}' to '{orth}' given token attributes "
+ "'{token_attrs}'.")
# TODO: fix numbering after merging develop into master
E998 = ("Can only create GoldParse's from Example's without a Doc, "
diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py
index 96ff9c1ed..0e31b56af 100644
--- a/spacy/lang/es/examples.py
+++ b/spacy/lang/es/examples.py
@@ -11,12 +11,12 @@ Example sentences to test spaCy and its language models.
sentences = [
- "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
- "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
- "San Francisco analiza prohibir los robots delivery",
- "Londres es una gran ciudad del Reino Unido",
- "El gato come pescado",
- "Veo al hombre con el telescopio",
- "La araña come moscas",
- "El pingüino incuba en su nido",
+ "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.",
+ "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.",
+ "San Francisco analiza prohibir los robots delivery.",
+ "Londres es una gran ciudad del Reino Unido.",
+ "El gato come pescado.",
+ "Veo al hombre con el telescopio.",
+ "La araña come moscas.",
+ "El pingüino incuba en su nido.",
]
diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py
index 72d6b5a71..c15426ded 100644
--- a/spacy/lang/nb/examples.py
+++ b/spacy/lang/nb/examples.py
@@ -11,8 +11,8 @@ Example sentences to test spaCy and its language models.
sentences = [
- "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
- "Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
- "San Francisco vurderer å forby robotbud på fortauene",
+ "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar.",
+ "Selvkjørende biler flytter forsikringsansvaret over på produsentene.",
+ "San Francisco vurderer å forby robotbud på fortauene.",
"London er en stor by i Storbritannia.",
]
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index 57771cca4..3ea2bc3e9 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -114,7 +114,6 @@ emoticons = set(
(-:
=)
(=
-")
:]
:-]
[:
diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py
new file mode 100644
index 000000000..38cd5e0cd
--- /dev/null
+++ b/spacy/lang/xx/examples.py
@@ -0,0 +1,99 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.de.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+# combined examples from de/en/es/fr/it/nl/pl/pt/ru
+
+sentences = [
+ "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
+ "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
+ "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
+ "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
+ "San Francisco erwägt Verbot von Lieferrobotern",
+ "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
+ "Wo bist du?",
+ "Was ist die Hauptstadt von Deutschland?",
+ "Apple is looking at buying U.K. startup for $1 billion",
+ "Autonomous cars shift insurance liability toward manufacturers",
+ "San Francisco considers banning sidewalk delivery robots",
+ "London is a big city in the United Kingdom.",
+ "Where are you?",
+ "Who is the president of France?",
+ "What is the capital of the United States?",
+ "When was Barack Obama born?",
+ "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.",
+ "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.",
+ "San Francisco analiza prohibir los robots delivery.",
+ "Londres es una gran ciudad del Reino Unido.",
+ "El gato come pescado.",
+ "Veo al hombre con el telescopio.",
+ "La araña come moscas.",
+ "El pingüino incuba en su nido.",
+ "Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars",
+ "Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs",
+ "San Francisco envisage d'interdire les robots coursiers sur les trottoirs",
+ "Londres est une grande ville du Royaume-Uni",
+ "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
+ "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
+ "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
+ "Nouvelles attaques de Trump contre le maire de Londres",
+ "Où es-tu ?",
+ "Qui est le président de la France ?",
+ "Où est la capitale des États-Unis ?",
+ "Quand est né Barack Obama ?",
+ "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
+ "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
+ "San Francisco prevede di bandire i robot di consegna porta a porta",
+ "Londra è una grande città del Regno Unito.",
+ "Apple overweegt om voor 1 miljard een U.K. startup te kopen",
+ "Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten",
+ "San Francisco overweegt robots op voetpaden te verbieden",
+ "Londen is een grote stad in het Verenigd Koninkrijk",
+ "Poczuł przyjemną woń mocnej kawy.",
+ "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
+ "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
+ "Nowy abonament pod lupą Komisji Europejskiej",
+ "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
+ "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.",
+ "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.",
+ "Carros autônomos empurram a responsabilidade do seguro para os fabricantes.."
+ "São Francisco considera banir os robôs de entrega que andam pelas calçadas.",
+ "Londres é a maior cidade do Reino Unido.",
+ # Translations from English:
+ "Apple рассматривает возможность покупки стартапа из Соединённого Королевства за $1 млрд",
+ "Беспилотные автомобили перекладывают страховую ответственность на производителя",
+ "В Сан-Франциско рассматривается возможность запрета роботов-курьеров, которые перемещаются по тротуару",
+ "Лондон — это большой город в Соединённом Королевстве",
+ # Native Russian sentences:
+ # Colloquial:
+ "Да, нет, наверное!", # Typical polite refusal
+ "Обратите внимание на необыкновенную красоту этого города-героя Москвы, столицы нашей Родины!", # From a tour guide speech
+ # Examples of Bookish Russian:
+ # Quote from "The Golden Calf"
+ "Рио-де-Жанейро — это моя мечта, и не смейте касаться её своими грязными лапами!",
+ # Quotes from "Ivan Vasilievich changes his occupation"
+ "Ты пошто боярыню обидел, смерд?!!",
+ "Оставь меня, старушка, я в печали!",
+ # Quotes from Dostoevsky:
+ "Уж коли я, такой же, как и ты, человек грешный, над тобой умилился и пожалел тебя, кольми паче бог",
+ "В мечтах я нередко, говорит, доходил до страстных помыслов о служении человечеству и может быть действительно пошел бы на крест за людей, если б это вдруг как-нибудь потребовалось, а между тем я двух дней не в состоянии прожить ни с кем в одной комнате, о чем знаю из опыта",
+ "Зато всегда так происходило, что чем более я ненавидел людей в частности, тем пламеннее становилась любовь моя к человечеству вообще",
+ # Quotes from Chekhov:
+ "Ненужные дела и разговоры всё об одном отхватывают на свою долю лучшую часть времени, лучшие силы, и в конце концов остается какая-то куцая, бескрылая жизнь, какая-то чепуха, и уйти и бежать нельзя, точно сидишь в сумасшедшем доме или в арестантских ротах!",
+ # Quotes from Turgenev:
+ "Нравится тебе женщина, старайся добиться толку; а нельзя — ну, не надо, отвернись — земля не клином сошлась",
+ "Узенькое местечко, которое я занимаю, до того крохотно в сравнении с остальным пространством, где меня нет и где дела до меня нет; и часть времени, которую мне удастся прожить, так ничтожна перед вечностью, где меня не было и не будет...",
+ # Quotes from newspapers:
+ # Komsomolskaya Pravda:
+ "На заседании президиума правительства Москвы принято решение присвоить статус инвестиционного приоритетного проекта города Москвы киностудии Союзмультфильм",
+ "Глава Минобороны Сергей Шойгу заявил, что обстановка на этом стратегическом направлении требует непрерывного совершенствования боевого состава войск",
+ # Argumenty i Facty:
+ "На реплику лже-Говина — дескать, он (Волков) будет лучшим революционером — Стамп с энтузиазмом ответил: Непременно!",
+]
diff --git a/spacy/language.py b/spacy/language.py
index 3106c6afe..c84f597d9 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -53,8 +53,8 @@ class BaseDefaults(object):
filenames = {name: root / filename for name, filename in cls.resources}
if LANG in cls.lex_attr_getters:
lang = cls.lex_attr_getters[LANG](None)
- user_lookups = util.get_entry_point(util.ENTRY_POINTS.lookups, lang, {})
- filenames.update(user_lookups)
+ if lang in util.registry.lookups:
+ filenames.update(util.registry.lookups.get(lang))
lookups = Lookups()
for name, filename in filenames.items():
data = util.load_language_data(filename)
@@ -157,7 +157,7 @@ class Language(object):
100,000 characters in one text.
RETURNS (Language): The newly constructed object.
"""
- user_factories = util.get_entry_points(util.ENTRY_POINTS.factories)
+ user_factories = util.registry.factories.get_all()
self.factories.update(user_factories)
self._meta = dict(meta)
self._path = None
@@ -741,6 +741,7 @@ class Language(object):
texts,
batch_size=batch_size,
disable=disable,
+ n_process=n_process,
component_cfg=component_cfg,
as_example=False
)
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index ae2ad3ca6..56d27024d 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -240,7 +240,7 @@ cdef class DependencyMatcher:
for i, (ent_id, nodes) in enumerate(matched_key_trees):
on_match = self._callbacks.get(ent_id)
if on_match is not None:
- on_match(self, doc, i, matches)
+ on_match(self, doc, i, matched_key_trees)
return matched_key_trees
def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees):
diff --git a/spacy/ml/common.py b/spacy/ml/common.py
index 963d4dc35..f90b53a15 100644
--- a/spacy/ml/common.py
+++ b/spacy/ml/common.py
@@ -3,10 +3,10 @@ from __future__ import unicode_literals
from thinc.api import chain
from thinc.v2v import Maxout
from thinc.misc import LayerNorm
-from ..util import register_architecture, make_layer
+from ..util import registry, make_layer
-@register_architecture("thinc.FeedForward.v1")
+@registry.architectures.register("thinc.FeedForward.v1")
def FeedForward(config):
layers = [make_layer(layer_cfg) for layer_cfg in config["layers"]]
model = chain(*layers)
@@ -14,7 +14,7 @@ def FeedForward(config):
return model
-@register_architecture("spacy.LayerNormalizedMaxout.v1")
+@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
def LayerNormalizedMaxout(config):
width = config["width"]
pieces = config["pieces"]
diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py
index 0b30551b5..8f86475ef 100644
--- a/spacy/ml/tok2vec.py
+++ b/spacy/ml/tok2vec.py
@@ -6,11 +6,11 @@ from thinc.v2v import Maxout, Model
from thinc.i2v import HashEmbed, StaticVectors
from thinc.t2t import ExtractWindow
from thinc.misc import Residual, LayerNorm, FeatureExtracter
-from ..util import make_layer, register_architecture
+from ..util import make_layer, registry
from ._wire import concatenate_lists
-@register_architecture("spacy.Tok2Vec.v1")
+@registry.architectures.register("spacy.Tok2Vec.v1")
def Tok2Vec(config):
doc2feats = make_layer(config["@doc2feats"])
embed = make_layer(config["@embed"])
@@ -24,13 +24,13 @@ def Tok2Vec(config):
return tok2vec
-@register_architecture("spacy.Doc2Feats.v1")
+@registry.architectures.register("spacy.Doc2Feats.v1")
def Doc2Feats(config):
columns = config["columns"]
return FeatureExtracter(columns)
-@register_architecture("spacy.MultiHashEmbed.v1")
+@registry.architectures.register("spacy.MultiHashEmbed.v1")
def MultiHashEmbed(config):
# For backwards compatibility with models before the architecture registry,
# we have to be careful to get exactly the same model structure. One subtle
@@ -78,7 +78,7 @@ def MultiHashEmbed(config):
return layer
-@register_architecture("spacy.CharacterEmbed.v1")
+@registry.architectures.register("spacy.CharacterEmbed.v1")
def CharacterEmbed(config):
from .. import _ml
@@ -94,7 +94,7 @@ def CharacterEmbed(config):
return model
-@register_architecture("spacy.MaxoutWindowEncoder.v1")
+@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
def MaxoutWindowEncoder(config):
nO = config["width"]
nW = config["window_size"]
@@ -110,7 +110,7 @@ def MaxoutWindowEncoder(config):
return model
-@register_architecture("spacy.MishWindowEncoder.v1")
+@registry.architectures.register("spacy.MishWindowEncoder.v1")
def MishWindowEncoder(config):
from thinc.v2v import Mish
@@ -124,12 +124,12 @@ def MishWindowEncoder(config):
return model
-@register_architecture("spacy.PretrainedVectors.v1")
+@registry.architectures.register("spacy.PretrainedVectors.v1")
def PretrainedVectors(config):
return StaticVectors(config["vectors_name"], config["width"], config["column"])
-@register_architecture("spacy.TorchBiLSTMEncoder.v1")
+@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
def TorchBiLSTMEncoder(config):
import torch.nn
from thinc.extra.wrappers import PyTorchWrapperRNN
diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py
new file mode 100644
index 000000000..6a43dfea9
--- /dev/null
+++ b/spacy/tests/regression/test_issue4590.py
@@ -0,0 +1,34 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+from mock import Mock
+from spacy.matcher import DependencyMatcher
+from ..util import get_doc
+
+
+def test_issue4590(en_vocab):
+ """Test that matches param in on_match method are the same as matches run with no on_match method"""
+ pattern = [
+ {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
+ {"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
+ {"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
+ ]
+
+ on_match = Mock()
+
+ matcher = DependencyMatcher(en_vocab)
+ matcher.add("pattern", on_match, pattern)
+
+ text = "The quick brown fox jumped over the lazy fox"
+ heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
+ deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
+
+ doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
+
+ matches = matcher(doc)
+
+ on_match_args = on_match.call_args
+
+ assert on_match_args[0][3] == matches
+
diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py
new file mode 100644
index 000000000..77f1af020
--- /dev/null
+++ b/spacy/tests/test_architectures.py
@@ -0,0 +1,19 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy import registry
+from thinc.v2v import Affine
+from catalogue import RegistryError
+
+
+@registry.architectures.register("my_test_function")
+def create_model(nr_in, nr_out):
+ return Affine(nr_in, nr_out)
+
+
+def test_get_architecture():
+ arch = registry.architectures.get("my_test_function")
+ assert arch is create_model
+ with pytest.raises(RegistryError):
+ registry.architectures.get("not_an_existing_key")
diff --git a/spacy/tests/test_register_architecture.py b/spacy/tests/test_register_architecture.py
deleted file mode 100644
index 0c1b5b16f..000000000
--- a/spacy/tests/test_register_architecture.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import pytest
-from spacy import register_architecture
-from spacy import get_architecture
-from thinc.v2v import Affine
-
-
-@register_architecture("my_test_function")
-def create_model(nr_in, nr_out):
- return Affine(nr_in, nr_out)
-
-
-def test_get_architecture():
- arch = get_architecture("my_test_function")
- assert arch is create_model
- with pytest.raises(KeyError):
- get_architecture("not_an_existing_key")
diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
index a79363abb..c2011487e 100644
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@@ -7,7 +7,7 @@ import pytest
def test_tokenizer_handles_emoticons(tokenizer):
# Tweebo challenge (CMU)
- text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
+ text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| :> ...."""
tokens = tokenizer(text)
assert tokens[0].text == ":o"
assert tokens[1].text == ":/"
@@ -28,12 +28,11 @@ def test_tokenizer_handles_emoticons(tokenizer):
assert tokens[16].text == ">:("
assert tokens[17].text == ":D"
assert tokens[18].text == "=|"
- assert tokens[19].text == '")'
- assert tokens[20].text == ":>"
- assert tokens[21].text == "...."
+ assert tokens[19].text == ":>"
+ assert tokens[20].text == "...."
-@pytest.mark.parametrize("text,length", [("example:)", 3), ("108)", 2), ("XDN", 1)])
+@pytest.mark.parametrize("text,length", [("108)", 2), ("XDN", 1)])
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
tokens = tokenizer(text)
assert len(tokens) == length
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index 803c31abf..5ac681c5e 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -108,6 +108,12 @@ def test_tokenizer_add_special_case(tokenizer, text, tokens):
assert doc[1].text == tokens[1]["orth"]
+@pytest.mark.parametrize("text,tokens", [("lorem", [{"orth": "lo"}, {"orth": "re"}])])
+def test_tokenizer_validate_special_case(tokenizer, text, tokens):
+ with pytest.raises(ValueError):
+ tokenizer.add_special_case(text, tokens)
+
+
@pytest.mark.parametrize(
"text,tokens", [("lorem", [{"orth": "lo", "tag": "NN"}, {"orth": "rem"}])]
)
@@ -120,3 +126,18 @@ def test_tokenizer_add_special_case_tag(text, tokens):
assert doc[0].tag_ == tokens[0]["tag"]
assert doc[0].pos_ == "NOUN"
assert doc[1].text == tokens[1]["orth"]
+
+
+def test_tokenizer_special_cases_with_affixes(tokenizer):
+ text = '(((_SPECIAL_ A/B, A/B-A/B")'
+ tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
+ tokenizer.add_special_case("A/B", [{"orth": "A/B"}])
+ doc = tokenizer(text)
+ assert [token.text for token in doc] == ["(", "(", "(", "_SPECIAL_", "A/B", ",", "A/B", "-", "A/B", '"', ")"]
+
+
+def test_tokenizer_special_cases_with_period(tokenizer):
+ text = "_SPECIAL_."
+ tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
+ doc = tokenizer(text)
+ assert [token.text for token in doc] == ["_SPECIAL_", "."]
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index 0e287aada..a1017bac8 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -3,6 +3,8 @@ from __future__ import unicode_literals
import pytest
+from spacy.lang.tokenizer_exceptions import BASE_EXCEPTIONS
+
URLS_BASIC = [
"http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0",
@@ -194,7 +196,12 @@ def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url):
@pytest.mark.parametrize("url", URLS_FULL)
def test_tokenizer_handles_two_suffix_url(tokenizer, suffix1, suffix2, url):
tokens = tokenizer(url + suffix1 + suffix2)
- assert len(tokens) == 3
- assert tokens[0].text == url
- assert tokens[1].text == suffix1
- assert tokens[2].text == suffix2
+ if suffix1 + suffix2 in BASE_EXCEPTIONS:
+ assert len(tokens) == 2
+ assert tokens[0].text == url
+ assert tokens[1].text == suffix1 + suffix2
+ else:
+ assert len(tokens) == 3
+ assert tokens[0].text == url
+ assert tokens[1].text == suffix1
+ assert tokens[2].text == suffix2
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index dadbad7bd..ba22f7782 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -4,10 +4,11 @@ from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from .typedefs cimport hash_t
-from .structs cimport LexemeC, TokenC
+from .structs cimport LexemeC, SpanC, TokenC
from .strings cimport StringStore
from .tokens.doc cimport Doc
from .vocab cimport Vocab, LexemesOrTokens, _Cached
+from .matcher.phrasematcher cimport PhraseMatcher
cdef class Tokenizer:
@@ -21,15 +22,32 @@ cdef class Tokenizer:
cdef object _suffix_search
cdef object _infix_finditer
cdef object _rules
+ cdef PhraseMatcher _special_matcher
+ cdef int _property_init_count
+ cdef int _property_init_max
cpdef Doc tokens_from_list(self, list strings)
+ cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
+ cdef int _apply_special_cases(self, Doc doc) except -1
+ cdef void _filter_special_spans(self, vector[SpanC] &original,
+ vector[SpanC] &filtered, int doc_len) nogil
+ cdef object _prepare_special_spans(self, Doc doc,
+ vector[SpanC] &filtered)
+ cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
+ object span_data)
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
- cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
- cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes,
- vector[LexemeC*] *suffixes, int* has_special)
+ cdef int _try_specials(self, hash_t key, Doc tokens,
+ int* has_special) except -1
+ cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
+ int* has_special, bint with_special_cases) except -1
+ cdef unicode _split_affixes(self, Pool mem, unicode string,
+ vector[LexemeC*] *prefixes,
+ vector[LexemeC*] *suffixes, int* has_special,
+ bint with_special_cases)
cdef int _attach_tokens(self, Doc tokens, unicode string,
- vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
-
- cdef int _save_cached(self, const TokenC* tokens, hash_t key, int has_special,
- int n) except -1
+ vector[LexemeC*] *prefixes,
+ vector[LexemeC*] *suffixes, int* has_special,
+ bint with_special_cases) except -1
+ cdef int _save_cached(self, const TokenC* tokens, hash_t key,
+ int* has_special, int n) except -1
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 262f19941..13f799f84 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -5,6 +5,8 @@ from __future__ import unicode_literals
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
+from libc.string cimport memcpy, memset
+from libcpp.set cimport set as stdset
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
cimport cython
@@ -19,6 +21,9 @@ from .compat import unescape_unicode
from .errors import Errors, Warnings, deprecation_warning
from . import util
+from .attrs import intify_attrs
+from .lexeme cimport EMPTY_LEXEME
+from .symbols import ORTH
cdef class Tokenizer:
"""Segment text, and create Doc objects with the discovered segment
@@ -57,9 +62,10 @@ cdef class Tokenizer:
self.infix_finditer = infix_finditer
self.vocab = vocab
self._rules = {}
- if rules is not None:
- for chunk, substrings in sorted(rules.items()):
- self.add_special_case(chunk, substrings)
+ self._special_matcher = PhraseMatcher(self.vocab)
+ self._load_special_cases(rules)
+ self._property_init_count = 0
+ self._property_init_max = 4
property token_match:
def __get__(self):
@@ -67,7 +73,9 @@ cdef class Tokenizer:
def __set__(self, token_match):
self._token_match = token_match
- self._flush_cache()
+ self._reload_special_cases()
+ if self._property_init_count <= self._property_init_max:
+ self._property_init_count += 1
property prefix_search:
def __get__(self):
@@ -75,7 +83,9 @@ cdef class Tokenizer:
def __set__(self, prefix_search):
self._prefix_search = prefix_search
- self._flush_cache()
+ self._reload_special_cases()
+ if self._property_init_count <= self._property_init_max:
+ self._property_init_count += 1
property suffix_search:
def __get__(self):
@@ -83,7 +93,9 @@ cdef class Tokenizer:
def __set__(self, suffix_search):
self._suffix_search = suffix_search
- self._flush_cache()
+ self._reload_special_cases()
+ if self._property_init_count <= self._property_init_max:
+ self._property_init_count += 1
property infix_finditer:
def __get__(self):
@@ -91,7 +103,9 @@ cdef class Tokenizer:
def __set__(self, infix_finditer):
self._infix_finditer = infix_finditer
- self._flush_cache()
+ self._reload_special_cases()
+ if self._property_init_count <= self._property_init_max:
+ self._property_init_count += 1
def __reduce__(self):
args = (self.vocab,
@@ -106,7 +120,6 @@ cdef class Tokenizer:
deprecation_warning(Warnings.W002)
return Doc(self.vocab, words=strings)
- @cython.boundscheck(False)
def __call__(self, unicode string):
"""Tokenize a string.
@@ -115,6 +128,17 @@ cdef class Tokenizer:
DOCS: https://spacy.io/api/tokenizer#call
"""
+ doc = self._tokenize_affixes(string, True)
+ self._apply_special_cases(doc)
+ return doc
+
+ @cython.boundscheck(False)
+ cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
+ """Tokenize according to affix and token_match settings.
+
+ string (unicode): The string to tokenize.
+ RETURNS (Doc): A container for linguistic annotations.
+ """
if len(string) >= (2 ** 30):
raise ValueError(Errors.E025.format(length=len(string)))
cdef int length = len(string)
@@ -123,7 +147,9 @@ cdef class Tokenizer:
return doc
cdef int i = 0
cdef int start = 0
- cdef bint cache_hit
+ cdef int has_special = 0
+ cdef bint specials_hit = 0
+ cdef bint cache_hit = 0
cdef bint in_ws = string[0].isspace()
cdef unicode span
# The task here is much like string.split, but not quite
@@ -139,9 +165,14 @@ cdef class Tokenizer:
# we don't have to create the slice when we hit the cache.
span = string[start:i]
key = hash_string(span)
- cache_hit = self._try_cache(key, doc)
- if not cache_hit:
- self._tokenize(doc, span, key)
+ specials_hit = 0
+ cache_hit = 0
+ if with_special_cases:
+ specials_hit = self._try_specials(key, doc, &has_special)
+ if not specials_hit:
+ cache_hit = self._try_cache(key, doc)
+ if not specials_hit and not cache_hit:
+ self._tokenize(doc, span, key, &has_special, with_special_cases)
if uc == ' ':
doc.c[doc.length - 1].spacy = True
start = i + 1
@@ -152,9 +183,14 @@ cdef class Tokenizer:
if start < i:
span = string[start:]
key = hash_string(span)
- cache_hit = self._try_cache(key, doc)
- if not cache_hit:
- self._tokenize(doc, span, key)
+ specials_hit = 0
+ cache_hit = 0
+ if with_special_cases:
+ specials_hit = self._try_specials(key, doc, &has_special)
+ if not specials_hit:
+ cache_hit = self._try_cache(key, doc)
+ if not specials_hit and not cache_hit:
+ self._tokenize(doc, span, key, &has_special, with_special_cases)
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
return doc
@@ -174,23 +210,141 @@ cdef class Tokenizer:
yield self(text)
def _flush_cache(self):
- self._reset_cache([key for key in self._cache if not key in self._specials])
+ self._reset_cache([key for key in self._cache])
def _reset_cache(self, keys):
for k in keys:
+ cached = <_Cached*>self._cache.get(k)
del self._cache[k]
- if not k in self._specials:
- cached = <_Cached*>self._cache.get(k)
- if cached is not NULL:
- self.mem.free(cached)
+ if cached is not NULL:
+ self.mem.free(cached)
- def _reset_specials(self):
+ def _flush_specials(self):
for k in self._specials:
cached = <_Cached*>self._specials.get(k)
del self._specials[k]
if cached is not NULL:
self.mem.free(cached)
+ cdef int _apply_special_cases(self, Doc doc) except -1:
+ """Retokenize doc according to special cases.
+
+ doc (Doc): Document.
+ """
+ cdef int i
+ cdef int max_length = 0
+ cdef bint modify_in_place
+ cdef Pool mem = Pool()
+ cdef vector[SpanC] c_matches
+ cdef vector[SpanC] c_filtered
+ cdef int offset
+ cdef int modified_doc_length
+ # Find matches for special cases
+ self._special_matcher.find_matches(doc, &c_matches)
+ # Skip processing if no matches
+ if c_matches.size() == 0:
+ return True
+ self._filter_special_spans(c_matches, c_filtered, doc.length)
+ # Put span info in span.start-indexed dict and calculate maximum
+ # intermediate document size
+ (span_data, max_length, modify_in_place) = self._prepare_special_spans(doc, c_filtered)
+ # If modifications never increase doc length, can modify in place
+ if modify_in_place:
+ tokens = doc.c
+ # Otherwise create a separate array to store modified tokens
+ else:
+ tokens = mem.alloc(max_length, sizeof(TokenC))
+ # Modify tokenization according to filtered special cases
+ offset = self._retokenize_special_spans(doc, tokens, span_data)
+ # Allocate more memory for doc if needed
+ modified_doc_length = doc.length + offset
+ while modified_doc_length >= doc.max_length:
+ doc._realloc(doc.max_length * 2)
+ # If not modified in place, copy tokens back to doc
+ if not modify_in_place:
+ memcpy(doc.c, tokens, max_length * sizeof(TokenC))
+ for i in range(doc.length + offset, doc.length):
+ memset(&doc.c[i], 0, sizeof(TokenC))
+ doc.c[i].lex = &EMPTY_LEXEME
+ doc.length = doc.length + offset
+ return True
+
+ cdef void _filter_special_spans(self, vector[SpanC] &original, vector[SpanC] &filtered, int doc_len) nogil:
+
+ cdef int seen_i
+ cdef SpanC span
+ cdef stdset[int] seen_tokens
+ stdsort(original.begin(), original.end(), len_start_cmp)
+ cdef int orig_i = original.size() - 1
+ while orig_i >= 0:
+ span = original[orig_i]
+ if not seen_tokens.count(span.start) and not seen_tokens.count(span.end - 1):
+ filtered.push_back(span)
+ for seen_i in range(span.start, span.end):
+ seen_tokens.insert(seen_i)
+ orig_i -= 1
+ stdsort(filtered.begin(), filtered.end(), start_cmp)
+
+ cdef object _prepare_special_spans(self, Doc doc, vector[SpanC] &filtered):
+ spans = [doc[match.start:match.end] for match in filtered]
+ cdef bint modify_in_place = True
+ cdef int curr_length = doc.length
+ cdef int max_length
+ cdef int span_length_diff = 0
+ span_data = {}
+ for span in spans:
+ rule = self._rules.get(span.text, None)
+ span_length_diff = 0
+ if rule:
+ span_length_diff = len(rule) - (span.end - span.start)
+ if span_length_diff > 0:
+ modify_in_place = False
+ curr_length += span_length_diff
+ if curr_length > max_length:
+ max_length = curr_length
+ span_data[span.start] = (span.text, span.start, span.end, span_length_diff)
+ return (span_data, max_length, modify_in_place)
+
+ cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens, object span_data):
+ cdef int i = 0
+ cdef int j = 0
+ cdef int offset = 0
+ cdef _Cached* cached
+ cdef int idx_offset = 0
+ cdef int orig_final_spacy
+ cdef int orig_idx
+ cdef int span_start
+ cdef int span_end
+ while i < doc.length:
+ if not i in span_data:
+ tokens[i + offset] = doc.c[i]
+ i += 1
+ else:
+ span = span_data[i]
+ span_start = span[1]
+ span_end = span[2]
+ cached = <_Cached*>self._specials.get(hash_string(span[0]))
+ if cached == NULL:
+ # Copy original tokens if no rule found
+ for j in range(span_end - span_start):
+ tokens[i + offset + j] = doc.c[i + j]
+ i += span_end - span_start
+ else:
+ # Copy special case tokens into doc and adjust token and
+ # character offsets
+ idx_offset = 0
+ orig_final_spacy = doc.c[span_end + offset - 1].spacy
+ orig_idx = doc.c[i].idx
+ for j in range(cached.length):
+ tokens[i + offset + j] = cached.data.tokens[j]
+ tokens[i + offset + j].idx = orig_idx + idx_offset
+ idx_offset += cached.data.tokens[j].lex.length + \
+ 1 if cached.data.tokens[j].spacy else 0
+ tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
+ i += span_end - span_start
+ offset += span[3]
+ return offset
+
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
cached = <_Cached*>self._cache.get(key)
if cached == NULL:
@@ -204,22 +358,33 @@ cdef class Tokenizer:
tokens.push_back(&cached.data.tokens[i], False)
return True
- cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key) except -1:
+ cdef int _try_specials(self, hash_t key, Doc tokens, int* has_special) except -1:
+ cached = <_Cached*>self._specials.get(key)
+ if cached == NULL:
+ return False
+ cdef int i
+ for i in range(cached.length):
+ tokens.push_back(&cached.data.tokens[i], False)
+ has_special[0] = 1
+ return True
+
+ cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes
cdef int orig_size
- cdef int has_special = 0
orig_size = tokens.length
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
- &has_special)
- self._attach_tokens(tokens, span, &prefixes, &suffixes)
+ has_special, with_special_cases)
+ self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
+ with_special_cases)
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)
cdef unicode _split_affixes(self, Pool mem, unicode string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes,
- int* has_special):
+ int* has_special,
+ bint with_special_cases):
cdef size_t i
cdef unicode prefix
cdef unicode suffix
@@ -231,29 +396,24 @@ cdef class Tokenizer:
and not self.find_prefix(string) \
and not self.find_suffix(string):
break
- if self._specials.get(hash_string(string)) != NULL:
- has_special[0] = 1
+ if with_special_cases and self._specials.get(hash_string(string)) != NULL:
break
last_size = len(string)
pre_len = self.find_prefix(string)
if pre_len != 0:
prefix = string[:pre_len]
minus_pre = string[pre_len:]
- # Check whether we've hit a special-case
- if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL:
+ if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix))
- has_special[0] = 1
break
suf_len = self.find_suffix(string)
if suf_len != 0:
suffix = string[-suf_len:]
minus_suf = string[:-suf_len]
- # Check whether we've hit a special-case
- if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL):
+ if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix))
- has_special[0] = 1
break
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
string = string[pre_len:-suf_len]
@@ -265,15 +425,15 @@ cdef class Tokenizer:
elif suf_len:
string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix))
- if string and (self._specials.get(hash_string(string)) != NULL):
- has_special[0] = 1
- break
return string
cdef int _attach_tokens(self, Doc tokens, unicode string,
vector[const LexemeC*] *prefixes,
- vector[const LexemeC*] *suffixes) except -1:
- cdef bint cache_hit
+ vector[const LexemeC*] *suffixes,
+ int* has_special,
+ bint with_special_cases) except -1:
+ cdef bint specials_hit = 0
+ cdef bint cache_hit = 0
cdef int split, end
cdef const LexemeC* const* lexemes
cdef const LexemeC* lexeme
@@ -283,8 +443,12 @@ cdef class Tokenizer:
for i in range(prefixes.size()):
tokens.push_back(prefixes[0][i], False)
if string:
- cache_hit = self._try_cache(hash_string(string), tokens)
- if cache_hit:
+ if with_special_cases:
+ specials_hit = self._try_specials(hash_string(string), tokens,
+ has_special)
+ if not specials_hit:
+ cache_hit = self._try_cache(hash_string(string), tokens)
+ if specials_hit or cache_hit:
pass
elif self.token_match and self.token_match(string):
# We're always saying 'no' to spaces here -- the caller will
@@ -329,7 +493,7 @@ cdef class Tokenizer:
tokens.push_back(lexeme, False)
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
- int has_special, int n) except -1:
+ int* has_special, int n) except -1:
cdef int i
if n <= 0:
# avoid mem alloc of zero length
@@ -338,7 +502,7 @@ cdef class Tokenizer:
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
return 0
# See #1250
- if has_special:
+ if has_special[0]:
return 0
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = n
@@ -391,10 +555,24 @@ cdef class Tokenizer:
match = self.suffix_search(string)
return (match.end() - match.start()) if match is not None else 0
- def _load_special_tokenization(self, special_cases):
+ def _load_special_cases(self, special_cases):
"""Add special-case tokenization rules."""
- for chunk, substrings in sorted(special_cases.items()):
- self.add_special_case(chunk, substrings)
+ if special_cases is not None:
+ for chunk, substrings in sorted(special_cases.items()):
+ self._validate_special_case(chunk, substrings)
+ self.add_special_case(chunk, substrings)
+
+ def _validate_special_case(self, chunk, substrings):
+ """Check whether the `ORTH` fields match the string.
+
+ string (unicode): The string to specially tokenize.
+ substrings (iterable): A sequence of dicts, where each dict describes
+ a token and its attributes.
+ """
+ attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings]
+ orth = "".join([spec[ORTH] for spec in attrs])
+ if chunk != orth:
+ raise ValueError(Errors.E187.format(chunk=chunk, orth=orth, token_attrs=substrings))
def add_special_case(self, unicode string, substrings):
"""Add a special-case tokenization rule.
@@ -406,6 +584,7 @@ cdef class Tokenizer:
DOCS: https://spacy.io/api/tokenizer#add_special_case
"""
+ self._validate_special_case(string, substrings)
substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings)
@@ -413,15 +592,25 @@ cdef class Tokenizer:
cached.data.tokens = self.vocab.make_fused_token(substrings)
key = hash_string(string)
stale_special = <_Cached*>self._specials.get(key)
- stale_cached = <_Cached*>self._cache.get(key)
- self._flush_cache()
self._specials.set(key, cached)
- self._cache.set(key, cached)
if stale_special is not NULL:
self.mem.free(stale_special)
- if stale_special != stale_cached and stale_cached is not NULL:
- self.mem.free(stale_cached)
self._rules[string] = substrings
+ self._flush_cache()
+ if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string):
+ self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
+
+ def _reload_special_cases(self):
+ try:
+ self._property_init_count
+ except AttributeError:
+ return
+ # only reload if all 4 of prefix, suffix, infix, token_match have
+ # have been initialized
+ if self.vocab is not None and self._property_init_count >= self._property_init_max:
+ self._flush_cache()
+ self._flush_specials()
+ self._load_special_cases(self._rules)
def to_disk(self, path, **kwargs):
"""Save the current state to a directory.
@@ -503,12 +692,9 @@ cdef class Tokenizer:
if data.get("rules"):
# make sure to hard reset the cache to remove data from the default exceptions
self._rules = {}
- self._reset_cache([key for key in self._cache])
- self._reset_specials()
- self._cache = PreshMap()
- self._specials = PreshMap()
- for string, substrings in data.get("rules", {}).items():
- self.add_special_case(string, substrings)
+ self._flush_cache()
+ self._flush_specials()
+ self._load_special_cases(data.get("rules", {}))
return self
@@ -516,3 +702,19 @@ cdef class Tokenizer:
def _get_regex_pattern(regex):
"""Get a pattern string for a regex, or None if the pattern is None."""
return None if regex is None else regex.__self__.pattern
+
+
+cdef extern from "" namespace "std" nogil:
+ void stdsort "sort"(vector[SpanC].iterator,
+ vector[SpanC].iterator,
+ bint (*)(SpanC, SpanC))
+
+
+cdef bint len_start_cmp(SpanC a, SpanC b) nogil:
+ if a.end - a.start == b.end - b.start:
+ return b.start < a.start
+ return a.end - a.start < b.end - b.start
+
+
+cdef bint start_cmp(SpanC a, SpanC b) nogil:
+ return a.start < b.start
diff --git a/spacy/util.py b/spacy/util.py
index f9e51f7d5..21c5ea427 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -13,6 +13,7 @@ import functools
import itertools
import numpy.random
import srsly
+import catalogue
import sys
try:
@@ -27,29 +28,20 @@ except ImportError:
from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
-from .compat import import_file, importlib_metadata
+from .compat import import_file
from .errors import Errors, Warnings, deprecation_warning
-LANGUAGES = {}
-ARCHITECTURES = {}
_data_path = Path(__file__).parent / "data"
_PRINT_ENV = False
-# NB: Ony ever call this once! If called more than ince within the
-# function, test_issue1506 hangs and it's not 100% clear why.
-AVAILABLE_ENTRY_POINTS = importlib_metadata.entry_points()
-
-
-class ENTRY_POINTS(object):
- """Available entry points to register extensions."""
-
- factories = "spacy_factories"
- languages = "spacy_languages"
- displacy_colors = "spacy_displacy_colors"
- lookups = "spacy_lookups"
- architectures = "spacy_architectures"
+class registry(object):
+ languages = catalogue.create("spacy", "languages", entry_points=True)
+ architectures = catalogue.create("spacy", "architectures", entry_points=True)
+ lookups = catalogue.create("spacy", "lookups", entry_points=True)
+ factories = catalogue.create("spacy", "factories", entry_points=True)
+ displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
def set_env_log(value):
@@ -65,8 +57,7 @@ def lang_class_is_loaded(lang):
lang (unicode): Two-letter language code, e.g. 'en'.
RETURNS (bool): Whether a Language class has been loaded.
"""
- global LANGUAGES
- return lang in LANGUAGES
+ return lang in registry.languages
def get_lang_class(lang):
@@ -75,19 +66,16 @@ def get_lang_class(lang):
lang (unicode): Two-letter language code, e.g. 'en'.
RETURNS (Language): Language class.
"""
- global LANGUAGES
- # Check if an entry point is exposed for the language code
- entry_point = get_entry_point(ENTRY_POINTS.languages, lang)
- if entry_point is not None:
- LANGUAGES[lang] = entry_point
- return entry_point
- if lang not in LANGUAGES:
+ # Check if language is registered / entry point is available
+ if lang in registry.languages:
+ return registry.languages.get(lang)
+ else:
try:
module = importlib.import_module(".lang.%s" % lang, "spacy")
except ImportError as err:
raise ImportError(Errors.E048.format(lang=lang, err=err))
- LANGUAGES[lang] = getattr(module, module.__all__[0])
- return LANGUAGES[lang]
+ set_lang_class(lang, getattr(module, module.__all__[0]))
+ return registry.languages.get(lang)
def set_lang_class(name, cls):
@@ -96,8 +84,7 @@ def set_lang_class(name, cls):
name (unicode): Name of Language class.
cls (Language): Language class.
"""
- global LANGUAGES
- LANGUAGES[name] = cls
+ registry.languages.register(name, func=cls)
def get_data_path(require_exists=True):
@@ -121,49 +108,11 @@ def set_data_path(path):
_data_path = ensure_path(path)
-def register_architecture(name, arch=None):
- """Decorator to register an architecture. An architecture is a function
- that returns a Thinc Model object.
-
- name (unicode): The name of the architecture to register.
- arch (Model): Optional architecture if function is called directly and
- not used as a decorator.
- RETURNS (callable): Function to register architecture.
- """
- global ARCHITECTURES
- if arch is not None:
- ARCHITECTURES[name] = arch
- return arch
-
- def do_registration(arch):
- ARCHITECTURES[name] = arch
- return arch
-
- return do_registration
-
-
def make_layer(arch_config):
- arch_func = get_architecture(arch_config["arch"])
+ arch_func = registry.architectures.get(arch_config["arch"])
return arch_func(arch_config["config"])
-def get_architecture(name):
- """Get a model architecture function by name. Raises a KeyError if the
- architecture is not found.
-
- name (unicode): The mame of the architecture.
- RETURNS (Model): The architecture.
- """
- # Check if an entry point is exposed for the architecture code
- entry_point = get_entry_point(ENTRY_POINTS.architectures, name)
- if entry_point is not None:
- ARCHITECTURES[name] = entry_point
- if name not in ARCHITECTURES:
- names = ", ".join(sorted(ARCHITECTURES.keys()))
- raise KeyError(Errors.E174.format(name=name, names=names))
- return ARCHITECTURES[name]
-
-
def ensure_path(path):
"""Ensure string is converted to a Path.
@@ -327,34 +276,6 @@ def get_package_path(name):
return Path(pkg.__file__).parent
-def get_entry_points(key):
- """Get registered entry points from other packages for a given key, e.g.
- 'spacy_factories' and return them as a dictionary, keyed by name.
-
- key (unicode): Entry point name.
- RETURNS (dict): Entry points, keyed by name.
- """
- result = {}
- for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []):
- result[entry_point.name] = entry_point.load()
- return result
-
-
-def get_entry_point(key, value, default=None):
- """Check if registered entry point is available for a given name and
- load it. Otherwise, return None.
-
- key (unicode): Entry point name.
- value (unicode): Name of entry point to load.
- default: Optional default value to return.
- RETURNS: The loaded entry point or None.
- """
- for entry_point in AVAILABLE_ENTRY_POINTS.get(key, []):
- if entry_point.name == value:
- return entry_point.load()
- return default
-
-
def is_in_jupyter():
"""Check if user is running spaCy from a Jupyter notebook by detecting the
IPython kernel. Mainly used for the displaCy visualizer.
diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md
index 41ebb6075..9f12a07e6 100644
--- a/website/docs/api/docbin.md
+++ b/website/docs/api/docbin.md
@@ -109,8 +109,8 @@ raise an error if the pre-defined attrs of the two `DocBin`s don't match.
> doc_bin1.add(nlp("Hello world"))
> doc_bin2 = DocBin(attrs=["LEMMA", "POS"])
> doc_bin2.add(nlp("This is a sentence"))
-> merged_bins = doc_bin1.merge(doc_bin2)
-> assert len(merged_bins) == 2
+> doc_bin1.merge(doc_bin2)
+> assert len(doc_bin1) == 2
> ```
| Argument | Type | Description |
diff --git a/website/docs/usage/101/_named-entities.md b/website/docs/usage/101/_named-entities.md
index 1ecaf9fe7..0e8784187 100644
--- a/website/docs/usage/101/_named-entities.md
+++ b/website/docs/usage/101/_named-entities.md
@@ -1,5 +1,5 @@
A named entity is a "real-world object" that's assigned a name – for example, a
-person, a country, a product or a book title. spaCy can **recognize**
+person, a country, a product or a book title. spaCy can **recognize**
[various types](/api/annotation#named-entities) of named entities in a document,
by asking the model for a **prediction**. Because models are statistical and
strongly depend on the examples they were trained on, this doesn't always work
diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md
index 441297813..2b0045bc3 100644
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@@ -20,6 +20,17 @@ available over [pip](https://pypi.python.org/pypi/spacy) and
> possible, the new docs also include notes on features that have changed in
> v2.0, and features that were introduced in the new version.
+
+
+We can't yet ship pre-compiled binary wheels for spaCy that work on Python 3.8,
+as we're still waiting for our CI providers and other tooling to support it.
+This means that in order to run spaCy on Python 3.8, you'll need
+[a compiler installed](#source) and compile the library and its Cython
+dependencies locally. If this is causing problems for you, the easiest solution
+is to **use Python 3.7** in the meantime.
+
+
+
## Quickstart {hidden="true"}
import QuickstartInstall from 'widgets/quickstart-install.js'
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 749abc659..40ebfaaa7 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1861,6 +1861,30 @@
"author_links": {
"github": "microsoft"
}
+ },
+ {
+ "id": "dframcy",
+ "title": "Dframcy",
+ "slogan": "Dataframe Integration with spaCy NLP",
+ "github": "yash1994/dframcy",
+ "description": "DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks.",
+ "pip": "dframcy",
+ "category": ["pipeline", "training"],
+ "tags": ["pandas"],
+ "code_example": [
+ "import spacy",
+ "from dframcy import DframCy",
+ "",
+ "nlp = spacy.load('en_core_web_sm')",
+ "dframcy = DframCy(nlp)",
+ "doc = dframcy.nlp(u'Apple is looking at buying U.K. startup for $1 billion')",
+ "annotation_dataframe = dframcy.to_dataframe(doc)"
+ ],
+ "author": "Yash Patadia",
+ "author_links": {
+ "twitter": "PatadiaYash",
+ "github": "yash1994"
+ }
}
],
From 44829950ba8ae019e60560701f7d62069f664cc9 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Sat, 23 Nov 2019 14:32:15 +0100
Subject: [PATCH 023/496] Fix Example details for train CLI / pipeline
components (#4624)
* Switch to train_dataset() function in train CLI
* Fixes for pipe() methods in pipeline components
* Don't clobber `examples` variable with `as_example` in pipe() methods
* Remove unnecessary traversals of `examples`
* Update Parser.pipe() for Examples
* Add `as_examples` kwarg to `pipe()` with implementation to return
`Example`s
* Accept `Doc` or `Example` in `pipe()` with `_get_doc()` (copied from
`Pipe`)
* Fixes to Example implementation in spacy.gold
* Move `make_projective` from an attribute of Example to an argument of
`Example.get_gold_parses()`
* Head of 0 are not treated as unset
* Unset heads are set to self rather than `None` (which causes problems
while projectivizing)
* Check for `Doc` (not just not `None`) when creating GoldParses for
pre-merged example
* Don't clobber `examples` variable in `iter_gold_docs()`
* Add/modify gold tests for handling projectivity
* In JSON roundtrip compare results from `dev_dataset` rather than
`train_dataset` to avoid projectivization (and other potential
modifications)
* Add test for projective train vs. nonprojective dev versions of the
same `Doc`
* Handle ignore_misaligned as arg rather than attr
Move `ignore_misaligned` from an attribute of `Example` to an argument
to `Example.get_gold_parses()`, which makes it parallel to
`make_projective`.
Add test with old and new align that checks whether `ignore_misaligned`
errors are raised as expected (only for new align).
* Remove unused attrs from gold.pxd
Remove `ignore_misaligned` and `make_projective` from `gold.pxd`
* Refer to Example.goldparse in iter_gold_docs()
Use `Example.goldparse` in `iter_gold_docs()` instead of `Example.gold`
because a `None` `GoldParse` is generated with ignore_misaligned and
generating it on-the-fly can raise an unwanted AlignmentError
* Update test for ignore_misaligned
---
spacy/cli/train.py | 2 +-
spacy/gold.pxd | 2 -
spacy/gold.pyx | 61 +++++++++--------
spacy/pipeline/pipes.pyx | 36 +++++-----
spacy/syntax/nn_parser.pyx | 22 +++++--
spacy/tests/test_gold.py | 130 +++++++++++++++++++++++++++++++------
6 files changed, 179 insertions(+), 74 deletions(-)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 622a9ca97..645d1e4d4 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -340,7 +340,7 @@ def train(
iter_since_best = 0
best_score = 0.0
for i in range(n_iter):
- train_data = corpus.train_data(
+ train_data = corpus.train_dataset(
nlp,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
diff --git a/spacy/gold.pxd b/spacy/gold.pxd
index 6027d85b6..8527ba2b6 100644
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@@ -58,8 +58,6 @@ cdef class Example:
cdef public object doc
cdef public list token_annotations
cdef public DocAnnotation doc_annotation
- cdef public object make_projective
- cdef public object ignore_misaligned
cdef public object goldparse
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index d79bc8205..39e867a33 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -311,47 +311,50 @@ class GoldCorpus(object):
ignore_misaligned=ignore_misaligned)
yield from gold_examples
- def train_dataset_without_preprocessing(self, nlp, gold_preproc=False):
- examples = self.iter_gold_docs(nlp, self.train_examples, gold_preproc=gold_preproc)
+ def train_dataset_without_preprocessing(self, nlp, gold_preproc=False,
+ ignore_misaligned=False):
+ examples = self.iter_gold_docs(nlp, self.train_examples,
+ gold_preproc=gold_preproc,
+ ignore_misaligned=ignore_misaligned)
yield from examples
def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
- examples = self.iter_gold_docs(nlp, self.dev_examples, gold_preproc=gold_preproc,
- ignore_misaligned=ignore_misaligned)
+ examples = self.iter_gold_docs(nlp, self.dev_examples,
+ gold_preproc=gold_preproc,
+ ignore_misaligned=ignore_misaligned)
yield from examples
@classmethod
def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
- noise_level=0.0, orth_variant_level=0.0, make_projective=False,
- ignore_misaligned=False):
+ noise_level=0.0, orth_variant_level=0.0,
+ make_projective=False, ignore_misaligned=False):
""" Setting gold_preproc will result in creating a doc per 'sentence' """
for example in examples:
if gold_preproc:
example.doc = None
else:
example = example.merge_sents()
- example.make_projective = make_projective
- example.ignore_misaligned = ignore_misaligned
- examples = cls._make_docs(nlp, example,
+ example_docs = cls._make_docs(nlp, example,
gold_preproc, noise_level=noise_level,
orth_variant_level=orth_variant_level)
- examples = cls._make_golds(examples, vocab=nlp.vocab)
- for ex in examples:
- if ex.gold is not None:
+ example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
+ make_projective=make_projective,
+ ignore_misaligned=ignore_misaligned)
+ for ex in example_golds:
+ if ex.goldparse is not None:
if (not max_length) or len(ex.doc) < max_length:
yield ex
@classmethod
def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
+ var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
# gold_preproc is not used ?!
if example.text is not None:
- var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
var_text = add_noise(var_example.text, noise_level)
var_doc = nlp.make_doc(var_text)
var_example.doc = var_doc
return [var_example]
else:
- var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
doc_examples = []
for token_annotation in var_example.token_annotations:
t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level))
@@ -362,10 +365,13 @@ class GoldCorpus(object):
return doc_examples
@classmethod
- def _make_golds(cls, examples, vocab=None):
+ def _make_golds(cls, examples, vocab=None, make_projective=False,
+ ignore_misaligned=False):
gold_examples = []
for example in examples:
- gold_parses = example.get_gold_parses(vocab=vocab)
+ gold_parses = example.get_gold_parses(vocab=vocab,
+ make_projective=make_projective,
+ ignore_misaligned=ignore_misaligned)
for (doc, gold) in gold_parses:
ex = Example(doc=doc)
ex.goldparse = gold
@@ -693,13 +699,11 @@ cdef class DocAnnotation:
cdef class Example:
def __init__(self, doc_annotation=None, token_annotations=None, doc=None,
- make_projective=False, ignore_misaligned=False, goldparse=None):
+ goldparse=None):
""" Doc can either be text, or an actual Doc """
self.doc = doc
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
self.token_annotations = token_annotations if token_annotations else []
- self.make_projective = make_projective
- self.ignore_misaligned = ignore_misaligned
self.goldparse = goldparse
@classmethod
@@ -760,7 +764,7 @@ cdef class Example:
m_ids.extend(id_ + i for id_ in t.ids)
m_words.extend(t.words)
m_tags.extend(t.tags)
- m_heads.extend(head + i if head else None for head in t.heads)
+ m_heads.extend(head + i if head is not None and head >= 0 else head_i + i for head_i, head in enumerate(t.heads))
m_deps.extend(t.deps)
m_ents.extend(t.entities)
m_morph.extend(t.morphology)
@@ -773,7 +777,8 @@ cdef class Example:
return m_example
- def get_gold_parses(self, merge=False, vocab=None):
+ def get_gold_parses(self, merge=False, vocab=None, make_projective=False,
+ ignore_misaligned=False):
"""Return a list of (doc, GoldParse) objects.
If merge is set to True, add all Token annotations to one big list."""
d = self.doc_annotation
@@ -788,20 +793,20 @@ cdef class Example:
raise ValueError(Errors.E998)
m_doc = Doc(vocab, words=t.words)
try:
- gp = GoldParse.from_annotation(m_doc, d, t, make_projective=self.make_projective)
+ gp = GoldParse.from_annotation(m_doc, d, t, make_projective=make_projective)
except AlignmentError:
- if self.ignore_misaligned:
+ if ignore_misaligned:
gp = None
else:
raise
return [(self.doc, gp)]
# we only have one sentence and an appropriate doc
- elif len(self.token_annotations) == 1 and self.doc is not None:
+ elif len(self.token_annotations) == 1 and isinstance(self.doc, Doc):
t = self.token_annotations[0]
try:
- gp = GoldParse.from_annotation(self.doc, d, t, make_projective=self.make_projective)
+ gp = GoldParse.from_annotation(self.doc, d, t, make_projective=make_projective)
except AlignmentError:
- if self.ignore_misaligned:
+ if ignore_misaligned:
gp = None
else:
raise
@@ -814,9 +819,9 @@ cdef class Example:
raise ValueError(Errors.E998)
t_doc = Doc(vocab, words=t.words)
try:
- gp = GoldParse.from_annotation(t_doc, d, t, make_projective=self.make_projective)
+ gp = GoldParse.from_annotation(t_doc, d, t, make_projective=make_projective)
except AlignmentError:
- if self.ignore_misaligned:
+ if ignore_misaligned:
gp = None
else:
raise
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 1d67d8e16..04a769b27 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -61,7 +61,7 @@ class Pipe(object):
return cls(nlp.vocab, **cfg)
def _get_doc(self, example):
- """ Use this method if the `example` method can be both a Doc or an Example """
+ """ Use this method if the `example` can be both a Doc or an Example """
if isinstance(example, Doc):
return example
return example.doc
@@ -102,7 +102,6 @@ class Pipe(object):
and `set_annotations()` methods.
"""
for examples in util.minibatch(stream, size=batch_size):
- examples = list(examples)
docs = [self._get_doc(ex) for ex in examples]
predictions = self.predict(docs)
if isinstance(predictions, tuple) and len(tuple) == 2:
@@ -112,11 +111,11 @@ class Pipe(object):
self.set_annotations(docs, predictions)
if as_example:
- examples = []
+ annotated_examples = []
for ex, doc in zip(examples, docs):
ex.doc = doc
- examples.append(ex)
- yield from examples
+ annotated_examples.append(ex)
+ yield from annotated_examples
else:
yield from docs
@@ -312,11 +311,11 @@ class Tensorizer(Pipe):
self.set_annotations(docs, tensors)
if as_example:
- examples = []
+ annotated_examples = []
for ex, doc in zip(examples, docs):
ex.doc = doc
- examples.append(ex)
- yield from examples
+ annotated_examples.append(ex)
+ yield from annotated_examples
else:
yield from docs
@@ -434,17 +433,16 @@ class Tagger(Pipe):
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size):
- examples = list(examples)
docs = [self._get_doc(ex) for ex in examples]
tag_ids, tokvecs = self.predict(docs)
self.set_annotations(docs, tag_ids, tensors=tokvecs)
if as_example:
- examples = []
+ annotated_examples = []
for ex, doc in zip(examples, docs):
ex.doc = doc
- examples.append(ex)
- yield from examples
+ annotated_examples.append(ex)
+ yield from annotated_examples
else:
yield from docs
@@ -1000,17 +998,16 @@ class TextCategorizer(Pipe):
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size):
- examples = list(examples)
docs = [self._get_doc(ex) for ex in examples]
scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensors=tensors)
if as_example:
- examples = []
+ annotated_examples = []
for ex, doc in zip(examples, docs):
ex.doc = doc
- examples.append(ex)
- yield from examples
+ annotated_examples.append(ex)
+ yield from annotated_examples
else:
yield from docs
@@ -1333,17 +1330,16 @@ class EntityLinker(Pipe):
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size):
- examples = list(examples)
docs = [self._get_doc(ex) for ex in examples]
kb_ids, tensors = self.predict(docs)
self.set_annotations(docs, kb_ids, tensors=tensors)
if as_example:
- examples = []
+ annotated_examples = []
for ex, doc in zip(examples, docs):
ex.doc = doc
- examples.append(ex)
- yield from examples
+ annotated_examples.append(ex)
+ yield from annotated_examples
else:
yield from docs
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 8fec87c50..073851d8a 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -227,7 +227,8 @@ cdef class Parser:
self.set_annotations([doc], states, tensors=None)
return doc
- def pipe(self, docs, int batch_size=256, int n_threads=-1, beam_width=None):
+ def pipe(self, docs, int batch_size=256, int n_threads=-1, beam_width=None,
+ as_example=False):
"""Process a stream of documents.
stream: The sequence of documents to process.
@@ -240,14 +241,21 @@ cdef class Parser:
cdef Doc doc
for batch in util.minibatch(docs, size=batch_size):
batch_in_order = list(batch)
- by_length = sorted(batch_in_order, key=lambda doc: len(doc))
+ docs = [self._get_doc(ex) for ex in batch_in_order]
+ by_length = sorted(docs, key=lambda doc: len(doc))
for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)):
subbatch = list(subbatch)
parse_states = self.predict(subbatch, beam_width=beam_width,
beam_density=beam_density)
self.set_annotations(subbatch, parse_states, tensors=None)
- for doc in batch_in_order:
- yield doc
+ if as_example:
+ annotated_examples = []
+ for ex, doc in zip(batch_in_order, docs):
+ ex.doc = doc
+ annotated_examples.append(ex)
+ yield from annotated_examples
+ else:
+ yield from batch_in_order
def require_model(self):
"""Raise an error if the component's model is not initialized."""
@@ -635,6 +643,12 @@ cdef class Parser:
self.cfg.update(cfg)
return sgd
+ def _get_doc(self, example):
+ """ Use this method if the `example` can be both a Doc or an Example """
+ if isinstance(example, Doc):
+ return example
+ return example.doc
+
def to_disk(self, path, exclude=tuple(), **kwargs):
serializers = {
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index c1bdfcc4d..b43eb3431 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -1,16 +1,40 @@
# coding: utf-8
from __future__ import unicode_literals
+import spacy
+from spacy.errors import AlignmentError
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Example, DocAnnotation
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
from spacy.gold import GoldCorpus, docs_to_json, align
from spacy.lang.en import English
+from spacy.syntax.nonproj import is_nonproj_tree
from spacy.tokens import Doc
from spacy.util import compounding, minibatch
from .util import make_tempdir
import pytest
import srsly
+@pytest.fixture
+def doc():
+ text = "Sarah's sister flew to Silicon Valley via London."
+ tags = ['NNP', 'POS', 'NN', 'VBD', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
+ # head of '.' is intentionally nonprojective for testing
+ heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
+ deps = ['poss', 'case', 'nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
+ biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
+ cats = {"TRAVEL": 1.0, "BAKING": 0.0}
+ nlp = English()
+ doc = nlp(text)
+ for i in range(len(tags)):
+ doc[i].tag_ = tags[i]
+ doc[i].dep_ = deps[i]
+ doc[i].head = doc[heads[i]]
+ doc.ents = spans_from_biluo_tags(doc, biluo_tags)
+ doc.cats = cats
+ doc.is_tagged = True
+ doc.is_parsed = True
+ return doc
+
def test_gold_biluo_U(en_vocab):
words = ["I", "flew", "to", "London", "."]
@@ -98,23 +122,14 @@ def test_iob_to_biluo():
iob_to_biluo(bad_iob)
-def test_roundtrip_docs_to_json():
- text = "I flew to Silicon Valley via London."
- tags = ["PRP", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
- heads = [1, 1, 1, 4, 2, 1, 5, 1]
- deps = ["nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
- biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
- cats = {"TRAVEL": 1.0, "BAKING": 0.0}
+def test_roundtrip_docs_to_json(doc):
nlp = English()
- doc = nlp(text)
- for i in range(len(tags)):
- doc[i].tag_ = tags[i]
- doc[i].dep_ = deps[i]
- doc[i].head = doc[heads[i]]
- doc.ents = spans_from_biluo_tags(doc, biluo_tags)
- doc.cats = cats
- doc.is_tagged = True
- doc.is_parsed = True
+ text = doc.text
+ tags = [t.tag_ for t in doc]
+ deps = [t.dep_ for t in doc]
+ heads = [t.head.i for t in doc]
+ biluo_tags = iob_to_biluo([t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc])
+ cats = doc.cats
# roundtrip to JSON
with make_tempdir() as tmpdir:
@@ -122,7 +137,7 @@ def test_roundtrip_docs_to_json():
srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
- reloaded_example = next(goldcorpus.train_dataset(nlp))
+ reloaded_example = next(goldcorpus.dev_dataset(nlp))
goldparse = reloaded_example.gold
assert len(doc) == goldcorpus.count_train()
@@ -142,7 +157,7 @@ def test_roundtrip_docs_to_json():
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
- reloaded_example = next(goldcorpus.train_dataset(nlp))
+ reloaded_example = next(goldcorpus.dev_dataset(nlp))
goldparse = reloaded_example.gold
assert len(doc) == goldcorpus.count_train()
@@ -166,7 +181,7 @@ def test_roundtrip_docs_to_json():
srsly.write_jsonl(jsonl_file, goldcorpus.train_examples)
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
- reloaded_example = next(goldcorpus.train_dataset(nlp))
+ reloaded_example = next(goldcorpus.dev_dataset(nlp))
goldparse = reloaded_example.gold
assert len(doc) == goldcorpus.count_train()
@@ -181,6 +196,83 @@ def test_roundtrip_docs_to_json():
assert cats["BAKING"] == goldparse.cats["BAKING"]
+def test_projective_train_vs_nonprojective_dev(doc):
+ nlp = English()
+ text = doc.text
+ deps = [t.dep_ for t in doc]
+ heads = [t.head.i for t in doc]
+
+ with make_tempdir() as tmpdir:
+ jsonl_file = tmpdir / "test.jsonl"
+ # write to JSONL train dicts
+ srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
+ goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
+
+ train_reloaded_example = next(goldcorpus.train_dataset(nlp))
+ train_goldparse = train_reloaded_example.gold
+
+ dev_reloaded_example = next(goldcorpus.dev_dataset(nlp))
+ dev_goldparse = dev_reloaded_example.gold
+
+ assert is_nonproj_tree([t.head.i for t in doc]) is True
+ assert is_nonproj_tree(train_goldparse.heads) is False
+ assert heads[:-1] == train_goldparse.heads[:-1]
+ assert heads[-1] != train_goldparse.heads[-1]
+ assert deps[:-1] == train_goldparse.labels[:-1]
+ assert deps[-1] != train_goldparse.labels[-1]
+
+ assert heads == dev_goldparse.heads
+ assert deps == dev_goldparse.labels
+
+
+def test_ignore_misaligned(doc):
+ nlp = English()
+ text = doc.text
+ deps = [t.dep_ for t in doc]
+ heads = [t.head.i for t in doc]
+
+ use_new_align = spacy.gold.USE_NEW_ALIGN
+
+ spacy.gold.USE_NEW_ALIGN = False
+ with make_tempdir() as tmpdir:
+ jsonl_file = tmpdir / "test.jsonl"
+ data = [docs_to_json(doc)]
+ data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
+ # write to JSONL train dicts
+ srsly.write_jsonl(jsonl_file, data)
+ goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
+
+ train_reloaded_example = next(goldcorpus.train_dataset(nlp))
+
+ spacy.gold.USE_NEW_ALIGN = True
+ with make_tempdir() as tmpdir:
+ jsonl_file = tmpdir / "test.jsonl"
+ data = [docs_to_json(doc)]
+ data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
+ # write to JSONL train dicts
+ srsly.write_jsonl(jsonl_file, data)
+ goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
+
+ with pytest.raises(AlignmentError):
+ train_reloaded_example = next(goldcorpus.train_dataset(nlp))
+
+ with make_tempdir() as tmpdir:
+ jsonl_file = tmpdir / "test.jsonl"
+ data = [docs_to_json(doc)]
+ data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
+ # write to JSONL train dicts
+ srsly.write_jsonl(jsonl_file, data)
+ goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
+
+ # doesn't raise an AlignmentError, but there is nothing to iterate over
+ # because the only example can't be aligned
+ train_reloaded_example = list(goldcorpus.train_dataset(nlp,
+ ignore_misaligned=True))
+ assert len(train_reloaded_example) == 0
+
+ spacy.gold.USE_NEW_ALIGN = use_new_align
+
+
# xfail while we have backwards-compatible alignment
@pytest.mark.xfail
@pytest.mark.parametrize(
From 392c4880d9b52c045990d9916a7ed8081573b80b Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Mon, 25 Nov 2019 16:03:28 +0100
Subject: [PATCH 024/496] Restructure Example with merged sents as default
(#4632)
* Switch to train_dataset() function in train CLI
* Fixes for pipe() methods in pipeline components
* Don't clobber `examples` variable with `as_example` in pipe() methods
* Remove unnecessary traversals of `examples`
* Update Parser.pipe() for Examples
* Add `as_examples` kwarg to `pipe()` with implementation to return
`Example`s
* Accept `Doc` or `Example` in `pipe()` with `_get_doc()` (copied from
`Pipe`)
* Fixes to Example implementation in spacy.gold
* Move `make_projective` from an attribute of Example to an argument of
`Example.get_gold_parses()`
* Head of 0 are not treated as unset
* Unset heads are set to self rather than `None` (which causes problems
while projectivizing)
* Check for `Doc` (not just not `None`) when creating GoldParses for
pre-merged example
* Don't clobber `examples` variable in `iter_gold_docs()`
* Add/modify gold tests for handling projectivity
* In JSON roundtrip compare results from `dev_dataset` rather than
`train_dataset` to avoid projectivization (and other potential
modifications)
* Add test for projective train vs. nonprojective dev versions of the
same `Doc`
* Handle ignore_misaligned as arg rather than attr
Move `ignore_misaligned` from an attribute of `Example` to an argument
to `Example.get_gold_parses()`, which makes it parallel to
`make_projective`.
Add test with old and new align that checks whether `ignore_misaligned`
errors are raised as expected (only for new align).
* Remove unused attrs from gold.pxd
Remove `ignore_misaligned` and `make_projective` from `gold.pxd`
* Restructure Example with merged sents as default
An `Example` now includes a single `TokenAnnotation` that includes all
the information from one `Doc` (=JSON `paragraph`). If required, the
individual sentences can be returned as a list of examples with
`Example.split_sents()` with no raw text available.
* Input/output a single `Example.token_annotation`
* Add `sent_starts` to `TokenAnnotation` to handle sentence boundaries
* Replace `Example.merge_sents()` with `Example.split_sents()`
* Modify components to use a single `Example.token_annotation`
* Pipeline components
* conllu2json converter
* Rework/rename `add_token_annotation()` and `add_doc_annotation()` to
`set_token_annotation()` and `set_doc_annotation()`, functions that set
rather then appending/extending.
* Rename `morphology` to `morphs` in `TokenAnnotation` and `GoldParse`
* Add getters to `TokenAnnotation` to supply default values when a given
attribute is not available
* `Example.get_gold_parses()` in `spacy.gold._make_golds()` is only
applied on single examples, so the `GoldParse` is returned saved in the
provided `Example` rather than creating a new `Example` with no other
internal annotation
* Update tests for API changes and `merge_sents()` vs. `split_sents()`
* Refer to Example.goldparse in iter_gold_docs()
Use `Example.goldparse` in `iter_gold_docs()` instead of `Example.gold`
because a `None` `GoldParse` is generated with ignore_misaligned and
generating it on-the-fly can raise an unwanted AlignmentError
* Fix make_orth_variants()
Fix bug in make_orth_variants() related to conversion from multiple to
one TokenAnnotation per Example.
* Add basic test for make_orth_variants()
* Replace try/except with conditionals
* Replace default morph value with set
---
spacy/cli/converters/conllu2json.py | 23 +-
spacy/gold.pxd | 7 +-
spacy/gold.pyx | 433 ++++++++++--------
spacy/language.py | 5 +-
spacy/pipeline/pipes.pyx | 20 +-
spacy/scorer.py | 2 +-
spacy/syntax/arc_eager.pyx | 26 +-
spacy/syntax/ner.pyx | 11 +-
spacy/syntax/nonproj.pyx | 66 ++-
spacy/tests/regression/test_issue1501-2000.py | 2 +-
spacy/tests/test_gold.py | 107 +++--
spacy/tests/test_scorer.py | 4 +-
12 files changed, 376 insertions(+), 330 deletions(-)
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index 43216c943..ff720f4bf 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -24,17 +24,16 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
checked_for_ner = False
has_ner_tags = False
for i, example in enumerate(conll_data):
- for token_annotation in example.token_annotations:
- if not checked_for_ner:
- has_ner_tags = is_ner(token_annotation.entities[0])
- checked_for_ner = True
- sentences.append(generate_sentence(token_annotation, has_ner_tags))
- # Real-sized documents could be extracted using the comments on the
- # conluu document
- if len(sentences) % n_sents == 0:
- doc = create_doc(sentences, i)
- docs.append(doc)
- sentences = []
+ if not checked_for_ner:
+ has_ner_tags = is_ner(example.token_annotation.entities[0])
+ checked_for_ner = True
+ sentences.append(generate_sentence(example.token_annotation, has_ner_tags))
+ # Real-sized documents could be extracted using the comments on the
+ # conllu document
+ if len(sentences) % n_sents == 0:
+ doc = create_doc(sentences, i)
+ docs.append(doc)
+ sentences = []
return docs
@@ -84,7 +83,7 @@ def read_conllx(input_data, use_morphology=False, n=0):
print(line)
raise
example = Example(doc=None)
- example.add_token_annotation(ids=ids, words=words, tags=tags,
+ example.set_token_annotation(ids=ids, words=words, tags=tags,
heads=heads, deps=deps, entities=ents)
yield example
i += 1
diff --git a/spacy/gold.pxd b/spacy/gold.pxd
index 8527ba2b6..247ff8aa1 100644
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@@ -25,7 +25,7 @@ cdef class GoldParse:
cdef public int loss
cdef public list words
cdef public list tags
- cdef public list morphology
+ cdef public list morphs
cdef public list heads
cdef public list labels
cdef public dict orths
@@ -45,7 +45,8 @@ cdef class TokenAnnotation:
cdef public list heads
cdef public list deps
cdef public list entities
- cdef public list morphology
+ cdef public list morphs
+ cdef public list sent_starts
cdef public list brackets
@@ -56,7 +57,7 @@ cdef class DocAnnotation:
cdef class Example:
cdef public object doc
- cdef public list token_annotations
+ cdef public TokenAnnotation token_annotation
cdef public DocAnnotation doc_annotation
cdef public object goldparse
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 39e867a33..0659ddd02 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -215,7 +215,7 @@ class GoldCorpus(object):
ex_dict = example.to_dict()
text = example.text
srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict))
- n += len(example.token_annotations)
+ n += 1
if limit and n >= limit:
break
@@ -271,7 +271,7 @@ class GoldCorpus(object):
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
for example in examples:
yield example
- i += len(example.token_annotations)
+ i += 1
if limit and i >= limit:
return
@@ -286,15 +286,14 @@ class GoldCorpus(object):
yield from self.read_examples(locs, limit=self.limit)
def count_train(self):
- # TODO: should this count words or sentences ?
+ """Returns count of words in train examples"""
n = 0
i = 0
for example in self.train_examples:
- for token_annotation in example.token_annotations:
- n += len(token_annotation.words)
- if self.limit and i >= self.limit:
- break
- i += 1
+ n += len(example.token_annotation.words)
+ if self.limit and i >= self.limit:
+ break
+ i += 1
return n
def train_dataset(self, nlp, gold_preproc=False, max_length=None,
@@ -328,18 +327,27 @@ class GoldCorpus(object):
def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
noise_level=0.0, orth_variant_level=0.0,
make_projective=False, ignore_misaligned=False):
- """ Setting gold_preproc will result in creating a doc per 'sentence' """
+ """ Setting gold_preproc will result in creating a doc per sentence """
for example in examples:
if gold_preproc:
example.doc = None
+ split_examples = example.split_sents()
+ example_golds = []
+ for split_example in split_examples:
+ split_example_docs = cls._make_docs(nlp, split_example,
+ gold_preproc, noise_level=noise_level,
+ orth_variant_level=orth_variant_level)
+ split_example_golds = cls._make_golds(split_example_docs,
+ vocab=nlp.vocab, make_projective=make_projective,
+ ignore_misaligned=ignore_misaligned)
+ example_golds.extend(split_example_golds)
else:
- example = example.merge_sents()
- example_docs = cls._make_docs(nlp, example,
- gold_preproc, noise_level=noise_level,
- orth_variant_level=orth_variant_level)
- example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
- make_projective=make_projective,
- ignore_misaligned=ignore_misaligned)
+ example_docs = cls._make_docs(nlp, example,
+ gold_preproc, noise_level=noise_level,
+ orth_variant_level=orth_variant_level)
+ example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
+ make_projective=make_projective,
+ ignore_misaligned=ignore_misaligned)
for ex in example_golds:
if ex.goldparse is not None:
if (not max_length) or len(ex.doc) < max_length:
@@ -353,35 +361,28 @@ class GoldCorpus(object):
var_text = add_noise(var_example.text, noise_level)
var_doc = nlp.make_doc(var_text)
var_example.doc = var_doc
- return [var_example]
else:
- doc_examples = []
- for token_annotation in var_example.token_annotations:
- t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level))
- doc_example = Example(doc_annotation=example.doc_annotation,
- token_annotations=[token_annotation],
- doc=t_doc)
- doc_examples.append(doc_example)
- return doc_examples
+ var_doc = Doc(nlp.vocab, words=add_noise(var_example.token_annotation.words, noise_level))
+ var_example.doc = var_doc
+ return [var_example]
@classmethod
def _make_golds(cls, examples, vocab=None, make_projective=False,
ignore_misaligned=False):
- gold_examples = []
for example in examples:
gold_parses = example.get_gold_parses(vocab=vocab,
make_projective=make_projective,
ignore_misaligned=ignore_misaligned)
- for (doc, gold) in gold_parses:
- ex = Example(doc=doc)
- ex.goldparse = gold
- gold_examples.append(ex)
- return gold_examples
+ assert len(gold_parses) == 1
+ assert gold_parses[0][0] == example.doc
+ example.goldparse = gold_parses[0][1]
+ return examples
+
def make_orth_variants(nlp, example, orth_variant_level=0.0):
if random.random() >= orth_variant_level:
return example
- if not example.token_annotations:
+ if not example.token_annotation:
return example
raw = example.text
if random.random() >= 0.5:
@@ -392,46 +393,46 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
ndpv = nlp.Defaults.paired_orth_variants
# modify words in paragraph_tuples
variant_example = Example(doc=raw)
- for token_annotation in example.token_annotations:
- words = token_annotation.words
- tags = token_annotation.tags
- if not words or not tags:
- # add the unmodified annotation
- token_dict = token_annotation.to_dict()
- variant_example.add_token_annotation(**token_dict)
- else:
- if lower:
- words = [w.lower() for w in words]
- # single variants
- punct_choices = [random.choice(x["variants"]) for x in ndsv]
- for word_idx in range(len(words)):
- for punct_idx in range(len(ndsv)):
- if tags[word_idx] in ndsv[punct_idx]["tags"] \
- and words[word_idx] in ndsv[punct_idx]["variants"]:
- words[word_idx] = punct_choices[punct_idx]
- # paired variants
- punct_choices = [random.choice(x["variants"]) for x in ndpv]
- for word_idx in range(len(words)):
- for punct_idx in range(len(ndpv)):
- if tags[word_idx] in ndpv[punct_idx]["tags"] \
- and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
- # backup option: random left vs. right from pair
- pair_idx = random.choice([0, 1])
- # best option: rely on paired POS tags like `` / ''
- if len(ndpv[punct_idx]["tags"]) == 2:
- pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
- # next best option: rely on position in variants
- # (may not be unambiguous, so order of variants matters)
- else:
- for pair in ndpv[punct_idx]["variants"]:
- if words[word_idx] in pair:
- pair_idx = pair.index(words[word_idx])
- words[word_idx] = punct_choices[punct_idx][pair_idx]
+ token_annotation = example.token_annotation
+ words = token_annotation.words
+ tags = token_annotation.tags
+ if not words or not tags:
+ # add the unmodified annotation
+ token_dict = token_annotation.to_dict()
+ variant_example.set_token_annotation(**token_dict)
+ else:
+ if lower:
+ words = [w.lower() for w in words]
+ # single variants
+ punct_choices = [random.choice(x["variants"]) for x in ndsv]
+ for word_idx in range(len(words)):
+ for punct_idx in range(len(ndsv)):
+ if tags[word_idx] in ndsv[punct_idx]["tags"] \
+ and words[word_idx] in ndsv[punct_idx]["variants"]:
+ words[word_idx] = punct_choices[punct_idx]
+ # paired variants
+ punct_choices = [random.choice(x["variants"]) for x in ndpv]
+ for word_idx in range(len(words)):
+ for punct_idx in range(len(ndpv)):
+ if tags[word_idx] in ndpv[punct_idx]["tags"] \
+ and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
+ # backup option: random left vs. right from pair
+ pair_idx = random.choice([0, 1])
+ # best option: rely on paired POS tags like `` / ''
+ if len(ndpv[punct_idx]["tags"]) == 2:
+ pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
+ # next best option: rely on position in variants
+ # (may not be unambiguous, so order of variants matters)
+ else:
+ for pair in ndpv[punct_idx]["variants"]:
+ if words[word_idx] in pair:
+ pair_idx = pair.index(words[word_idx])
+ words[word_idx] = punct_choices[punct_idx][pair_idx]
- token_dict = token_annotation.to_dict()
- token_dict["words"] = words
- token_dict["tags"] = tags
- variant_example.add_token_annotation(**token_dict)
+ token_dict = token_annotation.to_dict()
+ token_dict["words"] = words
+ token_dict["tags"] = tags
+ variant_example.set_token_annotation(**token_dict)
# modify raw to match variant_paragraph_tuples
if raw is not None:
variants = []
@@ -449,30 +450,29 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx]
raw_idx += 1
- for token_annotation in variant_example.token_annotations:
- for word in token_annotation.words:
- match_found = False
- # add identical word
- if word not in variants and raw[raw_idx:].startswith(word):
- variant_raw += word
- raw_idx += len(word)
- match_found = True
- # add variant word
- else:
- for variant in variants:
- if not match_found and \
- raw[raw_idx:].startswith(variant):
- raw_idx += len(variant)
- variant_raw += word
- match_found = True
- # something went wrong, abort
- # (add a warning message?)
- if not match_found:
- return example
- # add following whitespace
- while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
- variant_raw += raw[raw_idx]
- raw_idx += 1
+ for word in variant_example.token_annotation.words:
+ match_found = False
+ # add identical word
+ if word not in variants and raw[raw_idx:].startswith(word):
+ variant_raw += word
+ raw_idx += len(word)
+ match_found = True
+ # add variant word
+ else:
+ for variant in variants:
+ if not match_found and \
+ raw[raw_idx:].startswith(variant):
+ raw_idx += len(variant)
+ variant_raw += word
+ match_found = True
+ # something went wrong, abort
+ # (add a warning message?)
+ if not match_found:
+ return example
+ # add following whitespace
+ while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
+ variant_raw += raw[raw_idx]
+ raw_idx += 1
variant_example.doc = variant_raw
return variant_example
return variant_example
@@ -521,30 +521,43 @@ def json_to_examples(doc):
paragraphs = []
for paragraph in doc["paragraphs"]:
example = Example(doc=paragraph.get("raw", None))
+ words = []
+ ids = []
+ tags = []
+ heads = []
+ labels = []
+ ner = []
+ morphs = []
+ sent_starts = []
+ brackets = []
for sent in paragraph["sentences"]:
- words = []
- ids = []
- tags = []
- heads = []
- labels = []
- ner = []
+ sent_start_i = len(words)
for i, token in enumerate(sent["tokens"]):
words.append(token["orth"])
- ids.append(i)
+ ids.append(token.get('id', sent_start_i + i))
tags.append(token.get('tag', "-"))
- heads.append(token.get("head", 0) + i)
+ heads.append(token.get("head", 0) + sent_start_i + i)
labels.append(token.get("dep", ""))
# Ensure ROOT label is case-insensitive
if labels[-1].lower() == "root":
labels[-1] = "ROOT"
ner.append(token.get("ner", "-"))
- example.add_token_annotation(ids=ids, words=words, tags=tags,
- heads=heads, deps=labels, entities=ner,
- brackets=sent.get("brackets", []))
+ morphs.append(token.get("morph", {}))
+ if i == 0:
+ sent_starts.append(True)
+ else:
+ sent_starts.append(False)
+ if "brackets" in sent:
+ brackets.extend((b["first"] + sent_start_i,
+ b["last"] + sent_start_i, b["label"])
+ for b in sent["brackets"])
cats = {}
for cat in paragraph.get("cats", {}):
cats[cat["label"]] = cat["value"]
- example.add_doc_annotation(cats=cats)
+ example.set_token_annotation(ids=ids, words=words, tags=tags,
+ heads=heads, deps=labels, entities=ner, morphs=morphs,
+ sent_starts=sent_starts, brackets=brackets)
+ example.set_doc_annotation(cats=cats)
yield example
@@ -652,15 +665,16 @@ def _consume_ent(tags):
cdef class TokenAnnotation:
- def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphology=None, brackets=None):
+ def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphs=None, sent_starts=None, brackets=None):
self.ids = ids if ids else []
self.words = words if words else []
self.tags = tags if tags else []
self.heads = heads if heads else []
self.deps = deps if deps else []
self.entities = entities if entities else []
+ self.morphs = morphs if morphs else []
+ self.sent_starts = sent_starts if sent_starts else []
self.brackets = brackets if brackets else []
- self.morphology = morphology if morphology else []
@classmethod
def from_dict(cls, token_dict):
@@ -670,7 +684,8 @@ cdef class TokenAnnotation:
heads=token_dict.get("heads", None),
deps=token_dict.get("deps", None),
entities=token_dict.get("entities", None),
- morphology=token_dict.get("morphology", None),
+ morphs=token_dict.get("morphs", None),
+ sent_starts=token_dict.get("sent_starts", None),
brackets=token_dict.get("brackets", None))
def to_dict(self):
@@ -680,9 +695,34 @@ cdef class TokenAnnotation:
"heads": self.heads,
"deps": self.deps,
"entities": self.entities,
- "morphology": self.morphology,
+ "morphs": self.morphs,
+ "sent_starts": self.sent_starts,
"brackets": self.brackets}
+ def get_id(self, i):
+ return self.ids[i] if i < len(self.ids) else i
+
+ def get_word(self, i):
+ return self.words[i] if i < len(self.words) else ""
+
+ def get_tag(self, i):
+ return self.tags[i] if i < len(self.tags) else "-"
+
+ def get_head(self, i):
+ return self.heads[i] if i < len(self.heads) else i
+
+ def get_dep(self, i):
+ return self.deps[i] if i < len(self.deps) else ""
+
+ def get_entity(self, i):
+ return self.entities[i] if i < len(self.entities) else "-"
+
+ def get_morph(self, i):
+ return self.morphs[i] if i < len(self.morphs) else set()
+
+ def get_sent_start(self, i):
+ return self.sent_starts[i] if i < len(self.sent_starts) else None
+
cdef class DocAnnotation:
def __init__(self, cats=None, links=None):
@@ -698,33 +738,33 @@ cdef class DocAnnotation:
cdef class Example:
- def __init__(self, doc_annotation=None, token_annotations=None, doc=None,
+ def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
goldparse=None):
""" Doc can either be text, or an actual Doc """
self.doc = doc
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
- self.token_annotations = token_annotations if token_annotations else []
+ self.token_annotation = token_annotation if token_annotation else TokenAnnotation()
self.goldparse = goldparse
@classmethod
def from_gold(cls, goldparse, doc=None):
doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
token_annotation = goldparse.get_token_annotation()
- return cls(doc_annotation, [token_annotation], doc)
+ return cls(doc_annotation, token_annotation, doc)
@classmethod
def from_dict(cls, example_dict, doc=None):
- token_dicts = example_dict["token_annotations"]
- token_annotations = [TokenAnnotation.from_dict(t) for t in token_dicts]
+ token_dict = example_dict["token_annotation"]
+ token_annotation = TokenAnnotation.from_dict(token_dict)
doc_dict = example_dict["doc_annotation"]
doc_annotation = DocAnnotation.from_dict(doc_dict)
- return cls(doc_annotation, token_annotations, doc)
+ return cls(doc_annotation, token_annotation, doc)
def to_dict(self):
""" Note that this method does NOT export the doc, only the annotations ! """
- token_dicts = [t.to_dict() for t in self.token_annotations]
+ token_dict = self.token_annotation.to_dict()
doc_dict = self.doc_annotation.to_dict()
- return {"token_annotations": token_dicts, "doc_annotation": doc_dict}
+ return {"token_annotation": token_dict, "doc_annotation": doc_dict}
@property
def text(self):
@@ -737,96 +777,108 @@ cdef class Example:
@property
def gold(self):
if self.goldparse is None:
- doc, gold = self.get_gold_parses(merge=True)[0]
+ doc, gold = self.get_gold_parses()[0]
self.goldparse = gold
return self.goldparse
- def add_token_annotation(self, ids=None, words=None, tags=None, heads=None,
- deps=None, entities=None, morphology=None, brackets=None):
- t = TokenAnnotation(ids=ids, words=words, tags=tags,
+ def set_token_annotation(self, ids=None, words=None, tags=None, heads=None,
+ deps=None, entities=None, morphs=None,
+ sent_starts=None, brackets=None):
+ self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
heads=heads, deps=deps, entities=entities,
- morphology=morphology, brackets=brackets)
- self.token_annotations.append(t)
+ morphs=morphs, sent_starts=sent_starts,
+ brackets=brackets)
- def add_doc_annotation(self, cats=None, links=None):
+ def set_doc_annotation(self, cats=None, links=None):
if cats:
- self.doc_annotation.cats.update(cats)
+ self.doc_annotation.cats = cats
if links:
- self.doc_annotation.links.update(links)
+ self.doc_annotation.links = links
- def merge_sents(self):
- """ Merge the list of token annotations into one object and return this new object """
- m_example = Example(doc=self.doc, doc_annotation=self.doc_annotation)
- m_ids, m_words, m_tags, m_heads, m_deps, m_ents, m_morph = [], [], [], [], [], [], []
- m_brackets = []
- i = 0
- for t in self.token_annotations:
- m_ids.extend(id_ + i for id_ in t.ids)
- m_words.extend(t.words)
- m_tags.extend(t.tags)
- m_heads.extend(head + i if head is not None and head >= 0 else head_i + i for head_i, head in enumerate(t.heads))
- m_deps.extend(t.deps)
- m_ents.extend(t.entities)
- m_morph.extend(t.morphology)
- m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
- for b in t.brackets)
- i += len(t.ids)
- m_example.add_token_annotation(ids=m_ids, words=m_words, tags=m_tags,
- heads=m_heads, deps=m_deps, entities=m_ents,
- morphology=m_morph, brackets=m_brackets)
- return m_example
+ def split_sents(self):
+ """ Split the token annotations into multiple Examples based on
+ sent_starts and return a list of the new Examples"""
+ s_example = Example(doc=None, doc_annotation=self.doc_annotation)
+ s_ids, s_words, s_tags, s_heads = [], [], [], []
+ s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], []
+ s_brackets = []
+ sent_start_i = 0
+ t = self.token_annotation
+ split_examples = []
+ for i in range(len(t.words)):
+ if i > 0 and t.sent_starts[i] == True:
+ s_example.set_token_annotation(ids=s_ids,
+ words=s_words, tags=s_tags, heads=s_heads, deps=s_deps,
+ entities=s_ents, morphs=s_morphs,
+ sent_starts=s_sent_starts, brackets=s_brackets)
+ split_examples.append(s_example)
+ s_example = Example(doc=None, doc_annotation=self.doc_annotation)
+ s_ids, s_words, s_tags, s_heads = [], [], [], []
+ s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], []
+ s_brackets = []
+ sent_start_i = i
+ s_ids.append(t.get_id(i))
+ s_words.append(t.get_word(i))
+ s_tags.append(t.get_tag(i))
+ s_heads.append(t.get_head(i) - sent_start_i)
+ s_deps.append(t.get_dep(i))
+ s_ents.append(t.get_entity(i))
+ s_morphs.append(t.get_morph(i))
+ s_sent_starts.append(t.get_sent_start(i))
+ s_brackets.extend((b[0] - sent_start_i,
+ b[1] - sent_start_i, b[2])
+ for b in t.brackets if b[0] == i)
+ i += 1
+ s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
+ heads=s_heads, deps=s_deps, entities=s_ents,
+ morphs=s_morphs, sent_starts=s_sent_starts,
+ brackets=s_brackets)
+ split_examples.append(s_example)
+ return split_examples
- def get_gold_parses(self, merge=False, vocab=None, make_projective=False,
+ def get_gold_parses(self, merge=True, vocab=None, make_projective=False,
ignore_misaligned=False):
"""Return a list of (doc, GoldParse) objects.
- If merge is set to True, add all Token annotations to one big list."""
+ If merge is set to True, keep all Token annotations as one big list."""
d = self.doc_annotation
- # merging different sentences
+ # merge == do not modify Example
if merge:
- merged_example = self.merge_sents()
- assert(len(merged_example.token_annotations)) == 1
- t = merged_example.token_annotations[0]
- m_doc = merged_example.doc
- if not m_doc:
+ t = self.token_annotation
+ doc = self.doc
+ if not self.doc:
if not vocab:
raise ValueError(Errors.E998)
- m_doc = Doc(vocab, words=t.words)
+ doc = Doc(vocab, words=t.words)
try:
- gp = GoldParse.from_annotation(m_doc, d, t, make_projective=make_projective)
+ gp = GoldParse.from_annotation(doc, d, t,
+ make_projective=make_projective)
except AlignmentError:
if ignore_misaligned:
gp = None
else:
raise
- return [(self.doc, gp)]
- # we only have one sentence and an appropriate doc
- elif len(self.token_annotations) == 1 and isinstance(self.doc, Doc):
- t = self.token_annotations[0]
- try:
- gp = GoldParse.from_annotation(self.doc, d, t, make_projective=make_projective)
- except AlignmentError:
- if ignore_misaligned:
- gp = None
- else:
- raise
- return [(self.doc, gp)]
- # not merging: one GoldParse per 'sentence', defining docs with the words from each sentence
+ return [(doc, gp)]
+ # not merging: one GoldParse per sentence, defining docs with the words
+ # from each sentence
else:
parses = []
- for t in self.token_annotations:
+ split_examples = self.split_sents()
+ for split_example in split_examples:
if not vocab:
raise ValueError(Errors.E998)
- t_doc = Doc(vocab, words=t.words)
+ split_doc = Doc(vocab, words=split_example.token_annotation.words)
try:
- gp = GoldParse.from_annotation(t_doc, d, t, make_projective=make_projective)
+ gp = GoldParse.from_annotation(split_doc, d,
+ split_example.token_annotation,
+ make_projective=make_projective)
except AlignmentError:
if ignore_misaligned:
gp = None
else:
raise
if gp is not None:
- parses.append((t_doc, gp))
+ parses.append((split_doc, gp))
return parses
@classmethod
@@ -881,9 +933,14 @@ cdef class GoldParse:
"""
@classmethod
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
- return cls(doc, words=token_annotation.words, tags=token_annotation.tags,
- heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities,
- morphology=token_annotation.morphology, cats=doc_annotation.cats, links=doc_annotation.links,
+ return cls(doc, words=token_annotation.words,
+ tags=token_annotation.tags,
+ heads=token_annotation.heads,
+ deps=token_annotation.deps,
+ entities=token_annotation.entities,
+ morphs=token_annotation.morphs,
+ cats=doc_annotation.cats,
+ links=doc_annotation.links,
make_projective=make_projective)
def get_token_annotation(self):
@@ -893,9 +950,9 @@ cdef class GoldParse:
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
heads=self.heads, deps=self.labels, entities=self.ner,
- morphology=self.morphology)
+ morphs=self.morphs)
- def __init__(self, doc, words=None, tags=None, morphology=None,
+ def __init__(self, doc, words=None, tags=None, morphs=None,
heads=None, deps=None, entities=None, make_projective=False,
cats=None, links=None):
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
@@ -944,8 +1001,8 @@ cdef class GoldParse:
heads = [None for _ in words]
if not deps:
deps = [None for _ in words]
- if not morphology:
- morphology = [None for _ in words]
+ if not morphs:
+ morphs = [None for _ in words]
if entities is None:
entities = ["-" for _ in words]
elif len(entities) == 0:
@@ -971,7 +1028,7 @@ cdef class GoldParse:
self.heads = [None] * len(doc)
self.labels = [None] * len(doc)
self.ner = [None] * len(doc)
- self.morphology = [None] * len(doc)
+ self.morphs = [None] * len(doc)
# This needs to be done before we align the words
if make_projective and heads is not None and deps is not None:
@@ -990,7 +1047,7 @@ cdef class GoldParse:
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags,
- heads=heads, deps=deps, entities=entities, morphology=morphology,
+ heads=heads, deps=deps, entities=entities, morphs=morphs,
brackets=[])
for i, gold_i in enumerate(self.cand_to_gold):
@@ -1000,12 +1057,12 @@ cdef class GoldParse:
self.heads[i] = None
self.labels[i] = None
self.ner[i] = None
- self.morphology[i] = set()
+ self.morphs[i] = set()
if gold_i is None:
if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
- self.morphology[i] = morphology[i2j_multi[i]]
+ self.morphs[i] = morphs[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
is_first = i2j_multi[i] != i2j_multi.get(i-1)
# Set next word in multi-token span as head, until last
@@ -1044,7 +1101,7 @@ cdef class GoldParse:
else:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
- self.morphology[i] = morphology[gold_i]
+ self.morphs[i] = morphs[gold_i]
if heads[gold_i] is None:
self.heads[i] = None
else:
diff --git a/spacy/language.py b/spacy/language.py
index c84f597d9..8ec602ed7 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -574,9 +574,8 @@ class Language(object):
# Populate vocab
else:
for example in get_examples():
- for token_annotation in example.token_annotations:
- for word in token_annotation.words:
- _ = self.vocab[word] # noqa: F841
+ for word in example.token_annotation.words:
+ _ = self.vocab[word] # noqa: F841
if cfg.get("device", -1) >= 0:
util.use_gpu(cfg["device"])
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 04a769b27..56a00e33b 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -565,12 +565,11 @@ class Tagger(Pipe):
orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = OrderedDict()
for example in get_examples():
- for token_annotation in example.token_annotations:
- for tag in token_annotation.tags:
- if tag in orig_tag_map:
- new_tag_map[tag] = orig_tag_map[tag]
- else:
- new_tag_map[tag] = {POS: X}
+ for tag in example.token_annotation.tags:
+ if tag in orig_tag_map:
+ new_tag_map[tag] = orig_tag_map[tag]
+ else:
+ new_tag_map[tag] = {POS: X}
cdef Vocab vocab = self.vocab
if new_tag_map:
vocab.morphology = Morphology(vocab.strings, new_tag_map,
@@ -750,11 +749,10 @@ class MultitaskObjective(Tagger):
gold_examples = nonproj.preprocess_training_data(get_examples())
# for raw_text, doc_annot in gold_tuples:
for example in gold_examples:
- for token_annotation in example.token_annotations:
- for i in range(len(token_annotation.ids)):
- label = self.make_label(i, token_annotation)
- if label is not None and label not in self.labels:
- self.labels[label] = len(self.labels)
+ for i in range(len(example.token_annotation.ids)):
+ label = self.make_label(i, example.token_annotation)
+ if label is not None and label not in self.labels:
+ self.labels[label] = len(self.labels)
if self.model is True:
token_vector_width = util.env_opt("token_vector_width")
self.model = self.Model(len(self.labels), tok2vec=tok2vec)
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 25c6935f3..723259acd 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -237,7 +237,7 @@ class Scorer(object):
if len(doc) != len(gold):
doc_annotation = DocAnnotation(cats=gold.cats)
token_annotation = gold.orig
- gold = GoldParse.from_annotation(doc, doc_annotation, [token_annotation])
+ gold = GoldParse.from_annotation(doc, doc_annotation, token_annotation)
orig = gold.orig
gold_deps = set()
gold_deps_per_dep = {}
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 0a99609a8..d358c1277 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -342,19 +342,19 @@ cdef class ArcEager(TransitionSystem):
actions[RIGHT][label] = 1
actions[REDUCE][label] = 1
for example in kwargs.get('gold_parses', []):
- for token_annotation in example.token_annotations:
- heads, labels = nonproj.projectivize(token_annotation.heads, token_annotation.deps)
- for child, head, label in zip(token_annotation.ids, heads, labels):
- if label.upper() == 'ROOT' :
- label = 'ROOT'
- if head == child:
- actions[BREAK][label] += 1
- elif head < child:
- actions[RIGHT][label] += 1
- actions[REDUCE][''] += 1
- elif head > child:
- actions[LEFT][label] += 1
- actions[SHIFT][''] += 1
+ heads, labels = nonproj.projectivize(example.token_annotation.heads,
+ example.token_annotation.deps)
+ for child, head, label in zip(example.token_annotation.ids, heads, labels):
+ if label.upper() == 'ROOT' :
+ label = 'ROOT'
+ if head == child:
+ actions[BREAK][label] += 1
+ elif head < child:
+ actions[RIGHT][label] += 1
+ actions[REDUCE][''] += 1
+ elif head > child:
+ actions[LEFT][label] += 1
+ actions[SHIFT][''] += 1
if min_freq is not None:
for action, label_freqs in actions.items():
for label, freq in list(label_freqs.items()):
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index d791534ee..7467aa342 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -73,12 +73,11 @@ cdef class BiluoPushDown(TransitionSystem):
actions[action][entity_type] = 1
moves = ('M', 'B', 'I', 'L', 'U')
for example in kwargs.get('gold_parses', []):
- for token_annotation in example.token_annotations:
- for i, ner_tag in enumerate(token_annotation.entities):
- if ner_tag != 'O' and ner_tag != '-':
- _, label = ner_tag.split('-', 1)
- for action in (BEGIN, IN, LAST, UNIT):
- actions[action][label] += 1
+ for i, ner_tag in enumerate(example.token_annotation.entities):
+ if ner_tag != 'O' and ner_tag != '-':
+ _, label = ner_tag.split('-', 1)
+ for action in (BEGIN, IN, LAST, UNIT):
+ actions[action][label] += 1
return actions
@property
diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx
index c7ed25948..2ec6b61ac 100644
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@@ -81,15 +81,15 @@ def is_decorated(label):
def count_decorated_labels(gold_data):
freqs = {}
for example in gold_data:
- for token_annotation in example.token_annotations:
- proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps)
- # set the label to ROOT for each root dependent
- deco_deps = ['ROOT' if head == i else deco_deps[i]
- for i, head in enumerate(proj_heads)]
- # count label frequencies
- for label in deco_deps:
- if is_decorated(label):
- freqs[label] = freqs.get(label, 0) + 1
+ proj_heads, deco_deps = projectivize(example.token_annotation.heads,
+ example.token_annotation.deps)
+ # set the label to ROOT for each root dependent
+ deco_deps = ['ROOT' if head == i else deco_deps[i]
+ for i, head in enumerate(proj_heads)]
+ # count label frequencies
+ for label in deco_deps:
+ if is_decorated(label):
+ freqs[label] = freqs.get(label, 0) + 1
return freqs
@@ -98,21 +98,20 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30):
freqs = {}
for example in gold_data:
new_example = Example(doc=example.doc)
- for token_annotation in example.token_annotations:
- proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps)
- # set the label to ROOT for each root dependent
- deco_deps = ['ROOT' if head == i else deco_deps[i]
- for i, head in enumerate(proj_heads)]
- # count label frequencies
- if label_freq_cutoff > 0:
- for label in deco_deps:
- if is_decorated(label):
- freqs[label] = freqs.get(label, 0) + 1
- # TODO: the code would be less ugly when changing heads and deps in-place, but is this OK upstream ?
- proj_token_dict = token_annotation.to_dict()
- proj_token_dict["heads"] = proj_heads
- proj_token_dict["deps"] = deco_deps
- new_example.add_token_annotation(**proj_token_dict)
+ proj_heads, deco_deps = projectivize(example.token_annotation.heads,
+ example.token_annotation.deps)
+ # set the label to ROOT for each root dependent
+ deco_deps = ['ROOT' if head == i else deco_deps[i]
+ for i, head in enumerate(proj_heads)]
+ # count label frequencies
+ if label_freq_cutoff > 0:
+ for label in deco_deps:
+ if is_decorated(label):
+ freqs[label] = freqs.get(label, 0) + 1
+ proj_token_dict = example.token_annotation.to_dict()
+ proj_token_dict["heads"] = proj_heads
+ proj_token_dict["deps"] = deco_deps
+ new_example.set_token_annotation(**proj_token_dict)
preprocessed.append(new_example)
if label_freq_cutoff > 0:
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
@@ -213,15 +212,14 @@ def _filter_labels(examples, cutoff, freqs):
filtered = []
for example in examples:
new_example = Example(doc=example.doc)
- for token_annotation in example.token_annotations:
- filtered_labels = []
- for label in token_annotation.deps:
- if is_decorated(label) and freqs.get(label, 0) < cutoff:
- filtered_labels.append(decompose(label)[0])
- else:
- filtered_labels.append(label)
- filtered_token_dict = token_annotation.to_dict()
- filtered_token_dict["deps"] = filtered_labels
- new_example.add_token_annotation(**filtered_token_dict)
+ filtered_labels = []
+ for label in example.token_annotation.deps:
+ if is_decorated(label) and freqs.get(label, 0) < cutoff:
+ filtered_labels.append(decompose(label)[0])
+ else:
+ filtered_labels.append(label)
+ filtered_token_dict = example.token_annotation.to_dict()
+ filtered_token_dict["deps"] = filtered_labels
+ new_example.set_token_annotation(**filtered_token_dict)
filtered.append(new_example)
return filtered
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index ace25f8cc..4b27901ad 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -273,7 +273,7 @@ def test_issue1963(en_tokenizer):
def test_issue1967(label):
ner = EntityRecognizer(Vocab())
example = Example(doc=None)
- example.add_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label])
+ example.set_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label])
ner.moves.get_actions(gold_parses=[example])
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index b43eb3431..d1255c176 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -36,6 +36,16 @@ def doc():
return doc
+@pytest.fixture()
+def merged_dict():
+ return {
+ "ids": [1, 2, 3, 4, 5, 6, 7],
+ "words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
+ "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
+ "sent_starts": [1, 0, 0, 1, 0, 0, 0, 0],
+ }
+
+
def test_gold_biluo_U(en_vocab):
words = ["I", "flew", "to", "London", "."]
spaces = [True, True, True, False, True]
@@ -231,7 +241,7 @@ def test_ignore_misaligned(doc):
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
- use_new_align = spacy.gold.USE_NEW_ALIGN
+ saved_use_new_align = spacy.gold.USE_NEW_ALIGN
spacy.gold.USE_NEW_ALIGN = False
with make_tempdir() as tmpdir:
@@ -270,7 +280,25 @@ def test_ignore_misaligned(doc):
ignore_misaligned=True))
assert len(train_reloaded_example) == 0
- spacy.gold.USE_NEW_ALIGN = use_new_align
+ spacy.gold.USE_NEW_ALIGN = saved_use_new_align
+
+
+def test_make_orth_variants(doc):
+ nlp = English()
+ text = doc.text
+ deps = [t.dep_ for t in doc]
+ heads = [t.head.i for t in doc]
+
+ with make_tempdir() as tmpdir:
+ jsonl_file = tmpdir / "test.jsonl"
+ # write to JSONL train dicts
+ srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
+ goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
+
+ # due to randomness, test only that this runs with no errors for now
+ train_reloaded_example = next(goldcorpus.train_dataset(nlp,
+ orth_variant_level=0.2))
+ train_goldparse = train_reloaded_example.gold
# xfail while we have backwards-compatible alignment
@@ -386,71 +414,38 @@ def _train(train_data):
nlp.update(batch, sgd=optimizer, losses=losses)
-tokens_1 = {
- "ids": [1, 2, 3],
- "words": ["Hi", "there", "everyone"],
- "tags": ["INTJ", "ADV", "PRON"],
-}
-
-tokens_2 = {
- "ids": [1, 2, 3, 4],
- "words": ["It", "is", "just", "me"],
- "tags": ["PRON", "AUX", "ADV", "PRON"],
-}
-
-text0 = "Hi there everyone It is just me"
-
-
-def test_merge_sents():
+def test_split_sents(merged_dict):
nlp = English()
example = Example()
- example.add_token_annotation(**tokens_1)
- example.add_token_annotation(**tokens_2)
+ example.set_token_annotation(**merged_dict)
assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2
- assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 # this shouldn't change the original object
+ assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1
- merged_example = example.merge_sents()
+ split_examples = example.split_sents()
+ assert len(split_examples) == 2
- token_annotation_1 = example.token_annotations[0]
+ token_annotation_1 = split_examples[0].token_annotation
assert token_annotation_1.ids == [1, 2, 3]
assert token_annotation_1.words == ["Hi", "there", "everyone"]
assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"]
+ assert token_annotation_1.sent_starts == [1, 0, 0]
- token_annotation_m = merged_example.token_annotations[0]
- assert token_annotation_m.ids == [1, 2, 3, 4, 5, 6, 7]
- assert token_annotation_m.words == ["Hi", "there", "everyone", "It", "is", "just", "me"]
- assert token_annotation_m.tags == ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"]
+ token_annotation_2 = split_examples[1].token_annotation
+ assert token_annotation_2.ids == [4, 5, 6, 7]
+ assert token_annotation_2.words == ["It", "is", "just", "me"]
+ assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"]
+ assert token_annotation_2.sent_starts == [1, 0, 0, 0]
-def test_tuples_to_example():
+def test_tuples_to_example(merged_dict):
ex = Example()
- ex.add_token_annotation(**tokens_1)
- ex.add_token_annotation(**tokens_2)
- ex.add_doc_annotation(cats={"TRAVEL": 1.0, "BAKING": 0.0})
+ ex.set_token_annotation(**merged_dict)
+ cats = {"TRAVEL": 1.0, "BAKING": 0.0}
+ ex.set_doc_annotation(cats=cats)
ex_dict = ex.to_dict()
- token_dicts = [
- {
- "ids": [1, 2, 3],
- "words": ["Hi", "there", "everyone"],
- "tags": ["INTJ", "ADV", "PRON"],
- "heads": [],
- "deps": [],
- "entities": [],
- "morphology": [],
- "brackets": [],
- },
- {
- "ids": [1, 2, 3, 4],
- "words": ["It", "is", "just", "me"],
- "tags": ["PRON", "AUX", "ADV", "PRON"],
- "heads": [],
- "deps": [],
- "entities": [],
- "morphology": [],
- "brackets": [],
- },
- ]
- doc_dict = {"cats": {"TRAVEL": 1.0, "BAKING": 0.0}, "links": {}}
-
- assert ex_dict == {"token_annotations": token_dicts, "doc_annotation": doc_dict}
+ assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"]
+ assert ex_dict["token_annotation"]["words"] == merged_dict["words"]
+ assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
+ assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"]
+ assert ex_dict["doc_annotation"]["cats"] == cats
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index e8d74c405..92a607e5b 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -86,7 +86,7 @@ def test_ner_per_type(en_vocab):
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
)
ex = Example(doc=doc)
- ex.add_token_annotation(entities=annot["entities"])
+ ex.set_token_annotation(entities=annot["entities"])
scorer.score(ex)
results = scorer.scores
@@ -107,7 +107,7 @@ def test_ner_per_type(en_vocab):
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
)
ex = Example(doc=doc)
- ex.add_token_annotation(entities=annot["entities"])
+ ex.set_token_annotation(entities=annot["entities"])
scorer.score(ex)
results = scorer.scores
From 0c9640ced3c58bca6a6838c0b2e07c3e8b115e99 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Mon, 25 Nov 2019 23:13:26 +0100
Subject: [PATCH 025/496] Replace old gold alignment with new gold alignment
(#4710)
Replace old gold alignment that allowed for some noise in the alignment between raw and orth with the new simpler alignment that requires that the raw and orth strings are identical except for whitespace and capitalization.
* Replace old alignment with new alignment, removing `_align.pyx` and
its tests
* Remove all quote normalizations
* Enable test for new align
* Modify test case for quote normalization
---
setup.py | 1 -
spacy/_align.pyx | 255 --------------------------------------
spacy/gold.pyx | 47 -------
spacy/tests/test_align.py | 79 ------------
spacy/tests/test_gold.py | 20 +--
5 files changed, 1 insertion(+), 401 deletions(-)
delete mode 100644 spacy/_align.pyx
delete mode 100644 spacy/tests/test_align.py
diff --git a/setup.py b/setup.py
index 1156e7cde..62a09aa73 100755
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,6 @@ PACKAGES = find_packages()
MOD_NAMES = [
- "spacy._align",
"spacy.parts_of_speech",
"spacy.strings",
"spacy.lexeme",
diff --git a/spacy/_align.pyx b/spacy/_align.pyx
deleted file mode 100644
index 8ae7cdf4e..000000000
--- a/spacy/_align.pyx
+++ /dev/null
@@ -1,255 +0,0 @@
-# cython: infer_types=True
-'''Do Levenshtein alignment, for evaluation of tokenized input.
-
-Random notes:
-
- r i n g
- 0 1 2 3 4
-r 1 0 1 2 3
-a 2 1 1 2 3
-n 3 2 2 1 2
-g 4 3 3 2 1
-
-0,0: (1,1)=min(0+0,1+1,1+1)=0 S
-1,0: (2,1)=min(1+1,0+1,2+1)=1 D
-2,0: (3,1)=min(2+1,3+1,1+1)=2 D
-3,0: (4,1)=min(3+1,4+1,2+1)=3 D
-0,1: (1,2)=min(1+1,2+1,0+1)=1 D
-1,1: (2,2)=min(0+1,1+1,1+1)=1 S
-2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
-3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
-0,2: (1,3)=min(2+1,3+1,1+1)=2 I
-1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
-2,2: (3,3)
-3,2: (4,3)
-At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
-
-We know the costs to transition:
-
-S[:i] -> T[:j] (at D[i,j])
-S[:i+1] -> T[:j] (at D[i+1,j])
-S[:i] -> T[:j+1] (at D[i,j+1])
-
-Further, we now we can tranform:
-S[:i+1] -> S[:i] (DEL) for 1,
-T[:j+1] -> T[:j] (INS) for 1.
-S[i+1] -> T[j+1] (SUB) for 0 or 1
-
-Therefore we have the costs:
-SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j])
-i.e. D[i, j] + S[i+1] != T[j+1]
-INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
-i.e. D[i+1,j] + 1
-DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i])
-i.e. D[i,j+1] + 1
-
- Source string S has length m, with index i
- Target string T has length n, with index j
-
- Output two alignment vectors: i2j (length m) and j2i (length n)
- # function LevenshteinDistance(char s[1..m], char t[1..n]):
- # for all i and j, d[i,j] will hold the Levenshtein distance between
- # the first i characters of s and the first j characters of t
- # note that d has (m+1)*(n+1) values
- # set each element in d to zero
- ring rang
- - r i n g
- - 0 0 0 0 0
- r 0 0 0 0 0
- a 0 0 0 0 0
- n 0 0 0 0 0
- g 0 0 0 0 0
-
- # source prefixes can be transformed into empty string by
- # dropping all characters
- # d[i, 0] := i
- ring rang
- - r i n g
- - 0 0 0 0 0
- r 1 0 0 0 0
- a 2 0 0 0 0
- n 3 0 0 0 0
- g 4 0 0 0 0
-
- # target prefixes can be reached from empty source prefix
- # by inserting every character
- # d[0, j] := j
- - r i n g
- - 0 1 2 3 4
- r 1 0 0 0 0
- a 2 0 0 0 0
- n 3 0 0 0 0
- g 4 0 0 0 0
-
-'''
-from __future__ import unicode_literals
-from libc.stdint cimport uint32_t
-import numpy
-cimport numpy as np
-from .compat import unicode_
-from murmurhash.mrmr cimport hash32
-
-
-def align(S, T):
- cdef int m = len(S)
- cdef int n = len(T)
- cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
- cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
- cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
-
- cdef np.ndarray S_arr = _convert_sequence(S)
- cdef np.ndarray T_arr = _convert_sequence(T)
-
- fill_matrix(matrix.data,
- S_arr.data, m, T_arr.data, n)
- fill_i2j(i2j, matrix)
- fill_j2i(j2i, matrix)
- for i in range(i2j.shape[0]):
- if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
- i2j[i] = -1
- for j in range(j2i.shape[0]):
- if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
- j2i[j] = -1
- return matrix[-1,-1], i2j, j2i, matrix
-
-
-def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
- '''Let's say we had:
-
- Guess: [aa bb cc dd]
- Truth: [aa bbcc dd]
- i2j: [0, None, -2, 2]
- j2i: [0, -2, 3]
-
- We want:
-
- i2j_multi: {1: 1, 2: 1}
- j2i_multi: {}
- '''
- i2j_miss = _get_regions(i2j, i_lengths)
- j2i_miss = _get_regions(j2i, j_lengths)
-
- i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
- return i2j_multi, j2i_multi
-
-
-def _get_regions(alignment, lengths):
- regions = {}
- start = None
- offset = 0
- for i in range(len(alignment)):
- if alignment[i] < 0:
- if start is None:
- start = offset
- regions.setdefault(start, [])
- regions[start].append(i)
- else:
- start = None
- offset += lengths[i]
- return regions
-
-
-def _get_mapping(miss1, miss2, lengths1, lengths2):
- i2j = {}
- j2i = {}
- for start, region1 in miss1.items():
- if not region1 or start not in miss2:
- continue
- region2 = miss2[start]
- if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
- j = region2.pop(0)
- buff = []
- # Consume tokens from region 1, until we meet the length of the
- # first token in region2. If we do, align the tokens. If
- # we exceed the length, break.
- while region1:
- buff.append(region1.pop(0))
- if sum(lengths1[i] for i in buff) == lengths2[j]:
- for i in buff:
- i2j[i] = j
- j2i[j] = buff[-1]
- j += 1
- buff = []
- elif sum(lengths1[i] for i in buff) > lengths2[j]:
- break
- else:
- if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
- for i in buff:
- i2j[i] = j
- j2i[j] = buff[-1]
- return i2j, j2i
-
-
-def _convert_sequence(seq):
- if isinstance(seq, numpy.ndarray):
- return numpy.ascontiguousarray(seq, dtype='uint32_t')
- cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
- cdef bytes item_bytes
- for i, item in enumerate(seq):
- if item == "``":
- item = '"'
- elif item == "''":
- item = '"'
- if isinstance(item, unicode):
- item_bytes = item.encode('utf8')
- else:
- item_bytes = item
- output[i] = hash32(item_bytes, len(item_bytes), 0)
- return output
-
-
-cdef void fill_matrix(int* D,
- const int* S, int m, const int* T, int n) nogil:
- m1 = m+1
- n1 = n+1
- for i in range(m1*n1):
- D[i] = 0
-
- for i in range(m1):
- D[i*n1] = i
-
- for j in range(n1):
- D[j] = j
-
- cdef int sub_cost, ins_cost, del_cost
- for j in range(n):
- for i in range(m):
- i_j = i*n1 + j
- i1_j1 = (i+1)*n1 + j+1
- i1_j = (i+1)*n1 + j
- i_j1 = i*n1 + j+1
- if S[i] != T[j]:
- sub_cost = D[i_j] + 1
- else:
- sub_cost = D[i_j]
- del_cost = D[i_j1] + 1
- ins_cost = D[i1_j] + 1
- best = min(min(sub_cost, ins_cost), del_cost)
- D[i1_j1] = best
-
-
-cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
- j = D.shape[1]-2
- cdef int i = D.shape[0]-2
- while i >= 0:
- while D[i+1, j] < D[i+1, j+1]:
- j -= 1
- if D[i, j+1] < D[i+1, j+1]:
- i2j[i] = -1
- else:
- i2j[i] = j
- j -= 1
- i -= 1
-
-cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
- i = D.shape[0]-2
- cdef int j = D.shape[1]-2
- while j >= 0:
- while D[i, j+1] < D[i+1, j+1]:
- i -= 1
- if D[i+1, j] < D[i+1, j+1]:
- j2i[j] = -1
- else:
- j2i[j] = i
- i -= 1
- j -= 1
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 0659ddd02..f2f127438 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -18,7 +18,6 @@ from .compat import path2str, basestring_
from . import util
-USE_NEW_ALIGN = False
punct_re = re.compile(r"\W")
@@ -51,59 +50,15 @@ def tags_to_entities(tags):
return entities
-_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
-
-
def _normalize_for_alignment(tokens):
tokens = [w.replace(" ", "").lower() for w in tokens]
output = []
for token in tokens:
token = token.replace(" ", "").lower()
- for before, after in _ALIGNMENT_NORM_MAP:
- token = token.replace(before, after)
output.append(token)
return output
-def _align_before_v2_2_2(tokens_a, tokens_b):
- """Calculate alignment tables between two tokenizations, using the Levenshtein
- algorithm. The alignment is case-insensitive.
-
- tokens_a (List[str]): The candidate tokenization.
- tokens_b (List[str]): The reference tokenization.
- RETURNS: (tuple): A 5-tuple consisting of the following information:
- * cost (int): The number of misaligned tokens.
- * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
- For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
- to `tokens_b[6]`. If there's no one-to-one alignment for a token,
- it has the value -1.
- * b2a (List[int]): The same as `a2b`, but mapping the other direction.
- * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
- to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
- the same token of `tokens_b`.
- * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
- direction.
- """
- from . import _align
- if tokens_a == tokens_b:
- alignment = numpy.arange(len(tokens_a))
- return 0, alignment, alignment, {}, {}
- tokens_a = [w.replace(" ", "").lower() for w in tokens_a]
- tokens_b = [w.replace(" ", "").lower() for w in tokens_b]
- cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b)
- i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a],
- [len(w) for w in tokens_b])
- for i, j in list(i2j_multi.items()):
- if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
- i2j[i] = j
- i2j_multi.pop(i)
- for j, i in list(j2i_multi.items()):
- if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
- j2i[j] = i
- j2i_multi.pop(j)
- return cost, i2j, j2i, i2j_multi, j2i_multi
-
-
def align(tokens_a, tokens_b):
"""Calculate alignment tables between two tokenizations.
@@ -122,8 +77,6 @@ def align(tokens_a, tokens_b):
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
direction.
"""
- if not USE_NEW_ALIGN:
- return _align_before_v2_2_2(tokens_a, tokens_b)
tokens_a = _normalize_for_alignment(tokens_a)
tokens_b = _normalize_for_alignment(tokens_b)
cost = 0
diff --git a/spacy/tests/test_align.py b/spacy/tests/test_align.py
deleted file mode 100644
index d6bbab04e..000000000
--- a/spacy/tests/test_align.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import pytest
-from spacy._align import align, multi_align
-
-
-@pytest.mark.parametrize(
- "string1,string2,cost",
- [
- ("hello", "hell", 1),
- ("rat", "cat", 1),
- ("rat", "rat", 0),
- ("rat", "catsie", 4),
- ("t", "catsie", 5),
- ],
-)
-def test_align_costs(string1, string2, cost):
- output_cost, i2j, j2i, matrix = align(string1, string2)
- assert output_cost == cost
-
-
-@pytest.mark.parametrize(
- "string1,string2,i2j",
- [
- ("hello", "hell", [0, 1, 2, 3, -1]),
- ("rat", "cat", [0, 1, 2]),
- ("rat", "rat", [0, 1, 2]),
- ("rat", "catsie", [0, 1, 2]),
- ("t", "catsie", [2]),
- ],
-)
-def test_align_i2j(string1, string2, i2j):
- output_cost, output_i2j, j2i, matrix = align(string1, string2)
- assert list(output_i2j) == i2j
-
-
-@pytest.mark.parametrize(
- "string1,string2,j2i",
- [
- ("hello", "hell", [0, 1, 2, 3]),
- ("rat", "cat", [0, 1, 2]),
- ("rat", "rat", [0, 1, 2]),
- ("rat", "catsie", [0, 1, 2, -1, -1, -1]),
- ("t", "catsie", [-1, -1, 0, -1, -1, -1]),
- ],
-)
-def test_align_i2j_2(string1, string2, j2i):
- output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
- assert list(output_j2i) == j2i
-
-
-def test_align_strings():
- words1 = ["hello", "this", "is", "test!"]
- words2 = ["hellothis", "is", "test", "!"]
- cost, i2j, j2i, matrix = align(words1, words2)
- assert cost == 4
- assert list(i2j) == [-1, -1, 1, -1]
- assert list(j2i) == [-1, 2, -1, -1]
-
-
-def test_align_many_to_one():
- words1 = ["a", "b", "c", "d", "e", "f", "g", "h"]
- words2 = ["ab", "bc", "e", "fg", "h"]
- cost, i2j, j2i, matrix = align(words1, words2)
- assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
- lengths1 = [len(w) for w in words1]
- lengths2 = [len(w) for w in words2]
- i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2)
- assert i2j_multi[0] == 0
- assert i2j_multi[1] == 0
- assert i2j_multi[2] == 1
- assert i2j_multi[3] == 1
- assert i2j_multi[3] == 1
- assert i2j_multi[5] == 3
- assert i2j_multi[6] == 3
-
- assert j2i_multi[0] == 1
- assert j2i_multi[1] == 3
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index d1255c176..639d98859 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -241,20 +241,6 @@ def test_ignore_misaligned(doc):
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
- saved_use_new_align = spacy.gold.USE_NEW_ALIGN
-
- spacy.gold.USE_NEW_ALIGN = False
- with make_tempdir() as tmpdir:
- jsonl_file = tmpdir / "test.jsonl"
- data = [docs_to_json(doc)]
- data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
- # write to JSONL train dicts
- srsly.write_jsonl(jsonl_file, data)
- goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
-
- train_reloaded_example = next(goldcorpus.train_dataset(nlp))
-
- spacy.gold.USE_NEW_ALIGN = True
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
data = [docs_to_json(doc)]
@@ -280,8 +266,6 @@ def test_ignore_misaligned(doc):
ignore_misaligned=True))
assert len(train_reloaded_example) == 0
- spacy.gold.USE_NEW_ALIGN = saved_use_new_align
-
def test_make_orth_variants(doc):
nlp = English()
@@ -301,14 +285,12 @@ def test_make_orth_variants(doc):
train_goldparse = train_reloaded_example.gold
-# xfail while we have backwards-compatible alignment
-@pytest.mark.xfail
@pytest.mark.parametrize(
"tokens_a,tokens_b,expected",
[
(["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})),
(
- ["a", "b", "``", "c"],
+ ["a", "b", '"', "c"],
['ab"', "c"],
(4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}),
),
From 9aab0a55e1aaa2544d1d1e294bd9e87f3db626de Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Tue, 26 Nov 2019 16:05:17 +0100
Subject: [PATCH 026/496] Fix conllu2json converter to output all sentences
(#4716)
Make sure that the last batch of sentences is output if n_sents > 1.
---
spacy/cli/converters/conllu2json.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index ff720f4bf..c0fd58fb0 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -34,6 +34,9 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
doc = create_doc(sentences, i)
docs.append(doc)
sentences = []
+ if sentences:
+ doc = create_doc(sentences, i)
+ docs.append(doc)
return docs
From 9efd3ccbef689230ec2ae53a6432f694d59b48ae Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Tue, 26 Nov 2019 16:10:08 +0100
Subject: [PATCH 027/496] Update conllu2json MISC column handling (#4715)
Update converter to handle various things in MISC column:
* `SpaceAfter=No` and set raw text accordingly
* plain NER tag
* name=NER (for NorNE)
---
spacy/cli/converters/conllu2json.py | 68 +++++++++++++++++++----------
spacy/tests/test_cli.py | 27 +++++++++++-
2 files changed, 70 insertions(+), 25 deletions(-)
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index c0fd58fb0..7fa491b9d 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -18,21 +18,28 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
"""
# by @dvsrepo, via #11 explosion/spacy-dev-resources
# by @katarkor
+ # name=NER is to handle NorNE
+ MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?"
docs = []
+ raw = ""
sentences = []
conll_data = read_conllx(input_data, use_morphology=use_morphology)
checked_for_ner = False
has_ner_tags = False
for i, example in enumerate(conll_data):
if not checked_for_ner:
- has_ner_tags = is_ner(example.token_annotation.entities[0])
+ has_ner_tags = is_ner(example.token_annotation.entities[0],
+ MISC_NER_PATTERN)
checked_for_ner = True
- sentences.append(generate_sentence(example.token_annotation, has_ner_tags))
+ raw += example.text
+ sentences.append(generate_sentence(example.token_annotation,
+ has_ner_tags, MISC_NER_PATTERN))
# Real-sized documents could be extracted using the comments on the
# conllu document
if len(sentences) % n_sents == 0:
- doc = create_doc(sentences, i)
+ doc = create_doc(raw, sentences, i)
docs.append(doc)
+ raw = ""
sentences = []
if sentences:
doc = create_doc(sentences, i)
@@ -40,12 +47,12 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
return docs
-def is_ner(tag):
+def is_ner(tag, tag_pattern):
"""
Check the 10th column of the first token to determine if the file contains
NER tags
"""
- tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
+ tag_match = re.search(tag_pattern, tag)
if tag_match:
return True
elif tag == "O":
@@ -63,9 +70,10 @@ def read_conllx(input_data, use_morphology=False, n=0):
while lines[0].startswith("#"):
lines.pop(0)
ids, words, tags, heads, deps, ents = [], [], [], [], [], []
+ spaces = []
for line in lines:
parts = line.split("\t")
- id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
+ id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if "-" in id_ or "." in id_:
continue
try:
@@ -74,18 +82,27 @@ def read_conllx(input_data, use_morphology=False, n=0):
dep = "ROOT" if dep == "root" else dep
tag = pos if tag == "_" else tag
tag = tag + "__" + morph if use_morphology else tag
- iob = iob if iob else "O"
+ ent = misc if misc else "O"
ids.append(id_)
words.append(word)
tags.append(tag)
heads.append(head)
deps.append(dep)
- ents.append(iob)
+ ents.append(ent)
+ if "SpaceAfter=No" in misc:
+ spaces.append(False)
+ else:
+ spaces.append(True)
except: # noqa: E722
print(line)
raise
- example = Example(doc=None)
+ raw = ""
+ for word, space in zip(words, spaces):
+ raw += word
+ if space:
+ raw += " "
+ example = Example(doc=raw)
example.set_token_annotation(ids=ids, words=words, tags=tags,
heads=heads, deps=deps, entities=ents)
yield example
@@ -94,7 +111,7 @@ def read_conllx(input_data, use_morphology=False, n=0):
break
-def simplify_tags(iob):
+def simplify_tags(iob, tag_pattern):
"""
Simplify tags obtained from the dataset in order to follow Wikipedia
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
@@ -103,26 +120,28 @@ def simplify_tags(iob):
"""
new_iob = []
for tag in iob:
- tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
+ tag_match = re.search(tag_pattern, tag)
+ new_tag = "O"
if tag_match:
- prefix = tag_match.group(1)
- suffix = tag_match.group(2)
- if suffix == "GPE_LOC":
- suffix = "LOC"
- elif suffix == "GPE_ORG":
- suffix = "ORG"
- elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
- suffix = "MISC"
- tag = prefix + "-" + suffix
- new_iob.append(tag)
+ prefix = tag_match.group(2)
+ suffix = tag_match.group(3)
+ if prefix and suffix:
+ if suffix == "GPE_LOC":
+ suffix = "LOC"
+ elif suffix == "GPE_ORG":
+ suffix = "ORG"
+ elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
+ suffix = "MISC"
+ new_tag = prefix + "-" + suffix
+ new_iob.append(new_tag)
return new_iob
-def generate_sentence(token_annotation, has_ner_tags):
+def generate_sentence(token_annotation, has_ner_tags, tag_pattern):
sentence = {}
tokens = []
if has_ner_tags:
- iob = simplify_tags(token_annotation.entities)
+ iob = simplify_tags(token_annotation.entities, tag_pattern)
biluo = iob_to_biluo(iob)
for i, id in enumerate(token_annotation.ids):
token = {}
@@ -138,11 +157,12 @@ def generate_sentence(token_annotation, has_ner_tags):
return sentence
-def create_doc(sentences, id):
+def create_doc(raw, sentences, id):
doc = {}
paragraph = {}
doc["id"] = id
doc["paragraphs"] = []
+ paragraph["raw"] = raw.strip()
paragraph["sentences"] = sentences
doc["paragraphs"].append(paragraph)
return doc
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 6dce649a9..2ce76b9ba 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -32,6 +32,32 @@ def test_cli_converters_conllu2json():
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
+def test_cli_converters_conllu2json():
+ # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
+ lines = [
+ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
+ "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER",
+ "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER",
+ "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
+ "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
+ ]
+ input_data = "\n".join(lines)
+ converted = conllu2json(input_data, n_sents=1)
+ assert len(converted) == 1
+ assert converted[0]["id"] == 0
+ assert len(converted[0]["paragraphs"]) == 1
+ assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår."
+ assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
+ sent = converted[0]["paragraphs"][0]["sentences"][0]
+ assert len(sent["tokens"]) == 5
+ tokens = sent["tokens"]
+ assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår", "."]
+ assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
+ assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
+ assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
+ assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"]
+
+
def test_cli_converters_iob2json():
lines = [
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
@@ -106,7 +132,6 @@ def test_cli_converters_conll_ner2json():
]
input_data = "\n".join(lines)
converted = conll_ner2json(input_data, n_sents=10)
- print(converted)
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
From b841d3fe75f099b4bceba512c748f335211cab52 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Thu, 28 Nov 2019 11:10:07 +0100
Subject: [PATCH 028/496] Add a tagger-based SentenceRecognizer (#4713)
* Add sent_starts to GoldParse
* Add SentTagger pipeline component
Add `SentTagger` pipeline component as a subclass of `Tagger`.
* Model reduces default parameters from `Tagger` to be small and fast
* Hard-coded set of two labels:
* S (1): token at beginning of sentence
* I (0): all other sentence positions
* Sets `token.sent_start` values
* Add sentence segmentation to Scorer
Report `sent_p/r/f` for sentence boundaries, which may be provided by
various pipeline components.
* Add sentence segmentation to CLI evaluate
* Add senttagger metrics/scoring to train CLI
* Rename SentTagger to SentenceRecognizer
* Add SentenceRecognizer to spacy.pipes imports
* Add SentenceRecognizer serialization test
* Shorten component name to sentrec
* Remove duplicates from train CLI output metrics
---
spacy/cli/evaluate.py | 3 +
spacy/cli/train.py | 20 ++-
spacy/gold.pxd | 1 +
spacy/gold.pyx | 41 ++---
spacy/pipeline/__init__.py | 2 +
spacy/pipeline/pipes.pyx | 165 +++++++++++++++++-
spacy/scorer.py | 34 +++-
.../serialize/test_serialize_pipeline.py | 9 +-
8 files changed, 245 insertions(+), 30 deletions(-)
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index a3193a5cf..da8a714a7 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -61,6 +61,9 @@ def evaluate(
"NER R": "%.2f" % scorer.ents_r,
"NER F": "%.2f" % scorer.ents_f,
"Textcat": "%.2f" % scorer.textcat_score,
+ "Sent P": "%.2f" % scorer.sent_p,
+ "Sent R": "%.2f" % scorer.sent_r,
+ "Sent F": "%.2f" % scorer.sent_f,
}
msg.table(results, title="Results")
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 645d1e4d4..8d37254a5 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -11,6 +11,7 @@ import srsly
from wasabi import msg
import contextlib
import random
+from collections import OrderedDict
from .._ml import create_default_optimizer
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
@@ -585,11 +586,13 @@ def _find_best(experiment_dir, component):
def _get_metrics(component):
if component == "parser":
- return ("las", "uas", "token_acc")
+ return ("las", "uas", "token_acc", "sent_f")
elif component == "tagger":
return ("tags_acc",)
elif component == "ner":
return ("ents_f", "ents_p", "ents_r")
+ elif component == "sentrec":
+ return ("sent_p", "sent_r", "sent_f",)
return ("token_acc",)
@@ -601,14 +604,17 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths):
row_head.extend(["Tag Loss ", " Tag % "])
output_stats.extend(["tag_loss", "tags_acc"])
elif pipe == "parser":
- row_head.extend(["Dep Loss ", " UAS ", " LAS "])
- output_stats.extend(["dep_loss", "uas", "las"])
+ row_head.extend(["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"])
+ output_stats.extend(["dep_loss", "uas", "las", "sent_p", "sent_r", "sent_f"])
elif pipe == "ner":
row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "])
output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"])
elif pipe == "textcat":
row_head.extend(["Textcat Loss", "Textcat"])
output_stats.extend(["textcat_loss", "textcat_score"])
+ elif pipe == "sentrec":
+ row_head.extend(["Sentrec Loss", "Sent P", "Sent R", "Sent F"])
+ output_stats.extend(["sentrec_loss", "sent_p", "sent_r", "sent_f"])
row_head.extend(["Token %", "CPU WPS"])
output_stats.extend(["token_acc", "cpu_wps"])
@@ -618,7 +624,12 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths):
if has_beam_widths:
row_head.insert(1, "Beam W.")
- return row_head, output_stats
+ # remove duplicates
+ row_head_dict = OrderedDict()
+ row_head_dict.update({k: 1 for k in row_head})
+ output_stats_dict = OrderedDict()
+ output_stats_dict.update({k: 1 for k in output_stats})
+ return row_head_dict.keys(), output_stats_dict.keys()
def _get_progress(
@@ -631,6 +642,7 @@ def _get_progress(
scores["ner_loss"] = losses.get("ner", 0.0)
scores["tag_loss"] = losses.get("tagger", 0.0)
scores["textcat_loss"] = losses.get("textcat", 0.0)
+ scores["sentrec_loss"] = losses.get("sentrec", 0.0)
scores["cpu_wps"] = cpu_wps
scores["gpu_wps"] = gpu_wps or 0.0
scores.update(dev_scores)
diff --git a/spacy/gold.pxd b/spacy/gold.pxd
index 247ff8aa1..525aa2473 100644
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@@ -26,6 +26,7 @@ cdef class GoldParse:
cdef public list words
cdef public list tags
cdef public list morphs
+ cdef public list sent_starts
cdef public list heads
cdef public list labels
cdef public dict orths
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index f2f127438..a7c0f1d8d 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -497,9 +497,9 @@ def json_to_examples(doc):
ner.append(token.get("ner", "-"))
morphs.append(token.get("morph", {}))
if i == 0:
- sent_starts.append(True)
+ sent_starts.append(1)
else:
- sent_starts.append(False)
+ sent_starts.append(0)
if "brackets" in sent:
brackets.extend((b["first"] + sent_start_i,
b["last"] + sent_start_i, b["label"])
@@ -759,7 +759,7 @@ cdef class Example:
t = self.token_annotation
split_examples = []
for i in range(len(t.words)):
- if i > 0 and t.sent_starts[i] == True:
+ if i > 0 and t.sent_starts[i] == 1:
s_example.set_token_annotation(ids=s_ids,
words=s_words, tags=s_tags, heads=s_heads, deps=s_deps,
entities=s_ents, morphs=s_morphs,
@@ -892,6 +892,7 @@ cdef class GoldParse:
deps=token_annotation.deps,
entities=token_annotation.entities,
morphs=token_annotation.morphs,
+ sent_starts=token_annotation.sent_starts,
cats=doc_annotation.cats,
links=doc_annotation.links,
make_projective=make_projective)
@@ -902,12 +903,13 @@ cdef class GoldParse:
ids = list(range(len(self.words)))
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
- heads=self.heads, deps=self.labels, entities=self.ner,
- morphs=self.morphs)
+ heads=self.heads, deps=self.labels,
+ entities=self.ner, morphs=self.morphs,
+ sent_starts=self.sent_starts)
def __init__(self, doc, words=None, tags=None, morphs=None,
- heads=None, deps=None, entities=None, make_projective=False,
- cats=None, links=None):
+ heads=None, deps=None, entities=None, sent_starts=None,
+ make_projective=False, cats=None, links=None):
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
doc (Doc): The document the annotations refer to.
@@ -920,6 +922,8 @@ cdef class GoldParse:
entities (iterable): A sequence of named entity annotations, either as
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
representing the entity positions.
+ sent_starts (iterable): A sequence of sentence position tags, 1 for
+ the first word in a sentence, 0 for all others.
cats (dict): Labels for text classification. Each key in the dictionary
may be a string or an int, or a `(start_char, end_char, label)`
tuple, indicating that the label is applied to only part of the
@@ -956,6 +960,8 @@ cdef class GoldParse:
deps = [None for _ in words]
if not morphs:
morphs = [None for _ in words]
+ if not sent_starts:
+ sent_starts = [None for _ in words]
if entities is None:
entities = ["-" for _ in words]
elif len(entities) == 0:
@@ -982,6 +988,7 @@ cdef class GoldParse:
self.labels = [None] * len(doc)
self.ner = [None] * len(doc)
self.morphs = [None] * len(doc)
+ self.sent_starts = [None] * len(doc)
# This needs to be done before we align the words
if make_projective and heads is not None and deps is not None:
@@ -1000,7 +1007,7 @@ cdef class GoldParse:
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags,
- heads=heads, deps=deps, entities=entities, morphs=morphs,
+ heads=heads, deps=deps, entities=entities, morphs=morphs, sent_starts=sent_starts,
brackets=[])
for i, gold_i in enumerate(self.cand_to_gold):
@@ -1011,11 +1018,13 @@ cdef class GoldParse:
self.labels[i] = None
self.ner[i] = None
self.morphs[i] = set()
+ self.sent_starts[i] = 0
if gold_i is None:
if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
self.morphs[i] = morphs[i2j_multi[i]]
+ self.sent_starts[i] = sent_starts[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
is_first = i2j_multi[i] != i2j_multi.get(i-1)
# Set next word in multi-token span as head, until last
@@ -1055,6 +1064,7 @@ cdef class GoldParse:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
self.morphs[i] = morphs[gold_i]
+ self.sent_starts[i] = sent_starts[gold_i]
if heads[gold_i] is None:
self.heads[i] = None
else:
@@ -1091,21 +1101,6 @@ cdef class GoldParse:
"""
return not nonproj.is_nonproj_tree(self.heads)
- property sent_starts:
- def __get__(self):
- return [self.c.sent_start[i] for i in range(self.length)]
-
- def __set__(self, sent_starts):
- for gold_i, is_sent_start in enumerate(sent_starts):
- i = self.gold_to_cand[gold_i]
- if i is not None:
- if is_sent_start in (1, True):
- self.c.sent_start[i] = 1
- elif is_sent_start in (-1, False):
- self.c.sent_start[i] = -1
- else:
- self.c.sent_start[i] = 0
-
def docs_to_json(docs, id=0):
"""Convert a list of Doc objects into the JSON-serializable format used by
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 2f30fbbee..de8403152 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
+from .pipes import SentenceRecognizer
from .morphologizer import Morphologizer
from .entityruler import EntityRuler
from .hooks import SentenceSegmenter, SimilarityHook
@@ -20,6 +21,7 @@ __all__ = [
"EntityRuler",
"Sentencizer",
"SentenceSegmenter",
+ "SentenceRecognizer",
"SimilarityHook",
"merge_entities",
"merge_noun_chunks",
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 56a00e33b..110839acd 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -705,6 +705,169 @@ class Tagger(Pipe):
return self
+@component("sentrec", assigns=["token.is_sent_start"])
+class SentenceRecognizer(Tagger):
+ """Pipeline component for sentence segmentation.
+
+ DOCS: https://spacy.io/api/sentencerecognizer
+ """
+
+ def __init__(self, vocab, model=True, **cfg):
+ self.vocab = vocab
+ self.model = model
+ self._rehearsal_model = None
+ self.cfg = OrderedDict(sorted(cfg.items()))
+ self.cfg.setdefault("cnn_maxout_pieces", 2)
+ self.cfg.setdefault("subword_features", True)
+ self.cfg.setdefault("token_vector_width", 12)
+ self.cfg.setdefault("conv_depth", 1)
+ self.cfg.setdefault("pretrained_vectors", None)
+
+ @property
+ def labels(self):
+ # labels are numbered by index internally, so this matches GoldParse
+ # and Example where the sentence-initial tag is 1 and other positions
+ # are 0
+ return tuple(["I", "S"])
+
+ def set_annotations(self, docs, batch_tag_ids, **_):
+ if isinstance(docs, Doc):
+ docs = [docs]
+ cdef Doc doc
+ for i, doc in enumerate(docs):
+ doc_tag_ids = batch_tag_ids[i]
+ if hasattr(doc_tag_ids, "get"):
+ doc_tag_ids = doc_tag_ids.get()
+ for j, tag_id in enumerate(doc_tag_ids):
+ # Don't clobber existing sentence boundaries
+ if doc.c[j].sent_start == 0:
+ if tag_id == 1:
+ doc.c[j].sent_start = 1
+ else:
+ doc.c[j].sent_start = -1
+
+ def update(self, examples, drop=0., sgd=None, losses=None):
+ self.require_model()
+ examples = Example.to_example_objects(examples)
+ if losses is not None and self.name not in losses:
+ losses[self.name] = 0.
+
+ if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
+ # Handle cases where there are no tokens in any docs.
+ return
+
+ tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+ loss, d_tag_scores = self.get_loss(examples, tag_scores)
+ bp_tag_scores(d_tag_scores, sgd=sgd)
+
+ if losses is not None:
+ losses[self.name] += loss
+
+ def get_loss(self, examples, scores):
+ scores = self.model.ops.flatten(scores)
+ tag_index = range(len(self.labels))
+ cdef int idx = 0
+ correct = numpy.zeros((scores.shape[0],), dtype="i")
+ guesses = scores.argmax(axis=1)
+ known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
+ for ex in examples:
+ gold = ex.gold
+ for sent_start in gold.sent_starts:
+ if sent_start is None:
+ correct[idx] = guesses[idx]
+ elif sent_start in tag_index:
+ correct[idx] = sent_start
+ else:
+ correct[idx] = 0
+ known_labels[idx] = 0.
+ idx += 1
+ correct = self.model.ops.xp.array(correct, dtype="i")
+ d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
+ d_scores *= self.model.ops.asarray(known_labels)
+ loss = (d_scores**2).sum()
+ docs = [ex.doc for ex in examples]
+ d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
+ return float(loss), d_scores
+
+ def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
+ **kwargs):
+ cdef Vocab vocab = self.vocab
+ if self.model is True:
+ for hp in ["token_vector_width", "conv_depth"]:
+ if hp in kwargs:
+ self.cfg[hp] = kwargs[hp]
+ self.model = self.Model(len(self.labels), **self.cfg)
+ if sgd is None:
+ sgd = self.create_optimizer()
+ return sgd
+
+ @classmethod
+ def Model(cls, n_tags, **cfg):
+ return build_tagger_model(n_tags, **cfg)
+
+ def add_label(self, label, values=None):
+ raise NotImplementedError
+
+ def use_params(self, params):
+ with self.model.use_params(params):
+ yield
+
+ def to_bytes(self, exclude=tuple(), **kwargs):
+ serialize = OrderedDict()
+ if self.model not in (None, True, False):
+ serialize["model"] = self.model.to_bytes
+ serialize["vocab"] = self.vocab.to_bytes
+ serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
+ exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
+ return util.to_bytes(serialize, exclude)
+
+ def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
+ def load_model(b):
+ if self.model is True:
+ self.model = self.Model(len(self.labels), **self.cfg)
+ try:
+ self.model.from_bytes(b)
+ except AttributeError:
+ raise ValueError(Errors.E149)
+
+ deserialize = OrderedDict((
+ ("vocab", lambda b: self.vocab.from_bytes(b)),
+ ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
+ ("model", lambda b: load_model(b)),
+ ))
+ exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
+ util.from_bytes(bytes_data, deserialize, exclude)
+ return self
+
+ def to_disk(self, path, exclude=tuple(), **kwargs):
+ serialize = OrderedDict((
+ ("vocab", lambda p: self.vocab.to_disk(p)),
+ ("model", lambda p: p.open("wb").write(self.model.to_bytes())),
+ ("cfg", lambda p: srsly.write_json(p, self.cfg))
+ ))
+ exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
+ util.to_disk(path, serialize, exclude)
+
+ def from_disk(self, path, exclude=tuple(), **kwargs):
+ def load_model(p):
+ if self.model is True:
+ self.model = self.Model(len(self.labels), **self.cfg)
+ with p.open("rb") as file_:
+ try:
+ self.model.from_bytes(file_.read())
+ except AttributeError:
+ raise ValueError(Errors.E149)
+
+ deserialize = OrderedDict((
+ ("cfg", lambda p: self.cfg.update(_load_cfg(p))),
+ ("vocab", lambda p: self.vocab.from_disk(p)),
+ ("model", load_model),
+ ))
+ exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
+ util.from_disk(path, deserialize, exclude)
+ return self
+
+
@component("nn_labeller")
class MultitaskObjective(Tagger):
"""Experimental: Assist training of a parser or tagger, by training a
@@ -1589,4 +1752,4 @@ Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp,
Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg)
-__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"]
+__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 723259acd..d2878da1a 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -84,6 +84,7 @@ class Scorer(object):
self.labelled = PRFScore()
self.labelled_per_dep = dict()
self.tags = PRFScore()
+ self.sent_starts = PRFScore()
self.ner = PRFScore()
self.ner_per_ents = dict()
self.eval_punct = eval_punct
@@ -113,6 +114,27 @@ class Scorer(object):
"""
return self.tags.fscore * 100
+ @property
+ def sent_p(self):
+ """RETURNS (float): F-score for identification of sentence starts.
+ i.e. `Token.is_sent_start`).
+ """
+ return self.sent_starts.precision * 100
+
+ @property
+ def sent_r(self):
+ """RETURNS (float): F-score for identification of sentence starts.
+ i.e. `Token.is_sent_start`).
+ """
+ return self.sent_starts.recall * 100
+
+ @property
+ def sent_f(self):
+ """RETURNS (float): F-score for identification of sentence starts.
+ i.e. `Token.is_sent_start`).
+ """
+ return self.sent_starts.fscore * 100
+
@property
def token_acc(self):
"""RETURNS (float): Tokenization accuracy."""
@@ -212,6 +234,9 @@ class Scorer(object):
"ents_f": self.ents_f,
"ents_per_type": self.ents_per_type,
"tags_acc": self.tags_acc,
+ "sent_p": self.sent_p,
+ "sent_r": self.sent_r,
+ "sent_f": self.sent_f,
"token_acc": self.token_acc,
"textcat_score": self.textcat_score,
"textcats_per_cat": self.textcats_per_cat,
@@ -242,9 +267,12 @@ class Scorer(object):
gold_deps = set()
gold_deps_per_dep = {}
gold_tags = set()
+ gold_sent_starts = set()
gold_ents = set(tags_to_entities(orig.entities))
- for id_, tag, head, dep in zip(orig.ids, orig.tags, orig.heads, orig.deps):
+ for id_, tag, head, dep, sent_start in zip(orig.ids, orig.tags, orig.heads, orig.deps, orig.sent_starts):
gold_tags.add((id_, tag))
+ if sent_start:
+ gold_sent_starts.add(id_)
if dep not in (None, "") and dep.lower() not in punct_labels:
gold_deps.add((id_, head, dep.lower()))
if dep.lower() not in self.labelled_per_dep:
@@ -255,6 +283,7 @@ class Scorer(object):
cand_deps = set()
cand_deps_per_dep = {}
cand_tags = set()
+ cand_sent_starts = set()
for token in doc:
if token.orth_.isspace():
continue
@@ -264,6 +293,8 @@ class Scorer(object):
else:
self.tokens.tp += 1
cand_tags.add((gold_i, token.tag_))
+ if token.is_sent_start:
+ cand_sent_starts.add(gold_i)
if token.dep_.lower() not in punct_labels and token.orth_.strip():
gold_head = gold.cand_to_gold[token.head.i]
# None is indistinct, so we can't just add it to the set
@@ -308,6 +339,7 @@ class Scorer(object):
# Score for all ents
self.ner.score_set(cand_ents, gold_ents)
self.tags.score_set(cand_tags, gold_tags)
+ self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
self.labelled.score_set(cand_deps, gold_deps)
for dep in self.labelled_per_dep:
self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set()))
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index efa7ef625..797fa95f8 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import pytest
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
-from spacy.pipeline import Tensorizer, TextCategorizer
+from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
from ..util import make_tempdir
@@ -144,3 +144,10 @@ def test_serialize_pipe_exclude(en_vocab, Parser):
parser.to_bytes(cfg=False, exclude=["vocab"])
with pytest.raises(ValueError):
get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]), cfg=False)
+
+
+def test_serialize_sentencerecognizer(en_vocab):
+ sr = SentenceRecognizer(en_vocab)
+ sr_b = sr.to_bytes()
+ sr_d = SentenceRecognizer(en_vocab).from_bytes(sr_b)
+ assert sr.to_bytes() == sr_d.to_bytes()
From 79ba1a3b921f76c80dadc77a8d0a01bc630b3721 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Thu, 28 Nov 2019 14:53:44 +0100
Subject: [PATCH 029/496] Add lemmas to GoldParse / Example / docs_to_json
(#4726)
---
spacy/gold.pxd | 2 ++
spacy/gold.pyx | 52 +++++++++++++++++++++++++++-------------
spacy/tests/test_gold.py | 6 +++++
3 files changed, 44 insertions(+), 16 deletions(-)
diff --git a/spacy/gold.pxd b/spacy/gold.pxd
index 525aa2473..5f0b49c9f 100644
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@@ -26,6 +26,7 @@ cdef class GoldParse:
cdef public list words
cdef public list tags
cdef public list morphs
+ cdef public list lemmas
cdef public list sent_starts
cdef public list heads
cdef public list labels
@@ -47,6 +48,7 @@ cdef class TokenAnnotation:
cdef public list deps
cdef public list entities
cdef public list morphs
+ cdef public list lemmas
cdef public list sent_starts
cdef public list brackets
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index a7c0f1d8d..00ae7c5e8 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -481,6 +481,7 @@ def json_to_examples(doc):
labels = []
ner = []
morphs = []
+ lemmas = []
sent_starts = []
brackets = []
for sent in paragraph["sentences"]:
@@ -496,6 +497,7 @@ def json_to_examples(doc):
labels[-1] = "ROOT"
ner.append(token.get("ner", "-"))
morphs.append(token.get("morph", {}))
+ lemmas.append(token.get("lemma", ""))
if i == 0:
sent_starts.append(1)
else:
@@ -509,7 +511,7 @@ def json_to_examples(doc):
cats[cat["label"]] = cat["value"]
example.set_token_annotation(ids=ids, words=words, tags=tags,
heads=heads, deps=labels, entities=ner, morphs=morphs,
- sent_starts=sent_starts, brackets=brackets)
+ lemmas=lemmas, sent_starts=sent_starts, brackets=brackets)
example.set_doc_annotation(cats=cats)
yield example
@@ -618,7 +620,9 @@ def _consume_ent(tags):
cdef class TokenAnnotation:
- def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphs=None, sent_starts=None, brackets=None):
+ def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None,
+ entities=None, morphs=None, lemmas=None, sent_starts=None,
+ brackets=None):
self.ids = ids if ids else []
self.words = words if words else []
self.tags = tags if tags else []
@@ -626,6 +630,7 @@ cdef class TokenAnnotation:
self.deps = deps if deps else []
self.entities = entities if entities else []
self.morphs = morphs if morphs else []
+ self.lemmas = lemmas if lemmas else []
self.sent_starts = sent_starts if sent_starts else []
self.brackets = brackets if brackets else []
@@ -638,6 +643,7 @@ cdef class TokenAnnotation:
deps=token_dict.get("deps", None),
entities=token_dict.get("entities", None),
morphs=token_dict.get("morphs", None),
+ lemmas=token_dict.get("lemmas", None),
sent_starts=token_dict.get("sent_starts", None),
brackets=token_dict.get("brackets", None))
@@ -649,6 +655,7 @@ cdef class TokenAnnotation:
"deps": self.deps,
"entities": self.entities,
"morphs": self.morphs,
+ "lemmas": self.lemmas,
"sent_starts": self.sent_starts,
"brackets": self.brackets}
@@ -673,6 +680,9 @@ cdef class TokenAnnotation:
def get_morph(self, i):
return self.morphs[i] if i < len(self.morphs) else set()
+ def get_lemma(self, i):
+ return self.lemmas[i] if i < len(self.lemmas) else ""
+
def get_sent_start(self, i):
return self.sent_starts[i] if i < len(self.sent_starts) else None
@@ -735,12 +745,12 @@ cdef class Example:
return self.goldparse
def set_token_annotation(self, ids=None, words=None, tags=None, heads=None,
- deps=None, entities=None, morphs=None,
+ deps=None, entities=None, morphs=None, lemmas=None,
sent_starts=None, brackets=None):
self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
heads=heads, deps=deps, entities=entities,
- morphs=morphs, sent_starts=sent_starts,
- brackets=brackets)
+ morphs=morphs, lemmas=lemmas,
+ sent_starts=sent_starts, brackets=brackets)
def set_doc_annotation(self, cats=None, links=None):
if cats:
@@ -753,7 +763,7 @@ cdef class Example:
sent_starts and return a list of the new Examples"""
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
s_ids, s_words, s_tags, s_heads = [], [], [], []
- s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], []
+ s_deps, s_ents, s_morphs, s_lemmas, s_sent_starts = [], [], [], [], []
s_brackets = []
sent_start_i = 0
t = self.token_annotation
@@ -762,13 +772,13 @@ cdef class Example:
if i > 0 and t.sent_starts[i] == 1:
s_example.set_token_annotation(ids=s_ids,
words=s_words, tags=s_tags, heads=s_heads, deps=s_deps,
- entities=s_ents, morphs=s_morphs,
+ entities=s_ents, morphs=s_morphs, lemmas=s_lemmas,
sent_starts=s_sent_starts, brackets=s_brackets)
split_examples.append(s_example)
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
s_ids, s_words, s_tags, s_heads = [], [], [], []
- s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], []
- s_brackets = []
+ s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
+ s_sent_starts, s_brackets = [], []
sent_start_i = i
s_ids.append(t.get_id(i))
s_words.append(t.get_word(i))
@@ -777,6 +787,7 @@ cdef class Example:
s_deps.append(t.get_dep(i))
s_ents.append(t.get_entity(i))
s_morphs.append(t.get_morph(i))
+ s_lemmas.append(t.get_lemma(i))
s_sent_starts.append(t.get_sent_start(i))
s_brackets.extend((b[0] - sent_start_i,
b[1] - sent_start_i, b[2])
@@ -784,7 +795,7 @@ cdef class Example:
i += 1
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
heads=s_heads, deps=s_deps, entities=s_ents,
- morphs=s_morphs, sent_starts=s_sent_starts,
+ morphs=s_morphs, lemmas=s_lemmas, sent_starts=s_sent_starts,
brackets=s_brackets)
split_examples.append(s_example)
return split_examples
@@ -892,6 +903,7 @@ cdef class GoldParse:
deps=token_annotation.deps,
entities=token_annotation.entities,
morphs=token_annotation.morphs,
+ lemmas=token_annotation.lemmas,
sent_starts=token_annotation.sent_starts,
cats=doc_annotation.cats,
links=doc_annotation.links,
@@ -905,10 +917,10 @@ cdef class GoldParse:
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
heads=self.heads, deps=self.labels,
entities=self.ner, morphs=self.morphs,
- sent_starts=self.sent_starts)
+ sent_starts=self.sent_starts, lemmas=self.lemmas)
- def __init__(self, doc, words=None, tags=None, morphs=None,
- heads=None, deps=None, entities=None, sent_starts=None,
+ def __init__(self, doc, words=None, tags=None, morphs=None, lemmas=None,
+ sent_starts=None, heads=None, deps=None, entities=None,
make_projective=False, cats=None, links=None):
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
@@ -960,6 +972,8 @@ cdef class GoldParse:
deps = [None for _ in words]
if not morphs:
morphs = [None for _ in words]
+ if not lemmas:
+ lemmas = [None for _ in words]
if not sent_starts:
sent_starts = [None for _ in words]
if entities is None:
@@ -988,6 +1002,7 @@ cdef class GoldParse:
self.labels = [None] * len(doc)
self.ner = [None] * len(doc)
self.morphs = [None] * len(doc)
+ self.lemmas = [None] * len(doc)
self.sent_starts = [None] * len(doc)
# This needs to be done before we align the words
@@ -1006,9 +1021,10 @@ cdef class GoldParse:
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
- self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags,
- heads=heads, deps=deps, entities=entities, morphs=morphs, sent_starts=sent_starts,
- brackets=[])
+ self.orig = TokenAnnotation(ids=list(range(len(words))),
+ words=words, tags=tags, heads=heads, deps=deps,
+ entities=entities, morphs=morphs, lemmas=lemmas,
+ sent_starts=sent_starts, brackets=[])
for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].text.isspace():
@@ -1018,12 +1034,14 @@ cdef class GoldParse:
self.labels[i] = None
self.ner[i] = None
self.morphs[i] = set()
+ self.lemmas[i] = None
self.sent_starts[i] = 0
if gold_i is None:
if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
self.morphs[i] = morphs[i2j_multi[i]]
+ self.lemmas[i] = lemmas[i2j_multi[i]]
self.sent_starts[i] = sent_starts[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
is_first = i2j_multi[i] != i2j_multi.get(i-1)
@@ -1064,6 +1082,7 @@ cdef class GoldParse:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
self.morphs[i] = morphs[gold_i]
+ self.lemmas[i] = lemmas[gold_i]
self.sent_starts[i] = sent_starts[gold_i]
if heads[gold_i] is None:
self.heads[i] = None
@@ -1125,6 +1144,7 @@ def docs_to_json(docs, id=0):
json_sent = {"tokens": [], "brackets": []}
for token in sent:
json_token = {"id": token.i, "orth": token.text}
+ json_token["lemma"] = token.lemma_
if doc.is_tagged:
json_token["tag"] = token.tag_
if doc.is_parsed:
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 639d98859..9d644d062 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -21,6 +21,7 @@ def doc():
# head of '.' is intentionally nonprojective for testing
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
deps = ['poss', 'case', 'nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
+ lemmas = ['Sarah', "'s", 'sister', 'fly', 'to', 'Silicon', 'Valley', 'via', 'London', '.']
biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
nlp = English()
@@ -29,6 +30,7 @@ def doc():
doc[i].tag_ = tags[i]
doc[i].dep_ = deps[i]
doc[i].head = doc[heads[i]]
+ doc[i].lemma_ = lemmas[i]
doc.ents = spans_from_biluo_tags(doc, biluo_tags)
doc.cats = cats
doc.is_tagged = True
@@ -138,6 +140,7 @@ def test_roundtrip_docs_to_json(doc):
tags = [t.tag_ for t in doc]
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
+ lemmas = [t.lemma_ for t in doc]
biluo_tags = iob_to_biluo([t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc])
cats = doc.cats
@@ -155,6 +158,7 @@ def test_roundtrip_docs_to_json(doc):
assert tags == goldparse.tags
assert deps == goldparse.labels
assert heads == goldparse.heads
+ assert lemmas == goldparse.lemmas
assert biluo_tags == goldparse.ner
assert "TRAVEL" in goldparse.cats
assert "BAKING" in goldparse.cats
@@ -175,6 +179,7 @@ def test_roundtrip_docs_to_json(doc):
assert tags == goldparse.tags
assert deps == goldparse.labels
assert heads == goldparse.heads
+ assert lemmas == goldparse.lemmas
assert biluo_tags == goldparse.ner
assert "TRAVEL" in goldparse.cats
assert "BAKING" in goldparse.cats
@@ -199,6 +204,7 @@ def test_roundtrip_docs_to_json(doc):
assert tags == goldparse.tags
assert deps == goldparse.labels
assert heads == goldparse.heads
+ assert lemmas == goldparse.lemmas
assert biluo_tags == goldparse.ner
assert "TRAVEL" in goldparse.cats
assert "BAKING" in goldparse.cats
From 68f711b4097d027513380421565225b73bfcf907 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Fri, 29 Nov 2019 10:22:03 +0100
Subject: [PATCH 030/496] Fix conllu2json n_sents and raw text (#4728)
Update conllu2json converter to include raw text in final batch.
---
spacy/cli/converters/conllu2json.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index 7fa491b9d..dc68efef4 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -42,7 +42,7 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
raw = ""
sentences = []
if sentences:
- doc = create_doc(sentences, i)
+ doc = create_doc(raw, sentences, i)
docs.append(doc)
return docs
From eb9b1858c4c218a74c58a806334b6b237e144bb8 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 11 Dec 2019 18:20:49 +0100
Subject: [PATCH 031/496] Add NER map option to convert CLI (#4763)
Instead of a hard-coded NER tag simplification function that was only
intended for NorNE, map NER tags in CoNLL-U converter using a dict
provided as JSON as a command-line option.
Map NER entity types or new tag or to "" for 'O', e.g.:
```
{"PER": "PERSON", "BAD": ""}
=>
B-PER -> B-PERSON
B-BAD -> O
```
---
spacy/cli/convert.py | 6 ++++++
spacy/cli/converters/conllu2json.py | 30 ++++++++++++++++++-----------
spacy/tests/test_cli.py | 11 +++++------
3 files changed, 30 insertions(+), 17 deletions(-)
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index fa867fa04..0cc0693a8 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -39,6 +39,7 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
lang=("Language (if tokenizer required)", "option", "l", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool),
+ ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path),
)
def convert(
input_file,
@@ -49,6 +50,7 @@ def convert(
model=None,
morphology=False,
converter="auto",
+ ner_map_path=None,
lang=None,
):
"""
@@ -94,6 +96,9 @@ def convert(
)
if converter not in CONVERTERS:
msg.fail("Can't find converter for {}".format(converter), exits=1)
+ ner_map = None
+ if ner_map_path is not None:
+ ner_map = srsly.read_json(ner_map_path)
# Use converter function to convert data
func = CONVERTERS[converter]
data = func(
@@ -104,6 +109,7 @@ def convert(
lang=lang,
model=model,
no_print=no_print,
+ ner_map=ner_map,
)
if output_dir != "-":
# Export data to a file
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index dc68efef4..0699bb5c1 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -7,7 +7,8 @@ from spacy.gold import Example
from ...gold import iob_to_biluo
-def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
+def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None,
+ ner_map=None, **_):
"""
Convert conllu files into JSON format for use with train cli.
use_morphology parameter enables appending morphology to tags, which is
@@ -33,7 +34,8 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
checked_for_ner = True
raw += example.text
sentences.append(generate_sentence(example.token_annotation,
- has_ner_tags, MISC_NER_PATTERN))
+ has_ner_tags, MISC_NER_PATTERN,
+ ner_map=ner_map))
# Real-sized documents could be extracted using the comments on the
# conllu document
if len(sentences) % n_sents == 0:
@@ -111,8 +113,12 @@ def read_conllx(input_data, use_morphology=False, n=0):
break
-def simplify_tags(iob, tag_pattern):
+def extract_tags(iob, tag_pattern, ner_map=None):
"""
+ Extract tag from MISC column according to `tag_pattern` and map to final
+ entity type with `ner_map` if mapping present.
+
+ For NorNE:
Simplify tags obtained from the dataset in order to follow Wikipedia
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
@@ -126,22 +132,24 @@ def simplify_tags(iob, tag_pattern):
prefix = tag_match.group(2)
suffix = tag_match.group(3)
if prefix and suffix:
- if suffix == "GPE_LOC":
- suffix = "LOC"
- elif suffix == "GPE_ORG":
- suffix = "ORG"
- elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
- suffix = "MISC"
new_tag = prefix + "-" + suffix
+ if ner_map:
+ suffix = ner_map.get(suffix, suffix)
+ if suffix == "":
+ new_tag = "O"
+ else:
+ new_tag = prefix + "-" + suffix
new_iob.append(new_tag)
return new_iob
-def generate_sentence(token_annotation, has_ner_tags, tag_pattern):
+def generate_sentence(token_annotation, has_ner_tags, tag_pattern,
+ ner_map=None):
sentence = {}
tokens = []
if has_ner_tags:
- iob = simplify_tags(token_annotation.entities, tag_pattern)
+ iob = extract_tags(token_annotation.entities, tag_pattern,
+ ner_map=ner_map)
biluo = iob_to_biluo(iob)
for i, id in enumerate(token_annotation.ids):
token = {}
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 2ce76b9ba..3b75e760a 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -9,7 +9,7 @@ from spacy.cli.pretrain import make_docs
def test_cli_converters_conllu2json():
- # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
+ # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER",
@@ -32,17 +32,16 @@ def test_cli_converters_conllu2json():
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
-def test_cli_converters_conllu2json():
- # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
+def test_cli_converters_conllu2json_name_ner_map():
lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER",
"3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER",
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
- "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
+ "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD",
]
input_data = "\n".join(lines)
- converted = conllu2json(input_data, n_sents=1)
+ converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
@@ -55,7 +54,7 @@ def test_cli_converters_conllu2json():
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
- assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O", "O"]
+ assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
def test_cli_converters_iob2json():
From a4cacd3402848299444c477cb2f1a292425b29af Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Fri, 13 Dec 2019 10:46:18 +0100
Subject: [PATCH 032/496] Add tag_map argument to CLI debug-data and train
(#4750)
Add an argument for a path to a JSON-formatted tag map, which is used to
update and extend the default language tag map.
---
spacy/cli/debug_data.py | 10 +++++++++-
spacy/cli/train.py | 8 ++++++++
2 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index ed19703ac..c2af5bff0 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -26,6 +26,7 @@ BLANK_MODEL_THRESHOLD = 2000
lang=("model language", "positional", None, str),
train_path=("location of JSON-formatted training data", "positional", None, Path),
dev_path=("location of JSON-formatted development data", "positional", None, Path),
+ tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
base_model=("name of model to update (optional)", "option", "b", str),
pipeline=(
"Comma-separated names of pipeline components to train",
@@ -41,6 +42,7 @@ def debug_data(
lang,
train_path,
dev_path,
+ tag_map_path=None,
base_model=None,
pipeline="tagger,parser,ner",
ignore_warnings=False,
@@ -60,6 +62,10 @@ def debug_data(
if not dev_path.exists():
msg.fail("Development data not found", dev_path, exits=1)
+ tag_map = {}
+ if tag_map_path is not None:
+ tag_map = srsly.read_json(tag_map_path)
+
# Initialize the model and pipeline
pipeline = [p.strip() for p in pipeline.split(",")]
if base_model:
@@ -67,6 +73,8 @@ def debug_data(
else:
lang_cls = get_lang_class(lang)
nlp = lang_cls()
+ # Update tag map with provided mapping
+ nlp.vocab.morphology.tag_map.update(tag_map)
msg.divider("Data format validation")
@@ -329,7 +337,7 @@ def debug_data(
if "tagger" in pipeline:
msg.divider("Part-of-speech Tagging")
labels = [label for label in gold_train_data["tags"]]
- tag_map = nlp.Defaults.tag_map
+ tag_map = nlp.vocab.morphology.tag_map
msg.info(
"{} {} in data ({} {} in tag map)".format(
len(labels),
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 8d37254a5..cdcbed0b3 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -48,6 +48,7 @@ from .. import about
textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool),
textcat_arch=("Textcat model architecture", "option", "ta", str),
textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
+ tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
verbose=("Display more information for debug", "flag", "VV", bool),
debug=("Run data diagnostics before training", "flag", "D", bool),
# fmt: on
@@ -78,6 +79,7 @@ def train(
textcat_multilabel=False,
textcat_arch="bow",
textcat_positive_label=None,
+ tag_map_path=None,
verbose=False,
debug=False,
):
@@ -118,6 +120,9 @@ def train(
if not output_path.exists():
output_path.mkdir()
+ tag_map = {}
+ if tag_map_path is not None:
+ tag_map = srsly.read_json(tag_map_path)
# Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore.
# Batch size starts at 1 and grows, so that we make updates quickly
@@ -209,6 +214,9 @@ def train(
pipe_cfg = {}
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
+ # Update tag map with provided mapping
+ nlp.vocab.morphology.tag_map.update(tag_map)
+
if vectors:
msg.text("Loading vector from model '{}'".format(vectors))
_load_vectors(nlp, vectors)
From d17e7dca9ee36a33c6d23bc8786b3c76ca9dc061 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sat, 21 Dec 2019 19:57:41 +0100
Subject: [PATCH 033/496] Fix problems caused by merge conflict
---
spacy/scorer.py | 2 +-
spacy/tokenizer.pyx | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 7fee4865a..6238b6ead 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -313,7 +313,7 @@ class Scorer(object):
cand_deps_per_dep[token.dep_.lower()].add(
(gold_i, gold_head, token.dep_.lower())
)
- if "-" not in [token[-1] for token in gold.orig_annot]:
+ if "-" not in [token[-1] for token in orig.entities]:
# Find all NER labels in gold and doc
ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
# Set up all labels for per type scoring and prepare gold per type
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index c1ac3dd06..f0120c708 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -116,10 +116,10 @@ cdef class Tokenizer:
def __set__(self, rules):
self._rules = {}
self._reset_cache([key for key in self._cache])
- self._reset_specials()
+ self._flush_specials()
self._cache = PreshMap()
self._specials = PreshMap()
- self._load_special_tokenization(rules)
+ self._load_special_cases(rules)
def __reduce__(self):
args = (self.vocab,
From 21b6d6e0a8287e425b3fffd08309596d5dd1e6ca Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sat, 21 Dec 2019 21:17:31 +0100
Subject: [PATCH 034/496] Fix typo
---
spacy/tests/regression/test_issue4674.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py
index 8d0c32eaa..8fa4f9259 100644
--- a/spacy/tests/regression/test_issue4674.py
+++ b/spacy/tests/regression/test_issue4674.py
@@ -6,7 +6,7 @@ from spacy.kb import KnowledgeBase
from spacy.util import ensure_path
from spacy.lang.en import English
-from ..tests.util import make_tempdir
+from ..util import make_tempdir
def test_issue4674():
From db55577c452cbb0e9c984dcc2bce7ecaf99ad3c8 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sun, 22 Dec 2019 01:53:56 +0100
Subject: [PATCH 035/496] Drop Python 2.7 and 3.5 (#4828)
* Remove unicode declarations
* Remove Python 3.5 and 2.7 from CI
* Don't require pathlib
* Replace compat helpers
* Remove OrderedDict
* Use f-strings
* Set Cython compiler language level
* Fix typo
* Re-add OrderedDict for Table
* Update setup.cfg
* Revert CONTRIBUTING.md
* Revert lookups.md
* Revert top-level.md
* Small adjustments and docs [ci skip]
---
.travis.yml | 23 --
CONTRIBUTING.md | 18 +-
README.md | 7 +-
azure-pipelines.yml | 6 -
bin/cythonize.py | 6 +-
fabfile.py | 3 -
requirements.txt | 1 -
setup.cfg | 6 +-
setup.py | 1 -
spacy/__init__.py | 2 -
spacy/__main__.py | 10 +-
spacy/_ml.py | 3 -
spacy/analysis.py | 19 +-
spacy/attrs.pyx | 3 -
spacy/cli/_schemas.py | 3 -
spacy/cli/convert.py | 27 +-
spacy/cli/converters/conll_ner2json.py | 13 +-
spacy/cli/converters/conllu2json.py | 3 -
spacy/cli/converters/iob2json.py | 3 -
spacy/cli/converters/jsonl2json.py | 3 -
spacy/cli/debug_data.py | 283 ++++++------------
spacy/cli/download.py | 29 +-
spacy/cli/evaluate.py | 5 +-
spacy/cli/info.py | 20 +-
spacy/cli/init_model.py | 10 +-
spacy/cli/link.py | 25 +-
spacy/cli/package.py | 21 +-
spacy/cli/pretrain.py | 15 +-
spacy/cli/profile.py | 11 +-
spacy/cli/train.py | 83 +++--
spacy/cli/validate.py | 26 +-
spacy/compat.py | 102 +------
spacy/displacy/__init__.py | 14 +-
spacy/displacy/render.py | 5 +-
spacy/displacy/templates.py | 3 -
spacy/errors.py | 14 +-
spacy/glossary.py | 3 -
spacy/gold.pyx | 18 +-
spacy/kb.pyx | 13 +-
spacy/lang/af/__init__.py | 3 -
spacy/lang/af/stop_words.py | 3 -
spacy/lang/ar/__init__.py | 3 -
spacy/lang/ar/examples.py | 3 -
spacy/lang/ar/lex_attrs.py | 2 -
spacy/lang/ar/punctuation.py | 3 -
spacy/lang/ar/stop_words.py | 3 -
spacy/lang/ar/tokenizer_exceptions.py | 3 -
spacy/lang/bg/__init__.py | 3 -
spacy/lang/bg/examples.py | 3 -
spacy/lang/bg/stop_words.py | 3 -
spacy/lang/bn/__init__.py | 3 -
spacy/lang/bn/examples.py | 3 -
spacy/lang/bn/morph_rules.py | 3 -
spacy/lang/bn/punctuation.py | 3 -
spacy/lang/bn/stop_words.py | 3 -
spacy/lang/bn/tag_map.py | 3 -
spacy/lang/bn/tokenizer_exceptions.py | 3 -
spacy/lang/ca/__init__.py | 3 -
spacy/lang/ca/examples.py | 3 -
spacy/lang/ca/lex_attrs.py | 3 -
spacy/lang/ca/punctuation.py | 3 -
spacy/lang/ca/stop_words.py | 4 -
spacy/lang/ca/tag_map.py | 3 -
spacy/lang/ca/tokenizer_exceptions.py | 3 -
spacy/lang/char_classes.py | 3 -
spacy/lang/cs/__init__.py | 3 -
spacy/lang/cs/stop_words.py | 3 -
spacy/lang/da/__init__.py | 3 -
spacy/lang/da/examples.py | 3 -
spacy/lang/da/lex_attrs.py | 3 -
spacy/lang/da/morph_rules.py | 3 -
spacy/lang/da/norm_exceptions.py | 3 -
spacy/lang/da/punctuation.py | 3 -
spacy/lang/da/stop_words.py | 3 -
spacy/lang/da/tokenizer_exceptions.py | 4 -
spacy/lang/de/__init__.py | 3 -
spacy/lang/de/examples.py | 3 -
spacy/lang/de/norm_exceptions.py | 3 -
spacy/lang/de/punctuation.py | 3 -
spacy/lang/de/stop_words.py | 3 -
spacy/lang/de/syntax_iterators.py | 3 -
spacy/lang/de/tag_map.py | 3 -
spacy/lang/de/tokenizer_exceptions.py | 3 -
spacy/lang/el/__init__.py | 4 -
spacy/lang/el/examples.py | 4 -
spacy/lang/el/get_pos_from_wiktionary.py | 3 -
spacy/lang/el/lemmatizer.py | 3 -
spacy/lang/el/lex_attrs.py | 4 -
spacy/lang/el/norm_exceptions.py | 3 -
spacy/lang/el/punctuation.py | 4 -
spacy/lang/el/stop_words.py | 3 -
spacy/lang/el/syntax_iterators.py | 3 -
spacy/lang/el/tag_map.py | 3 -
spacy/lang/el/tag_map_general.py | 3 -
spacy/lang/el/tokenizer_exceptions.py | 3 -
spacy/lang/en/__init__.py | 3 -
spacy/lang/en/examples.py | 3 -
spacy/lang/en/lex_attrs.py | 3 -
spacy/lang/en/morph_rules.py | 3 -
spacy/lang/en/norm_exceptions.py | 3 -
spacy/lang/en/stop_words.py | 3 -
spacy/lang/en/syntax_iterators.py | 3 -
spacy/lang/en/tag_map.py | 3 -
spacy/lang/en/tokenizer_exceptions.py | 3 -
spacy/lang/es/__init__.py | 3 -
spacy/lang/es/examples.py | 3 -
spacy/lang/es/lex_attrs.py | 3 -
spacy/lang/es/stop_words.py | 3 -
spacy/lang/es/syntax_iterators.py | 3 -
spacy/lang/es/tag_map.py | 3 -
spacy/lang/es/tokenizer_exceptions.py | 3 -
spacy/lang/et/__init__.py | 3 -
spacy/lang/et/stop_words.py | 3 -
spacy/lang/fa/__init__.py | 3 -
spacy/lang/fa/examples.py | 3 -
spacy/lang/fa/generate_verbs_exc.py | 3 -
spacy/lang/fa/lex_attrs.py | 2 -
spacy/lang/fa/punctuation.py | 3 -
spacy/lang/fa/stop_words.py | 3 -
spacy/lang/fa/syntax_iterators.py | 3 -
spacy/lang/fa/tag_map.py | 3 -
spacy/lang/fa/tokenizer_exceptions.py | 3 -
spacy/lang/fi/__init__.py | 3 -
spacy/lang/fi/examples.py | 3 -
spacy/lang/fi/lex_attrs.py | 3 -
spacy/lang/fi/punctuation.py | 3 -
spacy/lang/fi/stop_words.py | 3 -
spacy/lang/fi/tokenizer_exceptions.py | 3 -
spacy/lang/fr/__init__.py | 3 -
spacy/lang/fr/_tokenizer_exceptions_list.py | 3 -
spacy/lang/fr/examples.py | 3 -
spacy/lang/fr/lemmatizer.py | 3 -
spacy/lang/fr/lex_attrs.py | 3 -
spacy/lang/fr/punctuation.py | 3 -
spacy/lang/fr/stop_words.py | 3 -
spacy/lang/fr/syntax_iterators.py | 3 -
spacy/lang/fr/tag_map.py | 3 -
spacy/lang/fr/tokenizer_exceptions.py | 7 +-
spacy/lang/ga/__init__.py | 3 -
spacy/lang/ga/irish_morphology_helpers.py | 3 -
spacy/lang/ga/stop_words.py | 4 -
spacy/lang/ga/tag_map.py | 3 -
spacy/lang/ga/tokenizer_exceptions.py | 3 -
spacy/lang/he/__init__.py | 3 -
spacy/lang/he/examples.py | 3 -
spacy/lang/he/stop_words.py | 4 -
spacy/lang/hi/__init__.py | 3 -
spacy/lang/hi/examples.py | 3 -
spacy/lang/hi/lex_attrs.py | 3 -
spacy/lang/hi/stop_words.py | 3 -
spacy/lang/hr/__init__.py | 3 -
spacy/lang/hr/examples.py | 3 -
spacy/lang/hr/stop_words.py | 4 -
spacy/lang/hu/__init__.py | 3 -
spacy/lang/hu/examples.py | 3 -
spacy/lang/hu/punctuation.py | 3 -
spacy/lang/hu/stop_words.py | 3 -
spacy/lang/hu/tokenizer_exceptions.py | 3 -
spacy/lang/id/__init__.py | 3 -
spacy/lang/id/_tokenizer_exceptions_list.py | 3 -
spacy/lang/id/examples.py | 3 -
spacy/lang/id/lex_attrs.py | 3 -
spacy/lang/id/norm_exceptions.py | 3 -
spacy/lang/id/punctuation.py | 3 -
spacy/lang/id/stop_words.py | 3 -
spacy/lang/id/syntax_iterators.py | 3 -
spacy/lang/id/tag_map.py | 3 -
spacy/lang/id/tokenizer_exceptions.py | 3 -
spacy/lang/is/__init__.py | 3 -
spacy/lang/is/stop_words.py | 3 -
spacy/lang/it/__init__.py | 3 -
spacy/lang/it/examples.py | 3 -
spacy/lang/it/punctuation.py | 3 -
spacy/lang/it/stop_words.py | 3 -
spacy/lang/it/tag_map.py | 3 -
spacy/lang/it/tokenizer_exceptions.py | 2 -
spacy/lang/ja/__init__.py | 3 -
spacy/lang/ja/examples.py | 3 -
spacy/lang/ja/stop_words.py | 3 -
spacy/lang/ja/tag_map.py | 3 -
spacy/lang/kn/__init__.py | 3 -
spacy/lang/kn/stop_words.py | 3 -
spacy/lang/ko/__init__.py | 3 -
spacy/lang/ko/examples.py | 3 -
spacy/lang/ko/lex_attrs.py | 3 -
spacy/lang/ko/stop_words.py | 3 -
spacy/lang/ko/tag_map.py | 3 -
spacy/lang/lb/__init__.py | 3 -
spacy/lang/lb/examples.py | 3 -
spacy/lang/lb/lex_attrs.py | 3 -
spacy/lang/lb/norm_exceptions.py | 3 -
spacy/lang/lb/punctuation.py | 3 -
spacy/lang/lb/stop_words.py | 3 -
spacy/lang/lb/tag_map.py | 3 -
spacy/lang/lb/tokenizer_exceptions.py | 3 -
spacy/lang/lex_attrs.py | 3 -
spacy/lang/lt/__init__.py | 3 -
spacy/lang/lt/examples.py | 3 -
spacy/lang/lt/lex_attrs.py | 3 -
spacy/lang/lt/morph_rules.py | 3 -
spacy/lang/lt/stop_words.py | 3 -
spacy/lang/lt/tag_map.py | 3 -
spacy/lang/lt/tokenizer_exceptions.py | 3 -
spacy/lang/lv/__init__.py | 3 -
spacy/lang/lv/stop_words.py | 3 -
spacy/lang/mr/__init__.py | 3 -
spacy/lang/mr/stop_words.py | 3 -
spacy/lang/nb/__init__.py | 3 -
spacy/lang/nb/examples.py | 3 -
spacy/lang/nb/morph_rules.py | 3 -
spacy/lang/nb/punctuation.py | 3 -
spacy/lang/nb/stop_words.py | 4 -
spacy/lang/nb/syntax_iterators.py | 3 -
spacy/lang/nb/tag_map.py | 3 -
spacy/lang/nb/tokenizer_exceptions.py | 3 -
spacy/lang/nl/__init__.py | 3 -
spacy/lang/nl/examples.py | 3 -
spacy/lang/nl/lemmatizer.py | 3 -
spacy/lang/nl/lex_attrs.py | 3 -
spacy/lang/nl/punctuation.py | 3 -
spacy/lang/nl/stop_words.py | 3 -
spacy/lang/nl/tag_map.py | 3 -
spacy/lang/nl/tokenizer_exceptions.py | 3 -
spacy/lang/norm_exceptions.py | 3 -
spacy/lang/pl/__init__.py | 3 -
spacy/lang/pl/_tokenizer_exceptions_list.py | 4 -
spacy/lang/pl/examples.py | 3 -
spacy/lang/pl/lex_attrs.py | 3 -
spacy/lang/pl/punctuation.py | 3 -
spacy/lang/pl/stop_words.py | 4 -
spacy/lang/pl/tag_map.py | 3 -
spacy/lang/pl/tokenizer_exceptions.py | 3 -
spacy/lang/pt/__init__.py | 3 -
spacy/lang/pt/examples.py | 3 -
spacy/lang/pt/lex_attrs.py | 3 -
spacy/lang/pt/norm_exceptions.py | 3 -
spacy/lang/pt/punctuation.py | 3 -
spacy/lang/pt/stop_words.py | 3 -
spacy/lang/pt/tag_map.py | 3 -
spacy/lang/pt/tokenizer_exceptions.py | 3 -
spacy/lang/punctuation.py | 3 -
spacy/lang/ro/__init__.py | 3 -
spacy/lang/ro/examples.py | 3 -
spacy/lang/ro/lex_attrs.py | 3 -
spacy/lang/ro/stop_words.py | 4 -
spacy/lang/ro/tag_map.py | 2 -
spacy/lang/ro/tokenizer_exceptions.py | 3 -
spacy/lang/ru/__init__.py | 3 -
spacy/lang/ru/examples.py | 3 -
spacy/lang/ru/lemmatizer.py | 6 +-
spacy/lang/ru/lex_attrs.py | 3 -
spacy/lang/ru/norm_exceptions.py | 3 -
spacy/lang/ru/stop_words.py | 4 -
spacy/lang/ru/tag_map.py | 3 -
spacy/lang/ru/tokenizer_exceptions.py | 3 -
spacy/lang/si/__init__.py | 3 -
spacy/lang/si/examples.py | 3 -
spacy/lang/si/lex_attrs.py | 3 -
spacy/lang/si/stop_words.py | 3 -
spacy/lang/sk/__init__.py | 3 -
spacy/lang/sk/stop_words.py | 3 -
spacy/lang/sl/__init__.py | 3 -
spacy/lang/sl/stop_words.py | 3 -
spacy/lang/sq/__init__.py | 3 -
spacy/lang/sq/examples.py | 3 -
spacy/lang/sq/stop_words.py | 3 -
spacy/lang/sr/__init__.py | 3 -
spacy/lang/sr/examples.py | 3 -
spacy/lang/sr/lex_attrs.py | 3 -
spacy/lang/sr/norm_exceptions.py | 3 -
spacy/lang/sr/stop_words.py | 3 -
spacy/lang/sr/tokenizer_exceptions.py | 3 -
spacy/lang/sv/__init__.py | 3 -
spacy/lang/sv/examples.py | 3 -
spacy/lang/sv/morph_rules.py | 3 -
spacy/lang/sv/stop_words.py | 3 -
spacy/lang/sv/syntax_iterators.py | 3 -
spacy/lang/sv/tag_map.py | 3 -
spacy/lang/sv/tokenizer_exceptions.py | 3 -
spacy/lang/ta/__init__.py | 3 -
spacy/lang/ta/examples.py | 3 -
spacy/lang/ta/lex_attrs.py | 3 -
spacy/lang/ta/norm_exceptions.py | 3 -
spacy/lang/ta/stop_words.py | 3 -
spacy/lang/tag_map.py | 3 -
spacy/lang/te/__init__.py | 3 -
spacy/lang/te/examples.py | 3 -
spacy/lang/te/lex_attrs.py | 3 -
spacy/lang/te/stop_words.py | 3 -
spacy/lang/th/__init__.py | 3 -
spacy/lang/th/lex_attrs.py | 3 -
spacy/lang/th/norm_exceptions.py | 3 -
spacy/lang/th/tag_map.py | 3 -
spacy/lang/th/tokenizer_exceptions.py | 3 -
spacy/lang/tl/__init__.py | 3 -
spacy/lang/tl/lex_attrs.py | 3 -
spacy/lang/tl/stop_words.py | 3 -
spacy/lang/tl/tokenizer_exceptions.py | 3 -
spacy/lang/tokenizer_exceptions.py | 3 -
spacy/lang/tr/__init__.py | 3 -
spacy/lang/tr/examples.py | 3 -
spacy/lang/tr/lex_attrs.py | 3 -
spacy/lang/tr/stop_words.py | 4 -
spacy/lang/tr/tokenizer_exceptions.py | 3 -
spacy/lang/tt/__init__.py | 3 -
spacy/lang/tt/examples.py | 3 -
spacy/lang/tt/lex_attrs.py | 3 -
spacy/lang/tt/punctuation.py | 3 -
spacy/lang/tt/stop_words.py | 3 -
spacy/lang/tt/tokenizer_exceptions.py | 3 -
spacy/lang/uk/__init__.py | 3 -
spacy/lang/uk/examples.py | 3 -
spacy/lang/uk/lemmatizer.py | 1 -
spacy/lang/uk/lex_attrs.py | 3 -
spacy/lang/uk/stop_words.py | 4 -
spacy/lang/uk/tag_map.py | 3 -
spacy/lang/uk/tokenizer_exceptions.py | 3 -
spacy/lang/ur/__init__.py | 3 -
spacy/lang/ur/examples.py | 3 -
spacy/lang/ur/lex_attrs.py | 3 -
spacy/lang/ur/punctuation.py | 3 -
spacy/lang/ur/stop_words.py | 3 -
spacy/lang/ur/tag_map.py | 3 -
spacy/lang/vi/__init__.py | 3 -
spacy/lang/vi/lex_attrs.py | 3 -
spacy/lang/vi/stop_words.py | 3 -
spacy/lang/vi/tag_map.py | 3 -
spacy/lang/xx/__init__.py | 3 -
spacy/lang/xx/examples.py | 3 -
spacy/lang/yo/__init__.py | 3 -
spacy/lang/yo/examples.py | 3 -
spacy/lang/yo/lex_attrs.py | 3 -
spacy/lang/yo/stop_words.py | 3 -
spacy/lang/zh/__init__.py | 3 -
spacy/lang/zh/examples.py | 3 -
spacy/lang/zh/lex_attrs.py | 4 +-
spacy/lang/zh/stop_words.py | 4 -
spacy/lang/zh/tag_map.py | 3 -
spacy/language.py | 38 +--
spacy/lemmatizer.py | 7 +-
spacy/lexeme.pyx | 3 -
spacy/lookups.py | 21 +-
spacy/matcher/__init__.py | 3 -
spacy/matcher/_schemas.py | 3 -
spacy/matcher/dependencymatcher.pyx | 2 -
spacy/matcher/matcher.pyx | 2 -
spacy/matcher/phrasematcher.pyx | 2 -
spacy/ml/__init__.py | 3 -
spacy/ml/_legacy_tok2vec.py | 2 -
spacy/ml/_wire.py | 1 -
spacy/ml/common.py | 2 -
spacy/ml/tok2vec.py | 2 -
spacy/morphology.pyx | 6 +-
spacy/parts_of_speech.pyx | 3 -
spacy/pipeline/__init__.py | 3 -
spacy/pipeline/entityruler.py | 27 +-
spacy/pipeline/functions.py | 3 -
spacy/pipeline/hooks.py | 3 -
spacy/pipeline/morphologizer.pyx | 6 +-
spacy/pipeline/pipes.pyx | 101 +++----
spacy/scorer.py | 3 -
spacy/strings.pyx | 8 +-
spacy/symbols.pyx | 6 +-
spacy/syntax/_parser_model.pyx | 8 +-
spacy/syntax/arc_eager.pyx | 7 +-
spacy/syntax/ner.pyx | 5 +-
spacy/syntax/nn_parser.pyx | 28 +-
spacy/syntax/nonproj.pyx | 3 -
spacy/syntax/stateclass.pyx | 3 -
spacy/syntax/transition_system.pyx | 5 +-
spacy/tests/conftest.py | 3 -
spacy/tests/doc/test_add_entities.py | 3 -
spacy/tests/doc/test_array.py | 3 -
spacy/tests/doc/test_creation.py | 3 -
spacy/tests/doc/test_doc_api.py | 3 -
spacy/tests/doc/test_morphanalysis.py | 3 -
spacy/tests/doc/test_pickle_doc.py | 9 +-
spacy/tests/doc/test_retokenize_merge.py | 3 -
spacy/tests/doc/test_retokenize_split.py | 3 -
spacy/tests/doc/test_span.py | 3 -
spacy/tests/doc/test_to_json.py | 3 -
spacy/tests/doc/test_token_api.py | 3 -
spacy/tests/doc/test_underscore.py | 3 -
spacy/tests/lang/ar/test_exceptions.py | 3 -
spacy/tests/lang/ar/test_text.py | 3 -
spacy/tests/lang/bn/test_tokenizer.py | 3 -
spacy/tests/lang/ca/test_exception.py | 4 -
.../tests/lang/ca/test_prefix_suffix_infix.py | 3 -
spacy/tests/lang/ca/test_text.py | 6 -
spacy/tests/lang/da/test_exceptions.py | 3 -
.../tests/lang/da/test_prefix_suffix_infix.py | 3 -
spacy/tests/lang/da/test_text.py | 3 -
spacy/tests/lang/de/test_exceptions.py | 3 -
spacy/tests/lang/de/test_parser.py | 3 -
.../tests/lang/de/test_prefix_suffix_infix.py | 3 -
spacy/tests/lang/de/test_text.py | 3 -
spacy/tests/lang/el/test_exception.py | 3 -
spacy/tests/lang/el/test_text.py | 3 -
.../lang/en/test_customized_tokenizer.py | 3 -
spacy/tests/lang/en/test_exceptions.py | 3 -
spacy/tests/lang/en/test_indices.py | 3 -
spacy/tests/lang/en/test_noun_chunks.py | 3 -
spacy/tests/lang/en/test_parser.py | 3 -
.../tests/lang/en/test_prefix_suffix_infix.py | 3 -
spacy/tests/lang/en/test_punct.py | 3 -
spacy/tests/lang/en/test_sbd.py | 3 -
spacy/tests/lang/en/test_tagger.py | 3 -
spacy/tests/lang/en/test_text.py | 3 -
spacy/tests/lang/es/test_exception.py | 3 -
spacy/tests/lang/es/test_text.py | 3 -
spacy/tests/lang/fi/test_text.py | 3 -
spacy/tests/lang/fi/test_tokenizer.py | 3 -
spacy/tests/lang/fr/test_exceptions.py | 3 -
.../tests/lang/fr/test_prefix_suffix_infix.py | 3 -
spacy/tests/lang/fr/test_text.py | 3 -
spacy/tests/lang/ga/test_tokenizer.py | 3 -
spacy/tests/lang/he/test_tokenizer.py | 3 -
spacy/tests/lang/hu/test_tokenizer.py | 3 -
.../tests/lang/id/test_prefix_suffix_infix.py | 3 -
spacy/tests/lang/id/test_text.py | 3 -
.../tests/lang/it/test_prefix_suffix_infix.py | 3 -
spacy/tests/lang/ja/test_lemmatization.py | 3 -
spacy/tests/lang/ja/test_tokenizer.py | 3 -
spacy/tests/lang/ko/test_lemmatization.py | 3 -
spacy/tests/lang/ko/test_tokenizer.py | 3 -
spacy/tests/lang/lb/test_exceptions.py | 3 -
.../tests/lang/lb/test_prefix_suffix_infix.py | 3 -
spacy/tests/lang/lb/test_text.py | 3 -
spacy/tests/lang/lt/test_text.py | 3 -
spacy/tests/lang/nb/test_tokenizer.py | 3 -
spacy/tests/lang/nl/test_text.py | 3 -
spacy/tests/lang/pl/test_text.py | 5 -
spacy/tests/lang/pl/test_tokenizer.py | 3 -
spacy/tests/lang/pt/test_text.py | 3 -
spacy/tests/lang/ro/test_tokenizer.py | 3 -
spacy/tests/lang/ru/test_exceptions.py | 3 -
spacy/tests/lang/ru/test_lemmatizer.py | 3 -
spacy/tests/lang/ru/test_text.py | 3 -
spacy/tests/lang/ru/test_tokenizer.py | 3 -
spacy/tests/lang/sr/test_exceptions.py | 3 -
spacy/tests/lang/sr/test_tokenizer.py | 3 -
spacy/tests/lang/sv/test_exceptions.py | 3 -
spacy/tests/lang/sv/test_noun_chunks.py | 3 -
.../tests/lang/sv/test_prefix_suffix_infix.py | 3 -
spacy/tests/lang/sv/test_text.py | 3 -
spacy/tests/lang/sv/test_tokenizer.py | 3 -
spacy/tests/lang/test_attrs.py | 3 -
spacy/tests/lang/test_initialize.py | 3 -
spacy/tests/lang/th/test_tokenizer.py | 3 -
spacy/tests/lang/tt/test_tokenizer.py | 3 -
spacy/tests/lang/uk/test_tokenizer.py | 3 -
spacy/tests/lang/uk/test_tokenizer_exc.py | 3 -
.../tests/lang/ur/test_prefix_suffix_infix.py | 3 -
spacy/tests/lang/ur/test_text.py | 3 -
spacy/tests/lang/yo/test_text.py | 3 -
spacy/tests/lang/zh/test_text.py | 3 -
spacy/tests/lang/zh/test_tokenizer.py | 3 -
spacy/tests/matcher/test_matcher_api.py | 3 -
spacy/tests/matcher/test_matcher_logic.py | 3 -
.../tests/matcher/test_pattern_validation.py | 3 -
spacy/tests/matcher/test_phrase_matcher.py | 3 -
spacy/tests/morphology/test_morph_features.py | 3 -
spacy/tests/parser/test_add_label.py | 3 -
spacy/tests/parser/test_arc_eager_oracle.py | 3 -
spacy/tests/parser/test_ner.py | 3 -
spacy/tests/parser/test_neural_parser.py | 3 -
spacy/tests/parser/test_nn_beam.py | 3 -
spacy/tests/parser/test_nonproj.py | 3 -
spacy/tests/parser/test_parse.py | 3 -
spacy/tests/parser/test_parse_navigate.py | 3 -
spacy/tests/parser/test_preset_sbd.py | 3 -
spacy/tests/parser/test_space_attachment.py | 3 -
spacy/tests/pipeline/test_analysis.py | 20 +-
spacy/tests/pipeline/test_entity_linker.py | 3 -
spacy/tests/pipeline/test_entity_ruler.py | 3 -
spacy/tests/pipeline/test_factories.py | 3 -
spacy/tests/pipeline/test_functions.py | 3 -
spacy/tests/pipeline/test_pipe_methods.py | 3 -
spacy/tests/pipeline/test_sentencizer.py | 3 -
spacy/tests/pipeline/test_tagger.py | 3 -
spacy/tests/pipeline/test_textcat.py | 3 -
spacy/tests/regression/test_issue1-1000.py | 3 -
spacy/tests/regression/test_issue1001-1500.py | 3 -
spacy/tests/regression/test_issue1501-2000.py | 3 -
spacy/tests/regression/test_issue2001-2500.py | 3 -
spacy/tests/regression/test_issue2501-3000.py | 3 -
spacy/tests/regression/test_issue3001-3500.py | 73 +----
spacy/tests/regression/test_issue3521.py | 3 -
spacy/tests/regression/test_issue3526.py | 3 -
spacy/tests/regression/test_issue3531.py | 3 -
spacy/tests/regression/test_issue3540.py | 3 -
spacy/tests/regression/test_issue3549.py | 3 -
spacy/tests/regression/test_issue3555.py | 3 -
spacy/tests/regression/test_issue3611.py | 3 -
spacy/tests/regression/test_issue3625.py | 3 -
spacy/tests/regression/test_issue3803.py | 3 -
spacy/tests/regression/test_issue3839.py | 3 -
spacy/tests/regression/test_issue3869.py | 3 -
spacy/tests/regression/test_issue3879.py | 3 -
spacy/tests/regression/test_issue3880.py | 3 -
spacy/tests/regression/test_issue3882.py | 3 -
spacy/tests/regression/test_issue3951.py | 3 -
spacy/tests/regression/test_issue3959.py | 3 -
spacy/tests/regression/test_issue3962.py | 3 -
spacy/tests/regression/test_issue3972.py | 3 -
spacy/tests/regression/test_issue4002.py | 3 -
spacy/tests/regression/test_issue4030.py | 3 -
spacy/tests/regression/test_issue4042.py | 3 -
spacy/tests/regression/test_issue4054.py | 3 -
spacy/tests/regression/test_issue4120.py | 3 -
spacy/tests/regression/test_issue4133.py | 3 -
spacy/tests/regression/test_issue4190.py | 3 -
spacy/tests/regression/test_issue4267.py | 3 -
spacy/tests/regression/test_issue4272.py | 3 -
spacy/tests/regression/test_issue4278.py | 3 -
spacy/tests/regression/test_issue4313.py | 3 -
spacy/tests/regression/test_issue4348.py | 3 -
spacy/tests/regression/test_issue4367.py | 3 -
spacy/tests/regression/test_issue4373.py | 3 -
spacy/tests/regression/test_issue4402.py | 3 -
spacy/tests/regression/test_issue4528.py | 3 -
spacy/tests/regression/test_issue4529.py | 3 -
spacy/tests/regression/test_issue4590.py | 3 -
spacy/tests/regression/test_issue4651.py | 3 -
spacy/tests/regression/test_issue4674.py | 3 -
spacy/tests/regression/test_issue4707.py | 3 -
spacy/tests/serialize/test_serialize_doc.py | 8 +-
.../test_serialize_extension_attrs.py | 9 +-
spacy/tests/serialize/test_serialize_kb.py | 3 -
.../serialize/test_serialize_language.py | 3 -
.../serialize/test_serialize_pipeline.py | 3 -
.../serialize/test_serialize_tokenizer.py | 3 -
.../serialize/test_serialize_vocab_strings.py | 3 -
spacy/tests/test_architectures.py | 3 -
spacy/tests/test_cli.py | 3 -
spacy/tests/test_displacy.py | 7 +-
spacy/tests/test_gold.py | 62 ++--
spacy/tests/test_json_schemas.py | 3 -
spacy/tests/test_language.py | 8 -
spacy/tests/test_lemmatizer.py | 3 -
spacy/tests/test_misc.py | 9 +-
spacy/tests/test_pickles.py | 3 -
spacy/tests/test_scorer.py | 3 -
spacy/tests/test_tok2vec.py | 6 +-
spacy/tests/tokenizer/test_exceptions.py | 3 -
spacy/tests/tokenizer/test_explain.py | 5 +-
spacy/tests/tokenizer/test_naughty_strings.py | 3 -
spacy/tests/tokenizer/test_tokenizer.py | 3 -
spacy/tests/tokenizer/test_urls.py | 3 -
spacy/tests/tokenizer/test_whitespace.py | 3 -
spacy/tests/util.py | 6 +-
spacy/tests/vocab_vectors/test_lexeme.py | 3 -
spacy/tests/vocab_vectors/test_lookups.py | 3 -
spacy/tests/vocab_vectors/test_similarity.py | 3 -
spacy/tests/vocab_vectors/test_stringstore.py | 3 -
spacy/tests/vocab_vectors/test_vectors.py | 3 -
spacy/tests/vocab_vectors/test_vocab_api.py | 3 -
spacy/tokenizer.pyx | 44 ++-
spacy/tokens/__init__.py | 3 -
spacy/tokens/_retokenize.pyx | 3 -
spacy/tokens/_serialize.py | 3 -
spacy/tokens/doc.pyx | 14 +-
spacy/tokens/span.pyx | 14 +-
spacy/tokens/token.pyx | 8 +-
spacy/tokens/underscore.py | 3 -
spacy/util.py | 42 ++-
spacy/vectors.pyx | 47 ++-
spacy/vocab.pyx | 43 ++-
website/docs/api/top-level.md | 47 ---
website/docs/usage/index.md | 15 +-
website/docs/usage/processing-pipelines.md | 4 +-
website/docs/usage/spacy-101.md | 13 +-
572 files changed, 526 insertions(+), 2625 deletions(-)
delete mode 100644 .travis.yml
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index e3ce53024..000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-language: python
-sudo: false
-cache: pip
-dist: trusty
-group: edge
-python:
- - "2.7"
-os:
- - linux
-install:
- - "pip install -r requirements.txt"
- - "python setup.py build_ext --inplace"
- - "pip install -e ."
-script:
- - "cat /proc/cpuinfo | grep flags | head -n 1"
- - "python -m pytest --tb=native spacy"
-branches:
- except:
- - spacy.io
-notifications:
- slack:
- secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
- email: false
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3c2b56cd3..6b7881dd2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -280,23 +280,7 @@ except: # noqa: E722
### Python conventions
-All Python code must be written in an **intersection of Python 2 and Python 3**.
-This is easy in Cython, but somewhat ugly in Python. Logic that deals with
-Python or platform compatibility should only live in
-[`spacy.compat`](spacy/compat.py). To distinguish them from the builtin
-functions, replacement functions are suffixed with an underscore, for example
-`unicode_`. If you need to access the user's version or platform information,
-for example to show more specific error messages, you can use the `is_config()`
-helper function.
-
-```python
-from .compat import unicode_, is_config
-
-compatible_unicode = unicode_('hello world')
-if is_config(windows=True, python2=True):
- print("You are using Python 2 on Windows.")
-```
-
+All Python code must be written **compatible with Python 3.6+**.
Code that interacts with the file-system should accept objects that follow the
`pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`.
If the function is user-facing and takes a path as an argument, it should check
diff --git a/README.md b/README.md
index 74d2d2166..500431b9f 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,6 @@ It's commercial open-source software, released under the MIT license.
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines]()](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
-[![Travis Build Status]()](https://travis-ci.org/explosion/spaCy)
[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases)
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
@@ -98,7 +97,7 @@ For detailed installation instructions, see the
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
Studio)
-- **Python version**: Python 2.7, 3.5+ (only 64 bit)
+- **Python version**: Python 3.6+ (only 64 bit)
- **Package managers**: [pip] · [conda] (via `conda-forge`)
[pip]: https://pypi.org/project/spacy/
@@ -269,9 +268,7 @@ and git preinstalled.
Install a version of the
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that
-matches the version that was used to compile your Python interpreter. For
-official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and
-VS 2015 (Python 3.5).
+matches the version that was used to compile your Python interpreter.
## Run tests
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 054365336..d34da39f7 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -35,12 +35,6 @@ jobs:
dependsOn: 'Validate'
strategy:
matrix:
- Python35Linux:
- imageName: 'ubuntu-16.04'
- python.version: '3.5'
- Python35Windows:
- imageName: 'vs2017-win2016'
- python.version: '3.5'
Python36Linux:
imageName: 'ubuntu-16.04'
python.version: '3.6'
diff --git a/bin/cythonize.py b/bin/cythonize.py
index 4814f8df0..554252294 100755
--- a/bin/cythonize.py
+++ b/bin/cythonize.py
@@ -38,14 +38,14 @@ import argparse
HASH_FILE = "cythonize.json"
-def process_pyx(fromfile, tofile, language_level="-2"):
+def process_pyx(fromfile, tofile, language_level="-3"):
print("Processing %s" % fromfile)
try:
from Cython.Compiler.Version import version as cython_version
from distutils.version import LooseVersion
- if LooseVersion(cython_version) < LooseVersion("0.19"):
- raise Exception("Require Cython >= 0.19")
+ if LooseVersion(cython_version) < LooseVersion("0.25"):
+ raise Exception("Require Cython >= 0.25")
except ImportError:
pass
diff --git a/fabfile.py b/fabfile.py
index fcab493f5..460471747 100644
--- a/fabfile.py
+++ b/fabfile.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals, print_function
-
import contextlib
from pathlib import Path
from fabric.api import local, lcd, env, settings, prefix
diff --git a/requirements.txt b/requirements.txt
index 1786ee186..188459c67 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,6 @@ catalogue>=0.0.7,<1.1.0
numpy>=1.15.0
requests>=2.13.0,<3.0.0
plac>=0.9.6,<1.2.0
-pathlib==1.0.1; python_version < "3.4"
tqdm>=4.38.0,<5.0.0
# Optional dependencies
jsonschema>=2.6.0,<3.1.0
diff --git a/setup.cfg b/setup.cfg
index a0103c5a2..28259c989 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -16,10 +16,7 @@ classifiers =
Operating System :: MacOS :: MacOS X
Operating System :: Microsoft :: Windows
Programming Language :: Cython
- Programming Language :: Python :: 2
- Programming Language :: Python :: 2.7
Programming Language :: Python :: 3
- Programming Language :: Python :: 3.5
Programming Language :: Python :: 3.6
Programming Language :: Python :: 3.7
Programming Language :: Python :: 3.8
@@ -30,7 +27,7 @@ zip_safe = false
include_package_data = true
scripts =
bin/spacy
-python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
+python_requires = >=3.6
setup_requires =
wheel
cython>=0.25
@@ -54,7 +51,6 @@ install_requires =
numpy>=1.15.0
plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0
- pathlib==1.0.1; python_version < "3.4"
[options.extras_require]
lookups =
diff --git a/setup.py b/setup.py
index 62a09aa73..1afdc7ae4 100755
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python
-from __future__ import print_function
import io
import os
import subprocess
diff --git a/spacy/__init__.py b/spacy/__init__.py
index 4a0d16a49..49db0e3b5 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -1,5 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
import warnings
import sys
diff --git a/spacy/__main__.py b/spacy/__main__.py
index 2c285095e..06ba5704d 100644
--- a/spacy/__main__.py
+++ b/spacy/__main__.py
@@ -1,9 +1,3 @@
-# coding: utf8
-from __future__ import print_function
-
-# NB! This breaks in plac on Python 2!!
-# from __future__ import unicode_literals
-
if __name__ == "__main__":
import plac
import sys
@@ -32,5 +26,5 @@ if __name__ == "__main__":
if command in commands:
plac.call(commands[command], sys.argv[1:])
else:
- available = "Available: {}".format(", ".join(commands))
- msg.fail("Unknown command: {}".format(command), available, exits=1)
+ available = f"Available: {', '.join(commands)}"
+ msg.fail(f"Unknown command: {command}", available, exits=1)
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 8695a88cc..a1d2b6b77 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import numpy
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
from thinc.t2t import ExtractWindow, ParametricAttention
diff --git a/spacy/analysis.py b/spacy/analysis.py
index 761be3de9..ed6d6b18e 100644
--- a/spacy/analysis.py
+++ b/spacy/analysis.py
@@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from collections import OrderedDict
from wasabi import Printer
from .tokens import Doc, Token, Span
@@ -23,7 +19,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
assert pipeline[index][0] == name
prev_pipes = pipeline[:index]
pipe_requires = getattr(pipe, "requires", [])
- requires = OrderedDict([(annot, False) for annot in pipe_requires])
+ requires = {annot: False for annot in pipe_requires}
if requires:
for prev_name, prev_pipe in prev_pipes:
prev_assigns = getattr(prev_pipe, "assigns", [])
@@ -98,15 +94,15 @@ def validate_attrs(values):
for ext_attr, ext_value in value.items():
# We don't check whether the attribute actually exists
if ext_value is not True: # attr is something like doc._.x.y
- good = "{}._.{}".format(obj_key, ext_attr)
- bad = "{}.{}".format(good, ".".join(ext_value))
+ good = f"{obj_key}._.{ext_attr}"
+ bad = f"{good}.{'.'.join(ext_value)}"
raise ValueError(Errors.E183.format(attr=bad, solution=good))
continue # we can't validate those further
if attr.endswith("_"): # attr is something like "token.pos_"
raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
if value is not True: # attr is something like doc.x.y
- good = "{}.{}".format(obj_key, attr)
- bad = "{}.{}".format(good, ".".join(value))
+ good = f"{obj_key}.{attr}"
+ bad = f"{good}.{'.'.join(value)}"
raise ValueError(Errors.E183.format(attr=bad, solution=good))
obj = objs[obj_key]
if not hasattr(obj, attr):
@@ -168,11 +164,10 @@ def print_summary(nlp, pretty=True, no_print=False):
msg.table(overview, header=header, divider=True, multiline=True)
n_problems = sum(len(p) for p in problems.values())
if any(p for p in problems.values()):
- msg.divider("Problems ({})".format(n_problems))
+ msg.divider(f"Problems ({n_problems})")
for name, problem in problems.items():
if problem:
- problem = ", ".join(problem)
- msg.warn("'{}' requirements not met: {}".format(name, problem))
+ msg.warn(f"'{name}' requirements not met: {', '.join(problem)}")
else:
msg.good("No problems found.")
if no_print:
diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 6d1c18eb9..a601a7a66 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
IDS = {
"": NULL_ATTR,
diff --git a/spacy/cli/_schemas.py b/spacy/cli/_schemas.py
index 3fb2c8979..42e5e04dd 100644
--- a/spacy/cli/_schemas.py
+++ b/spacy/cli/_schemas.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
# NB: This schema describes the new format of the training data, see #2928
TRAINING_SCHEMA = {
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 0cc0693a8..d8c8a7a18 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import plac
from pathlib import Path
from wasabi import Printer
@@ -30,16 +27,18 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
@plac.annotations(
+ # fmt: off
input_file=("Input file", "positional", None, str),
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
- file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
+ file_type=(f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES),
n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
model=("Model for sentence segmentation (for -s)", "option", "b", str),
- converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
+ converter=(f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str),
lang=("Language (if tokenizer required)", "option", "l", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool),
- ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path),
+ ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path,),
+ # fmt: on
)
def convert(
input_file,
@@ -62,16 +61,10 @@ def convert(
no_print = output_dir == "-"
msg = Printer(no_print=no_print)
input_path = Path(input_file)
- if file_type not in FILE_TYPES:
- msg.fail(
- "Unknown file type: '{}'".format(file_type),
- "Supported file types: '{}'".format(", ".join(FILE_TYPES)),
- exits=1,
- )
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
# TODO: support msgpack via stdout in srsly?
msg.fail(
- "Can't write .{} data to stdout.".format(file_type),
+ f"Can't write .{file_type} data to stdout",
"Please specify an output directory.",
exits=1,
)
@@ -95,7 +88,7 @@ def convert(
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
)
if converter not in CONVERTERS:
- msg.fail("Can't find converter for {}".format(converter), exits=1)
+ msg.fail(f"Can't find converter for {converter}", exits=1)
ner_map = None
if ner_map_path is not None:
ner_map = srsly.read_json(ner_map_path)
@@ -113,7 +106,7 @@ def convert(
)
if output_dir != "-":
# Export data to a file
- suffix = ".{}".format(file_type)
+ suffix = f".{file_type}"
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
if file_type == "json":
srsly.write_json(output_file, data)
@@ -121,9 +114,7 @@ def convert(
srsly.write_jsonl(output_file, data)
elif file_type == "msg":
srsly.write_msgpack(output_file, data)
- msg.good(
- "Generated output file ({} documents): {}".format(len(data), output_file)
- )
+ msg.good(f"Generated output file ({len(data)} documents): {output_file}")
else:
# Print to stdout
if file_type == "json":
diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py
index 46489ad7c..b607d5913 100644
--- a/spacy/cli/converters/conll_ner2json.py
+++ b/spacy/cli/converters/conll_ner2json.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from wasabi import Printer
from ...gold import iob_to_biluo
@@ -64,9 +61,9 @@ def conll_ner2json(
# sentence segmentation required for document segmentation
if n_sents > 0 and not seg_sents:
msg.warn(
- "No sentence boundaries found to use with option `-n {}`. "
- "Use `-s` to automatically segment sentences or `-n 0` "
- "to disable.".format(n_sents)
+ f"No sentence boundaries found to use with option `-n {n_sents}`. "
+ f"Use `-s` to automatically segment sentences or `-n 0` "
+ f"to disable."
)
else:
n_sents_info(msg, n_sents)
@@ -129,7 +126,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
if model:
nlp = load_model(model)
if "parser" in nlp.pipe_names:
- msg.info("Segmenting sentences with parser from model '{}'.".format(model))
+ msg.info(f"Segmenting sentences with parser from model '{model}'.")
sentencizer = nlp.get_pipe("parser")
if not sentencizer:
msg.info(
@@ -166,7 +163,7 @@ def segment_docs(input_data, n_sents, doc_delimiter):
def n_sents_info(msg, n_sents):
- msg.info("Grouping every {} sentences into a document.".format(n_sents))
+ msg.info(f"Grouping every {n_sents} sentences into a document.")
if n_sents == 1:
msg.warn(
"To generate better training data, you may want to group "
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index 3febd07d1..12b1103d4 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import re
from spacy.gold import Example
diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py
index 61c398f8d..b6ac234fc 100644
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from wasabi import Printer
from ...gold import iob_to_biluo
diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py
index 1c1bc45c7..525063b22 100644
--- a/spacy/cli/converters/jsonl2json.py
+++ b/spacy/cli/converters/jsonl2json.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import srsly
from ...gold import docs_to_json
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index c2af5bff0..2e780f53c 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals, print_function
-
from pathlib import Path
from collections import Counter
import plac
@@ -23,20 +20,17 @@ BLANK_MODEL_THRESHOLD = 2000
@plac.annotations(
+ # fmt: off
lang=("model language", "positional", None, str),
train_path=("location of JSON-formatted training data", "positional", None, Path),
dev_path=("location of JSON-formatted development data", "positional", None, Path),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
base_model=("name of model to update (optional)", "option", "b", str),
- pipeline=(
- "Comma-separated names of pipeline components to train",
- "option",
- "p",
- str,
- ),
+ pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
verbose=("Print additional information and explanations", "flag", "V", bool),
no_format=("Don't pretty-print the results", "flag", "NF", bool),
+ # fmt: on
)
def debug_data(
lang,
@@ -93,15 +87,11 @@ def debug_data(
corpus.train_dataset_without_preprocessing(nlp)
)
except ValueError as e:
- loading_train_error_message = "Training data cannot be loaded: {}".format(
- str(e)
- )
+ loading_train_error_message = f"Training data cannot be loaded: {e}"
try:
dev_dataset = list(corpus.dev_dataset(nlp))
except ValueError as e:
- loading_dev_error_message = "Development data cannot be loaded: {}".format(
- str(e)
- )
+ loading_dev_error_message = f"Development data cannot be loaded: {e}"
if loading_train_error_message or loading_dev_error_message:
if loading_train_error_message:
msg.fail(loading_train_error_message)
@@ -112,78 +102,66 @@ def debug_data(
# Create all gold data here to avoid iterating over the train_dataset constantly
gold_train_data = _compile_gold(train_dataset, pipeline)
- gold_train_unpreprocessed_data = _compile_gold(train_dataset_unpreprocessed, pipeline)
+ gold_train_unpreprocessed_data = _compile_gold(
+ train_dataset_unpreprocessed, pipeline
+ )
gold_dev_data = _compile_gold(dev_dataset, pipeline)
train_texts = gold_train_data["texts"]
dev_texts = gold_dev_data["texts"]
msg.divider("Training stats")
- msg.text("Training pipeline: {}".format(", ".join(pipeline)))
+ msg.text(f"Training pipeline: {', '.join(pipeline)}")
for pipe in [p for p in pipeline if p not in nlp.factories]:
- msg.fail("Pipeline component '{}' not available in factories".format(pipe))
+ msg.fail(f"Pipeline component '{pipe}' not available in factories")
if base_model:
- msg.text("Starting with base model '{}'".format(base_model))
+ msg.text(f"Starting with base model '{base_model}'")
else:
- msg.text("Starting with blank model '{}'".format(lang))
- msg.text("{} training docs".format(len(train_dataset)))
- msg.text("{} evaluation docs".format(len(gold_dev_data)))
+ msg.text(f"Starting with blank model '{lang}'")
+ msg.text(f"{len(train_dataset)} training docs")
+ msg.text(f"{len(gold_dev_data)} evaluation docs")
if not len(gold_dev_data):
msg.fail("No evaluation docs")
overlap = len(train_texts.intersection(dev_texts))
if overlap:
- msg.warn("{} training examples also in evaluation data".format(overlap))
+ msg.warn(f"{overlap} training examples also in evaluation data")
else:
msg.good("No overlap between training and evaluation data")
if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD:
- text = "Low number of examples to train from a blank model ({})".format(
- len(train_dataset)
+ text = (
+ f"Low number of examples to train from a blank model ({len(train_dataset)})"
)
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
msg.fail(text)
else:
msg.warn(text)
msg.text(
- "It's recommended to use at least {} examples (minimum {})".format(
- BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
- ),
+ f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples "
+ f"(minimum {BLANK_MODEL_MIN_THRESHOLD})",
show=verbose,
)
msg.divider("Vocab & Vectors")
n_words = gold_train_data["n_words"]
msg.info(
- "{} total {} in the data ({} unique)".format(
- n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"])
- )
+ f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)"
)
if gold_train_data["n_misaligned_words"] > 0:
- msg.warn(
- "{} misaligned tokens in the training data".format(
- gold_train_data["n_misaligned_words"]
- )
- )
+ n_misaligned = gold_train_data["n_misaligned_words"]
+ msg.warn(f"{n_misaligned} misaligned tokens in the training data")
if gold_dev_data["n_misaligned_words"] > 0:
- msg.warn(
- "{} misaligned tokens in the dev data".format(
- gold_dev_data["n_misaligned_words"]
- )
- )
+ n_misaligned = gold_dev_data["n_misaligned_words"]
+ msg.warn(f"{n_misaligned} misaligned tokens in the dev data")
most_common_words = gold_train_data["words"].most_common(10)
msg.text(
- "10 most common words: {}".format(
- _format_labels(most_common_words, counts=True)
- ),
+ f"10 most common words: {_format_labels(most_common_words, counts=True)}",
show=verbose,
)
if len(nlp.vocab.vectors):
msg.info(
- "{} vectors ({} unique keys, {} dimensions)".format(
- len(nlp.vocab.vectors),
- nlp.vocab.vectors.n_keys,
- nlp.vocab.vectors_length,
- )
+ f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
+ f"unique keys, {nlp.vocab.vectors_length} dimensions)"
)
else:
msg.info("No word vectors present in the model")
@@ -203,19 +181,10 @@ def debug_data(
msg.divider("Named Entity Recognition")
msg.info(
- "{} new {}, {} existing {}".format(
- len(new_labels),
- "label" if len(new_labels) == 1 else "labels",
- len(existing_labels),
- "label" if len(existing_labels) == 1 else "labels",
- )
+ f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)"
)
missing_values = label_counts["-"]
- msg.text(
- "{} missing {} (tokens with '-' label)".format(
- missing_values, "value" if missing_values == 1 else "values"
- )
- )
+ msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
for label in new_labels:
if len(label) == 0:
msg.fail("Empty label found in new labels")
@@ -226,33 +195,24 @@ def debug_data(
if label != "-"
]
labels_with_counts = _format_labels(labels_with_counts, counts=True)
- msg.text("New: {}".format(labels_with_counts), show=verbose)
+ msg.text(f"New: {labels_with_counts}", show=verbose)
if existing_labels:
- msg.text(
- "Existing: {}".format(_format_labels(existing_labels)), show=verbose
- )
-
+ msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
if gold_train_data["ws_ents"]:
- msg.fail(
- "{} invalid whitespace entity spans".format(gold_train_data["ws_ents"])
- )
+ msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
has_ws_ents_error = True
for label in new_labels:
if label_counts[label] <= NEW_LABEL_THRESHOLD:
msg.warn(
- "Low number of examples for new label '{}' ({})".format(
- label, label_counts[label]
- )
+ f"Low number of examples for new label '{label}' ({label_counts[label]})"
)
has_low_data_warning = True
with msg.loading("Analyzing label distribution..."):
neg_docs = _get_examples_without_label(train_dataset, label)
if neg_docs == 0:
- msg.warn(
- "No examples for texts WITHOUT new label '{}'".format(label)
- )
+ msg.warn(f"No examples for texts WITHOUT new label '{label}'")
has_no_neg_warning = True
if not has_low_data_warning:
@@ -264,8 +224,8 @@ def debug_data(
if has_low_data_warning:
msg.text(
- "To train a new entity type, your data should include at "
- "least {} instances of the new label".format(NEW_LABEL_THRESHOLD),
+ f"To train a new entity type, your data should include at "
+ f"least {NEW_LABEL_THRESHOLD} instances of the new label",
show=verbose,
)
if has_no_neg_warning:
@@ -288,27 +248,21 @@ def debug_data(
new_labels = [l for l in labels if l not in model_labels]
existing_labels = [l for l in labels if l in model_labels]
msg.info(
- "Text Classification: {} new label(s), {} existing label(s)".format(
- len(new_labels), len(existing_labels)
- )
+ f"Text Classification: {len(new_labels)} new label(s), "
+ f"{len(existing_labels)} existing label(s)"
)
if new_labels:
labels_with_counts = _format_labels(
gold_train_data["cats"].most_common(), counts=True
)
- msg.text("New: {}".format(labels_with_counts), show=verbose)
+ msg.text(f"New: {labels_with_counts}", show=verbose)
if existing_labels:
- msg.text(
- "Existing: {}".format(_format_labels(existing_labels)), show=verbose
- )
+ msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
msg.fail(
- "The train and dev labels are not the same. "
- "Train labels: {}. "
- "Dev labels: {}.".format(
- _format_labels(gold_train_data["cats"]),
- _format_labels(gold_dev_data["cats"]),
- )
+ f"The train and dev labels are not the same. "
+ f"Train labels: {_format_labels(gold_train_data['cats'])}. "
+ f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
)
if gold_train_data["n_cats_multilabel"] > 0:
msg.info(
@@ -338,27 +292,16 @@ def debug_data(
msg.divider("Part-of-speech Tagging")
labels = [label for label in gold_train_data["tags"]]
tag_map = nlp.vocab.morphology.tag_map
- msg.info(
- "{} {} in data ({} {} in tag map)".format(
- len(labels),
- "label" if len(labels) == 1 else "labels",
- len(tag_map),
- "label" if len(tag_map) == 1 else "labels",
- )
- )
+ msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
labels_with_counts = _format_labels(
gold_train_data["tags"].most_common(), counts=True
)
msg.text(labels_with_counts, show=verbose)
non_tagmap = [l for l in labels if l not in tag_map]
if not non_tagmap:
- msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
+ msg.good(f"All labels present in tag map for language '{nlp.lang}'")
for label in non_tagmap:
- msg.fail(
- "Label '{}' not found in tag map for language '{}'".format(
- label, nlp.lang
- )
- )
+ msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
if "parser" in pipeline:
has_low_data_warning = False
@@ -366,21 +309,18 @@ def debug_data(
# profile sentence length
msg.info(
- "Found {} sentence{} with an average length of {:.1f} words.".format(
- gold_train_data["n_sents"],
- "s" if len(train_dataset) > 1 else "",
- gold_train_data["n_words"] / gold_train_data["n_sents"],
- )
+ f"Found {gold_train_data['n_sents']} sentence(s) with an average "
+ f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words."
)
# check for documents with multiple sentences
sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
if sents_per_doc < 1.1:
msg.warn(
- "The training data contains {:.2f} sentences per "
- "document. When there are very few documents containing more "
- "than one sentence, the parser will not learn how to segment "
- "longer texts into sentences.".format(sents_per_doc)
+ f"The training data contains {sents_per_doc:.2f} sentences per "
+ f"document. When there are very few documents containing more "
+ f"than one sentence, the parser will not learn how to segment "
+ f"longer texts into sentences."
)
# profile labels
@@ -391,32 +331,13 @@ def debug_data(
labels_dev = [label for label in gold_dev_data["deps"]]
if gold_train_unpreprocessed_data["n_nonproj"] > 0:
- msg.info(
- "Found {} nonprojective train sentence{}".format(
- gold_train_unpreprocessed_data["n_nonproj"],
- "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "",
- )
- )
+ n_nonproj = gold_train_unpreprocessed_data["n_nonproj"]
+ msg.info(f"Found {n_nonproj} nonprojective train sentence(s)")
if gold_dev_data["n_nonproj"] > 0:
- msg.info(
- "Found {} nonprojective dev sentence{}".format(
- gold_dev_data["n_nonproj"],
- "s" if gold_dev_data["n_nonproj"] > 1 else "",
- )
- )
-
- msg.info(
- "{} {} in train data".format(
- len(labels_train_unpreprocessed),
- "label" if len(labels_train) == 1 else "labels",
- )
- )
- msg.info(
- "{} {} in projectivized train data".format(
- len(labels_train), "label" if len(labels_train) == 1 else "labels"
- )
- )
-
+ n_nonproj = gold_dev_data["n_nonproj"]
+ msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
+ msg.info(f"{labels_train_unpreprocessed} label(s) in train data")
+ msg.info(f"{len(labels_train)} label(s) in projectivized train data")
labels_with_counts = _format_labels(
gold_train_unpreprocessed_data["deps"].most_common(), counts=True
)
@@ -426,9 +347,8 @@ def debug_data(
for label in gold_train_unpreprocessed_data["deps"]:
if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD:
msg.warn(
- "Low number of examples for label '{}' ({})".format(
- label, gold_train_unpreprocessed_data["deps"][label]
- )
+ f"Low number of examples for label '{label}' "
+ f"({gold_train_unpreprocessed_data['deps'][label]})"
)
has_low_data_warning = True
@@ -437,22 +357,19 @@ def debug_data(
for label in gold_train_data["deps"]:
if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
rare_projectivized_labels.append(
- "{}: {}".format(label, str(gold_train_data["deps"][label]))
+ f"{label}: {gold_train_data['deps'][label]}"
)
if len(rare_projectivized_labels) > 0:
msg.warn(
- "Low number of examples for {} label{} in the "
- "projectivized dependency trees used for training. You may "
- "want to projectivize labels such as punct before "
- "training in order to improve parser performance.".format(
- len(rare_projectivized_labels),
- "s" if len(rare_projectivized_labels) > 1 else "",
- )
+ f"Low number of examples for {len(rare_projectivized_labels)} "
+ "label(s) in the projectivized dependency trees used for "
+ "training. You may want to projectivize labels such as punct "
+ "before training in order to improve parser performance."
)
msg.warn(
- "Projectivized labels with low numbers of examples: "
- "{}".format("\n".join(rare_projectivized_labels)),
+ f"Projectivized labels with low numbers of examples: ",
+ ", ".join(rare_projectivized_labels),
show=verbose,
)
has_low_data_warning = True
@@ -460,50 +377,44 @@ def debug_data(
# labels only in train
if set(labels_train) - set(labels_dev):
msg.warn(
- "The following labels were found only in the train data: "
- "{}".format(", ".join(set(labels_train) - set(labels_dev))),
+ "The following labels were found only in the train data:",
+ ", ".join(set(labels_train) - set(labels_dev)),
show=verbose,
)
# labels only in dev
if set(labels_dev) - set(labels_train):
msg.warn(
- "The following labels were found only in the dev data: "
- + ", ".join(set(labels_dev) - set(labels_train)),
+ "The following labels were found only in the dev data:",
+ ", ".join(set(labels_dev) - set(labels_train)),
show=verbose,
)
if has_low_data_warning:
msg.text(
- "To train a parser, your data should include at "
- "least {} instances of each label.".format(DEP_LABEL_THRESHOLD),
+ f"To train a parser, your data should include at "
+ f"least {DEP_LABEL_THRESHOLD} instances of each label.",
show=verbose,
)
# multiple root labels
if len(gold_train_unpreprocessed_data["roots"]) > 1:
msg.warn(
- "Multiple root labels ({}) ".format(
- ", ".join(gold_train_unpreprocessed_data["roots"])
- )
- + "found in training data. spaCy's parser uses a single root "
- "label ROOT so this distinction will not be available."
+ f"Multiple root labels "
+ f"({', '.join(gold_train_unpreprocessed_data['roots'])}) "
+ f"found in training data. spaCy's parser uses a single root "
+ f"label ROOT so this distinction will not be available."
)
# these should not happen, but just in case
if gold_train_data["n_nonproj"] > 0:
msg.fail(
- "Found {} nonprojective projectivized train sentence{}".format(
- gold_train_data["n_nonproj"],
- "s" if gold_train_data["n_nonproj"] > 1 else "",
- )
+ f"Found {gold_train_data['n_nonproj']} nonprojective "
+ f"projectivized train sentence(s)"
)
if gold_train_data["n_cycles"] > 0:
msg.fail(
- "Found {} projectivized train sentence{} with cycles".format(
- gold_train_data["n_cycles"],
- "s" if gold_train_data["n_cycles"] > 1 else "",
- )
+ f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
)
msg.divider("Summary")
@@ -511,36 +422,28 @@ def debug_data(
warn_counts = msg.counts[MESSAGES.WARN]
fail_counts = msg.counts[MESSAGES.FAIL]
if good_counts:
- msg.good(
- "{} {} passed".format(
- good_counts, "check" if good_counts == 1 else "checks"
- )
- )
+ msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed")
if warn_counts:
- msg.warn(
- "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
- )
- if fail_counts:
- msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
-
+ msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}")
if fail_counts:
+ msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}")
sys.exit(1)
def _load_file(file_path, msg):
file_name = file_path.parts[-1]
if file_path.suffix == ".json":
- with msg.loading("Loading {}...".format(file_name)):
+ with msg.loading(f"Loading {file_name}..."):
data = srsly.read_json(file_path)
- msg.good("Loaded {}".format(file_name))
+ msg.good(f"Loaded {file_name}")
return data
elif file_path.suffix == ".jsonl":
- with msg.loading("Loading {}...".format(file_name)):
+ with msg.loading(f"Loading {file_name}..."):
data = srsly.read_jsonl(file_path)
- msg.good("Loaded {}".format(file_name))
+ msg.good(f"Loaded {file_name}")
return data
msg.fail(
- "Can't load file extension {}".format(file_path.suffix),
+ f"Can't load file extension {file_path.suffix}",
"Expected .json or .jsonl",
exits=1,
)
@@ -604,14 +507,18 @@ def _compile_gold(examples, pipeline):
def _format_labels(labels, counts=False):
if counts:
- return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
- return ", ".join(["'{}'".format(l) for l in labels])
+ return ", ".join([f"'{l}' ({c})" for l, c in labels])
+ return ", ".join([f"'{l}'" for l in labels])
def _get_examples_without_label(data, label):
count = 0
for ex in data:
- labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-", None)]
+ labels = [
+ label.split("-")[1]
+ for label in ex.gold.ner
+ if label not in ("O", "-", None)
+ ]
if label not in labels:
count += 1
return count
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 19f3e7860..7c87a582a 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import plac
import requests
import os
@@ -50,7 +47,7 @@ def download(model, direct=False, *pip_args):
sys.exit(dl)
msg.good(
"Download and installation successful",
- "You can now load the model via spacy.load('{}')".format(model_name),
+ f"You can now load the model via spacy.load('{model_name}')",
)
# Only create symlink if the model is installed via a shortcut like 'en'.
# There's no real advantage over an additional symlink for en_core_web_sm
@@ -69,10 +66,10 @@ def download(model, direct=False, *pip_args):
# message and loading instructions, even if linking fails.
msg.warn(
"Download successful but linking failed",
- "Creating a shortcut link for '{}' didn't work (maybe you "
- "don't have admin permissions?), but you can still load "
- "the model via its full package name: "
- "nlp = spacy.load('{}')".format(model, model_name),
+ f"Creating a shortcut link for '{model}' didn't work (maybe you "
+ f"don't have admin permissions?), but you can still load "
+ f"the model via its full package name: "
+ f"nlp = spacy.load('{model_name}')",
)
# If a model is downloaded and then loaded within the same process, our
# is_package check currently fails, because pkg_resources.working_set
@@ -95,11 +92,11 @@ def get_json(url, desc):
r = requests.get(url)
if r.status_code != 200:
msg.fail(
- "Server error ({})".format(r.status_code),
- "Couldn't fetch {}. Please find a model for your spaCy "
- "installation (v{}), and download it manually. For more "
- "details, see the documentation: "
- "https://spacy.io/usage/models".format(desc, about.__version__),
+ f"Server error ({r.status_code})",
+ f"Couldn't fetch {desc}. Please find a model for your spaCy "
+ f"installation (v{about.__version__}), and download it manually. "
+ f"For more details, see the documentation: "
+ f"https://spacy.io/usage/models",
exits=1,
)
return r.json()
@@ -111,7 +108,7 @@ def get_compatibility():
comp_table = get_json(about.__compatibility__, "compatibility table")
comp = comp_table["spacy"]
if version not in comp:
- msg.fail("No compatible models found for v{} of spaCy".format(version), exits=1)
+ msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
return comp[version]
@@ -119,8 +116,8 @@ def get_version(model, comp):
model = model.rsplit(".dev", 1)[0]
if model not in comp:
msg.fail(
- "No compatible model found for '{}' "
- "(spaCy v{}).".format(model, about.__version__),
+ f"No compatible model found for '{model}' "
+ f"(spaCy v{about.__version__}).",
exits=1,
)
return comp[model][0]
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index da8a714a7..de2cb4d09 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals, division, print_function
-
import plac
from timeit import default_timer as timer
from wasabi import msg
@@ -79,7 +76,7 @@ def evaluate(
deps=render_deps,
ents=render_ents,
)
- msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path)
+ msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
if return_scores:
return scorer.scores
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 080d0dc77..060a38e78 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -1,13 +1,9 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import plac
import platform
from pathlib import Path
from wasabi import msg
import srsly
-from ..compat import path2str, basestring_, unicode_
from .. import util
from .. import about
@@ -33,12 +29,12 @@ def info(model=None, markdown=False, silent=False):
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path)
if model_path.resolve() != model_path:
- meta["link"] = path2str(model_path)
- meta["source"] = path2str(model_path.resolve())
+ meta["link"] = str(model_path)
+ meta["source"] = str(model_path.resolve())
else:
- meta["source"] = path2str(model_path)
+ meta["source"] = str(model_path)
if not silent:
- title = "Info about model '{}'".format(model)
+ title = f"Info about model '{model}'"
model_meta = {
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
}
@@ -49,7 +45,7 @@ def info(model=None, markdown=False, silent=False):
return meta
data = {
"spaCy version": about.__version__,
- "Location": path2str(Path(__file__).parent.parent),
+ "Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(),
"Python version": platform.python_version(),
"Models": list_models(),
@@ -84,9 +80,9 @@ def print_markdown(data, title=None):
"""
markdown = []
for key, value in data.items():
- if isinstance(value, basestring_) and Path(value).exists():
+ if isinstance(value, str) and Path(value).exists():
continue
- markdown.append("* **{}:** {}".format(key, unicode_(value)))
+ markdown.append(f"* **{key}:** {value}")
if title:
- print("\n## {}".format(title))
+ print(f"\n## {title}")
print("\n{}\n".format("\n".join(markdown)))
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 3fa0cc890..c3ef5267c 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import plac
import math
from tqdm import tqdm
@@ -91,8 +88,7 @@ def init_model(
vec_added = len(nlp.vocab.vectors)
lex_added = len(nlp.vocab)
msg.good(
- "Sucessfully compiled vocab",
- "{} entries, {} vectors".format(lex_added, vec_added),
+ "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
)
if not output_dir.exists():
output_dir.mkdir()
@@ -177,9 +173,9 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
else:
if vectors_loc:
- with msg.loading("Reading vectors from {}".format(vectors_loc)):
+ with msg.loading(f"Reading vectors from {vectors_loc}"):
vectors_data, vector_keys = read_vectors(vectors_loc)
- msg.good("Loaded vectors from {}".format(vectors_loc))
+ msg.good(f"Loaded vectors from {vectors_loc}")
else:
vectors_data, vector_keys = (None, None)
if vector_keys is not None:
diff --git a/spacy/cli/link.py b/spacy/cli/link.py
index 8117829b5..df24adc23 100644
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@@ -1,11 +1,8 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import plac
from pathlib import Path
from wasabi import msg
-from ..compat import symlink_to, path2str
+from ..compat import symlink_to
from .. import util
@@ -27,23 +24,23 @@ def link(origin, link_name, force=False, model_path=None):
if not model_path.exists():
msg.fail(
"Can't locate model data",
- "The data should be located in {}".format(path2str(model_path)),
+ f"The data should be located in {model_path}",
exits=1,
)
data_path = util.get_data_path()
if not data_path or not data_path.exists():
spacy_loc = Path(__file__).parent.parent
msg.fail(
- "Can't find the spaCy data path to create model symlink",
- "Make sure a directory `/data` exists within your spaCy "
- "installation and try again. The data directory should be located "
- "here:".format(path=spacy_loc),
+ f"Can't find the spaCy data path to create model symlink",
+ f"Make sure a directory `/data` exists within your spaCy "
+ f"installation and try again. The data directory should be located "
+ f"here: {spacy_loc}",
exits=1,
)
link_path = util.get_data_path() / link_name
if link_path.is_symlink() and not force:
msg.fail(
- "Link '{}' already exists".format(link_name),
+ f"Link '{link_name}' already exists",
"To overwrite an existing link, use the --force flag",
exits=1,
)
@@ -54,18 +51,18 @@ def link(origin, link_name, force=False, model_path=None):
elif link_path.exists(): # does it exist otherwise?
# NB: Check this last because valid symlinks also "exist".
msg.fail(
- "Can't overwrite symlink '{}'".format(link_name),
+ f"Can't overwrite symlink '{link_name}'",
"This can happen if your data directory contains a directory or "
"file of the same name.",
exits=1,
)
- details = "%s --> %s" % (path2str(model_path), path2str(link_path))
+ details = f"{model_path} --> {link_path}"
try:
symlink_to(link_path, model_path)
except: # noqa: E722
# This is quite dirty, but just making sure other errors are caught.
msg.fail(
- "Couldn't link model to '{}'".format(link_name),
+ f"Couldn't link model to '{link_name}'",
"Creating a symlink in spacy/data failed. Make sure you have the "
"required permissions and try re-running the command as admin, or "
"use a virtualenv. You can still import the model as a module and "
@@ -74,4 +71,4 @@ def link(origin, link_name, force=False, model_path=None):
msg.text(details)
raise
msg.good("Linking successful", details)
- msg.text("You can now load the model via spacy.load('{}')".format(link_name))
+ msg.text(f"You can now load the model via spacy.load('{link_name}')")
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 8ed92259c..8830a0ca2 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -1,13 +1,9 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import plac
import shutil
from pathlib import Path
from wasabi import msg, get_raw_input
import srsly
-from ..compat import path2str
from .. import util
from .. import about
@@ -47,7 +43,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
for key in ("lang", "name", "version"):
if key not in meta or meta[key] == "":
msg.fail(
- "No '{}' setting found in meta.json".format(key),
+ f"No '{key}' setting found in meta.json",
"This setting is required to build your package.",
exits=1,
)
@@ -58,22 +54,21 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
if package_path.exists():
if force:
- shutil.rmtree(path2str(package_path))
+ shutil.rmtree(str(package_path))
else:
msg.fail(
"Package directory already exists",
"Please delete the directory and try again, or use the "
- "`--force` flag to overwrite existing "
- "directories.".format(path=path2str(package_path)),
+ "`--force` flag to overwrite existing directories.",
exits=1,
)
Path.mkdir(package_path, parents=True)
- shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
+ shutil.copytree(str(input_path), str(package_path / model_name_v))
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
create_file(main_path / "setup.py", TEMPLATE_SETUP)
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
create_file(package_path / "__init__.py", TEMPLATE_INIT)
- msg.good("Successfully created package '{}'".format(model_name_v), main_path)
+ msg.good(f"Successfully created package '{model_name_v}'", main_path)
msg.text("To build the package, run `python setup.py sdist` in this directory.")
@@ -118,9 +113,6 @@ def generate_meta(model_path, existing_meta, msg):
TEMPLATE_SETUP = """
#!/usr/bin/env python
-# coding: utf8
-from __future__ import unicode_literals
-
import io
import json
from os import path, walk
@@ -190,9 +182,6 @@ include meta.json
TEMPLATE_INIT = """
-# coding: utf8
-from __future__ import unicode_literals
-
from pathlib import Path
from spacy.util import load_model_from_init_py, get_model_meta
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 68038bc5c..75840923e 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import print_function, unicode_literals
-
import plac
import random
import numpy
@@ -154,9 +151,9 @@ def pretrain(
msg.text("Reading input text from stdin...")
texts = srsly.read_jsonl("-")
- with msg.loading("Loading model '{}'...".format(vectors_model)):
+ with msg.loading(f"Loading model '{vectors_model}'..."):
nlp = util.load_model(vectors_model)
- msg.good("Loaded model '{}'".format(vectors_model))
+ msg.good(f"Loaded model '{vectors_model}'")
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
model = create_pretraining_model(
nlp,
@@ -173,7 +170,7 @@ def pretrain(
# Load in pretrained weights
if init_tok2vec is not None:
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
- msg.text("Loaded pretrained tok2vec for: {}".format(components))
+ msg.text(f"Loaded pretrained tok2vec for: {components}")
# Parse the epoch number from the given weight file
model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
if model_name:
@@ -221,7 +218,9 @@ def pretrain(
skip_counter = 0
for epoch in range(epoch_start, n_iter + epoch_start):
for batch_id, batch in enumerate(
- util.minibatch_by_words((Example(doc=text) for text in texts), size=batch_size)
+ util.minibatch_by_words(
+ (Example(doc=text) for text in texts), size=batch_size
+ )
):
docs, count = make_docs(
nlp,
@@ -246,7 +245,7 @@ def pretrain(
# Reshuffle the texts if texts were loaded from a file
random.shuffle(texts)
if skip_counter > 0:
- msg.warn("Skipped {count} empty values".format(count=str(skip_counter)))
+ msg.warn(f"Skipped {skip_counter} empty values")
msg.good("Successfully finished pretrain")
diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
index 4ee72fc23..f3df0817d 100644
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals, division, print_function
-
import plac
import tqdm
from pathlib import Path
@@ -34,11 +31,11 @@ def profile(model, inputs=None, n_texts=10000):
with msg.loading("Loading IMDB dataset via Thinc..."):
imdb_train, _ = thinc.extra.datasets.imdb()
inputs, _ = zip(*imdb_train)
- msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
+ msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
inputs = inputs[:n_inputs]
- with msg.loading("Loading model '{}'...".format(model)):
+ with msg.loading(f"Loading model '{model}'..."):
nlp = load_model(model)
- msg.good("Loaded model '{}'".format(model))
+ msg.good(f"Loaded model '{model}'")
texts = list(itertools.islice(inputs, n_texts))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
@@ -60,7 +57,7 @@ def _read_inputs(loc, msg):
input_path = Path(loc)
if not input_path.exists() or not input_path.is_file():
msg.fail("Not a valid input data file", loc, exits=1)
- msg.info("Using data from {}".format(input_path.parts[-1]))
+ msg.info(f"Using data from {input_path.parts[-1]}")
file_ = input_path.open()
for line in file_:
data = srsly.json_loads(line)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index daa90f022..e8662a101 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals, division, print_function
-
import plac
import os
import tqdm
@@ -12,12 +9,10 @@ import srsly
from wasabi import msg
import contextlib
import random
-from collections import OrderedDict
from .._ml import create_default_optimizer
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus
-from ..compat import path2str
from .. import util
from .. import about
@@ -148,14 +143,14 @@ def train(
# the model and make sure the pipeline matches the pipeline setting. If
# training starts from a blank model, intitalize the language class.
pipeline = [p.strip() for p in pipeline.split(",")]
- msg.text("Training pipeline: {}".format(pipeline))
+ msg.text(f"Training pipeline: {pipeline}")
if base_model:
- msg.text("Starting with base model '{}'".format(base_model))
+ msg.text(f"Starting with base model '{base_model}'")
nlp = util.load_model(base_model)
if nlp.lang != lang:
msg.fail(
- "Model language ('{}') doesn't match language specified as "
- "`lang` argument ('{}') ".format(nlp.lang, lang),
+ f"Model language ('{nlp.lang}') doesn't match language "
+ f"specified as `lang` argument ('{lang}') ",
exits=1,
)
nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
@@ -187,15 +182,13 @@ def train(
}
if base_cfg != pipe_cfg:
msg.fail(
- "The base textcat model configuration does"
- "not match the provided training options. "
- "Existing cfg: {}, provided cfg: {}".format(
- base_cfg, pipe_cfg
- ),
+ f"The base textcat model configuration does"
+ f"not match the provided training options. "
+ f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
exits=1,
)
else:
- msg.text("Starting with blank model '{}'".format(lang))
+ msg.text(f"Starting with blank model '{lang}'")
lang_cls = util.get_lang_class(lang)
nlp = lang_cls()
for pipe in pipeline:
@@ -215,7 +208,7 @@ def train(
nlp.vocab.morphology.tag_map.update(tag_map)
if vectors:
- msg.text("Loading vector from model '{}'".format(vectors))
+ msg.text(f"Loading vector from model '{vectors}'")
_load_vectors(nlp, vectors)
# Multitask objectives
@@ -224,15 +217,15 @@ def train(
if multitasks:
if pipe_name not in pipeline:
msg.fail(
- "Can't use multitask objective without '{}' in the "
- "pipeline".format(pipe_name)
+ f"Can't use multitask objective without '{pipe_name}' in "
+ f"the pipeline"
)
pipe = nlp.get_pipe(pipe_name)
for objective in multitasks.split(","):
pipe.add_multitask_objective(objective)
# Prepare training corpus
- msg.text("Counting training words (limit={})".format(n_examples))
+ msg.text(f"Counting training words (limit={n_examples})")
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
n_train_words = corpus.count_train()
@@ -248,22 +241,22 @@ def train(
# Load in pretrained weights
if init_tok2vec is not None:
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
- msg.text("Loaded pretrained tok2vec for: {}".format(components))
+ msg.text(f"Loaded pretrained tok2vec for: {components}")
# Verify textcat config
if "textcat" in pipeline:
textcat_labels = nlp.get_pipe("textcat").cfg["labels"]
if textcat_positive_label and textcat_positive_label not in textcat_labels:
msg.fail(
- "The textcat_positive_label (tpl) '{}' does not match any "
- "label in the training data.".format(textcat_positive_label),
+ f"The textcat_positive_label (tpl) '{textcat_positive_label}' "
+ f"does not match any label in the training data.",
exits=1,
)
if textcat_positive_label and len(textcat_labels) != 2:
msg.fail(
- "A textcat_positive_label (tpl) '{}' was provided for training "
- "data that does not appear to be a binary classification "
- "problem with two labels.".format(textcat_positive_label),
+ "A textcat_positive_label (tpl) '{textcat_positive_label}' was "
+ "provided for training data that does not appear to be a "
+ "binary classification problem with two labels.",
exits=1,
)
train_data = corpus.train_data(
@@ -302,20 +295,20 @@ def train(
break
if base_model and set(textcat_labels) != train_labels:
msg.fail(
- "Cannot extend textcat model using data with different "
- "labels. Base model labels: {}, training data labels: "
- "{}.".format(textcat_labels, list(train_labels)),
+ f"Cannot extend textcat model using data with different "
+ f"labels. Base model labels: {textcat_labels}, training data "
+ f"labels: {list(train_labels)}",
exits=1,
)
if textcat_multilabel:
msg.text(
- "Textcat evaluation score: ROC AUC score macro-averaged across "
- "the labels '{}'".format(", ".join(textcat_labels))
+ f"Textcat evaluation score: ROC AUC score macro-averaged across "
+ f"the labels '{', '.join(textcat_labels)}'"
)
elif textcat_positive_label and len(textcat_labels) == 2:
msg.text(
- "Textcat evaluation score: F1-score for the "
- "label '{}'".format(textcat_positive_label)
+ f"Textcat evaluation score: F1-score for the "
+ f"label '{textcat_positive_label}'"
)
elif len(textcat_labels) > 1:
if len(textcat_labels) == 2:
@@ -325,8 +318,8 @@ def train(
"an evaluation on the positive class."
)
msg.text(
- "Textcat evaluation score: F1-score macro-averaged across "
- "the labels '{}'".format(", ".join(textcat_labels))
+ f"Textcat evaluation score: F1-score macro-averaged across "
+ f"the labels '{', '.join(textcat_labels)}'"
)
else:
msg.fail(
@@ -471,8 +464,8 @@ def train(
for cat, cat_score in textcats_per_cat.items():
if cat_score.get("roc_auc_score", 0) < 0:
msg.warn(
- "Textcat ROC AUC score is undefined due to "
- "only one value in label '{}'.".format(cat)
+ f"Textcat ROC AUC score is undefined due to "
+ f"only one value in label '{cat}'."
)
msg.row(progress, **row_settings)
# Early stopping
@@ -485,12 +478,10 @@ def train(
best_score = current_score
if iter_since_best >= n_early_stopping:
msg.text(
- "Early stopping, best iteration "
- "is: {}".format(i - iter_since_best)
+ f"Early stopping, best iteration is: {i - iter_since_best}"
)
msg.text(
- "Best score = {}; Final iteration "
- "score = {}".format(best_score, current_score)
+ f"Best score = {best_score}; Final iteration score = {current_score}"
)
break
finally:
@@ -560,11 +551,11 @@ def _collate_best_model(meta, output_path, components):
for component in components:
bests[component] = _find_best(output_path, component)
best_dest = output_path / "model-best"
- shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest))
+ shutil.copytree(str(output_path / "model-final"), str(best_dest))
for component, best_component_src in bests.items():
- shutil.rmtree(path2str(best_dest / component))
+ shutil.rmtree(str(best_dest / component))
shutil.copytree(
- path2str(best_component_src / component), path2str(best_dest / component)
+ str(best_component_src / component), str(best_dest / component)
)
accs = srsly.read_json(best_component_src / "accuracy.json")
for metric in _get_metrics(component):
@@ -627,10 +618,8 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths):
if has_beam_widths:
row_head.insert(1, "Beam W.")
# remove duplicates
- row_head_dict = OrderedDict()
- row_head_dict.update({k: 1 for k in row_head})
- output_stats_dict = OrderedDict()
- output_stats_dict.update({k: 1 for k in output_stats})
+ row_head_dict = {k: 1 for k in row_head}
+ output_stats_dict = {k: 1 for k in output_stats}
return row_head_dict.keys(), output_stats_dict.keys()
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index 93abad6f6..b4d217f2f 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -1,13 +1,9 @@
-# coding: utf8
-from __future__ import unicode_literals, print_function
-
from pathlib import Path
import sys
import requests
import srsly
from wasabi import msg
-from ..compat import path2str
from ..util import get_data_path
from .. import about
@@ -21,7 +17,7 @@ def validate():
r = requests.get(about.__compatibility__)
if r.status_code != 200:
msg.fail(
- "Server error ({})".format(r.status_code),
+ f"Server error ({r.status_code})",
"Couldn't fetch compatibility table.",
exits=1,
)
@@ -32,7 +28,7 @@ def validate():
current_compat = compat.get(version)
if not current_compat:
msg.fail(
- "Can't find spaCy v{} in compatibility table".format(version),
+ f"Can't find spaCy v{version} in compatibility table",
about.__compatibility__,
exits=1,
)
@@ -52,8 +48,8 @@ def validate():
update_models = [m for m in incompat_models if m in current_compat]
spacy_dir = Path(__file__).parent.parent
- msg.divider("Installed models (spaCy v{})".format(about.__version__))
- msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
+ msg.divider(f"Installed models (spaCy v{about.__version__})")
+ msg.info(f"spaCy installation: {spacy_dir}")
if model_links or model_pkgs:
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
@@ -72,15 +68,15 @@ def validate():
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
if na_models:
msg.text(
- "The following models are not available for spaCy "
- "v{}: {}".format(about.__version__, ", ".join(na_models))
+ f"The following models are not available for spaCy "
+ f"v{about.__version__}: {', '.join(na_models)}"
)
if incompat_links:
msg.text(
- "You may also want to overwrite the incompatible links using the "
- "`python -m spacy link` command with `--force`, or remove them "
- "from the data directory. "
- "Data path: {path}".format(path=path2str(get_data_path()))
+ f"You may also want to overwrite the incompatible links using the "
+ f"`python -m spacy link` command with `--force`, or remove them "
+ f"from the data directory. "
+ f"Data path: {get_data_path()}"
)
if incompat_models or incompat_links:
sys.exit(1)
@@ -128,7 +124,7 @@ def get_model_row(compat, name, data, msg, model_type="package"):
version = msg.text(data["version"], color="green", no_print=True)
else:
version = msg.text(data["version"], color="red", no_print=True)
- comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
+ comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
return (model_type, name, data["name"], version, comp)
diff --git a/spacy/compat.py b/spacy/compat.py
index 0ea31c6b3..8cb08ae09 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -1,4 +1,3 @@
-# coding: utf8
"""
Helpers for Python and platform compatibility. To distinguish them from
the builtin functions, replacement functions are suffixed with an underscore,
@@ -6,13 +5,8 @@ e.g. `unicode_`.
DOCS: https://spacy.io/api/top-level#compat
"""
-from __future__ import unicode_literals
-
import os
import sys
-import itertools
-import ast
-import types
from thinc.neural.util import copy_array
@@ -46,45 +40,11 @@ copy_reg = copy_reg
CudaStream = CudaStream
cupy = cupy
copy_array = copy_array
-izip = getattr(itertools, "izip", zip)
is_windows = sys.platform.startswith("win")
is_linux = sys.platform.startswith("linux")
is_osx = sys.platform == "darwin"
-# See: https://github.com/benjaminp/six/blob/master/six.py
-is_python2 = sys.version_info[0] == 2
-is_python3 = sys.version_info[0] == 3
-is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)
-
-if is_python2:
- bytes_ = str
- unicode_ = unicode # noqa: F821
- basestring_ = basestring # noqa: F821
- input_ = raw_input # noqa: F821
- path2str = lambda path: str(path).decode("utf8")
- class_types = (type, types.ClassType)
-
-elif is_python3:
- bytes_ = bytes
- unicode_ = str
- basestring_ = str
- input_ = input
- path2str = lambda path: str(path)
- class_types = (type, types.ClassType) if is_python_pre_3_5 else type
-
-
-def b_to_str(b_str):
- """Convert a bytes object to a string.
-
- b_str (bytes): The object to convert.
- RETURNS (unicode): The converted string.
- """
- if is_python2:
- return b_str
- # Important: if no encoding is set, string becomes "b'...'"
- return str(b_str, encoding="utf8")
-
def symlink_to(orig, dest):
"""Create a symlink. Used for model shortcut links.
@@ -95,9 +55,7 @@ def symlink_to(orig, dest):
if is_windows:
import subprocess
- subprocess.check_call(
- ["mklink", "/d", path2str(orig), path2str(dest)], shell=True
- )
+ subprocess.check_call(["mklink", "/d", str(orig), str(dest)], shell=True)
else:
orig.symlink_to(dest)
@@ -108,19 +66,17 @@ def symlink_remove(link):
link (unicode / Path): The path to the symlink.
"""
# https://stackoverflow.com/q/26554135/6400719
- if os.path.isdir(path2str(link)) and is_windows:
+ if os.path.isdir(str(link)) and is_windows:
# this should only be on Py2.7 and windows
- os.rmdir(path2str(link))
+ os.rmdir(str(link))
else:
- os.unlink(path2str(link))
+ os.unlink(str(link))
-def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
+def is_config(windows=None, linux=None, osx=None, **kwargs):
"""Check if a specific configuration of Python version and operating system
matches the user's setup. Mostly used to display targeted error messages.
- python2 (bool): spaCy is executed with Python 2.x.
- python3 (bool): spaCy is executed with Python 3.x.
windows (bool): spaCy is executed on Windows.
linux (bool): spaCy is executed on Linux.
osx (bool): spaCy is executed on OS X or macOS.
@@ -129,53 +85,7 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
DOCS: https://spacy.io/api/top-level#compat.is_config
"""
return (
- python2 in (None, is_python2)
- and python3 in (None, is_python3)
- and windows in (None, is_windows)
+ windows in (None, is_windows)
and linux in (None, is_linux)
and osx in (None, is_osx)
)
-
-
-def import_file(name, loc):
- """Import module from a file. Used to load models from a directory.
-
- name (unicode): Name of module to load.
- loc (unicode / Path): Path to the file.
- RETURNS: The loaded module.
- """
- loc = path2str(loc)
- if is_python_pre_3_5:
- import imp
-
- return imp.load_source(name, loc)
- else:
- import importlib.util
-
- spec = importlib.util.spec_from_file_location(name, str(loc))
- module = importlib.util.module_from_spec(spec)
- spec.loader.exec_module(module)
- return module
-
-
-def unescape_unicode(string):
- """Python2.7's re module chokes when compiling patterns that have ranges
- between escaped unicode codepoints if the two codepoints are unrecognised
- in the unicode database. For instance:
-
- re.compile('[\\uAA77-\\uAA79]').findall("hello")
-
- Ends up matching every character (on Python 2). This problem doesn't occur
- if we're dealing with unicode literals.
- """
- if string is None:
- return string
- # We only want to unescape the unicode, so we first must protect the other
- # backslashes.
- string = string.replace("\\", "\\\\")
- # Now we remove that protection for the unicode.
- string = string.replace("\\\\u", "\\u")
- string = string.replace("\\\\U", "\\U")
- # Now we unescape by evaling the string with the AST. This can't execute
- # code -- it only does the representational level.
- return ast.literal_eval("u'''" + string + "'''")
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index d2ef21dbd..d804757ef 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -1,15 +1,11 @@
-# coding: utf8
"""
spaCy's built in visualization suite for dependencies and named entities.
DOCS: https://spacy.io/api/top-level#displacy
USAGE: https://spacy.io/usage/visualizers
"""
-from __future__ import unicode_literals
-
from .render import DependencyRenderer, EntityRenderer
from ..tokens import Doc, Span
-from ..compat import b_to_str
from ..errors import Errors, Warnings, user_warning
from ..util import is_in_jupyter
@@ -92,20 +88,20 @@ def serve(
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
httpd = simple_server.make_server(host, port, app)
- print("\nUsing the '{}' visualizer".format(style))
- print("Serving on http://{}:{} ...\n".format(host, port))
+ print(f"\nUsing the '{style}' visualizer")
+ print(f"Serving on http://{host}:{port} ...\n")
try:
httpd.serve_forever()
except KeyboardInterrupt:
- print("Shutting down server on port {}.".format(port))
+ print(f"Shutting down server on port {port}.")
finally:
httpd.server_close()
def app(environ, start_response):
# Headers and status need to be bytes in Python 2, see #1227
- headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))]
- start_response(b_to_str(b"200 OK"), headers)
+ headers = [("Content-type", "text/html; charset=utf-8")]
+ start_response("200 OK", headers)
res = _html["parsed"].encode(encoding="utf-8")
return [res]
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index d6e33437b..7ca1eebb7 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import uuid
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
@@ -55,7 +52,7 @@ class DependencyRenderer(object):
settings = p.get("settings", {})
self.direction = settings.get("direction", DEFAULT_DIR)
self.lang = settings.get("lang", DEFAULT_LANG)
- render_id = "{}-{}".format(id_prefix, i)
+ render_id = f"{id_prefix}-{i}"
svg = self.render_svg(render_id, p["words"], p["arcs"])
rendered.append(svg)
if page:
diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py
index ade75d1d6..d6970aa2f 100644
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Setting explicit height and max-width: none on the SVG is required for
# Jupyter to render it properly in a cell
diff --git a/spacy/errors.py b/spacy/errors.py
index 3dab4e1fb..81747b33b 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import os
import warnings
import inspect
@@ -12,7 +9,7 @@ def add_codes(err_cls):
class ErrorsWithCodes(object):
def __getattribute__(self, code):
msg = getattr(err_cls, code)
- return "[{code}] {msg}".format(code=code, msg=msg)
+ return f"[{code}] {msg}"
return ErrorsWithCodes()
@@ -98,8 +95,6 @@ class Warnings(object):
"you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
"If this is surprising, make sure you have the spacy-lookups-data "
"package installed.")
- W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
- "'n_process' will be set to 1.")
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
"the Knowledge Base.")
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
@@ -550,6 +545,7 @@ class Errors(object):
E999 = ("Encountered an unexpected format for the dictionary holding "
"gold annotations: {gold_dict}")
+
@add_codes
class TempErrors(object):
T003 = ("Resizing pretrained Tagger models is not currently supported.")
@@ -573,10 +569,10 @@ class MatchPatternError(ValueError):
errors (dict): Validation errors (sequence of strings) mapped to pattern
ID, i.e. the index of the added pattern.
"""
- msg = "Invalid token patterns for matcher rule '{}'\n".format(key)
+ msg = f"Invalid token patterns for matcher rule '{key}'\n"
for pattern_idx, error_msgs in errors.items():
- pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs])
- msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors)
+ pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
+ msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
ValueError.__init__(self, msg)
diff --git a/spacy/glossary.py b/spacy/glossary.py
index 44a8277da..5e7e531a9 100644
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
def explain(term):
"""Get a description for a given POS tag, dependency label or entity type.
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 0374825dc..e3af40d4d 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -1,7 +1,4 @@
# cython: profile=True
-# coding: utf8
-from __future__ import unicode_literals, print_function
-
import re
import random
import numpy
@@ -14,7 +11,6 @@ import srsly
from .syntax import nonproj
from .tokens import Doc, Span
from .errors import Errors, AlignmentError, user_warning, Warnings
-from .compat import path2str, basestring_
from . import util
@@ -157,7 +153,7 @@ class GoldCorpus(object):
self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
def __del__(self):
- shutil.rmtree(path2str(self.tmp_dir))
+ shutil.rmtree(self.tmp_dir)
@staticmethod
def write_msgpack(directory, examples, limit=0):
@@ -167,7 +163,7 @@ class GoldCorpus(object):
for i, example in enumerate(examples):
ex_dict = example.to_dict()
text = example.text
- srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict))
+ srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
n += 1
if limit and n >= limit:
break
@@ -221,7 +217,7 @@ class GoldCorpus(object):
examples = [Example.from_dict(ex_dict, doc=text)]
else:
supported = ("json", "jsonl", "msg")
- raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
+ raise ValueError(Errors.E124.format(path=loc, formats=supported))
for example in examples:
yield example
i += 1
@@ -862,7 +858,7 @@ cdef class Example:
converted_examples = []
for ex in examples:
# convert string to Doc to Example
- if isinstance(ex, basestring_):
+ if isinstance(ex, str):
if keep_raw_text:
converted_examples.append(Example(doc=ex))
else:
@@ -876,7 +872,7 @@ cdef class Example:
doc, gold = ex
gold_dict = {}
# convert string to Doc
- if isinstance(doc, basestring_) and not keep_raw_text:
+ if isinstance(doc, str) and not keep_raw_text:
doc = make_doc(doc)
# convert dict to GoldParse
if isinstance(gold, dict):
@@ -988,7 +984,7 @@ cdef class GoldParse:
# Translate the None values to '-', to make processing easier.
# See Issue #2603
entities = [(ent if ent is not None else "-") for ent in entities]
- if not isinstance(entities[0], basestring_):
+ if not isinstance(entities[0], str):
# Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities)
@@ -1107,7 +1103,7 @@ cdef class GoldParse:
cycle = nonproj.contains_cycle(self.heads)
if cycle is not None:
raise ValueError(Errors.E069.format(cycle=cycle,
- cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
+ cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
doc_tokens=" ".join(words[:50])))
def __len__(self):
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 63eb41b42..1129fa860 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,22 +1,17 @@
# cython: infer_types=True
# cython: profile=True
-# coding: utf8
-from spacy.errors import Errors, Warnings, user_warning
-
from pathlib import Path
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
-
from cpython.exc cimport PyErr_SetFromErrno
-
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
from libc.stdint cimport int32_t, int64_t
-
-from .typedefs cimport hash_t
-
from os import path
from libcpp.vector cimport vector
+from .typedefs cimport hash_t
+from .errors import Errors, Warnings, user_warning
+
cdef class Candidate:
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
@@ -584,5 +579,3 @@ cdef class Reader:
cdef int _read(self, void* value, size_t size) except -1:
status = fread(value, size, 1, self._fp)
return status
-
-
diff --git a/spacy/lang/af/__init__.py b/spacy/lang/af/__init__.py
index 90ea324f0..0da123419 100644
--- a/spacy/lang/af/__init__.py
+++ b/spacy/lang/af/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
diff --git a/spacy/lang/af/stop_words.py b/spacy/lang/af/stop_words.py
index 2b3bcc019..dfd144de9 100644
--- a/spacy/lang/af/stop_words.py
+++ b/spacy/lang/af/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/stopwords-iso/stopwords-af
diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py
index c120703f6..6a1a8af3a 100644
--- a/spacy/lang/ar/__init__.py
+++ b/spacy/lang/ar/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
diff --git a/spacy/lang/ar/examples.py b/spacy/lang/ar/examples.py
index 2a10f4fcc..a51bb9ded 100644
--- a/spacy/lang/ar/examples.py
+++ b/spacy/lang/ar/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/ar/lex_attrs.py b/spacy/lang/ar/lex_attrs.py
index 19e7aef8a..54ad7a8c3 100644
--- a/spacy/lang/ar/lex_attrs.py
+++ b/spacy/lang/ar/lex_attrs.py
@@ -1,5 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = set(
diff --git a/spacy/lang/ar/punctuation.py b/spacy/lang/ar/punctuation.py
index 6625c5475..f30204c02 100644
--- a/spacy/lang/ar/punctuation.py
+++ b/spacy/lang/ar/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
from ..char_classes import UNITS, ALPHA_UPPER
diff --git a/spacy/lang/ar/stop_words.py b/spacy/lang/ar/stop_words.py
index de2fc7443..f4da54dda 100644
--- a/spacy/lang/ar/stop_words.py
+++ b/spacy/lang/ar/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
من
diff --git a/spacy/lang/ar/tokenizer_exceptions.py b/spacy/lang/ar/tokenizer_exceptions.py
index 030daecd5..a11f3b43a 100644
--- a/spacy/lang/ar/tokenizer_exceptions.py
+++ b/spacy/lang/ar/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA
diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py
index 9b4c647e3..437feb9ed 100644
--- a/spacy/lang/bg/__init__.py
+++ b/spacy/lang/bg/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
diff --git a/spacy/lang/bg/examples.py b/spacy/lang/bg/examples.py
index b08b8926d..a6d40da1a 100644
--- a/spacy/lang/bg/examples.py
+++ b/spacy/lang/bg/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py
index e7c65cbc2..45a252bc9 100644
--- a/spacy/lang/bg/stop_words.py
+++ b/spacy/lang/bg/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/Alir3z4/stop-words
diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index e70232552..901676554 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .tag_map import TAG_MAP
diff --git a/spacy/lang/bn/examples.py b/spacy/lang/bn/examples.py
index 2d5bdb238..051e59d84 100644
--- a/spacy/lang/bn/examples.py
+++ b/spacy/lang/bn/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/bn/morph_rules.py b/spacy/lang/bn/morph_rules.py
index 21a76c7e6..44d6108e9 100644
--- a/spacy/lang/bn/morph_rules.py
+++ b/spacy/lang/bn/morph_rules.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import LEMMA, PRON_LEMMA
diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py
index f624b4ba4..becfe8d2a 100644
--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
diff --git a/spacy/lang/bn/stop_words.py b/spacy/lang/bn/stop_words.py
index 6c9967df8..6bcd06b37 100644
--- a/spacy/lang/bn/stop_words.py
+++ b/spacy/lang/bn/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
diff --git a/spacy/lang/bn/tag_map.py b/spacy/lang/bn/tag_map.py
index 1efb35858..36d69ccf9 100644
--- a/spacy/lang/bn/tag_map.py
+++ b/spacy/lang/bn/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py
index 32acb1730..18e313a25 100644
--- a/spacy/lang/bn/tokenizer_exceptions.py
+++ b/spacy/lang/bn/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding=utf-8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA
diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py
index 6d4c00a6b..a1ff2f2df 100644
--- a/spacy/lang/ca/__init__.py
+++ b/spacy/lang/ca/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
diff --git a/spacy/lang/ca/examples.py b/spacy/lang/ca/examples.py
index 3020ee707..3fbf1fb0a 100644
--- a/spacy/lang/ca/examples.py
+++ b/spacy/lang/ca/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/ca/lex_attrs.py b/spacy/lang/ca/lex_attrs.py
index 6314efa92..be8b7a6ea 100644
--- a/spacy/lang/ca/lex_attrs.py
+++ b/spacy/lang/ca/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py
index 4439376c8..d50b75589 100644
--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import ALPHA
diff --git a/spacy/lang/ca/stop_words.py b/spacy/lang/ca/stop_words.py
index a803db2a5..1a87b2f9d 100644
--- a/spacy/lang/ca/stop_words.py
+++ b/spacy/lang/ca/stop_words.py
@@ -1,7 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-
STOP_WORDS = set(
"""
a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò
diff --git a/spacy/lang/ca/tag_map.py b/spacy/lang/ca/tag_map.py
index 472e772ef..1ecbddc49 100644
--- a/spacy/lang/ca/tag_map.py
+++ b/spacy/lang/ca/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py
index d95e5e626..5a9d9055a 100644
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA
diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index 2c8823867..73f48e49a 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
split_chars = lambda char: list(char.strip().split(" "))
merge_chars = lambda char: char.strip().replace(" ", "|")
group_chars = lambda char: char.strip().replace(" ", "")
diff --git a/spacy/lang/cs/__init__.py b/spacy/lang/cs/__init__.py
index 5b1397ba2..a27e3339d 100644
--- a/spacy/lang/cs/__init__.py
+++ b/spacy/lang/cs/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
diff --git a/spacy/lang/cs/stop_words.py b/spacy/lang/cs/stop_words.py
index 59d3c102e..e8171a7e5 100644
--- a/spacy/lang/cs/stop_words.py
+++ b/spacy/lang/cs/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/Alir3z4/stop-words
diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py
index ac8c04954..2828c014b 100644
--- a/spacy/lang/da/__init__.py
+++ b/spacy/lang/da/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py
index b535191a1..e5c6448f0 100644
--- a/spacy/lang/da/examples.py
+++ b/spacy/lang/da/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/da/lex_attrs.py b/spacy/lang/da/lex_attrs.py
index 9fefc1eba..403af686c 100644
--- a/spacy/lang/da/lex_attrs.py
+++ b/spacy/lang/da/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/da/morph_rules.py b/spacy/lang/da/morph_rules.py
index 7ffe2ac6f..06704f482 100644
--- a/spacy/lang/da/morph_rules.py
+++ b/spacy/lang/da/morph_rules.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import LEMMA, PRON_LEMMA
# Source: Danish Universal Dependencies and http://fjern-uv.dk/pronom.php
diff --git a/spacy/lang/da/norm_exceptions.py b/spacy/lang/da/norm_exceptions.py
index dbffdb88b..c689500f4 100644
--- a/spacy/lang/da/norm_exceptions.py
+++ b/spacy/lang/da/norm_exceptions.py
@@ -1,10 +1,7 @@
-# coding: utf8
"""
Special-case rules for normalizing tokens to improve the model's predictions.
For example 'mysterium' vs 'mysterie' and similar.
"""
-from __future__ import unicode_literals
-
# Sources:
# 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/
diff --git a/spacy/lang/da/punctuation.py b/spacy/lang/da/punctuation.py
index b6b852c55..e050ab7aa 100644
--- a/spacy/lang/da/punctuation.py
+++ b/spacy/lang/da/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_SUFFIXES
diff --git a/spacy/lang/da/stop_words.py b/spacy/lang/da/stop_words.py
index 48de0c7ca..05b2084dd 100644
--- a/spacy/lang/da/stop_words.py
+++ b/spacy/lang/da/stop_words.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
# Source: Handpicked by Jens Dahl Møllerhøj.
STOP_WORDS = set(
diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py
index d669fb981..64eba819f 100644
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@@ -1,11 +1,7 @@
-# encoding: utf8
"""
Tokenizer Exceptions.
Source: https://forkortelse.dk/ and various others.
"""
-
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT
diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py
index 1412f033a..8478b6f23 100644
--- a/spacy/lang/de/__init__.py
+++ b/spacy/lang/de/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
diff --git a/spacy/lang/de/examples.py b/spacy/lang/de/examples.py
index 0c64a693a..530ece629 100644
--- a/spacy/lang/de/examples.py
+++ b/spacy/lang/de/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/de/norm_exceptions.py b/spacy/lang/de/norm_exceptions.py
index 3dbd4c7e3..6ad5b62a7 100644
--- a/spacy/lang/de/norm_exceptions.py
+++ b/spacy/lang/de/norm_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Here we only want to include the absolute most common words. Otherwise,
# this list would get impossibly long for German – especially considering the
# old vs. new spelling rules, and all possible cases.
diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py
index 7dfa61bd4..72f7e1022 100644
--- a/spacy/lang/de/punctuation.py
+++ b/spacy/lang/de/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
diff --git a/spacy/lang/de/stop_words.py b/spacy/lang/de/stop_words.py
index cf3204d5e..df708e22e 100644
--- a/spacy/lang/de/stop_words.py
+++ b/spacy/lang/de/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py
index 89d784a0c..410d2f0b4 100644
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import NOUN, PROPN, PRON
diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py
index c169501a9..ca7ec61f1 100644
--- a/spacy/lang/de/tag_map.py
+++ b/spacy/lang/de/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, VERB
diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py
index 5b09a0b89..3dd8507bc 100644
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index 16863e6d7..1ef7c503f 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map_general import TAG_MAP
from .stop_words import STOP_WORDS
diff --git a/spacy/lang/el/examples.py b/spacy/lang/el/examples.py
index 521e7b30d..62515c07a 100644
--- a/spacy/lang/el/examples.py
+++ b/spacy/lang/el/examples.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.el.examples import sentences
diff --git a/spacy/lang/el/get_pos_from_wiktionary.py b/spacy/lang/el/get_pos_from_wiktionary.py
index f41833974..01deb23a2 100644
--- a/spacy/lang/el/get_pos_from_wiktionary.py
+++ b/spacy/lang/el/get_pos_from_wiktionary.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
def get_pos_from_wiktionary():
import re
diff --git a/spacy/lang/el/lemmatizer.py b/spacy/lang/el/lemmatizer.py
index 6f5b3999b..cf3a7fe97 100644
--- a/spacy/lang/el/lemmatizer.py
+++ b/spacy/lang/el/lemmatizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...lemmatizer import Lemmatizer
diff --git a/spacy/lang/el/lex_attrs.py b/spacy/lang/el/lex_attrs.py
index cf32fe12c..5c8f96848 100644
--- a/spacy/lang/el/lex_attrs.py
+++ b/spacy/lang/el/lex_attrs.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
_num_words = [
diff --git a/spacy/lang/el/norm_exceptions.py b/spacy/lang/el/norm_exceptions.py
index d4384ff3c..d540aae2c 100644
--- a/spacy/lang/el/norm_exceptions.py
+++ b/spacy/lang/el/norm_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# These exceptions are used to add NORM values based on a token's ORTH value.
# Norms are only set if no alternative is provided in the tokenizer exceptions.
diff --git a/spacy/lang/el/punctuation.py b/spacy/lang/el/punctuation.py
index fbf773f4d..2d5690407 100644
--- a/spacy/lang/el/punctuation.py
+++ b/spacy/lang/el/punctuation.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import unicode_literals
-
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
from ..char_classes import CONCAT_QUOTES, CURRENCY
diff --git a/spacy/lang/el/stop_words.py b/spacy/lang/el/stop_words.py
index f13c47ec2..8484826d1 100644
--- a/spacy/lang/el/stop_words.py
+++ b/spacy/lang/el/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Stop words
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index 5dfd44f07..988a36c80 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import NOUN, PROPN, PRON
diff --git a/spacy/lang/el/tag_map.py b/spacy/lang/el/tag_map.py
index b346299bc..adfacd025 100644
--- a/spacy/lang/el/tag_map.py
+++ b/spacy/lang/el/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, PRON, AUX
diff --git a/spacy/lang/el/tag_map_general.py b/spacy/lang/el/tag_map_general.py
index 42e64a013..d7e89d43a 100644
--- a/spacy/lang/el/tag_map_general.py
+++ b/spacy/lang/el/tag_map_general.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
from ...symbols import PUNCT, NUM, AUX, X, ADJ, VERB, PART, SPACE, CCONJ
diff --git a/spacy/lang/el/tokenizer_exceptions.py b/spacy/lang/el/tokenizer_exceptions.py
index a3c36542e..27ae1fe3a 100644
--- a/spacy/lang/el/tokenizer_exceptions.py
+++ b/spacy/lang/el/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA, NORM
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index fca4e01e7..fa01e2b60 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .tag_map import TAG_MAP
diff --git a/spacy/lang/en/examples.py b/spacy/lang/en/examples.py
index 946289c7c..0363a45e7 100644
--- a/spacy/lang/en/examples.py
+++ b/spacy/lang/en/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/en/lex_attrs.py b/spacy/lang/en/lex_attrs.py
index f92d41139..96fb4c9fa 100644
--- a/spacy/lang/en/lex_attrs.py
+++ b/spacy/lang/en/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/en/morph_rules.py b/spacy/lang/en/morph_rules.py
index 5ed4eac59..aa3e6ce57 100644
--- a/spacy/lang/en/morph_rules.py
+++ b/spacy/lang/en/morph_rules.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import LEMMA, PRON_LEMMA
# Several entries here look pretty suspicious. These will get the POS SCONJ
diff --git a/spacy/lang/en/norm_exceptions.py b/spacy/lang/en/norm_exceptions.py
index a2cf58b8a..431d9c049 100644
--- a/spacy/lang/en/norm_exceptions.py
+++ b/spacy/lang/en/norm_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
_exc = {
# Slang and abbreviations
diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py
index 3505b13bf..4573c9411 100644
--- a/spacy/lang/en/stop_words.py
+++ b/spacy/lang/en/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Stop words
STOP_WORDS = set(
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index ed665ef29..86695cf6f 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import NOUN, PROPN, PRON
diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py
index ecb3103cc..2078798f7 100644
--- a/spacy/lang/en/tag_map.py
+++ b/spacy/lang/en/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index c45197771..776948c28 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py
index 80cc1727c..060bd8fc6 100644
--- a/spacy/lang/es/__init__.py
+++ b/spacy/lang/es/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py
index 0e31b56af..1c1ad631b 100644
--- a/spacy/lang/es/examples.py
+++ b/spacy/lang/es/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py
index 03ada1f43..d2a3c891a 100644
--- a/spacy/lang/es/lex_attrs.py
+++ b/spacy/lang/es/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/es/stop_words.py b/spacy/lang/es/stop_words.py
index 20e929b48..3d46a88cb 100644
--- a/spacy/lang/es/stop_words.py
+++ b/spacy/lang/es/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index 6a78d86f7..e998cd1d6 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
diff --git a/spacy/lang/es/tag_map.py b/spacy/lang/es/tag_map.py
index 7a7c9d549..1748162c0 100644
--- a/spacy/lang/es/tag_map.py
+++ b/spacy/lang/es/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX, CONJ
diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py
index 9109d658b..1cd5941be 100644
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA
diff --git a/spacy/lang/et/__init__.py b/spacy/lang/et/__init__.py
index d84c081ef..e0b0a8a87 100644
--- a/spacy/lang/et/__init__.py
+++ b/spacy/lang/et/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
diff --git a/spacy/lang/et/stop_words.py b/spacy/lang/et/stop_words.py
index 15070db5f..3b600a158 100644
--- a/spacy/lang/et/stop_words.py
+++ b/spacy/lang/et/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/stopwords-iso/stopwords-et
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index 9d85f814a..aa02855e9 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
diff --git a/spacy/lang/fa/examples.py b/spacy/lang/fa/examples.py
index 3f65a366d..d89feb6c8 100644
--- a/spacy/lang/fa/examples.py
+++ b/spacy/lang/fa/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/fa/generate_verbs_exc.py b/spacy/lang/fa/generate_verbs_exc.py
index 5d0ff944d..61586dc3f 100644
--- a/spacy/lang/fa/generate_verbs_exc.py
+++ b/spacy/lang/fa/generate_verbs_exc.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
verb_roots = """
#هست
diff --git a/spacy/lang/fa/lex_attrs.py b/spacy/lang/fa/lex_attrs.py
index dbea66b68..99b8e2787 100644
--- a/spacy/lang/fa/lex_attrs.py
+++ b/spacy/lang/fa/lex_attrs.py
@@ -1,5 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/fa/punctuation.py b/spacy/lang/fa/punctuation.py
index 33aa46ae2..4b258c13d 100644
--- a/spacy/lang/fa/punctuation.py
+++ b/spacy/lang/fa/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
from ..char_classes import UNITS, ALPHA_UPPER
diff --git a/spacy/lang/fa/stop_words.py b/spacy/lang/fa/stop_words.py
index 682fb7a71..372422b67 100644
--- a/spacy/lang/fa/stop_words.py
+++ b/spacy/lang/fa/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Stop words from HAZM package
STOP_WORDS = set(
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index ed665ef29..86695cf6f 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import NOUN, PROPN, PRON
diff --git a/spacy/lang/fa/tag_map.py b/spacy/lang/fa/tag_map.py
index b9043adf0..f1f106915 100644
--- a/spacy/lang/fa/tag_map.py
+++ b/spacy/lang/fa/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import PRON, NOUN, PART, INTJ, AUX
diff --git a/spacy/lang/fa/tokenizer_exceptions.py b/spacy/lang/fa/tokenizer_exceptions.py
index b3f8dcbf5..db9e3f6fc 100644
--- a/spacy/lang/fa/tokenizer_exceptions.py
+++ b/spacy/lang/fa/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA, TAG, NORM
diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py
index 45d2f886f..db58ad3ba 100644
--- a/spacy/lang/fi/__init__.py
+++ b/spacy/lang/fi/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
diff --git a/spacy/lang/fi/examples.py b/spacy/lang/fi/examples.py
index 88be248a6..930fac273 100644
--- a/spacy/lang/fi/examples.py
+++ b/spacy/lang/fi/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.fi.examples import sentences
diff --git a/spacy/lang/fi/lex_attrs.py b/spacy/lang/fi/lex_attrs.py
index e960b55eb..4d500cead 100644
--- a/spacy/lang/fi/lex_attrs.py
+++ b/spacy/lang/fi/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/fi/punctuation.py b/spacy/lang/fi/punctuation.py
index 02eb1b200..878c8e250 100644
--- a/spacy/lang/fi/punctuation.py
+++ b/spacy/lang/fi/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_SUFFIXES
diff --git a/spacy/lang/fi/stop_words.py b/spacy/lang/fi/stop_words.py
index e8e39ec6f..642cfc369 100644
--- a/spacy/lang/fi/stop_words.py
+++ b/spacy/lang/fi/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt
# Reformatted with some minor corrections
diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py
index d74deb22b..44360e969 100644
--- a/spacy/lang/fi/tokenizer_exceptions.py
+++ b/spacy/lang/fi/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA
diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index f56c8688a..dc45e538c 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .tag_map import TAG_MAP
diff --git a/spacy/lang/fr/_tokenizer_exceptions_list.py b/spacy/lang/fr/_tokenizer_exceptions_list.py
index c9fcfff2d..7f908dac8 100644
--- a/spacy/lang/fr/_tokenizer_exceptions_list.py
+++ b/spacy/lang/fr/_tokenizer_exceptions_list.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
FR_BASE_EXCEPTIONS = [
"(+)-amphétamine",
"(5R,6S)-7,8-didehydro-4,5-époxy-3-méthoxy-N-méthylmorphinan-6-ol",
diff --git a/spacy/lang/fr/examples.py b/spacy/lang/fr/examples.py
index a874c22fc..57d57f4a6 100644
--- a/spacy/lang/fr/examples.py
+++ b/spacy/lang/fr/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py
index 79f4dd28d..84e55d509 100644
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...lemmatizer import Lemmatizer
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
from ...symbols import SCONJ, CCONJ
diff --git a/spacy/lang/fr/lex_attrs.py b/spacy/lang/fr/lex_attrs.py
index e3ccd9fdd..da98c6e37 100644
--- a/spacy/lang/fr/lex_attrs.py
+++ b/spacy/lang/fr/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py
index 1422b4194..5f42e7f25 100644
--- a/spacy/lang/fr/punctuation.py
+++ b/spacy/lang/fr/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
diff --git a/spacy/lang/fr/stop_words.py b/spacy/lang/fr/stop_words.py
index ae8432043..9c12e49a3 100644
--- a/spacy/lang/fr/stop_words.py
+++ b/spacy/lang/fr/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 4712d34d9..96636b0b7 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import NOUN, PROPN, PRON
diff --git a/spacy/lang/fr/tag_map.py b/spacy/lang/fr/tag_map.py
index 93b43c2ec..2b1b20c52 100644
--- a/spacy/lang/fr/tag_map.py
+++ b/spacy/lang/fr/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SCONJ
diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py
index 4b3b2c908..b1c0a53af 100644
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import re
from .punctuation import ELISION, HYPHENS
@@ -70,7 +67,7 @@ for verb, verb_lemma in [
]:
for orth in [verb, verb.title()]:
for pronoun in ["elle", "il", "on"]:
- token = "{}-t-{}".format(orth, pronoun)
+ token = f"{orth}-t-{pronoun}"
_exc[token] = [
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
{LEMMA: "t", ORTH: "-t"},
@@ -79,7 +76,7 @@ for verb, verb_lemma in [
for verb, verb_lemma in [("est", "être")]:
for orth in [verb, verb.title()]:
- token = "{}-ce".format(orth)
+ token = f"{orth}-ce"
_exc[token] = [
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
{LEMMA: "ce", ORTH: "-ce"},
diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py
index 42b4d0d18..cea7c0e94 100644
--- a/spacy/lang/ga/__init__.py
+++ b/spacy/lang/ga/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py
index 2133f0d22..c8cd36835 100644
--- a/spacy/lang/ga/irish_morphology_helpers.py
+++ b/spacy/lang/ga/irish_morphology_helpers.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# fmt: off
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
diff --git a/spacy/lang/ga/stop_words.py b/spacy/lang/ga/stop_words.py
index d8f705b59..4ef052ca5 100644
--- a/spacy/lang/ga/stop_words.py
+++ b/spacy/lang/ga/stop_words.py
@@ -1,7 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-
STOP_WORDS = set(
"""
a ach ag agus an aon ar arna as
diff --git a/spacy/lang/ga/tag_map.py b/spacy/lang/ga/tag_map.py
index 1d8284014..baf64c1b8 100644
--- a/spacy/lang/ga/tag_map.py
+++ b/spacy/lang/ga/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# fmt: off
TAG_MAP = {
"ADJ__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index c0e53f522..0c587c67e 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX
from ...symbols import ORTH, LEMMA, NORM
diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py
index 411cdf107..0d324f64c 100644
--- a/spacy/lang/he/__init__.py
+++ b/spacy/lang/he/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
diff --git a/spacy/lang/he/examples.py b/spacy/lang/he/examples.py
index 34cd157ae..29075c7d4 100644
--- a/spacy/lang/he/examples.py
+++ b/spacy/lang/he/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/he/stop_words.py b/spacy/lang/he/stop_words.py
index a01ec4246..2745460a7 100644
--- a/spacy/lang/he/stop_words.py
+++ b/spacy/lang/he/stop_words.py
@@ -1,7 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-
STOP_WORDS = set(
"""
אני
diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py
index b0d45ddf3..9a96de95c 100644
--- a/spacy/lang/hi/__init__.py
+++ b/spacy/lang/hi/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
diff --git a/spacy/lang/hi/examples.py b/spacy/lang/hi/examples.py
index 1dd182532..7639ff940 100644
--- a/spacy/lang/hi/examples.py
+++ b/spacy/lang/hi/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py
index 12666d96a..20a8c2975 100644
--- a/spacy/lang/hi/lex_attrs.py
+++ b/spacy/lang/hi/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..norm_exceptions import BASE_NORMS
from ...attrs import NORM, LIKE_NUM
diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py
index efad18c84..142fc6f47 100644
--- a/spacy/lang/hi/stop_words.py
+++ b/spacy/lang/hi/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6
diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py
index 539b164d7..fbc66ece0 100644
--- a/spacy/lang/hr/__init__.py
+++ b/spacy/lang/hr/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
diff --git a/spacy/lang/hr/examples.py b/spacy/lang/hr/examples.py
index dc52ce4f0..b28fb63c2 100644
--- a/spacy/lang/hr/examples.py
+++ b/spacy/lang/hr/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/hr/stop_words.py b/spacy/lang/hr/stop_words.py
index 408b802c5..dd10f792d 100644
--- a/spacy/lang/hr/stop_words.py
+++ b/spacy/lang/hr/stop_words.py
@@ -1,7 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-
# Source: https://github.com/stopwords-iso/stopwords-hr
STOP_WORDS = set(
"""
diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py
index a331adc5b..df3fe4a44 100644
--- a/spacy/lang/hu/__init__.py
+++ b/spacy/lang/hu/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
diff --git a/spacy/lang/hu/examples.py b/spacy/lang/hu/examples.py
index 3267887fe..b60f752ec 100644
--- a/spacy/lang/hu/examples.py
+++ b/spacy/lang/hu/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py
index bc043486f..1fea6d510 100644
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES
from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
diff --git a/spacy/lang/hu/stop_words.py b/spacy/lang/hu/stop_words.py
index c9a217dd6..024af68f4 100644
--- a/spacy/lang/hu/stop_words.py
+++ b/spacy/lang/hu/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py
index c18a2cec2..cc5eede17 100644
--- a/spacy/lang/hu/tokenizer_exceptions.py
+++ b/spacy/lang/hu/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import re
from ..punctuation import ALPHA_LOWER, CURRENCY
diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py
index ea8e355ac..89f874abe 100644
--- a/spacy/lang/id/__init__.py
+++ b/spacy/lang/id/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
diff --git a/spacy/lang/id/_tokenizer_exceptions_list.py b/spacy/lang/id/_tokenizer_exceptions_list.py
index fec878d5a..a0b35fa1a 100644
--- a/spacy/lang/id/_tokenizer_exceptions_list.py
+++ b/spacy/lang/id/_tokenizer_exceptions_list.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
ID_BASE_EXCEPTIONS = set(
"""
aba-aba
diff --git a/spacy/lang/id/examples.py b/spacy/lang/id/examples.py
index 56ac9165e..2ce46ce5a 100644
--- a/spacy/lang/id/examples.py
+++ b/spacy/lang/id/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/id/lex_attrs.py b/spacy/lang/id/lex_attrs.py
index 1d4584ae3..3167f4659 100644
--- a/spacy/lang/id/lex_attrs.py
+++ b/spacy/lang/id/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import unicodedata
from .punctuation import LIST_CURRENCY
diff --git a/spacy/lang/id/norm_exceptions.py b/spacy/lang/id/norm_exceptions.py
index 09ac6a6d3..63d2081e9 100644
--- a/spacy/lang/id/norm_exceptions.py
+++ b/spacy/lang/id/norm_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Daftar kosakata yang sering salah dieja
# https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja
_exc = {
diff --git a/spacy/lang/id/punctuation.py b/spacy/lang/id/punctuation.py
index e4794d42b..f6c2387d8 100644
--- a/spacy/lang/id/punctuation.py
+++ b/spacy/lang/id/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ..char_classes import ALPHA, merge_chars, split_chars, _currency, _units
diff --git a/spacy/lang/id/stop_words.py b/spacy/lang/id/stop_words.py
index 0a9f91947..b1bfaea79 100644
--- a/spacy/lang/id/stop_words.py
+++ b/spacy/lang/id/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index 4712d34d9..96636b0b7 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import NOUN, PROPN, PRON
diff --git a/spacy/lang/id/tag_map.py b/spacy/lang/id/tag_map.py
index 16391a840..3bd08e96a 100644
--- a/spacy/lang/id/tag_map.py
+++ b/spacy/lang/id/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PRON, AUX, SCONJ, INTJ, PART, PROPN
diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py
index 86fe611bf..5259bddf8 100644
--- a/spacy/lang/id/tokenizer_exceptions.py
+++ b/spacy/lang/id/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM
diff --git a/spacy/lang/is/__init__.py b/spacy/lang/is/__init__.py
index 18e41432d..cdcfd6e71 100644
--- a/spacy/lang/is/__init__.py
+++ b/spacy/lang/is/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/is/stop_words.py
index e4ae0498b..5b3ff2f5a 100644
--- a/spacy/lang/is/stop_words.py
+++ b/spacy/lang/is/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/Xangis/extra-stopwords
diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
index 90763eda5..4b223582b 100644
--- a/spacy/lang/it/__init__.py
+++ b/spacy/lang/it/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
diff --git a/spacy/lang/it/examples.py b/spacy/lang/it/examples.py
index af66b7eca..30327bd14 100644
--- a/spacy/lang/it/examples.py
+++ b/spacy/lang/it/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py
index 4fa931fde..0b8405cc0 100644
--- a/spacy/lang/it/punctuation.py
+++ b/spacy/lang/it/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import ALPHA
diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py
index 84233d381..5cd1af137 100644
--- a/spacy/lang/it/stop_words.py
+++ b/spacy/lang/it/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
diff --git a/spacy/lang/it/tag_map.py b/spacy/lang/it/tag_map.py
index 798c45d80..ce0e1d9ee 100644
--- a/spacy/lang/it/tag_map.py
+++ b/spacy/lang/it/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX, CONJ
diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py
index 62f568c5c..f1cfba2c0 100644
--- a/spacy/lang/it/tokenizer_exceptions.py
+++ b/spacy/lang/it/tokenizer_exceptions.py
@@ -1,5 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA
_exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]}
diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 22590043f..d1ce651d7 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals, print_function
-
import re
from collections import namedtuple
diff --git a/spacy/lang/ja/examples.py b/spacy/lang/ja/examples.py
index e00001ed5..1d532ad77 100644
--- a/spacy/lang/ja/examples.py
+++ b/spacy/lang/ja/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/ja/stop_words.py b/spacy/lang/ja/stop_words.py
index bb232a2d2..98560d7e2 100644
--- a/spacy/lang/ja/stop_words.py
+++ b/spacy/lang/ja/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# This list was created by taking the top 2000 words from a Wikipedia dump and
# filtering out everything that wasn't hiragana. ー (one) was also added.
# Considered keeping some non-hiragana words but too many place names were
diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py
index 4ff0a35ee..d922cd22b 100644
--- a/spacy/lang/ja/tag_map.py
+++ b/spacy/lang/ja/tag_map.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN
from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE
diff --git a/spacy/lang/kn/__init__.py b/spacy/lang/kn/__init__.py
index c86354248..ef3b10f81 100644
--- a/spacy/lang/kn/__init__.py
+++ b/spacy/lang/kn/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
diff --git a/spacy/lang/kn/stop_words.py b/spacy/lang/kn/stop_words.py
index 652341e73..cfeb0e69d 100644
--- a/spacy/lang/kn/stop_words.py
+++ b/spacy/lang/kn/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index ec79a95ab..4ecdfbc58 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals, print_function
-
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP
from ...attrs import LANG
diff --git a/spacy/lang/ko/examples.py b/spacy/lang/ko/examples.py
index 7885ad801..cc0a66c0a 100644
--- a/spacy/lang/ko/examples.py
+++ b/spacy/lang/ko/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/ko/lex_attrs.py b/spacy/lang/ko/lex_attrs.py
index 1904a0ece..ac5bc7e48 100644
--- a/spacy/lang/ko/lex_attrs.py
+++ b/spacy/lang/ko/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/ko/stop_words.py b/spacy/lang/ko/stop_words.py
index 676dca1b4..3eba9fc82 100644
--- a/spacy/lang/ko/stop_words.py
+++ b/spacy/lang/ko/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
이
diff --git a/spacy/lang/ko/tag_map.py b/spacy/lang/ko/tag_map.py
index 57317c969..26a8c56b9 100644
--- a/spacy/lang/ko/tag_map.py
+++ b/spacy/lang/ko/tag_map.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON
from ...symbols import VERB, ADV, PROPN, NUM, DET
diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py
index 4fcfaddb4..afcf77f33 100644
--- a/spacy/lang/lb/__init__.py
+++ b/spacy/lang/lb/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
diff --git a/spacy/lang/lb/examples.py b/spacy/lang/lb/examples.py
index 3cbba31d9..a7a10489c 100644
--- a/spacy/lang/lb/examples.py
+++ b/spacy/lang/lb/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/lb/lex_attrs.py b/spacy/lang/lb/lex_attrs.py
index e38c74974..d2d50d9dc 100644
--- a/spacy/lang/lb/lex_attrs.py
+++ b/spacy/lang/lb/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/lb/norm_exceptions.py b/spacy/lang/lb/norm_exceptions.py
index 7063e6863..afc384228 100644
--- a/spacy/lang/lb/norm_exceptions.py
+++ b/spacy/lang/lb/norm_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# TODO
# norm execptions: find a possibility to deal with the zillions of spelling
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py
index 1571e13d7..4886b316c 100644
--- a/spacy/lang/lb/punctuation.py
+++ b/spacy/lang/lb/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
ELISION = " ' ’ ".strip().replace(" ", "")
diff --git a/spacy/lang/lb/stop_words.py b/spacy/lang/lb/stop_words.py
index 41e6f79d2..8f22ea6e6 100644
--- a/spacy/lang/lb/stop_words.py
+++ b/spacy/lang/lb/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
a
diff --git a/spacy/lang/lb/tag_map.py b/spacy/lang/lb/tag_map.py
index 424a83bb4..cd2e8b93c 100644
--- a/spacy/lang/lb/tag_map.py
+++ b/spacy/lang/lb/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PART, SPACE, AUX
diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py
index b32daa58c..ebf624281 100644
--- a/spacy/lang/lb/tokenizer_exceptions.py
+++ b/spacy/lang/lb/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA, NORM
# TODO
diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py
index 7c0ed8a04..339290d4a 100644
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import unicodedata
import re
diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py
index 7919a4858..0f096a5b7 100644
--- a/spacy/lang/lt/__init__.py
+++ b/spacy/lang/lt/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
diff --git a/spacy/lang/lt/examples.py b/spacy/lang/lt/examples.py
index 99dbe9d4d..b2889114c 100644
--- a/spacy/lang/lt/examples.py
+++ b/spacy/lang/lt/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/lt/lex_attrs.py b/spacy/lang/lt/lex_attrs.py
index 81879948f..28894a59b 100644
--- a/spacy/lang/lt/lex_attrs.py
+++ b/spacy/lang/lt/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
_num_words = {
diff --git a/spacy/lang/lt/morph_rules.py b/spacy/lang/lt/morph_rules.py
index 3bf26d9d8..f7bfd3cc6 100644
--- a/spacy/lang/lt/morph_rules.py
+++ b/spacy/lang/lt/morph_rules.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import LEMMA, PRON_LEMMA
diff --git a/spacy/lang/lt/stop_words.py b/spacy/lang/lt/stop_words.py
index fed05d80d..8c11b3f7b 100644
--- a/spacy/lang/lt/stop_words.py
+++ b/spacy/lang/lt/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = {
"a",
"abejais",
diff --git a/spacy/lang/lt/tag_map.py b/spacy/lang/lt/tag_map.py
index 6ea4f8ae0..f08db535f 100644
--- a/spacy/lang/lt/tag_map.py
+++ b/spacy/lang/lt/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, ADJ, ADP, ADV, CONJ, INTJ, NOUN, NUM, PART
from ...symbols import PRON, PROPN, PUNCT, SYM, VERB, X
diff --git a/spacy/lang/lt/tokenizer_exceptions.py b/spacy/lang/lt/tokenizer_exceptions.py
index fcf807278..e4b53e5b7 100644
--- a/spacy/lang/lt/tokenizer_exceptions.py
+++ b/spacy/lang/lt/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH
_exc = {}
diff --git a/spacy/lang/lv/__init__.py b/spacy/lang/lv/__init__.py
index bb8c0763b..dd8919b73 100644
--- a/spacy/lang/lv/__init__.py
+++ b/spacy/lang/lv/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
diff --git a/spacy/lang/lv/stop_words.py b/spacy/lang/lv/stop_words.py
index 075ad6347..a9612f949 100644
--- a/spacy/lang/lv/stop_words.py
+++ b/spacy/lang/lv/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/stopwords-iso/stopwords-lv
diff --git a/spacy/lang/mr/__init__.py b/spacy/lang/mr/__init__.py
index fd95f9354..eb52a3935 100644
--- a/spacy/lang/mr/__init__.py
+++ b/spacy/lang/mr/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
diff --git a/spacy/lang/mr/stop_words.py b/spacy/lang/mr/stop_words.py
index 0b0cd035d..0d7501461 100644
--- a/spacy/lang/mr/stop_words.py
+++ b/spacy/lang/mr/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json
STOP_WORDS = set(
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index 086761f82..3120951a2 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .morph_rules import MORPH_RULES
diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py
index c15426ded..89e265951 100644
--- a/spacy/lang/nb/examples.py
+++ b/spacy/lang/nb/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/nb/morph_rules.py b/spacy/lang/nb/morph_rules.py
index e20814535..b1799fca8 100644
--- a/spacy/lang/nb/morph_rules.py
+++ b/spacy/lang/nb/morph_rules.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
from ...symbols import LEMMA, PRON_LEMMA
# This dict includes all the PRON and DET tag combinations found in the
diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py
index b49aa9838..5d5800ae3 100644
--- a/spacy/lang/nb/punctuation.py
+++ b/spacy/lang/nb/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_SUFFIXES
diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py
index caa2012e7..fd65dd788 100644
--- a/spacy/lang/nb/stop_words.py
+++ b/spacy/lang/nb/stop_words.py
@@ -1,7 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-
STOP_WORDS = set(
"""
alle allerede alt and andre annen annet at av
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index 4712d34d9..96636b0b7 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import NOUN, PROPN, PRON
diff --git a/spacy/lang/nb/tag_map.py b/spacy/lang/nb/tag_map.py
index ca0ece265..a67586ed9 100644
--- a/spacy/lang/nb/tag_map.py
+++ b/spacy/lang/nb/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X
from ...symbols import VERB, NOUN, PROPN, PART, INTJ, PRON, AUX
diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py
index 92ac09841..ef6dcf264 100644
--- a/spacy/lang/nb/tokenizer_exceptions.py
+++ b/spacy/lang/nb/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA
diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py
index 074fd9133..c12b08d77 100644
--- a/spacy/lang/nl/__init__.py
+++ b/spacy/lang/nl/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP
diff --git a/spacy/lang/nl/examples.py b/spacy/lang/nl/examples.py
index a459760f4..fcefa9d62 100644
--- a/spacy/lang/nl/examples.py
+++ b/spacy/lang/nl/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py
index 9a92bee44..e7501ec52 100644
--- a/spacy/lang/nl/lemmatizer.py
+++ b/spacy/lang/nl/lemmatizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...lemmatizer import Lemmatizer
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
diff --git a/spacy/lang/nl/lex_attrs.py b/spacy/lang/nl/lex_attrs.py
index 69343b589..f1acaefeb 100644
--- a/spacy/lang/nl/lex_attrs.py
+++ b/spacy/lang/nl/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/nl/punctuation.py b/spacy/lang/nl/punctuation.py
index a48ecc044..3f3be61f8 100644
--- a/spacy/lang/nl/punctuation.py
+++ b/spacy/lang/nl/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
diff --git a/spacy/lang/nl/stop_words.py b/spacy/lang/nl/stop_words.py
index 44551f2d4..a2c6198e7 100644
--- a/spacy/lang/nl/stop_words.py
+++ b/spacy/lang/nl/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# The original stop words list (added in f46ffe3) was taken from
# http://www.damienvanholten.com/downloads/dutch-stop-words.txt
# and consisted of about 100 tokens.
diff --git a/spacy/lang/nl/tag_map.py b/spacy/lang/nl/tag_map.py
index 4fde5d39f..5bd7747c6 100644
--- a/spacy/lang/nl/tag_map.py
+++ b/spacy/lang/nl/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, ADJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, SPACE, PRON, CONJ
diff --git a/spacy/lang/nl/tokenizer_exceptions.py b/spacy/lang/nl/tokenizer_exceptions.py
index dbdd104f3..12ab8aef5 100644
--- a/spacy/lang/nl/tokenizer_exceptions.py
+++ b/spacy/lang/nl/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH
# Extensive list of both common and uncommon dutch abbreviations copied from
diff --git a/spacy/lang/norm_exceptions.py b/spacy/lang/norm_exceptions.py
index 341967a78..c194f05c7 100644
--- a/spacy/lang/norm_exceptions.py
+++ b/spacy/lang/norm_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them -
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 702a19063..a03ead1ff 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .tag_map import TAG_MAP
diff --git a/spacy/lang/pl/_tokenizer_exceptions_list.py b/spacy/lang/pl/_tokenizer_exceptions_list.py
index 839eccb83..965318442 100644
--- a/spacy/lang/pl/_tokenizer_exceptions_list.py
+++ b/spacy/lang/pl/_tokenizer_exceptions_list.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import unicode_literals
-
# The following list consists of:
# - exceptions generated from polish_srx_rules [1]
# (https://github.com/milekpl/polish_srx_rules)
diff --git a/spacy/lang/pl/examples.py b/spacy/lang/pl/examples.py
index 14b6c7030..6eabe1843 100644
--- a/spacy/lang/pl/examples.py
+++ b/spacy/lang/pl/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/pl/lex_attrs.py b/spacy/lang/pl/lex_attrs.py
index f1379aa50..ce56e28a8 100644
--- a/spacy/lang/pl/lex_attrs.py
+++ b/spacy/lang/pl/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py
index 4e69a3912..eea28de11 100644
--- a/spacy/lang/pl/punctuation.py
+++ b/spacy/lang/pl/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
diff --git a/spacy/lang/pl/stop_words.py b/spacy/lang/pl/stop_words.py
index 11df67328..075aec391 100644
--- a/spacy/lang/pl/stop_words.py
+++ b/spacy/lang/pl/stop_words.py
@@ -1,7 +1,3 @@
-# encoding: utf8
-
-from __future__ import unicode_literals
-
# sources: https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt and https://github.com/stopwords-iso/stopwords-pl
STOP_WORDS = set(
diff --git a/spacy/lang/pl/tag_map.py b/spacy/lang/pl/tag_map.py
index 5356c26cb..b83ee4d4c 100644
--- a/spacy/lang/pl/tag_map.py
+++ b/spacy/lang/pl/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import (
POS,
ADJ,
diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py
index 9e4814b0f..39f3017ed 100644
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ
diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py
index f786d6542..0557e8b31 100644
--- a/spacy/lang/pt/__init__.py
+++ b/spacy/lang/pt/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
diff --git a/spacy/lang/pt/examples.py b/spacy/lang/pt/examples.py
index b7206ffd7..7427f8b25 100644
--- a/spacy/lang/pt/examples.py
+++ b/spacy/lang/pt/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/pt/lex_attrs.py b/spacy/lang/pt/lex_attrs.py
index 4ad0eeecb..3c6979ab4 100644
--- a/spacy/lang/pt/lex_attrs.py
+++ b/spacy/lang/pt/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/pt/norm_exceptions.py b/spacy/lang/pt/norm_exceptions.py
index ea650cb31..e115b0385 100644
--- a/spacy/lang/pt/norm_exceptions.py
+++ b/spacy/lang/pt/norm_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them -
# for example, British vs. American spelling in English.
diff --git a/spacy/lang/pt/punctuation.py b/spacy/lang/pt/punctuation.py
index 370e6aaad..08e31f9d0 100644
--- a/spacy/lang/pt/punctuation.py
+++ b/spacy/lang/pt/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
from ..punctuation import TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES
from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
diff --git a/spacy/lang/pt/stop_words.py b/spacy/lang/pt/stop_words.py
index 774b06809..8065fcda7 100644
--- a/spacy/lang/pt/stop_words.py
+++ b/spacy/lang/pt/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
diff --git a/spacy/lang/pt/tag_map.py b/spacy/lang/pt/tag_map.py
index cdc7de57e..dc65998a4 100644
--- a/spacy/lang/pt/tag_map.py
+++ b/spacy/lang/pt/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, CCONJ
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX
diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py
index 5169780e6..2089ea8fa 100644
--- a/spacy/lang/pt/tokenizer_exceptions.py
+++ b/spacy/lang/pt/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, NORM
diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py
index ccb72de28..bf7357e48 100644
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py
index 6c325b74d..e32ae19cb 100644
--- a/spacy/lang/ro/__init__.py
+++ b/spacy/lang/ro/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
diff --git a/spacy/lang/ro/examples.py b/spacy/lang/ro/examples.py
index a372d7cb2..d472f0d6d 100644
--- a/spacy/lang/ro/examples.py
+++ b/spacy/lang/ro/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/ro/lex_attrs.py b/spacy/lang/ro/lex_attrs.py
index bb8391ad1..0f86f53cd 100644
--- a/spacy/lang/ro/lex_attrs.py
+++ b/spacy/lang/ro/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/ro/stop_words.py b/spacy/lang/ro/stop_words.py
index b5ba73458..1d90be85d 100644
--- a/spacy/lang/ro/stop_words.py
+++ b/spacy/lang/ro/stop_words.py
@@ -1,7 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-
# Source: https://github.com/stopwords-iso/stopwords-ro
STOP_WORDS = set(
"""
diff --git a/spacy/lang/ro/tag_map.py b/spacy/lang/ro/tag_map.py
index cb5239809..d6820b4f2 100644
--- a/spacy/lang/ro/tag_map.py
+++ b/spacy/lang/ro/tag_map.py
@@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
from ...symbols import POS, ADJ, ADP, ADV, INTJ, NOUN, NUM, PART
from ...symbols import PRON, PROPN, PUNCT, SYM, VERB, X, CCONJ, SCONJ, DET, AUX
diff --git a/spacy/lang/ro/tokenizer_exceptions.py b/spacy/lang/ro/tokenizer_exceptions.py
index a7fb38453..8408ef987 100644
--- a/spacy/lang/ro/tokenizer_exceptions.py
+++ b/spacy/lang/ro/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index f34fc5435..d25e8048b 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals, print_function
-
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
diff --git a/spacy/lang/ru/examples.py b/spacy/lang/ru/examples.py
index 2db621dac..34cf5a1eb 100644
--- a/spacy/lang/ru/examples.py
+++ b/spacy/lang/ru/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index 96d32f59c..ed0e858f5 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -1,9 +1,5 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
from ...lemmatizer import Lemmatizer
-from ...compat import unicode_
class RussianLemmatizer(Lemmatizer):
@@ -85,7 +81,7 @@ class RussianLemmatizer(Lemmatizer):
@staticmethod
def normalize_univ_pos(univ_pos):
- if isinstance(univ_pos, unicode_):
+ if isinstance(univ_pos, str):
return univ_pos.upper()
symbols_to_str = {
diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py
index 448c5b285..7979c7ea6 100644
--- a/spacy/lang/ru/lex_attrs.py
+++ b/spacy/lang/ru/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/ru/norm_exceptions.py b/spacy/lang/ru/norm_exceptions.py
index 43e08948c..c5d725031 100644
--- a/spacy/lang/ru/norm_exceptions.py
+++ b/spacy/lang/ru/norm_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
_exc = {
# Slang
diff --git a/spacy/lang/ru/stop_words.py b/spacy/lang/ru/stop_words.py
index 89069b3cf..16cb55ef9 100644
--- a/spacy/lang/ru/stop_words.py
+++ b/spacy/lang/ru/stop_words.py
@@ -1,7 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-
STOP_WORDS = set(
"""
а
diff --git a/spacy/lang/ru/tag_map.py b/spacy/lang/ru/tag_map.py
index baf065588..294919811 100644
--- a/spacy/lang/ru/tag_map.py
+++ b/spacy/lang/ru/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py
index ea7b5b20d..df3169baf 100644
--- a/spacy/lang/ru/tokenizer_exceptions.py
+++ b/spacy/lang/ru/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA, NORM
diff --git a/spacy/lang/si/__init__.py b/spacy/lang/si/__init__.py
index a58a63f03..3b065860c 100644
--- a/spacy/lang/si/__init__.py
+++ b/spacy/lang/si/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
diff --git a/spacy/lang/si/examples.py b/spacy/lang/si/examples.py
index 842dfdd7e..0ff00e76e 100644
--- a/spacy/lang/si/examples.py
+++ b/spacy/lang/si/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/si/lex_attrs.py b/spacy/lang/si/lex_attrs.py
index 5d5f06187..aa061852d 100644
--- a/spacy/lang/si/lex_attrs.py
+++ b/spacy/lang/si/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
_num_words = [
diff --git a/spacy/lang/si/stop_words.py b/spacy/lang/si/stop_words.py
index 8bbdec6b7..49723c860 100644
--- a/spacy/lang/si/stop_words.py
+++ b/spacy/lang/si/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
diff --git a/spacy/lang/sk/__init__.py b/spacy/lang/sk/__init__.py
index e7704196a..77a07e504 100644
--- a/spacy/lang/sk/__init__.py
+++ b/spacy/lang/sk/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
diff --git a/spacy/lang/sk/stop_words.py b/spacy/lang/sk/stop_words.py
index f6994d33f..bd39b22f2 100644
--- a/spacy/lang/sk/stop_words.py
+++ b/spacy/lang/sk/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/stopwords-iso/stopwords-sk
diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py
index 2d4977bdf..ce46e92dc 100644
--- a/spacy/lang/sl/__init__.py
+++ b/spacy/lang/sl/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py
index 187e95876..c8596ad0b 100644
--- a/spacy/lang/sl/stop_words.py
+++ b/spacy/lang/sl/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/stopwords-iso/stopwords-sl
# TODO: probably needs to be tidied up – the list seems to have month names in
diff --git a/spacy/lang/sq/__init__.py b/spacy/lang/sq/__init__.py
index 6f33b37c2..034604838 100644
--- a/spacy/lang/sq/__init__.py
+++ b/spacy/lang/sq/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
diff --git a/spacy/lang/sq/examples.py b/spacy/lang/sq/examples.py
index c51a0da39..e1075f70a 100644
--- a/spacy/lang/sq/examples.py
+++ b/spacy/lang/sq/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/sq/stop_words.py b/spacy/lang/sq/stop_words.py
index f91861ca1..58ee87d05 100644
--- a/spacy/lang/sq/stop_words.py
+++ b/spacy/lang/sq/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/andrixh/index-albanian
diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py
index f27b87102..151cc231c 100644
--- a/spacy/lang/sr/__init__.py
+++ b/spacy/lang/sr/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
diff --git a/spacy/lang/sr/examples.py b/spacy/lang/sr/examples.py
index d636220c3..1ac867f4c 100644
--- a/spacy/lang/sr/examples.py
+++ b/spacy/lang/sr/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py
index c90dc0da7..dc48909bc 100644
--- a/spacy/lang/sr/lex_attrs.py
+++ b/spacy/lang/sr/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/sr/norm_exceptions.py b/spacy/lang/sr/norm_exceptions.py
index 69f2c3173..add8350a0 100644
--- a/spacy/lang/sr/norm_exceptions.py
+++ b/spacy/lang/sr/norm_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
_exc = {
# Slang
diff --git a/spacy/lang/sr/stop_words.py b/spacy/lang/sr/stop_words.py
index 9712327f8..488c82a75 100644
--- a/spacy/lang/sr/stop_words.py
+++ b/spacy/lang/sr/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py
index 8fca346a3..82df15186 100755
--- a/spacy/lang/sr/tokenizer_exceptions.py
+++ b/spacy/lang/sr/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA, NORM
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 671eefca0..d400eae4d 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
diff --git a/spacy/lang/sv/examples.py b/spacy/lang/sv/examples.py
index 58e095195..98eee700b 100644
--- a/spacy/lang/sv/examples.py
+++ b/spacy/lang/sv/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/sv/morph_rules.py b/spacy/lang/sv/morph_rules.py
index 77744813f..8fca20a49 100644
--- a/spacy/lang/sv/morph_rules.py
+++ b/spacy/lang/sv/morph_rules.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import LEMMA, PRON_LEMMA
diff --git a/spacy/lang/sv/stop_words.py b/spacy/lang/sv/stop_words.py
index 206abce5a..4d933a76d 100644
--- a/spacy/lang/sv/stop_words.py
+++ b/spacy/lang/sv/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index 7a82e6b59..021d5d2f5 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import NOUN, PROPN, PRON
diff --git a/spacy/lang/sv/tag_map.py b/spacy/lang/sv/tag_map.py
index 7d4e29030..d4f5b6291 100644
--- a/spacy/lang/sv/tag_map.py
+++ b/spacy/lang/sv/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV
from ...symbols import ADP, X, VERB, NOUN, PROPN, PART, INTJ, PRON
diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py
index dd0976aa6..834a088ad 100644
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA, PUNCT, TAG
_exc = {}
diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py
index cb23339e6..d7a04afea 100644
--- a/spacy/lang/ta/__init__.py
+++ b/spacy/lang/ta/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py
index 3ce3c3544..2590163cb 100644
--- a/spacy/lang/ta/examples.py
+++ b/spacy/lang/ta/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/ta/lex_attrs.py b/spacy/lang/ta/lex_attrs.py
index 40158ad7a..f830f4ac9 100644
--- a/spacy/lang/ta/lex_attrs.py
+++ b/spacy/lang/ta/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/ta/norm_exceptions.py b/spacy/lang/ta/norm_exceptions.py
index fbdceb98c..8eaf0aa74 100644
--- a/spacy/lang/ta/norm_exceptions.py
+++ b/spacy/lang/ta/norm_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
_exc = {
# Regional words normal
# Sri Lanka - wikipeadia
diff --git a/spacy/lang/ta/stop_words.py b/spacy/lang/ta/stop_words.py
index 91ebe8fd8..83410d65e 100644
--- a/spacy/lang/ta/stop_words.py
+++ b/spacy/lang/ta/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Stop words
diff --git a/spacy/lang/tag_map.py b/spacy/lang/tag_map.py
index 3a744f180..5bff905bd 100644
--- a/spacy/lang/tag_map.py
+++ b/spacy/lang/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
diff --git a/spacy/lang/te/__init__.py b/spacy/lang/te/__init__.py
index a4709177d..424164cc7 100644
--- a/spacy/lang/te/__init__.py
+++ b/spacy/lang/te/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
diff --git a/spacy/lang/te/examples.py b/spacy/lang/te/examples.py
index 815ec8227..6162b231e 100644
--- a/spacy/lang/te/examples.py
+++ b/spacy/lang/te/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/te/lex_attrs.py b/spacy/lang/te/lex_attrs.py
index 6da766dca..ae11827f6 100644
--- a/spacy/lang/te/lex_attrs.py
+++ b/spacy/lang/te/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
_num_words = [
diff --git a/spacy/lang/te/stop_words.py b/spacy/lang/te/stop_words.py
index 11e157177..b18dab697 100644
--- a/spacy/lang/te/stop_words.py
+++ b/spacy/lang/te/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/Xangis/extra-stopwords (MIT License)
STOP_WORDS = set(
diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index 06970fbd7..950a77818 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
diff --git a/spacy/lang/th/lex_attrs.py b/spacy/lang/th/lex_attrs.py
index 047d046c2..bc4e5293e 100644
--- a/spacy/lang/th/lex_attrs.py
+++ b/spacy/lang/th/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/th/norm_exceptions.py b/spacy/lang/th/norm_exceptions.py
index ed1b3e760..98b878308 100644
--- a/spacy/lang/th/norm_exceptions.py
+++ b/spacy/lang/th/norm_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
_exc = {
# Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py
index 119a2f6a0..7fb12d538 100644
--- a/spacy/lang/th/tag_map.py
+++ b/spacy/lang/th/tag_map.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, NOUN, PRON, ADJ, ADV, INTJ, PROPN, DET, NUM, AUX, VERB
from ...symbols import ADP, CCONJ, PART, PUNCT, SPACE, SCONJ
diff --git a/spacy/lang/th/tokenizer_exceptions.py b/spacy/lang/th/tokenizer_exceptions.py
index 4de0f1195..0529b3a99 100644
--- a/spacy/lang/th/tokenizer_exceptions.py
+++ b/spacy/lang/th/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA
diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py
index 30ad93139..f477029f7 100644
--- a/spacy/lang/tl/__init__.py
+++ b/spacy/lang/tl/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
diff --git a/spacy/lang/tl/lex_attrs.py b/spacy/lang/tl/lex_attrs.py
index 61dc9d4f3..60bdc923b 100644
--- a/spacy/lang/tl/lex_attrs.py
+++ b/spacy/lang/tl/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/tl/stop_words.py b/spacy/lang/tl/stop_words.py
index 510b3a418..2560cdaed 100644
--- a/spacy/lang/tl/stop_words.py
+++ b/spacy/lang/tl/stop_words.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
STOP_WORDS = set(
"""
akin
diff --git a/spacy/lang/tl/tokenizer_exceptions.py b/spacy/lang/tl/tokenizer_exceptions.py
index 77e1fb0c6..ea14746c4 100644
--- a/spacy/lang/tl/tokenizer_exceptions.py
+++ b/spacy/lang/tl/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index 3ea2bc3e9..13a1033a6 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import re
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py
index 2553e7c0f..a29d78261 100644
--- a/spacy/lang/tr/__init__.py
+++ b/spacy/lang/tr/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
diff --git a/spacy/lang/tr/examples.py b/spacy/lang/tr/examples.py
index a0464dfe3..a14d87a46 100644
--- a/spacy/lang/tr/examples.py
+++ b/spacy/lang/tr/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/tr/lex_attrs.py b/spacy/lang/tr/lex_attrs.py
index 93f26fc8e..3dbc1833a 100644
--- a/spacy/lang/tr/lex_attrs.py
+++ b/spacy/lang/tr/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/tr/stop_words.py b/spacy/lang/tr/stop_words.py
index 65905499a..85dcff6a5 100644
--- a/spacy/lang/tr/stop_words.py
+++ b/spacy/lang/tr/stop_words.py
@@ -1,7 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-
# Source: https://github.com/stopwords-iso/stopwords-tr
STOP_WORDS = set(
"""
diff --git a/spacy/lang/tr/tokenizer_exceptions.py b/spacy/lang/tr/tokenizer_exceptions.py
index f48e035d4..97f524a87 100644
--- a/spacy/lang/tr/tokenizer_exceptions.py
+++ b/spacy/lang/tr/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, NORM
_exc = {"sağol": [{ORTH: "sağ"}, {ORTH: "ol", NORM: "olun"}]}
diff --git a/spacy/lang/tt/__init__.py b/spacy/lang/tt/__init__.py
index 3655e6264..80574a70d 100644
--- a/spacy/lang/tt/__init__.py
+++ b/spacy/lang/tt/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
diff --git a/spacy/lang/tt/examples.py b/spacy/lang/tt/examples.py
index ac668a0c2..723fcdd15 100644
--- a/spacy/lang/tt/examples.py
+++ b/spacy/lang/tt/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.tt.examples import sentences
diff --git a/spacy/lang/tt/lex_attrs.py b/spacy/lang/tt/lex_attrs.py
index ad3d6b9eb..a2ae03061 100644
--- a/spacy/lang/tt/lex_attrs.py
+++ b/spacy/lang/tt/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
_num_words = [
diff --git a/spacy/lang/tt/punctuation.py b/spacy/lang/tt/punctuation.py
index 9ee66a59e..f644a8ccb 100644
--- a/spacy/lang/tt/punctuation.py
+++ b/spacy/lang/tt/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
diff --git a/spacy/lang/tt/stop_words.py b/spacy/lang/tt/stop_words.py
index 9f6e9bb86..44169b757 100644
--- a/spacy/lang/tt/stop_words.py
+++ b/spacy/lang/tt/stop_words.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
# Tatar stopwords are from https://github.com/aliiae/stopwords-tt
STOP_WORDS = set(
diff --git a/spacy/lang/tt/tokenizer_exceptions.py b/spacy/lang/tt/tokenizer_exceptions.py
index 89f7a990b..efe9e1fc0 100644
--- a/spacy/lang/tt/tokenizer_exceptions.py
+++ b/spacy/lang/tt/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA, NORM
_exc = {}
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index e74ff2d86..51165112a 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
diff --git a/spacy/lang/uk/examples.py b/spacy/lang/uk/examples.py
index 4f2b034eb..d17768ea6 100644
--- a/spacy/lang/uk/examples.py
+++ b/spacy/lang/uk/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index 3eeed5dd4..ff61d711f 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -1,4 +1,3 @@
-# coding: utf8
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
from ...lemmatizer import Lemmatizer
diff --git a/spacy/lang/uk/lex_attrs.py b/spacy/lang/uk/lex_attrs.py
index 0ade751d6..510e5b85d 100644
--- a/spacy/lang/uk/lex_attrs.py
+++ b/spacy/lang/uk/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
_num_words = [
diff --git a/spacy/lang/uk/stop_words.py b/spacy/lang/uk/stop_words.py
index cdf24dd70..b11d7a044 100644
--- a/spacy/lang/uk/stop_words.py
+++ b/spacy/lang/uk/stop_words.py
@@ -1,7 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-
STOP_WORDS = set(
"""а
або
diff --git a/spacy/lang/uk/tag_map.py b/spacy/lang/uk/tag_map.py
index 472e772ef..1ecbddc49 100644
--- a/spacy/lang/uk/tag_map.py
+++ b/spacy/lang/uk/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
diff --git a/spacy/lang/uk/tokenizer_exceptions.py b/spacy/lang/uk/tokenizer_exceptions.py
index a94d77af3..36f0b2e72 100644
--- a/spacy/lang/uk/tokenizer_exceptions.py
+++ b/spacy/lang/uk/tokenizer_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import ORTH, LEMMA, POS, NORM, NOUN
diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py
index 6eea0cf3b..c7f65adc3 100644
--- a/spacy/lang/ur/__init__.py
+++ b/spacy/lang/ur/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
diff --git a/spacy/lang/ur/examples.py b/spacy/lang/ur/examples.py
index f47c11600..7024483b5 100644
--- a/spacy/lang/ur/examples.py
+++ b/spacy/lang/ur/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/ur/lex_attrs.py b/spacy/lang/ur/lex_attrs.py
index 12d85be4b..e590ed3e3 100644
--- a/spacy/lang/ur/lex_attrs.py
+++ b/spacy/lang/ur/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
# Source https://quizlet.com/4271889/1-100-urdu-number-wordsurdu-numerals-flash-cards/
diff --git a/spacy/lang/ur/punctuation.py b/spacy/lang/ur/punctuation.py
index b8b1a1c83..5d35d0a25 100644
--- a/spacy/lang/ur/punctuation.py
+++ b/spacy/lang/ur/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..punctuation import TOKENIZER_SUFFIXES
diff --git a/spacy/lang/ur/stop_words.py b/spacy/lang/ur/stop_words.py
index 73c159d5c..abfa36497 100644
--- a/spacy/lang/ur/stop_words.py
+++ b/spacy/lang/ur/stop_words.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
# Source: collected from different resource on internet
STOP_WORDS = set(
"""
diff --git a/spacy/lang/ur/tag_map.py b/spacy/lang/ur/tag_map.py
index 2499d7e3e..e0940edb7 100644
--- a/spacy/lang/ur/tag_map.py
+++ b/spacy/lang/ur/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py
index 425f84e3d..7496763ee 100644
--- a/spacy/lang/vi/__init__.py
+++ b/spacy/lang/vi/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LANG, NORM
from ..norm_exceptions import BASE_NORMS
from ...language import Language
diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py
index b6cd1188a..b3dbf2192 100644
--- a/spacy/lang/vi/lex_attrs.py
+++ b/spacy/lang/vi/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/vi/stop_words.py b/spacy/lang/vi/stop_words.py
index 13284dc59..1d2ecdf8d 100644
--- a/spacy/lang/vi/stop_words.py
+++ b/spacy/lang/vi/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/stopwords/vietnamese-stopwords
STOP_WORDS = set(
"""
diff --git a/spacy/lang/vi/tag_map.py b/spacy/lang/vi/tag_map.py
index 472e772ef..1ecbddc49 100644
--- a/spacy/lang/vi/tag_map.py
+++ b/spacy/lang/vi/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py
index 66d8c7917..2af650703 100644
--- a/spacy/lang/xx/__init__.py
+++ b/spacy/lang/xx/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py
index 38cd5e0cd..15f5c4ff8 100644
--- a/spacy/lang/xx/examples.py
+++ b/spacy/lang/xx/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/yo/__init__.py b/spacy/lang/yo/__init__.py
index f227203cc..08e3166e1 100644
--- a/spacy/lang/yo/__init__.py
+++ b/spacy/lang/yo/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
diff --git a/spacy/lang/yo/examples.py b/spacy/lang/yo/examples.py
index 170ddc803..9b875d09e 100644
--- a/spacy/lang/yo/examples.py
+++ b/spacy/lang/yo/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/yo/lex_attrs.py b/spacy/lang/yo/lex_attrs.py
index a9f1b85f6..ead68ced2 100644
--- a/spacy/lang/yo/lex_attrs.py
+++ b/spacy/lang/yo/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import unicodedata
from ...attrs import LIKE_NUM
diff --git a/spacy/lang/yo/stop_words.py b/spacy/lang/yo/stop_words.py
index 53d382ad3..5c7a7fc45 100644
--- a/spacy/lang/yo/stop_words.py
+++ b/spacy/lang/yo/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
# stop words as whitespace-separated list.
# Source: https://raw.githubusercontent.com/dohliam/more-stoplists/master/yo/yo.txt
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 8179b4551..e427dc6d2 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LANG
from ...language import Language
from ...tokens import Doc
diff --git a/spacy/lang/zh/examples.py b/spacy/lang/zh/examples.py
index b28215741..d0715eb0d 100644
--- a/spacy/lang/zh/examples.py
+++ b/spacy/lang/zh/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/zh/lex_attrs.py b/spacy/lang/zh/lex_attrs.py
index 0b29c226e..08c8e3160 100644
--- a/spacy/lang/zh/lex_attrs.py
+++ b/spacy/lang/zh/lex_attrs.py
@@ -1,8 +1,8 @@
-# coding: utf8
-from __future__ import unicode_literals
import re
+
from ...attrs import LIKE_NUM
+
_single_num_words = [
"〇",
"一",
diff --git a/spacy/lang/zh/stop_words.py b/spacy/lang/zh/stop_words.py
index 0af4c1859..42ae4a1de 100644
--- a/spacy/lang/zh/stop_words.py
+++ b/spacy/lang/zh/stop_words.py
@@ -1,7 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-
# stop words as whitespace-separated list
# Chinese stop words,maybe not enough
STOP_WORDS = set(
diff --git a/spacy/lang/zh/tag_map.py b/spacy/lang/zh/tag_map.py
index 41e2d2158..1ff0827be 100644
--- a/spacy/lang/zh/tag_map.py
+++ b/spacy/lang/zh/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X
from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE
diff --git a/spacy/language.py b/spacy/language.py
index 008b5559f..4a553bcaf 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,14 +1,7 @@
-# coding: utf8
-from __future__ import absolute_import, unicode_literals
-
import random
import itertools
-
-from spacy.gold import Example
-from spacy.util import minibatch
import weakref
import functools
-from collections import OrderedDict
from contextlib import contextmanager
from copy import copy, deepcopy
from thinc.neural import Model
@@ -21,8 +14,7 @@ from .vocab import Vocab
from .lemmatizer import Lemmatizer
from .lookups import Lookups
from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
-from .compat import izip, basestring_, is_python2, class_types
-from .gold import GoldParse
+from .gold import Example
from .scorer import Scorer
from ._ml import link_vectors_to_models, create_default_optimizer
from .attrs import IS_STOP, LANG
@@ -32,7 +24,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
from .tokens import Doc
from .lang.lex_attrs import LEX_ATTRS, is_stop
-from .errors import Errors, Warnings, deprecation_warning, user_warning
+from .errors import Errors, Warnings, deprecation_warning
from . import util
from . import about
@@ -190,7 +182,7 @@ class Language(object):
self._meta.setdefault("lang", self.lang)
self._meta.setdefault("name", "model")
self._meta.setdefault("version", "0.0.0")
- self._meta.setdefault("spacy_version", ">={}".format(about.__version__))
+ self._meta.setdefault("spacy_version", f">={about.__version__}")
self._meta.setdefault("description", "")
self._meta.setdefault("author", "")
self._meta.setdefault("email", "")
@@ -263,7 +255,7 @@ class Language(object):
RETURNS (dict): Labels keyed by component name.
"""
- labels = OrderedDict()
+ labels = {}
for name, pipe in self.pipeline:
if hasattr(pipe, "labels"):
labels[name] = list(pipe.labels)
@@ -320,7 +312,7 @@ class Language(object):
"""
if not hasattr(component, "__call__"):
msg = Errors.E003.format(component=repr(component), name=name)
- if isinstance(component, basestring_) and component in self.factories:
+ if isinstance(component, str) and component in self.factories:
msg += Errors.E004.format(component=component)
raise ValueError(msg)
if name is None:
@@ -372,7 +364,7 @@ class Language(object):
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
if not hasattr(component, "__call__"):
msg = Errors.E003.format(component=repr(component), name=name)
- if isinstance(component, basestring_) and component in self.factories:
+ if isinstance(component, str) and component in self.factories:
msg += Errors.E135.format(name=name)
raise ValueError(msg)
self.pipeline[self.pipe_names.index(name)] = (name, component)
@@ -476,6 +468,7 @@ class Language(object):
sgd = self._optimizer
grads = {}
+
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
@@ -725,9 +718,6 @@ class Language(object):
"""
# raw_texts will be used later to stop iterator.
texts, raw_texts = itertools.tee(texts)
- if is_python2 and n_process != 1:
- user_warning(Warnings.W023)
- n_process = 1
if n_threads != -1:
deprecation_warning(Warnings.W016)
if n_process == -1:
@@ -744,7 +734,7 @@ class Language(object):
component_cfg=component_cfg,
as_example=False
)
- for doc, context in izip(docs, contexts):
+ for doc, context in zip(docs, contexts):
yield (doc, context)
return
if component_cfg is None:
@@ -814,7 +804,7 @@ class Language(object):
*[mp.Pipe(False) for _ in range(n_process)]
)
- batch_texts = minibatch(texts, batch_size)
+ batch_texts = util.minibatch(texts, batch_size)
# Sender sends texts to the workers.
# This is necessary to properly handle infinite length of texts.
# (In this case, all data cannot be sent to the workers at once)
@@ -858,7 +848,7 @@ class Language(object):
deprecation_warning(Warnings.W014)
exclude = disable
path = util.ensure_path(path)
- serializers = OrderedDict()
+ serializers = {}
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
p, exclude=["vocab"]
)
@@ -891,7 +881,7 @@ class Language(object):
deprecation_warning(Warnings.W014)
exclude = disable
path = util.ensure_path(path)
- deserializers = OrderedDict()
+ deserializers = {}
deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
deserializers["vocab"] = lambda p: self.vocab.from_disk(
p
@@ -925,7 +915,7 @@ class Language(object):
if disable is not None:
deprecation_warning(Warnings.W014)
exclude = disable
- serializers = OrderedDict()
+ serializers = {}
serializers["vocab"] = lambda: self.vocab.to_bytes()
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
@@ -950,7 +940,7 @@ class Language(object):
if disable is not None:
deprecation_warning(Warnings.W014)
exclude = disable
- deserializers = OrderedDict()
+ deserializers = {}
deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
deserializers["vocab"] = lambda b: self.vocab.from_bytes(
b
@@ -1009,7 +999,7 @@ class component(object):
def factory(nlp, **cfg):
if hasattr(obj, "from_nlp"):
return obj.from_nlp(nlp, **cfg)
- elif isinstance(obj, class_types):
+ elif isinstance(obj, type):
return obj()
return obj
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index d70e4cfc4..3ba86c169 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -1,8 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from collections import OrderedDict
-
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
from .errors import Errors
from .lookups import Lookups
@@ -160,7 +155,7 @@ class Lemmatizer(object):
else:
oov_forms.append(form)
# Remove duplicates but preserve the ordering of applied "rules"
- forms = list(OrderedDict.fromkeys(forms))
+ forms = list(dict.fromkeys(forms))
# Put exceptions at the front of the list, so they get priority.
# This is a dodgy heuristic -- but it's the best we can do until we get
# frequencies on this. We can at least prune out problematic exceptions,
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 5c981bc25..497e20516 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -1,7 +1,4 @@
# cython: embedsignature=True
-# coding: utf8
-from __future__ import unicode_literals, print_function
-
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
from libc.string cimport memset
diff --git a/spacy/lookups.py b/spacy/lookups.py
index bf250b4b4..a9d371b79 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -1,9 +1,6 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import srsly
-from collections import OrderedDict
from preshed.bloom import BloomFilter
+from collections import OrderedDict
from .errors import Errors
from .util import SimpleFrozenDict, ensure_path
@@ -28,7 +25,7 @@ class Lookups(object):
DOCS: https://spacy.io/api/lookups#init
"""
- self._tables = OrderedDict()
+ self._tables = {}
def __contains__(self, name):
"""Check if the lookups contain a table of a given name. Delegates to
@@ -118,7 +115,7 @@ class Lookups(object):
DOCS: https://spacy.io/api/lookups#from_bytes
"""
- self._tables = OrderedDict()
+ self._tables = {}
for key, value in srsly.msgpack_loads(bytes_data).items():
self._tables[key] = Table(key)
self._tables[key].update(value)
@@ -254,12 +251,12 @@ class Table(OrderedDict):
DOCS: https://spacy.io/api/lookups#table.to_bytes
"""
- data = [
- ("name", self.name),
- ("dict", dict(self.items())),
- ("bloom", self.bloom.to_bytes()),
- ]
- return srsly.msgpack_dumps(OrderedDict(data))
+ data = {
+ "name": self.name,
+ "dict": dict(self.items()),
+ "bloom": self.bloom.to_bytes(),
+ }
+ return srsly.msgpack_dumps(data)
def from_bytes(self, bytes_data):
"""Load a table from a bytestring.
diff --git a/spacy/matcher/__init__.py b/spacy/matcher/__init__.py
index 91874ed43..286844787 100644
--- a/spacy/matcher/__init__.py
+++ b/spacy/matcher/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .matcher import Matcher
from .phrasematcher import PhraseMatcher
from .dependencymatcher import DependencyMatcher
diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py
index 1b10f0dd5..ce6379c45 100644
--- a/spacy/matcher/_schemas.py
+++ b/spacy/matcher/_schemas.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
TOKEN_PATTERN_SCHEMA = {
"$schema": "http://json-schema.org/draft-06/schema",
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 56d27024d..46cff0d0c 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -1,7 +1,5 @@
# cython: infer_types=True
# cython: profile=True
-from __future__ import unicode_literals
-
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 30ef3dd36..2908ab0c2 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -1,7 +1,5 @@
# cython: infer_types=True
# cython: profile=True
-from __future__ import unicode_literals
-
from libcpp.vector cimport vector
from libc.stdint cimport int32_t
from cymem.cymem cimport Pool
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 4de5782f9..20f45b9e4 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,7 +1,5 @@
# cython: infer_types=True
# cython: profile=True
-from __future__ import unicode_literals
-
from libc.stdint cimport uintptr_t
from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py
index 57e7ef571..8eebf0564 100644
--- a/spacy/ml/__init__.py
+++ b/spacy/ml/__init__.py
@@ -1,5 +1,2 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .tok2vec import Tok2Vec # noqa: F401
from .common import FeedForward, LayerNormalizedMaxout # noqa: F401
diff --git a/spacy/ml/_legacy_tok2vec.py b/spacy/ml/_legacy_tok2vec.py
index b077a46b7..e7baae380 100644
--- a/spacy/ml/_legacy_tok2vec.py
+++ b/spacy/ml/_legacy_tok2vec.py
@@ -1,5 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
from thinc.v2v import Model, Maxout
from thinc.i2v import HashEmbed, StaticVectors
from thinc.t2t import ExtractWindow
diff --git a/spacy/ml/_wire.py b/spacy/ml/_wire.py
index fa271b37c..2b1144fcb 100644
--- a/spacy/ml/_wire.py
+++ b/spacy/ml/_wire.py
@@ -1,4 +1,3 @@
-from __future__ import unicode_literals
from thinc.api import layerize, wrap, noop, chain, concatenate
from thinc.v2v import Model
diff --git a/spacy/ml/common.py b/spacy/ml/common.py
index f90b53a15..4ecb00e4e 100644
--- a/spacy/ml/common.py
+++ b/spacy/ml/common.py
@@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
from thinc.api import chain
from thinc.v2v import Maxout
from thinc.misc import LayerNorm
diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py
index 8f86475ef..9a0ed6bf5 100644
--- a/spacy/ml/tok2vec.py
+++ b/spacy/ml/tok2vec.py
@@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
from thinc.api import chain, layerize, clone, concatenate, with_flatten, uniqued
from thinc.api import noop, with_square_sequences
from thinc.v2v import Maxout, Model
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index c146094a9..f12691170 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,12 +1,8 @@
# cython: infer_types
-# coding: utf8
-from __future__ import unicode_literals
-
from libc.string cimport memset
import srsly
from collections import Counter
-from .compat import basestring_
from .strings import get_string_id
from . import symbols
from .attrs cimport POS, IS_SPACE
@@ -190,7 +186,7 @@ cdef class Morphology:
present. Returns the hash of the new analysis.
"""
for f in features:
- if isinstance(f, basestring_):
+ if isinstance(f, str):
self.strings.add(f)
string_features = features
features = intify_features(features)
diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx
index 3925a6738..e71fb917f 100644
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
IDS = {
"": NO_TAG,
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index de8403152..2f9824eda 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
from .pipes import SentenceRecognizer
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 2db312d64..75120dfe6 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -1,12 +1,8 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from collections import defaultdict, OrderedDict
+from collections import defaultdict
import srsly
from ..language import component
from ..errors import Errors
-from ..compat import basestring_
from ..util import ensure_path, to_disk, from_disk
from ..tokens import Span
from ..matcher import Matcher, PhraseMatcher
@@ -201,7 +197,7 @@ class EntityRuler(object):
self._ent_ids[key] = (ent_label, entry["id"])
pattern = entry["pattern"]
- if isinstance(pattern, basestring_):
+ if isinstance(pattern, str):
self.phrase_patterns[label].append(self.nlp(pattern))
elif isinstance(pattern, list):
self.token_patterns[label].append(pattern)
@@ -230,8 +226,8 @@ class EntityRuler(object):
RETURNS (str): The ent_label joined with configured `ent_id_sep`
"""
- if isinstance(ent_id, basestring_):
- label = "{}{}{}".format(label, self.ent_id_sep, ent_id)
+ if isinstance(ent_id, str):
+ label = f"{label}{self.ent_id_sep}{ent_id}"
return label
def from_bytes(self, patterns_bytes, **kwargs):
@@ -264,15 +260,12 @@ class EntityRuler(object):
DOCS: https://spacy.io/api/entityruler#to_bytes
"""
-
- serial = OrderedDict(
- (
- ("overwrite", self.overwrite),
- ("ent_id_sep", self.ent_id_sep),
- ("phrase_matcher_attr", self.phrase_matcher_attr),
- ("patterns", self.patterns),
- )
- )
+ serial = {
+ "overwrite": self.overwrite,
+ "ent_id_sep": self.ent_id_sep,
+ "phrase_matcher_attr": self.phrase_matcher_attr,
+ "patterns": self.patterns,
+ }
return srsly.msgpack_dumps(serial)
def from_disk(self, path, **kwargs):
diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
index 69e638da2..6e9d4197c 100644
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..language import component
from ..matcher import Matcher
from ..util import filter_spans
diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py
index b61a34c0e..68385c5a9 100644
--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from thinc.t2v import Pooling, max_pool, mean_pool
from thinc.neural._classes.difference import Siamese, CauchySimilarity
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index adcff9280..10038d410 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,5 +1,4 @@
-from __future__ import unicode_literals
-from collections import OrderedDict, defaultdict
+from collections import defaultdict
import numpy
cimport numpy as np
@@ -13,7 +12,6 @@ from .._ml import Tok2Vec, build_morphologizer_model
from .._ml import link_vectors_to_models, zero_init, flatten
from .._ml import create_default_optimizer
from ..errors import Errors, TempErrors
-from ..compat import basestring_
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..morphology cimport Morphology
@@ -32,7 +30,7 @@ class Morphologizer(Pipe):
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
- self.cfg = OrderedDict(sorted(cfg.items()))
+ self.cfg = dict(sorted(cfg.items()))
self.cfg.setdefault('cnn_maxout_pieces', 2)
self._class_map = self.vocab.morphology.create_class_map()
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index b041e2441..ff88340cd 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1,12 +1,8 @@
# cython: infer_types=True
# cython: profile=True
-# coding: utf8
-from __future__ import unicode_literals
-
import numpy
import srsly
import random
-from collections import OrderedDict
from thinc.api import chain
from thinc.v2v import Affine, Maxout, Softmax
from thinc.misc import LayerNorm
@@ -24,7 +20,6 @@ from .functions import merge_subtokens
from ..language import Language, component
from ..syntax import nonproj
from ..gold import Example
-from ..compat import basestring_
from ..attrs import POS, ID
from ..parts_of_speech import X
from ..kb import KnowledgeBase
@@ -183,7 +178,7 @@ class Pipe(object):
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized object.
"""
- serialize = OrderedDict()
+ serialize = {}
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
if self.model not in (True, False, None):
serialize["model"] = self.model.to_bytes
@@ -206,7 +201,7 @@ class Pipe(object):
except AttributeError:
raise ValueError(Errors.E149)
- deserialize = OrderedDict()
+ deserialize = {}
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
if hasattr(self, "vocab"):
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
@@ -217,7 +212,7 @@ class Pipe(object):
def to_disk(self, path, exclude=tuple(), **kwargs):
"""Serialize the pipe to disk."""
- serialize = OrderedDict()
+ serialize = {}
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
if self.model not in (None, True, False):
@@ -239,7 +234,7 @@ class Pipe(object):
except AttributeError:
raise ValueError(Errors.E149)
- deserialize = OrderedDict()
+ deserialize = {}
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
deserialize["model"] = load_model
@@ -409,7 +404,7 @@ class Tagger(Pipe):
self.vocab = vocab
self.model = model
self._rehearsal_model = None
- self.cfg = OrderedDict(sorted(cfg.items()))
+ self.cfg = dict(sorted(cfg.items()))
self.cfg.setdefault("cnn_maxout_pieces", 2)
@property
@@ -564,7 +559,7 @@ class Tagger(Pipe):
if not any(table in self.vocab.lookups for table in lemma_tables):
user_warning(Warnings.W022)
orig_tag_map = dict(self.vocab.morphology.tag_map)
- new_tag_map = OrderedDict()
+ new_tag_map = {}
for example in get_examples():
for tag in example.token_annotation.tags:
if tag in orig_tag_map:
@@ -594,7 +589,7 @@ class Tagger(Pipe):
return build_tagger_model(n_tags, **cfg)
def add_label(self, label, values=None):
- if not isinstance(label, basestring_):
+ if not isinstance(label, str):
raise ValueError(Errors.E187)
if label in self.labels:
return 0
@@ -624,12 +619,12 @@ class Tagger(Pipe):
yield
def to_bytes(self, exclude=tuple(), **kwargs):
- serialize = OrderedDict()
+ serialize = {}
if self.model not in (None, True, False):
serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
- tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
+ tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
return util.to_bytes(serialize, exclude)
@@ -656,24 +651,24 @@ class Tagger(Pipe):
lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
- deserialize = OrderedDict((
- ("vocab", lambda b: self.vocab.from_bytes(b)),
- ("tag_map", load_tag_map),
- ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
- ("model", lambda b: load_model(b)),
- ))
+ deserialize = {
+ "vocab": lambda b: self.vocab.from_bytes(b),
+ "tag_map": load_tag_map,
+ "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
+ "model": lambda b: load_model(b),
+ }
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, exclude=tuple(), **kwargs):
- tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
- serialize = OrderedDict((
- ("vocab", lambda p: self.vocab.to_disk(p)),
- ("tag_map", lambda p: srsly.write_msgpack(p, tag_map)),
- ("model", lambda p: p.open("wb").write(self.model.to_bytes())),
- ("cfg", lambda p: srsly.write_json(p, self.cfg))
- ))
+ tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
+ serialize = {
+ "vocab": lambda p: self.vocab.to_disk(p),
+ "tag_map": lambda p: srsly.write_msgpack(p, tag_map),
+ "model": lambda p: p.open("wb").write(self.model.to_bytes()),
+ "cfg": lambda p: srsly.write_json(p, self.cfg)
+ }
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude)
@@ -697,12 +692,12 @@ class Tagger(Pipe):
lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
- deserialize = OrderedDict((
- ("cfg", lambda p: self.cfg.update(_load_cfg(p))),
- ("vocab", lambda p: self.vocab.from_disk(p)),
- ("tag_map", load_tag_map),
- ("model", load_model),
- ))
+ deserialize = {
+ "cfg": lambda p: self.cfg.update(_load_cfg(p)),
+ "vocab": lambda p: self.vocab.from_disk(p),
+ "tag_map": load_tag_map,
+ "model": load_model,
+ }
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_disk(path, deserialize, exclude)
return self
@@ -719,7 +714,7 @@ class SentenceRecognizer(Tagger):
self.vocab = vocab
self.model = model
self._rehearsal_model = None
- self.cfg = OrderedDict(sorted(cfg.items()))
+ self.cfg = dict(sorted(cfg.items()))
self.cfg.setdefault("cnn_maxout_pieces", 2)
self.cfg.setdefault("subword_features", True)
self.cfg.setdefault("token_vector_width", 12)
@@ -816,7 +811,7 @@ class SentenceRecognizer(Tagger):
yield
def to_bytes(self, exclude=tuple(), **kwargs):
- serialize = OrderedDict()
+ serialize = {}
if self.model not in (None, True, False):
serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes
@@ -833,21 +828,21 @@ class SentenceRecognizer(Tagger):
except AttributeError:
raise ValueError(Errors.E149)
- deserialize = OrderedDict((
- ("vocab", lambda b: self.vocab.from_bytes(b)),
- ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
- ("model", lambda b: load_model(b)),
- ))
+ deserialize = {
+ "vocab": lambda b: self.vocab.from_bytes(b),
+ "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
+ "model": lambda b: load_model(b),
+ }
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, exclude=tuple(), **kwargs):
- serialize = OrderedDict((
- ("vocab", lambda p: self.vocab.to_disk(p)),
- ("model", lambda p: p.open("wb").write(self.model.to_bytes())),
- ("cfg", lambda p: srsly.write_json(p, self.cfg))
- ))
+ serialize = {
+ "vocab": lambda p: self.vocab.to_disk(p),
+ "model": lambda p: p.open("wb").write(self.model.to_bytes()),
+ "cfg": lambda p: srsly.write_json(p, self.cfg)
+ }
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude)
@@ -861,11 +856,11 @@ class SentenceRecognizer(Tagger):
except AttributeError:
raise ValueError(Errors.E149)
- deserialize = OrderedDict((
- ("cfg", lambda p: self.cfg.update(_load_cfg(p))),
- ("vocab", lambda p: self.vocab.from_disk(p)),
- ("model", load_model),
- ))
+ deserialize = {
+ "cfg": lambda p: self.cfg.update(_load_cfg(p)),
+ "vocab": lambda p: self.vocab.from_disk(p),
+ "model": load_model,
+ }
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_disk(path, deserialize, exclude)
return self
@@ -1241,7 +1236,7 @@ class TextCategorizer(Pipe):
return float(mean_square_error), d_scores
def add_label(self, label):
- if not isinstance(label, basestring_):
+ if not isinstance(label, str):
raise ValueError(Errors.E187)
if label in self.labels:
return 0
@@ -1614,7 +1609,7 @@ class EntityLinker(Pipe):
token.ent_kb_id_ = kb_id
def to_disk(self, path, exclude=tuple(), **kwargs):
- serialize = OrderedDict()
+ serialize = {}
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["kb"] = lambda p: self.kb.dump(p)
@@ -1637,7 +1632,7 @@ class EntityLinker(Pipe):
kb.load_bulk(p)
self.set_kb(kb)
- deserialize = OrderedDict()
+ deserialize = {}
deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
deserialize["kb"] = load_kb
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 6238b6ead..82b10a77d 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import division, print_function, unicode_literals
-
import numpy as np
from .gold import tags_to_entities, GoldParse, DocAnnotation
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index f3457e1a5..0605de96c 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,7 +1,4 @@
# cython: infer_types=True
-# coding: utf8
-from __future__ import unicode_literals, absolute_import
-
cimport cython
from libc.string cimport memcpy
from libcpp.set cimport set
@@ -9,7 +6,6 @@ from libc.stdint cimport uint32_t
from murmurhash.mrmr cimport hash64, hash32
import srsly
-from .compat import basestring_
from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT
from .typedefs cimport hash_t
@@ -24,7 +20,7 @@ def get_string_id(key):
This function optimises for convenience over performance, so shouldn't be
used in tight loops.
"""
- if not isinstance(key, basestring_):
+ if not isinstance(key, str):
return key
elif key in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[key]
@@ -150,7 +146,7 @@ cdef class StringStore:
return key
else:
return self[key]
-
+
def add(self, string):
"""Add a string to the StringStore.
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index b65ae9628..85f23ccbc 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -1,8 +1,4 @@
-# coding: utf8
-#cython: optimize.unpack_method_calls=False
-from __future__ import unicode_literals
-
-
+# cython: optimize.unpack_method_calls=False
IDS = {
"": NIL,
"IS_ALPHA": IS_ALPHA,
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 8b6448a46..19d05e77f 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -1,10 +1,6 @@
# cython: infer_types=True
# cython: cdivision=True
# cython: boundscheck=False
-# coding: utf-8
-from __future__ import unicode_literals, print_function
-
-from collections import OrderedDict
import numpy
cimport cython.parallel
import numpy.random
@@ -249,7 +245,7 @@ class ParserModel(Model):
def resize_output(self, new_output):
if len(self._layers) == 2:
- return
+ return
if new_output == self.upper.nO:
return
smaller = self.upper
@@ -485,7 +481,7 @@ cdef class precompute_hiddens:
ops = NumpyOps()
else:
ops = CupyOps()
-
+
if self.activation == "maxout":
state_vector, mask = ops.maxout(state_vector)
else:
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 45fd1170b..5ec169428 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -1,12 +1,9 @@
# cython: profile=True
# cython: cdivision=True
# cython: infer_types=True
-# coding: utf-8
-from __future__ import unicode_literals
-
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
-from collections import OrderedDict, defaultdict, Counter
+from collections import defaultdict, Counter
from thinc.extra.search cimport Beam
import json
@@ -25,7 +22,7 @@ from ..tokens.doc cimport Doc, set_children_from_heads
# Calculate cost as gold/not gold. We don't use scalar value anyway.
cdef int BINARY_COSTS = 1
cdef weight_t MIN_SCORE = -90000
-cdef attr_t SUBTOK_LABEL = hash_string('subtok')
+cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
DEF NON_MONOTONIC = True
DEF USE_BREAK = True
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index 7467aa342..5dfa20b7d 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -1,9 +1,6 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam
-from collections import OrderedDict, Counter
+from collections import Counter
from .stateclass cimport StateClass
from ._state cimport StateC
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index c98baf6fd..14d9e54d4 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -1,10 +1,6 @@
# cython: infer_types=True
# cython: cdivision=True
# cython: boundscheck=False
-# coding: utf-8
-from __future__ import unicode_literals, print_function
-
-from collections import OrderedDict
import numpy
cimport cython.parallel
import numpy.random
@@ -692,22 +688,22 @@ cdef class Parser:
return self
def to_bytes(self, exclude=tuple(), **kwargs):
- serializers = OrderedDict((
- ('model', lambda: (self.model.to_bytes() if self.model is not True else True)),
- ('vocab', lambda: self.vocab.to_bytes()),
- ('moves', lambda: self.moves.to_bytes(exclude=["strings"])),
- ('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True))
- ))
+ serializers = {
+ "model": lambda: (self.model.to_bytes() if self.model is not True else True),
+ "vocab": lambda: self.vocab.to_bytes(),
+ "moves": lambda: self.moves.to_bytes(exclude=["strings"]),
+ "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
+ }
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
- deserializers = OrderedDict((
- ('vocab', lambda b: self.vocab.from_bytes(b)),
- ('moves', lambda b: self.moves.from_bytes(b, exclude=["strings"])),
- ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
- ('model', lambda b: None)
- ))
+ deserializers = {
+ "vocab": lambda b: self.vocab.from_bytes(b),
+ "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
+ "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
+ "model": lambda b: None
+ }
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude:
diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx
index 2ec6b61ac..0f738f99f 100644
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@@ -1,12 +1,9 @@
-# coding: utf-8
# cython: profile=True
# cython: infer_types=True
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme.
"""
-from __future__ import unicode_literals
-
from copy import copy
from spacy.gold import Example
diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx
index 2a15a2de1..47b37946c 100644
--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@@ -1,7 +1,4 @@
-# coding: utf-8
# cython: infer_types=True
-from __future__ import unicode_literals
-
import numpy
from ..tokens.doc cimport Doc
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 65097f114..62e369091 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -1,12 +1,9 @@
# cython: infer_types=True
-# coding: utf-8
-from __future__ import unicode_literals
-
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam
-from collections import OrderedDict, Counter
+from collections import Counter
import srsly
from . cimport _beam_utils
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 816970e61..ba7b67e25 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.util import get_lang_class
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 6c69e699a..766dcb739 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
from spacy.pipeline import EntityRecognizer
from spacy.tokens import Span
import pytest
diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index 7b513cfab..6be6e3867 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.tokens import Doc
from spacy.attrs import ORTH, SHAPE, POS, DEP
diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py
index 120fb6e28..d986d160c 100644
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.vocab import Vocab
from spacy.tokens import Doc
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 86c7fbf72..41a060b7b 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
import numpy
diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py
index 5d570af53..67ebc06d6 100644
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/doc/test_pickle_doc.py b/spacy/tests/doc/test_pickle_doc.py
index 2b6970a38..28cb66714 100644
--- a/spacy/tests/doc/test_pickle_doc.py
+++ b/spacy/tests/doc/test_pickle_doc.py
@@ -1,8 +1,5 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
from spacy.language import Language
-from spacy.compat import pickle, unicode_
+from spacy.compat import pickle
def test_pickle_single_doc():
@@ -16,9 +13,9 @@ def test_pickle_single_doc():
def test_list_of_docs_pickles_efficiently():
nlp = Language()
for i in range(10000):
- _ = nlp.vocab[unicode_(i)] # noqa: F841
+ _ = nlp.vocab[str(i)] # noqa: F841
one_pickled = pickle.dumps(nlp("0"), -1)
- docs = list(nlp.pipe(unicode_(i) for i in range(100)))
+ docs = list(nlp.pipe(str(i) for i in range(100)))
many_pickled = pickle.dumps(docs, -1)
assert len(many_pickled) < (len(one_pickled) * 2)
many_unpickled = pickle.loads(many_pickled)
diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py
index 5bdf78f39..c82c04eeb 100644
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.attrs import LEMMA
from spacy.vocab import Vocab
diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py
index d074fddc6..33b6fbe81 100644
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.vocab import Vocab
from spacy.tokens import Doc, Token
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 01bb93c50..9fb552d44 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.attrs import ORTH, LENGTH
from spacy.tokens import Doc, Span
diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py
index a063a6569..18243c306 100644
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.cli._schemas import TRAINING_SCHEMA
from spacy.util import get_json_validator, validate_json
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index bff2a95c6..cff1d3327 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
import numpy
from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP
diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index 2877bfeea..352460581 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from mock import Mock
from spacy.tokens import Doc, Span, Token
diff --git a/spacy/tests/lang/ar/test_exceptions.py b/spacy/tests/lang/ar/test_exceptions.py
index 3cfc380d2..125220caf 100644
--- a/spacy/tests/lang/ar/test_exceptions.py
+++ b/spacy/tests/lang/ar/test_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/ar/test_text.py b/spacy/tests/lang/ar/test_text.py
index 109c3721a..f4a8cc1e3 100644
--- a/spacy/tests/lang/ar/test_text.py
+++ b/spacy/tests/lang/ar/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
def test_ar_tokenizer_handles_long_text(ar_tokenizer):
text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين.
diff --git a/spacy/tests/lang/bn/test_tokenizer.py b/spacy/tests/lang/bn/test_tokenizer.py
index 62dd52778..5b18c5269 100644
--- a/spacy/tests/lang/bn/test_tokenizer.py
+++ b/spacy/tests/lang/bn/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/ca/test_exception.py b/spacy/tests/lang/ca/test_exception.py
index 56156c328..71098f094 100644
--- a/spacy/tests/lang/ca/test_exception.py
+++ b/spacy/tests/lang/ca/test_exception.py
@@ -1,7 +1,3 @@
-# coding: utf-8
-
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/ca/test_prefix_suffix_infix.py b/spacy/tests/lang/ca/test_prefix_suffix_infix.py
index 4583a62b9..83a75f056 100644
--- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/ca/test_text.py b/spacy/tests/lang/ca/test_text.py
index 1506016d4..38f5fc708 100644
--- a/spacy/tests/lang/ca/test_text.py
+++ b/spacy/tests/lang/ca/test_text.py
@@ -1,10 +1,4 @@
-# coding: utf-8
-
"""Test that longer and mixed texts are tokenized correctly."""
-
-
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py
index a522ab5e8..603378ea7 100644
--- a/spacy/tests/lang/da/test_exceptions.py
+++ b/spacy/tests/lang/da/test_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/da/test_prefix_suffix_infix.py b/spacy/tests/lang/da/test_prefix_suffix_infix.py
index 8b43bf360..e36b3cdb9 100644
--- a/spacy/tests/lang/da/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/da/test_prefix_suffix_infix.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/da/test_text.py b/spacy/tests/lang/da/test_text.py
index 07b134e2d..3c6cca5ac 100644
--- a/spacy/tests/lang/da/test_text.py
+++ b/spacy/tests/lang/da/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.lang.da.lex_attrs import like_num
diff --git a/spacy/tests/lang/de/test_exceptions.py b/spacy/tests/lang/de/test_exceptions.py
index 2e065870e..a4614f6c4 100644
--- a/spacy/tests/lang/de/test_exceptions.py
+++ b/spacy/tests/lang/de/test_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/de/test_parser.py b/spacy/tests/lang/de/test_parser.py
index 5c8694da3..c897dcf2f 100644
--- a/spacy/tests/lang/de/test_parser.py
+++ b/spacy/tests/lang/de/test_parser.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
from ...util import get_doc
diff --git a/spacy/tests/lang/de/test_prefix_suffix_infix.py b/spacy/tests/lang/de/test_prefix_suffix_infix.py
index 13e109395..82bd8ed69 100644
--- a/spacy/tests/lang/de/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/de/test_prefix_suffix_infix.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/de/test_text.py b/spacy/tests/lang/de/test_text.py
index b3fb1eaa5..22711763e 100644
--- a/spacy/tests/lang/de/test_text.py
+++ b/spacy/tests/lang/de/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/el/test_exception.py b/spacy/tests/lang/el/test_exception.py
index b8d10fb69..a4656ea98 100644
--- a/spacy/tests/lang/el/test_exception.py
+++ b/spacy/tests/lang/el/test_exception.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/el/test_text.py b/spacy/tests/lang/el/test_text.py
index a6395ab4a..1b3ef6182 100644
--- a/spacy/tests/lang/el/test_text.py
+++ b/spacy/tests/lang/el/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/en/test_customized_tokenizer.py b/spacy/tests/lang/en/test_customized_tokenizer.py
index 7f939011f..f5302cb31 100644
--- a/spacy/tests/lang/en/test_customized_tokenizer.py
+++ b/spacy/tests/lang/en/test_customized_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
import re
from spacy.lang.en import English
diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py
index 6285a9408..b2e941dab 100644
--- a/spacy/tests/lang/en/test_exceptions.py
+++ b/spacy/tests/lang/en/test_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/en/test_indices.py b/spacy/tests/lang/en/test_indices.py
index 8a7bc0323..d50c75fc5 100644
--- a/spacy/tests/lang/en/test_indices.py
+++ b/spacy/tests/lang/en/test_indices.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
def test_en_simple_punct(en_tokenizer):
text = "to walk, do foo"
diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py
index 7dc47f9cc..6739b5137 100644
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import numpy
from spacy.attrs import HEAD, DEP
from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
diff --git a/spacy/tests/lang/en/test_parser.py b/spacy/tests/lang/en/test_parser.py
index ce696bc25..057143696 100644
--- a/spacy/tests/lang/en/test_parser.py
+++ b/spacy/tests/lang/en/test_parser.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
from ...util import get_doc
diff --git a/spacy/tests/lang/en/test_prefix_suffix_infix.py b/spacy/tests/lang/en/test_prefix_suffix_infix.py
index 3dccd6bcf..8c9c58fea 100644
--- a/spacy/tests/lang/en/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/en/test_prefix_suffix_infix.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/en/test_punct.py b/spacy/tests/lang/en/test_punct.py
index 61274cf14..4dc6ddfe4 100644
--- a/spacy/tests/lang/en/test_punct.py
+++ b/spacy/tests/lang/en/test_punct.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.util import compile_prefix_regex
from spacy.lang.punctuation import TOKENIZER_PREFIXES
diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py
index 40bd110e8..ba7b2f2cf 100644
--- a/spacy/tests/lang/en/test_sbd.py
+++ b/spacy/tests/lang/en/test_sbd.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from ...util import get_doc, apply_transition_sequence
diff --git a/spacy/tests/lang/en/test_tagger.py b/spacy/tests/lang/en/test_tagger.py
index 567fd5a44..d9eced2ff 100644
--- a/spacy/tests/lang/en/test_tagger.py
+++ b/spacy/tests/lang/en/test_tagger.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
from ...util import get_doc
diff --git a/spacy/tests/lang/en/test_text.py b/spacy/tests/lang/en/test_text.py
index a7ebde989..c5d56d885 100644
--- a/spacy/tests/lang/en/test_text.py
+++ b/spacy/tests/lang/en/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.lang.en.lex_attrs import like_num
diff --git a/spacy/tests/lang/es/test_exception.py b/spacy/tests/lang/es/test_exception.py
index 8d6164058..90d897a4c 100644
--- a/spacy/tests/lang/es/test_exception.py
+++ b/spacy/tests/lang/es/test_exception.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py
index acd572b48..af7b0212d 100644
--- a/spacy/tests/lang/es/test_text.py
+++ b/spacy/tests/lang/es/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/fi/test_text.py b/spacy/tests/lang/fi/test_text.py
index 2dd92597e..dbb67ad7a 100644
--- a/spacy/tests/lang/fi/test_text.py
+++ b/spacy/tests/lang/fi/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py
index 17f6f0ccc..6d5a14e6e 100644
--- a/spacy/tests/lang/fi/test_tokenizer.py
+++ b/spacy/tests/lang/fi/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/fr/test_exceptions.py b/spacy/tests/lang/fr/test_exceptions.py
index 93dbf0993..98d318f6e 100644
--- a/spacy/tests/lang/fr/test_exceptions.py
+++ b/spacy/tests/lang/fr/test_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/fr/test_prefix_suffix_infix.py b/spacy/tests/lang/fr/test_prefix_suffix_infix.py
index ca6bdbd87..01d50b0a6 100644
--- a/spacy/tests/lang/fr/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/fr/test_prefix_suffix_infix.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.language import Language
from spacy.lang.punctuation import TOKENIZER_INFIXES
diff --git a/spacy/tests/lang/fr/test_text.py b/spacy/tests/lang/fr/test_text.py
index 24b4c4532..01231f593 100644
--- a/spacy/tests/lang/fr/test_text.py
+++ b/spacy/tests/lang/fr/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.lang.fr.lex_attrs import like_num
diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py
index 29bc1c759..78127ef7c 100644
--- a/spacy/tests/lang/ga/test_tokenizer.py
+++ b/spacy/tests/lang/ga/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/he/test_tokenizer.py b/spacy/tests/lang/he/test_tokenizer.py
index f138ec6e7..3131014a3 100644
--- a/spacy/tests/lang/he/test_tokenizer.py
+++ b/spacy/tests/lang/he/test_tokenizer.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py
index fa8e132c0..4ec720c60 100644
--- a/spacy/tests/lang/hu/test_tokenizer.py
+++ b/spacy/tests/lang/hu/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/id/test_prefix_suffix_infix.py b/spacy/tests/lang/id/test_prefix_suffix_infix.py
index e86a98ee3..2a81dab01 100644
--- a/spacy/tests/lang/id/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/id/test_prefix_suffix_infix.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/id/test_text.py b/spacy/tests/lang/id/test_text.py
index 915d268ae..ed6487b68 100644
--- a/spacy/tests/lang/id/test_text.py
+++ b/spacy/tests/lang/id/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.lang.id.lex_attrs import like_num
diff --git a/spacy/tests/lang/it/test_prefix_suffix_infix.py b/spacy/tests/lang/it/test_prefix_suffix_infix.py
index f84351fd7..46f66b5e6 100644
--- a/spacy/tests/lang/it/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/it/test_prefix_suffix_infix.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/ja/test_lemmatization.py b/spacy/tests/lang/ja/test_lemmatization.py
index cfff0fcfe..4cb3110b3 100644
--- a/spacy/tests/lang/ja/test_lemmatization.py
+++ b/spacy/tests/lang/ja/test_lemmatization.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index ad8bfaa00..481f346bb 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py
index 42c306c11..7782ca4bc 100644
--- a/spacy/tests/lang/ko/test_lemmatization.py
+++ b/spacy/tests/lang/ko/test_lemmatization.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py
index b8fe7959c..eac309857 100644
--- a/spacy/tests/lang/ko/test_tokenizer.py
+++ b/spacy/tests/lang/ko/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
# fmt: off
diff --git a/spacy/tests/lang/lb/test_exceptions.py b/spacy/tests/lang/lb/test_exceptions.py
index 7ca2394b7..5b5005ae7 100644
--- a/spacy/tests/lang/lb/test_exceptions.py
+++ b/spacy/tests/lang/lb/test_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/lb/test_prefix_suffix_infix.py b/spacy/tests/lang/lb/test_prefix_suffix_infix.py
index d85f932be..3958d1543 100644
--- a/spacy/tests/lang/lb/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/lb/test_prefix_suffix_infix.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/lb/test_text.py b/spacy/tests/lang/lb/test_text.py
index 36464b379..b0ba76b6b 100644
--- a/spacy/tests/lang/lb/test_text.py
+++ b/spacy/tests/lang/lb/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/lt/test_text.py b/spacy/tests/lang/lt/test_text.py
index cac32aa4d..8d9201cd9 100644
--- a/spacy/tests/lang/lt/test_text.py
+++ b/spacy/tests/lang/lt/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/nb/test_tokenizer.py b/spacy/tests/lang/nb/test_tokenizer.py
index f72d310e8..2da6e8d40 100644
--- a/spacy/tests/lang/nb/test_tokenizer.py
+++ b/spacy/tests/lang/nb/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/nl/test_text.py b/spacy/tests/lang/nl/test_text.py
index 4045b1c39..8bc72cc6d 100644
--- a/spacy/tests/lang/nl/test_text.py
+++ b/spacy/tests/lang/nl/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.lang.nl.lex_attrs import like_num
diff --git a/spacy/tests/lang/pl/test_text.py b/spacy/tests/lang/pl/test_text.py
index ec9b18084..e8654a498 100644
--- a/spacy/tests/lang/pl/test_text.py
+++ b/spacy/tests/lang/pl/test_text.py
@@ -1,9 +1,4 @@
-# coding: utf-8
"""Words like numbers are recognized correctly."""
-
-
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/pl/test_tokenizer.py b/spacy/tests/lang/pl/test_tokenizer.py
index 9d0034589..a04b4fdcb 100644
--- a/spacy/tests/lang/pl/test_tokenizer.py
+++ b/spacy/tests/lang/pl/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
DOT_TESTS = [
diff --git a/spacy/tests/lang/pt/test_text.py b/spacy/tests/lang/pt/test_text.py
index 39dfff2c1..3a9162b80 100644
--- a/spacy/tests/lang/pt/test_text.py
+++ b/spacy/tests/lang/pt/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.lang.pt.lex_attrs import like_num
diff --git a/spacy/tests/lang/ro/test_tokenizer.py b/spacy/tests/lang/ro/test_tokenizer.py
index a327174e5..64c072470 100644
--- a/spacy/tests/lang/ro/test_tokenizer.py
+++ b/spacy/tests/lang/ro/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/ru/test_exceptions.py b/spacy/tests/lang/ru/test_exceptions.py
index a8f0c3429..4fb417df8 100644
--- a/spacy/tests/lang/ru/test_exceptions.py
+++ b/spacy/tests/lang/ru/test_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py
index b228fded8..40dcf4cf8 100644
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from ...util import get_doc
diff --git a/spacy/tests/lang/ru/test_text.py b/spacy/tests/lang/ru/test_text.py
index c5bff6973..b0eaf66bb 100644
--- a/spacy/tests/lang/ru/test_text.py
+++ b/spacy/tests/lang/ru/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.lang.ru.lex_attrs import like_num
diff --git a/spacy/tests/lang/ru/test_tokenizer.py b/spacy/tests/lang/ru/test_tokenizer.py
index 5507f9f09..e05a479aa 100644
--- a/spacy/tests/lang/ru/test_tokenizer.py
+++ b/spacy/tests/lang/ru/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/sr/test_exceptions.py b/spacy/tests/lang/sr/test_exceptions.py
index 285e99996..fa92e5e2d 100644
--- a/spacy/tests/lang/sr/test_exceptions.py
+++ b/spacy/tests/lang/sr/test_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/sr/test_tokenizer.py b/spacy/tests/lang/sr/test_tokenizer.py
index c4672b3ef..03a0470bd 100644
--- a/spacy/tests/lang/sr/test_tokenizer.py
+++ b/spacy/tests/lang/sr/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/sv/test_exceptions.py b/spacy/tests/lang/sv/test_exceptions.py
index c977a4183..5d3acf3d5 100644
--- a/spacy/tests/lang/sv/test_exceptions.py
+++ b/spacy/tests/lang/sv/test_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py
index ac7c066ba..ad335c317 100644
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from ...util import get_doc
diff --git a/spacy/tests/lang/sv/test_prefix_suffix_infix.py b/spacy/tests/lang/sv/test_prefix_suffix_infix.py
index f3fdd9a9e..bbb0ff415 100644
--- a/spacy/tests/lang/sv/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/sv/test_prefix_suffix_infix.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/sv/test_text.py b/spacy/tests/lang/sv/test_text.py
index 9ea1851ae..dc4911ab6 100644
--- a/spacy/tests/lang/sv/test_text.py
+++ b/spacy/tests/lang/sv/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
def test_sv_tokenizer_handles_long_text(sv_tokenizer):
text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön,
diff --git a/spacy/tests/lang/sv/test_tokenizer.py b/spacy/tests/lang/sv/test_tokenizer.py
index 894b5aa6a..8871f4414 100644
--- a/spacy/tests/lang/sv/test_tokenizer.py
+++ b/spacy/tests/lang/sv/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py
index 4bb5aac70..b39109455 100644
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA
from spacy.lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape
diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py
index 5c701fc22..de1871e64 100644
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.util import get_lang_class
diff --git a/spacy/tests/lang/th/test_tokenizer.py b/spacy/tests/lang/th/test_tokenizer.py
index 265c7753d..1e1ba52dc 100644
--- a/spacy/tests/lang/th/test_tokenizer.py
+++ b/spacy/tests/lang/th/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/tt/test_tokenizer.py b/spacy/tests/lang/tt/test_tokenizer.py
index 66ef9c181..7e0748931 100644
--- a/spacy/tests/lang/tt/test_tokenizer.py
+++ b/spacy/tests/lang/tt/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/uk/test_tokenizer.py b/spacy/tests/lang/uk/test_tokenizer.py
index f744b32b0..eb647a041 100644
--- a/spacy/tests/lang/uk/test_tokenizer.py
+++ b/spacy/tests/lang/uk/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/uk/test_tokenizer_exc.py b/spacy/tests/lang/uk/test_tokenizer_exc.py
index 328e1d287..4fb4a6b31 100644
--- a/spacy/tests/lang/uk/test_tokenizer_exc.py
+++ b/spacy/tests/lang/uk/test_tokenizer_exc.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/ur/test_prefix_suffix_infix.py b/spacy/tests/lang/ur/test_prefix_suffix_infix.py
index de11c9b34..e9f3272f4 100644
--- a/spacy/tests/lang/ur/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/ur/test_prefix_suffix_infix.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/ur/test_text.py b/spacy/tests/lang/ur/test_text.py
index 546e79182..5da831cf8 100644
--- a/spacy/tests/lang/ur/test_text.py
+++ b/spacy/tests/lang/ur/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/yo/test_text.py b/spacy/tests/lang/yo/test_text.py
index ce6408b67..48b689f3d 100644
--- a/spacy/tests/lang/yo/test_text.py
+++ b/spacy/tests/lang/yo/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.lang.yo.lex_attrs import like_num
diff --git a/spacy/tests/lang/zh/test_text.py b/spacy/tests/lang/zh/test_text.py
index 235f597a5..d48feaee5 100644
--- a/spacy/tests/lang/zh/test_text.py
+++ b/spacy/tests/lang/zh/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py
index 36d94beb5..f71785337 100644
--- a/spacy/tests/lang/zh/test_tokenizer.py
+++ b/spacy/tests/lang/zh/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index e4584d03a..adeef834d 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
import re
from mock import Mock
diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py
index 240ace537..a6a82f2e2 100644
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
import re
diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py
index 2db2f9eb3..c879cc0fe 100644
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.matcher import Matcher
from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 7a6585e06..23cd80d1d 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from mock import Mock
from spacy.matcher import PhraseMatcher
diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py
index 41f807143..4cf6b1206 100644
--- a/spacy/tests/morphology/test_morph_features.py
+++ b/spacy/tests/morphology/test_morph_features.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.morphology import Morphology
from spacy.strings import StringStore, get_string_id
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index bee9db82e..a24fd143d 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps
diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py
index 0d9bd1ad0..dd593f7d3 100644
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.vocab import Vocab
from spacy.pipeline import DependencyParser
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 8329391ca..8d5043487 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.lang.en import English
diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py
index 468b3ff40..0906fbb94 100644
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy._ml import Tok2Vec
from spacy.vocab import Vocab
diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py
index 9dca99255..24997e47c 100644
--- a/spacy/tests/parser/test_nn_beam.py
+++ b/spacy/tests/parser/test_nn_beam.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
import numpy
from spacy.vocab import Vocab
diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py
index 8bf8111c1..86d9a0180 100644
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.syntax.nonproj import ancestors, contains_cycle, is_nonproj_arc
from spacy.syntax.nonproj import is_nonproj_tree
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index fb5301718..75091ec07 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from ..util import get_doc, apply_transition_sequence
diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py
index eb206458e..ed95718f1 100644
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from ..util import get_doc
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index d935494d6..ed6aef096 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps
diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py
index 945173faf..59ae4e629 100644
--- a/spacy/tests/parser/test_space_attachment.py
+++ b/spacy/tests/parser/test_space_attachment.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.tokens.doc import Doc
diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py
index 198f11bcd..5c246538c 100644
--- a/spacy/tests/pipeline/test_analysis.py
+++ b/spacy/tests/pipeline/test_analysis.py
@@ -1,11 +1,7 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import spacy.language
from spacy.language import Language, component
from spacy.analysis import print_summary, validate_attrs
from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
-from spacy.compat import is_python2
from mock import Mock, ANY
import pytest
@@ -17,8 +13,7 @@ def test_component_decorator_function():
return doc
assert test_component.name == "test"
- if not is_python2:
- assert test_component.__doc__ == "docstring"
+ assert test_component.__doc__ == "docstring"
assert test_component("foo") == "foo"
@@ -45,13 +40,12 @@ def test_component_decorator_class():
assert test_component("foo") == "foo"
assert hasattr(test_component, "custom")
assert test_component.custom("bar") == "bar"
- if not is_python2:
- assert TestComponent.__doc__ == "docstring1"
- assert TestComponent.__call__.__doc__ == "docstring2"
- assert TestComponent.custom.__doc__ == "docstring3"
- assert test_component.__doc__ == "docstring1"
- assert test_component.__call__.__doc__ == "docstring2"
- assert test_component.custom.__doc__ == "docstring3"
+ assert TestComponent.__doc__ == "docstring1"
+ assert TestComponent.__call__.__doc__ == "docstring2"
+ assert TestComponent.custom.__doc__ == "docstring3"
+ assert test_component.__doc__ == "docstring1"
+ assert test_component.__call__.__doc__ == "docstring2"
+ assert test_component.custom.__doc__ == "docstring3"
def test_component_decorator_assigns():
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 8023f72a6..9ff5f8194 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.kb import KnowledgeBase
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 660ad3b28..210a56cea 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.tokens import Span
from spacy.language import Language
diff --git a/spacy/tests/pipeline/test_factories.py b/spacy/tests/pipeline/test_factories.py
index 5efcc319a..0a9a4d3c9 100644
--- a/spacy/tests/pipeline/test_factories.py
+++ b/spacy/tests/pipeline/test_factories.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.language import Language
from spacy.tokens import Span
diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py
index 5b5fcd2fd..ca983267f 100644
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.pipeline.functions import merge_subtokens
from ..util import get_doc
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 27fb57b18..3ec8b508d 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.language import Language
diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py
index 359552c5b..78ab6d2d1 100644
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
import spacy
from spacy.pipeline import Sentencizer
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index a5bda9090..ca9dab009 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.language import Language
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 44834c2a8..9e37e92e1 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
import random
import numpy.random
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index 61d2c9cd2..a3148aa90 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
import random
from spacy.matcher import Matcher
diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py
index 924c5aa3e..7d81c3148 100644
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ b/spacy/tests/regression/test_issue1001-1500.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
import re
from spacy.tokens import Doc
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 4b27901ad..d9e1d663a 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
import gc
import numpy
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index e95c1a9b9..2c25b6d73 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
import numpy
from spacy.tokens import Doc
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 0acb25e90..49e7de179 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy import displacy
from spacy.lang.en import English
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index d05759c31..cc893e472 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.lang.en import English
from spacy.lang.de import German
@@ -9,11 +6,10 @@ from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.attrs import ENT_IOB, ENT_TYPE
-from spacy.compat import pickle, is_python2, unescape_unicode
+from spacy.compat import pickle
from spacy import displacy
from spacy.util import decaying
import numpy
-import re
from spacy.vectors import Vectors
from ..util import get_doc
@@ -211,73 +207,6 @@ def test_issue3345():
assert ner.moves.is_valid(state, "B-GPE")
-if is_python2:
- # If we have this test in Python 3, pytest chokes, as it can't print the
- # string above in the xpass message.
- prefix_search = (
- b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])"
- b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?"
- b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}"
- b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|"
- b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|"
- b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|"
- b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|"
- b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|"
- b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|"
- b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|"
- b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|"
- b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|"
- b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|"
- b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|"
- b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F"
- b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8"
- b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17"
- b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC"
- b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940"
- b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103"
- b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125"
- b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F"
- b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4"
- b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5"
- b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B"
- b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440"
- b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2"
- b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800"
- b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76"
- b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80"
- b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004"
- b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191"
- b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250"
- b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0"
- b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77"
- b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137"
- b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E"
- b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877"
- b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45"
- b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129"
- b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C"
- b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245"
- b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A"
- b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86"
- b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0"
- b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1"
- b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6"
- b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250"
- b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400"
- b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700"
- b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810"
- b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890"
- b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940"
- b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2"
- b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF"
- b"\\U0001FA60-\\U0001FA6D]"
- )
-
- def test_issue3356():
- pattern = re.compile(unescape_unicode(prefix_search.decode("utf8")))
- assert not pattern.search("hello")
-
-
def test_issue3410():
texts = ["Hello world", "This is a test"]
nlp = English()
diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py
index 35731ac12..3d8ee9922 100644
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/regression/test_issue3526.py b/spacy/tests/regression/test_issue3526.py
index c6f513730..aa77028fb 100644
--- a/spacy/tests/regression/test_issue3526.py
+++ b/spacy/tests/regression/test_issue3526.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.tokens import Span
from spacy.language import Language
diff --git a/spacy/tests/regression/test_issue3531.py b/spacy/tests/regression/test_issue3531.py
index 7b9d0bd2a..4c65a5bfe 100644
--- a/spacy/tests/regression/test_issue3531.py
+++ b/spacy/tests/regression/test_issue3531.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy import displacy
diff --git a/spacy/tests/regression/test_issue3540.py b/spacy/tests/regression/test_issue3540.py
index 19d89c797..be9e04b0b 100644
--- a/spacy/tests/regression/test_issue3540.py
+++ b/spacy/tests/regression/test_issue3540.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.tokens import Doc
import numpy as np
diff --git a/spacy/tests/regression/test_issue3549.py b/spacy/tests/regression/test_issue3549.py
index 587b3a857..b3af59c2e 100644
--- a/spacy/tests/regression/test_issue3549.py
+++ b/spacy/tests/regression/test_issue3549.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.matcher import Matcher
from spacy.errors import MatchPatternError
diff --git a/spacy/tests/regression/test_issue3555.py b/spacy/tests/regression/test_issue3555.py
index 8444f11f2..de047bcbc 100644
--- a/spacy/tests/regression/test_issue3555.py
+++ b/spacy/tests/regression/test_issue3555.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.tokens import Doc, Token
from spacy.matcher import Matcher
diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py
index bc8603888..367961ab1 100644
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import spacy
from spacy.util import minibatch, compounding
diff --git a/spacy/tests/regression/test_issue3625.py b/spacy/tests/regression/test_issue3625.py
index d935db17f..51561b3ac 100644
--- a/spacy/tests/regression/test_issue3625.py
+++ b/spacy/tests/regression/test_issue3625.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.lang.hi import Hindi
diff --git a/spacy/tests/regression/test_issue3803.py b/spacy/tests/regression/test_issue3803.py
index 37d15a5cf..ab5250edf 100644
--- a/spacy/tests/regression/test_issue3803.py
+++ b/spacy/tests/regression/test_issue3803.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.lang.es import Spanish
diff --git a/spacy/tests/regression/test_issue3839.py b/spacy/tests/regression/test_issue3839.py
index fe722a681..27b1f5f29 100644
--- a/spacy/tests/regression/test_issue3839.py
+++ b/spacy/tests/regression/test_issue3839.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.matcher import Matcher
from spacy.tokens import Doc
diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py
index 62e8eabd6..0a851e869 100644
--- a/spacy/tests/regression/test_issue3869.py
+++ b/spacy/tests/regression/test_issue3869.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.attrs import IS_ALPHA
from spacy.lang.en import English
diff --git a/spacy/tests/regression/test_issue3879.py b/spacy/tests/regression/test_issue3879.py
index 5cd245231..8500c09aa 100644
--- a/spacy/tests/regression/test_issue3879.py
+++ b/spacy/tests/regression/test_issue3879.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.matcher import Matcher
from spacy.tokens import Doc
diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py
index c060473f5..6e8ab6f43 100644
--- a/spacy/tests/regression/test_issue3880.py
+++ b/spacy/tests/regression/test_issue3880.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.lang.en import English
import pytest
diff --git a/spacy/tests/regression/test_issue3882.py b/spacy/tests/regression/test_issue3882.py
index 1b2dcea25..fa616db1d 100644
--- a/spacy/tests/regression/test_issue3882.py
+++ b/spacy/tests/regression/test_issue3882.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.displacy import parse_deps
from spacy.tokens import Doc
diff --git a/spacy/tests/regression/test_issue3951.py b/spacy/tests/regression/test_issue3951.py
index 33230112f..6e4c9eeaa 100644
--- a/spacy/tests/regression/test_issue3951.py
+++ b/spacy/tests/regression/test_issue3951.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.matcher import Matcher
from spacy.tokens import Doc
diff --git a/spacy/tests/regression/test_issue3959.py b/spacy/tests/regression/test_issue3959.py
index c1f7fe100..7db28a31f 100644
--- a/spacy/tests/regression/test_issue3959.py
+++ b/spacy/tests/regression/test_issue3959.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.lang.en import English
from ..util import make_tempdir
diff --git a/spacy/tests/regression/test_issue3962.py b/spacy/tests/regression/test_issue3962.py
index ae60fa0fa..971c9b08e 100644
--- a/spacy/tests/regression/test_issue3962.py
+++ b/spacy/tests/regression/test_issue3962.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from ..util import get_doc
diff --git a/spacy/tests/regression/test_issue3972.py b/spacy/tests/regression/test_issue3972.py
index 22b8d486e..fe5388950 100644
--- a/spacy/tests/regression/test_issue3972.py
+++ b/spacy/tests/regression/test_issue3972.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc
diff --git a/spacy/tests/regression/test_issue4002.py b/spacy/tests/regression/test_issue4002.py
index d075128aa..3ac26d3ab 100644
--- a/spacy/tests/regression/test_issue4002.py
+++ b/spacy/tests/regression/test_issue4002.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc
diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py
index e774feb2d..7153594db 100644
--- a/spacy/tests/regression/test_issue4030.py
+++ b/spacy/tests/regression/test_issue4030.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import spacy
from spacy.util import minibatch, compounding
diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py
index 00a8882d3..6644a8eda 100644
--- a/spacy/tests/regression/test_issue4042.py
+++ b/spacy/tests/regression/test_issue4042.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import spacy
from spacy.pipeline import EntityRecognizer, EntityRuler
from spacy.lang.en import English
diff --git a/spacy/tests/regression/test_issue4054.py b/spacy/tests/regression/test_issue4054.py
index cc84cebf8..c52ded395 100644
--- a/spacy/tests/regression/test_issue4054.py
+++ b/spacy/tests/regression/test_issue4054.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.vocab import Vocab
import spacy
from spacy.lang.en import English
diff --git a/spacy/tests/regression/test_issue4120.py b/spacy/tests/regression/test_issue4120.py
index d288f46c4..4849aa238 100644
--- a/spacy/tests/regression/test_issue4120.py
+++ b/spacy/tests/regression/test_issue4120.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.matcher import Matcher
from spacy.tokens import Doc
diff --git a/spacy/tests/regression/test_issue4133.py b/spacy/tests/regression/test_issue4133.py
index 93262f8cf..a726806d7 100644
--- a/spacy/tests/regression/test_issue4133.py
+++ b/spacy/tests/regression/test_issue4133.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.lang.en import English
from spacy.tokens import Doc
from spacy.vocab import Vocab
diff --git a/spacy/tests/regression/test_issue4190.py b/spacy/tests/regression/test_issue4190.py
index eb4eb8648..97d532d2a 100644
--- a/spacy/tests/regression/test_issue4190.py
+++ b/spacy/tests/regression/test_issue4190.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
from spacy import util
diff --git a/spacy/tests/regression/test_issue4267.py b/spacy/tests/regression/test_issue4267.py
index ef871bf9f..891f03b30 100644
--- a/spacy/tests/regression/test_issue4267.py
+++ b/spacy/tests/regression/test_issue4267.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
diff --git a/spacy/tests/regression/test_issue4272.py b/spacy/tests/regression/test_issue4272.py
index c57704d71..4bac97a44 100644
--- a/spacy/tests/regression/test_issue4272.py
+++ b/spacy/tests/regression/test_issue4272.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.lang.el import Greek
diff --git a/spacy/tests/regression/test_issue4278.py b/spacy/tests/regression/test_issue4278.py
index cb09340ff..ffbc41226 100644
--- a/spacy/tests/regression/test_issue4278.py
+++ b/spacy/tests/regression/test_issue4278.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.language import Language
from spacy.pipeline import Pipe
diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py
index c68f745a7..a3f6f69df 100644
--- a/spacy/tests/regression/test_issue4313.py
+++ b/spacy/tests/regression/test_issue4313.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from collections import defaultdict
from spacy.pipeline import EntityRecognizer
diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py
index 484d5d280..4978e0c8e 100644
--- a/spacy/tests/regression/test_issue4348.py
+++ b/spacy/tests/regression/test_issue4348.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.lang.en import English
from spacy.util import minibatch, compounding
import pytest
diff --git a/spacy/tests/regression/test_issue4367.py b/spacy/tests/regression/test_issue4367.py
index ab6192744..917847a05 100644
--- a/spacy/tests/regression/test_issue4367.py
+++ b/spacy/tests/regression/test_issue4367.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.tokens import DocBin
diff --git a/spacy/tests/regression/test_issue4373.py b/spacy/tests/regression/test_issue4373.py
index 57d7547da..dbde1624e 100644
--- a/spacy/tests/regression/test_issue4373.py
+++ b/spacy/tests/regression/test_issue4373.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.matcher import Matcher, PhraseMatcher
from spacy.vocab import Vocab
diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py
index 89332ca2f..80d37b1e6 100644
--- a/spacy/tests/regression/test_issue4402.py
+++ b/spacy/tests/regression/test_issue4402.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import srsly
from spacy.gold import GoldCorpus
from spacy.lang.en import English
diff --git a/spacy/tests/regression/test_issue4528.py b/spacy/tests/regression/test_issue4528.py
index 460449003..6f96c9f2d 100644
--- a/spacy/tests/regression/test_issue4528.py
+++ b/spacy/tests/regression/test_issue4528.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.tokens import Doc, DocBin
diff --git a/spacy/tests/regression/test_issue4529.py b/spacy/tests/regression/test_issue4529.py
index 381957be6..fa962c053 100644
--- a/spacy/tests/regression/test_issue4529.py
+++ b/spacy/tests/regression/test_issue4529.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.gold import GoldParse
diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py
index 8ec9a0bd1..74bb5de10 100644
--- a/spacy/tests/regression/test_issue4590.py
+++ b/spacy/tests/regression/test_issue4590.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
from mock import Mock
from spacy.matcher import DependencyMatcher
from ..util import get_doc
diff --git a/spacy/tests/regression/test_issue4651.py b/spacy/tests/regression/test_issue4651.py
index eb49f4a38..3f6c1a57c 100644
--- a/spacy/tests/regression/test_issue4651.py
+++ b/spacy/tests/regression/test_issue4651.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py
index 8fa4f9259..149e1431b 100644
--- a/spacy/tests/regression/test_issue4674.py
+++ b/spacy/tests/regression/test_issue4674.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.kb import KnowledgeBase
from spacy.util import ensure_path
diff --git a/spacy/tests/regression/test_issue4707.py b/spacy/tests/regression/test_issue4707.py
index e710881d7..d9798ef84 100644
--- a/spacy/tests/regression/test_issue4707.py
+++ b/spacy/tests/regression/test_issue4707.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.util import load_model_from_path
from spacy.lang.en import English
diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py
index ef2b1ee89..615bb1cd9 100644
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@@ -1,13 +1,7 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import spacy
-
import pytest
-
from spacy.lang.en import English
from spacy.tokens import Doc, DocBin
-from spacy.compat import path2str
from ..util import make_tempdir
@@ -43,7 +37,7 @@ def test_serialize_doc_roundtrip_disk_str_path(en_vocab):
doc = Doc(en_vocab, words=["hello", "world"])
with make_tempdir() as d:
file_path = d / "doc"
- file_path = path2str(file_path)
+ file_path = str(file_path)
doc.to_disk(file_path)
doc_d = Doc(en_vocab).from_disk(file_path)
assert doc.to_bytes() == doc_d.to_bytes()
diff --git a/spacy/tests/serialize/test_serialize_extension_attrs.py b/spacy/tests/serialize/test_serialize_extension_attrs.py
index 1881b7d0c..b8a31ab5e 100644
--- a/spacy/tests/serialize/test_serialize_extension_attrs.py
+++ b/spacy/tests/serialize/test_serialize_extension_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.tokens import Doc
from spacy.vocab import Vocab
@@ -10,9 +7,7 @@ from spacy.vocab import Vocab
def doc_w_attrs(en_tokenizer):
Doc.set_extension("_test_attr", default=False)
Doc.set_extension("_test_prop", getter=lambda doc: len(doc.text))
- Doc.set_extension(
- "_test_method", method=lambda doc, arg: "{}{}".format(len(doc.text), arg)
- )
+ Doc.set_extension("_test_method", method=lambda doc, arg: f"{len(doc.text)}{arg}")
doc = en_tokenizer("This is a test.")
doc._._test_attr = "test"
return doc
@@ -24,4 +19,4 @@ def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
assert doc._.has("_test_attr")
assert doc._._test_attr == "test"
assert doc._._test_prop == len(doc.text)
- assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test")
+ assert doc._._test_method("test") == f"{len(doc.text)}test"
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index b19c11864..91036a496 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
from spacy.util import ensure_path
from spacy.kb import KnowledgeBase
diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py
index efc5d181c..4089a0d07 100644
--- a/spacy/tests/serialize/test_serialize_language.py
+++ b/spacy/tests/serialize/test_serialize_language.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
import re
from spacy.language import Language
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 797fa95f8..0ad9bc4d4 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py
index 9a273980c..f504ed048 100644
--- a/spacy/tests/serialize/test_serialize_tokenizer.py
+++ b/spacy/tests/serialize/test_serialize_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.util import get_lang_class
from spacy.tokenizer import Tokenizer
diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py
index 1671845ee..359a0657f 100644
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.vocab import Vocab
from spacy.strings import StringStore
diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py
index 77f1af020..ad56e4c54 100644
--- a/spacy/tests/test_architectures.py
+++ b/spacy/tests/test_architectures.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy import registry
from thinc.v2v import Affine
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 3b75e760a..b4aebe521 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.lang.en import English
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index 2d1f1bd8f..4436b437f 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy import displacy
from spacy.displacy.render import DependencyRenderer
@@ -80,10 +77,10 @@ def test_displacy_rtl():
html = displacy.render(doc, page=True, style="dep")
assert "direction: rtl" in html
assert 'direction="rtl"' in html
- assert 'lang="{}"'.format(nlp.lang) in html
+ assert f'lang="{nlp.lang}"' in html
html = displacy.render(doc, page=True, style="ent")
assert "direction: rtl" in html
- assert 'lang="{}"'.format(nlp.lang) in html
+ assert f'lang="{nlp.lang}"' in html
def test_displacy_render_wrapper(en_vocab):
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 9d644d062..46c54b879 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -1,9 +1,10 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import spacy
from spacy.errors import AlignmentError
-from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Example, DocAnnotation
+from spacy.gold import (
+ biluo_tags_from_offsets,
+ offsets_from_biluo_tags,
+ Example,
+ DocAnnotation,
+)
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
from spacy.gold import GoldCorpus, docs_to_json, align
from spacy.lang.en import English
@@ -14,14 +15,37 @@ from .util import make_tempdir
import pytest
import srsly
+
@pytest.fixture
def doc():
text = "Sarah's sister flew to Silicon Valley via London."
- tags = ['NNP', 'POS', 'NN', 'VBD', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
+ tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
# head of '.' is intentionally nonprojective for testing
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
- deps = ['poss', 'case', 'nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
- lemmas = ['Sarah', "'s", 'sister', 'fly', 'to', 'Silicon', 'Valley', 'via', 'London', '.']
+ deps = [
+ "poss",
+ "case",
+ "nsubj",
+ "ROOT",
+ "prep",
+ "compound",
+ "pobj",
+ "prep",
+ "pobj",
+ "punct",
+ ]
+ lemmas = [
+ "Sarah",
+ "'s",
+ "sister",
+ "fly",
+ "to",
+ "Silicon",
+ "Valley",
+ "via",
+ "London",
+ ".",
+ ]
biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
nlp = English()
@@ -45,7 +69,7 @@ def merged_dict():
"words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
"tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
"sent_starts": [1, 0, 0, 1, 0, 0, 0, 0],
- }
+ }
def test_gold_biluo_U(en_vocab):
@@ -141,7 +165,9 @@ def test_roundtrip_docs_to_json(doc):
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
lemmas = [t.lemma_ for t in doc]
- biluo_tags = iob_to_biluo([t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc])
+ biluo_tags = iob_to_biluo(
+ [t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc]
+ )
cats = doc.cats
# roundtrip to JSON
@@ -214,7 +240,6 @@ def test_roundtrip_docs_to_json(doc):
def test_projective_train_vs_nonprojective_dev(doc):
nlp = English()
- text = doc.text
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
@@ -244,9 +269,6 @@ def test_projective_train_vs_nonprojective_dev(doc):
def test_ignore_misaligned(doc):
nlp = English()
text = doc.text
- deps = [t.dep_ for t in doc]
- heads = [t.head.i for t in doc]
-
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
data = [docs_to_json(doc)]
@@ -268,17 +290,12 @@ def test_ignore_misaligned(doc):
# doesn't raise an AlignmentError, but there is nothing to iterate over
# because the only example can't be aligned
- train_reloaded_example = list(goldcorpus.train_dataset(nlp,
- ignore_misaligned=True))
+ train_reloaded_example = list(goldcorpus.train_dataset(nlp, ignore_misaligned=True))
assert len(train_reloaded_example) == 0
def test_make_orth_variants(doc):
nlp = English()
- text = doc.text
- deps = [t.dep_ for t in doc]
- heads = [t.head.i for t in doc]
-
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
# write to JSONL train dicts
@@ -286,9 +303,8 @@ def test_make_orth_variants(doc):
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
# due to randomness, test only that this runs with no errors for now
- train_reloaded_example = next(goldcorpus.train_dataset(nlp,
- orth_variant_level=0.2))
- train_goldparse = train_reloaded_example.gold
+ train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2))
+ train_goldparse = train_reloaded_example.gold # noqa: F841
@pytest.mark.parametrize(
diff --git a/spacy/tests/test_json_schemas.py b/spacy/tests/test_json_schemas.py
index 89e797c1a..1330d3a65 100644
--- a/spacy/tests/test_json_schemas.py
+++ b/spacy/tests/test_json_schemas.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
from spacy.util import get_json_validator, validate_json, validate_schema
from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA
from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 102b87142..58db0a040 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -1,10 +1,5 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import itertools
-
import pytest
-from spacy.compat import is_python2
from spacy.gold import GoldParse
from spacy.language import Language
from spacy.tokens import Doc, Span
@@ -134,9 +129,6 @@ def test_language_pipe(nlp2, n_process, texts):
assert_docs_equal(doc, expected_doc)
-@pytest.mark.skipif(
- is_python2, reason="python2 seems to be unable to handle iterator properly"
-)
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_stream(nlp2, n_process, texts):
# check if nlp.pipe can handle infinite length iterator properly.
diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py
index 701222afc..c2534ca22 100644
--- a/spacy/tests/test_lemmatizer.py
+++ b/spacy/tests/test_lemmatizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
from spacy.tokens import Doc
from spacy.language import Language
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 4075ccf64..09e0fb561 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,13 +1,10 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
import os
import ctypes
from pathlib import Path
from spacy import util
from spacy import prefer_gpu, require_gpu
-from spacy.compat import symlink_to, symlink_remove, path2str, is_windows
+from spacy.compat import symlink_to, symlink_remove, is_windows
from spacy._ml import PrecomputableAffine
from subprocess import CalledProcessError
@@ -25,7 +22,7 @@ def symlink():
@pytest.fixture(scope="function")
def symlink_setup_target(request, symlink_target, symlink):
if not symlink_target.exists():
- os.mkdir(path2str(symlink_target))
+ os.mkdir(str(symlink_target))
# yield -- need to cleanup even if assertion fails
# https://github.com/pytest-dev/pytest/issues/2508#issuecomment-309934240
@@ -33,7 +30,7 @@ def symlink_setup_target(request, symlink_target, symlink):
# Remove symlink only if it was created
if symlink.exists():
symlink_remove(symlink)
- os.rmdir(path2str(symlink_target))
+ os.rmdir(str(symlink_target))
request.addfinalizer(cleanup)
diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py
index 65288527a..e4c67b672 100644
--- a/spacy/tests/test_pickles.py
+++ b/spacy/tests/test_pickles.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
import numpy
import srsly
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 888028b6c..efaf80b4f 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
from numpy.testing import assert_almost_equal, assert_array_almost_equal
import pytest
from pytest import approx
diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py
index ddaa71059..473d5017d 100644
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@@ -1,12 +1,8 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy._ml import Tok2Vec
from spacy.vocab import Vocab
from spacy.tokens import Doc
-from spacy.compat import unicode_
def get_batch(batch_size):
@@ -16,7 +12,7 @@ def get_batch(batch_size):
for size in range(1, batch_size + 1):
# Make the words numbers, so that they're distnct
# across the batch, and easy to track.
- numbers = [unicode_(i) for i in range(start, start + size)]
+ numbers = [str(i) for i in range(start, start + size)]
docs.append(Doc(vocab, words=numbers))
start += size
return docs
diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
index c2011487e..8276d7aea 100644
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import sys
import pytest
diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py
index 2d71588cc..3e7681234 100644
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.util import get_lang_class
@@ -58,7 +55,7 @@ LANGUAGES = [
@pytest.mark.parametrize("lang", LANGUAGES)
def test_tokenizer_explain(lang):
tokenizer = get_lang_class(lang).Defaults.create_tokenizer()
- examples = pytest.importorskip("spacy.lang.{}.examples".format(lang))
+ examples = pytest.importorskip(f"spacy.lang.{lang}.examples")
for sentence in examples.sentences:
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
diff --git a/spacy/tests/tokenizer/test_naughty_strings.py b/spacy/tests/tokenizer/test_naughty_strings.py
index 36c69611e..e93d5654f 100644
--- a/spacy/tests/tokenizer/test_naughty_strings.py
+++ b/spacy/tests/tokenizer/test_naughty_strings.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
# Examples taken from the "Big List of Naughty Strings"
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index 5ac681c5e..3dce1ae31 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index e2c0e3de8..9f673d5d8 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.lang.tokenizer_exceptions import BASE_EXCEPTIONS
diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py
index 74c9b369b..c7b9d7c6d 100644
--- a/spacy/tests/tokenizer/test_whitespace.py
+++ b/spacy/tests/tokenizer/test_whitespace.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 175480fe7..0516e9272 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import numpy
import tempfile
import shutil
@@ -9,7 +6,6 @@ import srsly
from pathlib import Path
from spacy.tokens import Doc, Span
from spacy.attrs import POS, HEAD, DEP
-from spacy.compat import path2str
@contextlib.contextmanager
@@ -23,7 +19,7 @@ def make_tempfile(mode="r"):
def make_tempdir():
d = Path(tempfile.mkdtemp())
yield d
- shutil.rmtree(path2str(d))
+ shutil.rmtree(str(d))
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py
index d84a56981..e033aa7c6 100644
--- a/spacy/tests/vocab_vectors/test_lexeme.py
+++ b/spacy/tests/vocab_vectors/test_lexeme.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.attrs import IS_ALPHA, IS_DIGIT
diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py
index f78dd33c4..fff3d24ef 100644
--- a/spacy/tests/vocab_vectors/test_lookups.py
+++ b/spacy/tests/vocab_vectors/test_lookups.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.lookups import Lookups, Table
from spacy.strings import get_string_id
diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py
index f98f0e6e0..b5f7303b5 100644
--- a/spacy/tests/vocab_vectors/test_similarity.py
+++ b/spacy/tests/vocab_vectors/test_similarity.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
import numpy
from spacy.tokens import Doc
diff --git a/spacy/tests/vocab_vectors/test_stringstore.py b/spacy/tests/vocab_vectors/test_stringstore.py
index 75b1116dd..c71d5f3f2 100644
--- a/spacy/tests/vocab_vectors/test_stringstore.py
+++ b/spacy/tests/vocab_vectors/test_stringstore.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.strings import StringStore
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index b688ab9dd..8684ad018 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
import numpy
from numpy.testing import assert_allclose
diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py
index d22db2d8b..a687059be 100644
--- a/spacy/tests/vocab_vectors/test_vocab_api.py
+++ b/spacy/tests/vocab_vectors/test_vocab_api.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
from spacy.parts_of_speech import NOUN, VERB
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index f0120c708..7491a11fc 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -1,8 +1,5 @@
# cython: embedsignature=True
# cython: profile=True
-# coding: utf8
-from __future__ import unicode_literals
-
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
from libc.string cimport memcpy, memset
@@ -11,22 +8,20 @@ from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
cimport cython
-from collections import OrderedDict
import re
from .tokens.doc cimport Doc
from .strings cimport hash_string
-from .compat import unescape_unicode
from .attrs import intify_attrs
from .symbols import ORTH
from .errors import Errors, Warnings, deprecation_warning
from . import util
-
from .attrs import intify_attrs
from .lexeme cimport EMPTY_LEXEME
from .symbols import ORTH
+
cdef class Tokenizer:
"""Segment text, and create Doc objects with the discovered segment
boundaries.
@@ -728,14 +723,14 @@ cdef class Tokenizer:
DOCS: https://spacy.io/api/tokenizer#to_bytes
"""
- serializers = OrderedDict((
- ("vocab", lambda: self.vocab.to_bytes()),
- ("prefix_search", lambda: _get_regex_pattern(self.prefix_search)),
- ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)),
- ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)),
- ("token_match", lambda: _get_regex_pattern(self.token_match)),
- ("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
- ))
+ serializers = {
+ "vocab": lambda: self.vocab.to_bytes(),
+ "prefix_search": lambda: _get_regex_pattern(self.prefix_search),
+ "suffix_search": lambda: _get_regex_pattern(self.suffix_search),
+ "infix_finditer": lambda: _get_regex_pattern(self.infix_finditer),
+ "token_match": lambda: _get_regex_pattern(self.token_match),
+ "exceptions": lambda: dict(sorted(self._rules.items()))
+ }
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
return util.to_bytes(serializers, exclude)
@@ -748,20 +743,17 @@ cdef class Tokenizer:
DOCS: https://spacy.io/api/tokenizer#from_bytes
"""
- data = OrderedDict()
- deserializers = OrderedDict((
- ("vocab", lambda b: self.vocab.from_bytes(b)),
- ("prefix_search", lambda b: data.setdefault("prefix_search", b)),
- ("suffix_search", lambda b: data.setdefault("suffix_search", b)),
- ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)),
- ("token_match", lambda b: data.setdefault("token_match", b)),
- ("exceptions", lambda b: data.setdefault("rules", b))
- ))
+ data = {}
+ deserializers = {
+ "vocab": lambda b: self.vocab.from_bytes(b),
+ "prefix_search": lambda b: data.setdefault("prefix_search", b),
+ "suffix_search": lambda b: data.setdefault("suffix_search", b),
+ "infix_finditer": lambda b: data.setdefault("infix_finditer", b),
+ "token_match": lambda b: data.setdefault("token_match", b),
+ "exceptions": lambda b: data.setdefault("rules", b)
+ }
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude)
- for key in ["prefix_search", "suffix_search", "infix_finditer"]:
- if key in data:
- data[key] = unescape_unicode(data[key])
if data.get("prefix_search"):
self.prefix_search = re.compile(data["prefix_search"]).search
if data.get("suffix_search"):
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index 536ec8349..88428709b 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .doc import Doc
from .token import Token
from .span import Span
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index a5d06491a..12690ba50 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -1,9 +1,6 @@
-# coding: utf8
# cython: infer_types=True
# cython: bounds_check=False
# cython: profile=True
-from __future__ import unicode_literals
-
from libc.string cimport memcpy, memset
from libc.stdlib cimport malloc, free
from cymem.cymem cimport Pool
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index b60a6d7b3..d7348659d 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import numpy
import zlib
import srsly
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 716df1087..58423c420 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1,10 +1,6 @@
-
-# coding: utf8
# cython: infer_types=True
# cython: bounds_check=False
# cython: profile=True
-from __future__ import unicode_literals
-
cimport cython
cimport numpy as np
from libc.string cimport memcpy, memset
@@ -28,7 +24,7 @@ from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..attrs import intify_attrs, IDS
from ..util import normalize_slice
-from ..compat import is_config, copy_reg, pickle, basestring_
+from ..compat import copy_reg, pickle
from ..errors import deprecation_warning, models_warning, user_warning
from ..errors import Errors, Warnings
from .. import util
@@ -327,9 +323,7 @@ cdef class Doc:
return "".join([t.text_with_ws for t in self]).encode("utf-8")
def __str__(self):
- if is_config(python3=True):
- return self.__unicode__()
- return self.__bytes__()
+ return self.__unicode__()
def __repr__(self):
return self.__str__()
@@ -683,7 +677,7 @@ cdef class Doc:
cdef np.ndarray[attr_t, ndim=2] output
# Handle scalar/list inputs of strings/ints for py_attr_ids
# See also #3064
- if isinstance(py_attr_ids, basestring_):
+ if isinstance(py_attr_ids, str):
# Handle inputs like doc.to_array('ORTH')
py_attr_ids = [py_attr_ids]
elif not hasattr(py_attr_ids, "__iter__"):
@@ -772,7 +766,7 @@ cdef class Doc:
"""
# Handle scalar/list inputs of strings/ints for py_attr_ids
# See also #3064
- if isinstance(attrs, basestring_):
+ if isinstance(attrs, str):
# Handle inputs like doc.to_array('ORTH')
attrs = [attrs]
elif not hasattr(attrs, "__iter__"):
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 957e853ca..9e9322d65 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
cimport numpy as np
from libc.math cimport sqrt
@@ -20,7 +17,6 @@ from ..lexeme cimport Lexeme
from ..symbols cimport dep
from ..util import normalize_slice
-from ..compat import is_config, basestring_
from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
from ..errors import deprecation_warning
from .underscore import Underscore, get_ext_args
@@ -110,9 +106,9 @@ cdef class Span:
self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1])
else:
self.end_char = 0
- if isinstance(label, basestring_):
+ if isinstance(label, str):
label = doc.vocab.strings.add(label)
- if isinstance(kb_id, basestring_):
+ if isinstance(kb_id, str):
kb_id = doc.vocab.strings.add(kb_id)
if label not in doc.vocab.strings:
raise ValueError(Errors.E084.format(label=label))
@@ -157,9 +153,7 @@ cdef class Span:
return self.end - self.start
def __repr__(self):
- if is_config(python3=True):
- return self.text
- return self.text.encode("utf-8")
+ return self.text
def __getitem__(self, object i):
"""Get a `Token` or a `Span` object
@@ -478,7 +472,7 @@ cdef class Span:
@property
def tensor(self):
"""The span's slice of the doc's tensor.
-
+
RETURNS (ndarray[ndim=2, dtype='float32']): A 2D numpy or cupy array
representing the span's semantics.
"""
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 8b15a4223..8e6290187 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -1,7 +1,4 @@
# cython: infer_types=True
-# coding: utf8
-from __future__ import unicode_literals
-
from libc.string cimport memcpy
from cpython.mem cimport PyMem_Malloc, PyMem_Free
# Compiler crashes on memory view coercion without this. Should report bug.
@@ -23,7 +20,6 @@ from ..symbols cimport conj
from .. import parts_of_speech
from .. import util
-from ..compat import is_config
from ..errors import Errors, Warnings, user_warning, models_warning
from .underscore import Underscore, get_ext_args
from .morphanalysis cimport MorphAnalysis
@@ -122,9 +118,7 @@ cdef class Token:
return self.text.encode('utf8')
def __str__(self):
- if is_config(python3=True):
- return self.__unicode__()
- return self.__bytes__()
+ return self.__unicode__()
def __repr__(self):
return self.__str__()
diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index b36fe9294..328851945 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import functools
import copy
diff --git a/spacy/util.py b/spacy/util.py
index 693136bc1..4e6c10e2b 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1,12 +1,9 @@
-# coding: utf8
-from __future__ import unicode_literals, print_function
-
import os
import importlib
+import importlib.util
import re
from pathlib import Path
import random
-from collections import OrderedDict
from thinc.neural._classes.model import Model
from thinc.neural.ops import NumpyOps
import functools
@@ -27,8 +24,7 @@ except ImportError:
cupy = None
from .symbols import ORTH
-from .compat import cupy, CudaStream, path2str, basestring_, unicode_
-from .compat import import_file
+from .compat import cupy, CudaStream
from .errors import Errors, Warnings, deprecation_warning
@@ -119,7 +115,7 @@ def ensure_path(path):
path: Anything. If string, it's converted to Path.
RETURNS: Path or original argument.
"""
- if isinstance(path, basestring_):
+ if isinstance(path, str):
return Path(path)
else:
return path
@@ -138,7 +134,7 @@ def load_language_data(path):
path = path.with_suffix(path.suffix + ".gz")
if path.exists():
return srsly.read_gzip_json(path)
- raise ValueError(Errors.E160.format(path=path2str(path)))
+ raise ValueError(Errors.E160.format(path=path))
def get_module_path(module):
@@ -156,8 +152,8 @@ def load_model(name, **overrides):
"""
data_path = get_data_path()
if not data_path or not data_path.exists():
- raise IOError(Errors.E049.format(path=path2str(data_path)))
- if isinstance(name, basestring_): # in data dir / shortcut
+ raise IOError(Errors.E049.format(path=data_path))
+ if isinstance(name, str): # in data dir / shortcut
if name in set([d.name for d in data_path.iterdir()]):
return load_model_from_link(name, **overrides)
if is_package(name): # installed as package
@@ -224,7 +220,7 @@ def load_model_from_init_py(init_file, **overrides):
data_dir = "%s_%s-%s" % (meta["lang"], meta["name"], meta["version"])
data_path = model_path / data_dir
if not model_path.exists():
- raise IOError(Errors.E052.format(path=path2str(data_path)))
+ raise IOError(Errors.E052.format(path=data_path))
return load_model_from_path(data_path, meta, **overrides)
@@ -236,7 +232,7 @@ def get_model_meta(path):
"""
model_path = ensure_path(path)
if not model_path.exists():
- raise IOError(Errors.E052.format(path=path2str(model_path)))
+ raise IOError(Errors.E052.format(path=model_path))
meta_path = model_path / "meta.json"
if not meta_path.is_file():
raise IOError(Errors.E053.format(path=meta_path))
@@ -417,7 +413,7 @@ def update_exc(base_exceptions, *addition_dicts):
exc = dict(base_exceptions)
for additions in addition_dicts:
for orth, token_attrs in additions.items():
- if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs):
+ if not all(isinstance(attr[ORTH], str) for attr in token_attrs):
raise ValueError(Errors.E055.format(key=orth, orths=token_attrs))
described_orth = "".join(attr[ORTH] for attr in token_attrs)
if orth != described_orth:
@@ -612,7 +608,7 @@ def filter_spans(spans):
def to_bytes(getters, exclude):
- serialized = OrderedDict()
+ serialized = {}
for key, getter in getters.items():
# Split to support file names like meta.json
if key.split(".")[0] not in exclude:
@@ -649,6 +645,20 @@ def from_disk(path, readers, exclude):
return path
+def import_file(name, loc):
+ """Import module from a file. Used to load models from a directory.
+
+ name (unicode): Name of module to load.
+ loc (unicode / Path): Path to the file.
+ RETURNS: The loaded module.
+ """
+ loc = str(loc)
+ spec = importlib.util.spec_from_file_location(name, str(loc))
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ return module
+
+
def minify_html(html):
"""Perform a template-specific, rudimentary HTML minification for displaCy.
Disclaimer: NOT a general-purpose solution, only removes indentation and
@@ -726,8 +736,8 @@ def validate_json(data, validator):
err_path = ""
msg = err.message + " " + err_path
if err.context: # Error has suberrors, e.g. if schema uses anyOf
- suberrs = [" - {}".format(suberr.message) for suberr in err.context]
- msg += ":\n{}".format("".join(suberrs))
+ suberrs = [f" - {suberr.message}" for suberr in err.context]
+ msg += f":\n{''.join(suberrs)}"
errors.append(msg)
return errors
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 6b26bf123..b12c8d833 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,13 +1,9 @@
-# coding: utf8
-from __future__ import unicode_literals
-
cimport numpy as np
from cython.operator cimport dereference as deref
from libcpp.set cimport set as cppset
import functools
import numpy
-from collections import OrderedDict
import srsly
from thinc.neural.util import get_array_module
from thinc.neural._classes.model import Model
@@ -15,7 +11,6 @@ from thinc.neural._classes.model import Model
from .strings cimport StringStore
from .strings import get_string_id
-from .compat import basestring_, path2str
from .errors import Errors
from . import util
@@ -74,7 +69,7 @@ cdef class Vectors:
shape = (0,0)
data = numpy.zeros(shape, dtype="f")
self.data = data
- self.key2row = OrderedDict()
+ self.key2row = {}
if self.data is not None:
self._unset = cppset[int]({i for i in range(self.data.shape[0])})
else:
@@ -339,7 +334,7 @@ cdef class Vectors:
sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
scores[i:i+batch_size] = scores[sorted_index]
best_rows[i:i+batch_size] = best_rows[sorted_index]
-
+
xp = get_array_module(self.data)
# Round values really close to 1 or -1
scores = xp.around(scores, decimals=4, out=scores)
@@ -347,7 +342,7 @@ cdef class Vectors:
scores = xp.clip(scores, a_min=-1, a_max=1, out=scores)
row2key = {row: key for key, row in self.key2row.items()}
keys = xp.asarray(
- [[row2key[row] for row in best_rows[i] if row in row2key]
+ [[row2key[row] for row in best_rows[i] if row in row2key]
for i in range(len(queries)) ], dtype="uint64")
return (keys, best_rows, scores)
@@ -372,7 +367,7 @@ cdef class Vectors:
break
else:
raise IOError(Errors.E061.format(filename=path))
- bin_loc = path / "vectors.{dims}.{dtype}.bin".format(dims=dims, dtype=dtype)
+ bin_loc = path / f"vectors.{dims}.{dtype}.bin"
xp = get_array_module(self.data)
self.data = None
with bin_loc.open("rb") as file_:
@@ -402,10 +397,10 @@ cdef class Vectors:
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
else:
save_array = lambda arr, file_: xp.save(file_, arr)
- serializers = OrderedDict((
- ("vectors", lambda p: save_array(self.data, p.open("wb"))),
- ("key2row", lambda p: srsly.write_msgpack(p, self.key2row))
- ))
+ serializers = {
+ "vectors": lambda p: save_array(self.data, p.open("wb")),
+ "key2row": lambda p: srsly.write_msgpack(p, self.key2row)
+ }
return util.to_disk(path, serializers, [])
def from_disk(self, path, **kwargs):
@@ -435,11 +430,11 @@ cdef class Vectors:
if path.exists():
self.data = xp.load(str(path))
- serializers = OrderedDict((
- ("key2row", load_key2row),
- ("keys", load_keys),
- ("vectors", load_vectors),
- ))
+ serializers = {
+ "key2row": load_key2row,
+ "keys": load_keys,
+ "vectors": load_vectors,
+ }
util.from_disk(path, serializers, [])
return self
@@ -457,10 +452,10 @@ cdef class Vectors:
else:
return srsly.msgpack_dumps(self.data)
- serializers = OrderedDict((
- ("key2row", lambda: srsly.msgpack_dumps(self.key2row)),
- ("vectors", serialize_weights)
- ))
+ serializers = {
+ "key2row": lambda: srsly.msgpack_dumps(self.key2row),
+ "vectors": serialize_weights
+ }
return util.to_bytes(serializers, [])
def from_bytes(self, data, **kwargs):
@@ -478,9 +473,9 @@ cdef class Vectors:
else:
self.data = srsly.msgpack_loads(b)
- deserializers = OrderedDict((
- ("key2row", lambda b: self.key2row.update(srsly.msgpack_loads(b))),
- ("vectors", deserialize_weights)
- ))
+ deserializers = {
+ "key2row": lambda b: self.key2row.update(srsly.msgpack_loads(b)),
+ "vectors": deserialize_weights
+ }
util.from_bytes(data, deserializers, [])
return self
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 3cf0095ee..c7e74f36c 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -1,10 +1,7 @@
-# coding: utf8
# cython: profile=True
-from __future__ import unicode_literals
from libc.string cimport memcpy
import srsly
-from collections import OrderedDict
from thinc.neural.util import get_array_module
from .lexeme cimport EMPTY_LEXEME
@@ -14,7 +11,7 @@ from .tokens.token cimport Token
from .attrs cimport PROB, LANG, ORTH, TAG, POS
from .structs cimport SerializedLexemeC
-from .compat import copy_reg, basestring_
+from .compat import copy_reg
from .errors import Errors
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs, NORM
@@ -335,14 +332,14 @@ cdef class Vocab:
"""Retrieve a vector for a word in the vocabulary. Words can be looked
up by string or int ID. If no vectors data is loaded, ValueError is
raised.
-
- If `minn` is defined, then the resulting vector uses Fasttext's
+
+ If `minn` is defined, then the resulting vector uses Fasttext's
subword features by average over ngrams of `orth`.
orth (int / unicode): The hash value of a word, or its unicode string.
- minn (int): Minimum n-gram length used for Fasttext's ngram computation.
+ minn (int): Minimum n-gram length used for Fasttext's ngram computation.
Defaults to the length of `orth`.
- maxn (int): Maximum n-gram length used for Fasttext's ngram computation.
+ maxn (int): Maximum n-gram length used for Fasttext's ngram computation.
Defaults to the length of `orth`.
RETURNS (numpy.ndarray): A word vector. Size
and shape determined by the `vocab.vectors` instance. Usually, a
@@ -350,7 +347,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#get_vector
"""
- if isinstance(orth, basestring_):
+ if isinstance(orth, str):
orth = self.strings.add(orth)
word = self[orth].orth_
if orth in self.vectors.key2row:
@@ -397,7 +394,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#set_vector
"""
- if isinstance(orth, basestring_):
+ if isinstance(orth, str):
orth = self.strings.add(orth)
if self.vectors.is_full and orth not in self.vectors:
new_rows = max(100, int(self.vectors.shape[0]*1.3))
@@ -419,7 +416,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#has_vector
"""
- if isinstance(orth, basestring_):
+ if isinstance(orth, str):
orth = self.strings.add(orth)
return orth in self.vectors
@@ -488,12 +485,12 @@ cdef class Vocab:
else:
return self.vectors.to_bytes()
- getters = OrderedDict((
- ("strings", lambda: self.strings.to_bytes()),
- ("lexemes", lambda: self.lexemes_to_bytes()),
- ("vectors", deserialize_vectors),
- ("lookups", lambda: self.lookups.to_bytes())
- ))
+ getters = {
+ "strings": lambda: self.strings.to_bytes(),
+ "lexemes": lambda: self.lexemes_to_bytes(),
+ "vectors": deserialize_vectors,
+ "lookups": lambda: self.lookups.to_bytes()
+ }
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
return util.to_bytes(getters, exclude)
@@ -512,12 +509,12 @@ cdef class Vocab:
else:
return self.vectors.from_bytes(b)
- setters = OrderedDict((
- ("strings", lambda b: self.strings.from_bytes(b)),
- ("lexemes", lambda b: self.lexemes_from_bytes(b)),
- ("vectors", lambda b: serialize_vectors(b)),
- ("lookups", lambda b: self.lookups.from_bytes(b))
- ))
+ setters = {
+ "strings": lambda b: self.strings.from_bytes(b),
+ "lexemes": lambda b: self.lexemes_from_bytes(b),
+ "vectors": lambda b: serialize_vectors(b),
+ "lookups": lambda b: self.lookups.from_bytes(b)
+ }
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
util.from_bytes(bytes_data, setters, exclude)
if self.vectors.name is not None:
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 50ba0e3d9..c9c7a010c 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -677,50 +677,3 @@ of one entity) or when merging spans with
| ----------- | -------- | -------------------- |
| `spans` | iterable | The spans to filter. |
| **RETURNS** | list | The filtered spans. |
-
-## Compatibility functions {#compat source="spacy/compaty.py"}
-
-All Python code is written in an **intersection of Python 2 and Python 3**. This
-is easy in Cython, but somewhat ugly in Python. Logic that deals with Python or
-platform compatibility only lives in `spacy.compat`. To distinguish them from
-the builtin functions, replacement functions are suffixed with an underscore,
-e.g. `unicode_`.
-
-> #### Example
->
-> ```python
-> from spacy.compat import unicode_
->
-> compatible_unicode = unicode_("hello world")
-> ```
-
-| Name | Python 2 | Python 3 |
-| -------------------- | ---------------------------------- | ----------- |
-| `compat.bytes_` | `str` | `bytes` |
-| `compat.unicode_` | `unicode` | `str` |
-| `compat.basestring_` | `basestring` | `str` |
-| `compat.input_` | `raw_input` | `input` |
-| `compat.path2str` | `str(path)` with `.decode('utf8')` | `str(path)` |
-
-### compat.is_config {#compat.is_config tag="function"}
-
-Check if a specific configuration of Python version and operating system matches
-the user's setup. Mostly used to display targeted error messages.
-
-> #### Example
->
-> ```python
-> from spacy.compat import is_config
->
-> if is_config(python2=True, windows=True):
-> print("You are using Python 2 on Windows.")
-> ```
-
-| Name | Type | Description |
-| ----------- | ---- | ---------------------------------------------------------------- |
-| `python2` | bool | spaCy is executed with Python 2.x. |
-| `python3` | bool | spaCy is executed with Python 3.x. |
-| `windows` | bool | spaCy is executed on Windows. |
-| `linux` | bool | spaCy is executed on Linux. |
-| `osx` | bool | spaCy is executed on OS X or macOS. |
-| **RETURNS** | bool | Whether the specified configuration matches the user's platform. |
diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md
index 2b0045bc3..6c398d584 100644
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@@ -8,9 +8,9 @@ menu:
- ['Changelog', 'changelog']
---
-spaCy is compatible with **64-bit CPython 2.7 / 3.5+** and runs on
-**Unix/Linux**, **macOS/OS X** and **Windows**. The latest spaCy releases are
-available over [pip](https://pypi.python.org/pypi/spacy) and
+spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**,
+**macOS/OS X** and **Windows**. The latest spaCy releases are available over
+[pip](https://pypi.python.org/pypi/spacy) and
[conda](https://anaconda.org/conda-forge/spacy).
> #### 📖 Looking for the old docs?
@@ -207,14 +207,7 @@ Install a version of the
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
or
[Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/)
-that matches the version that was used to compile your Python interpreter. For
-official distributions these are:
-
-| Distribution | Version |
-| ------------ | ------------------ |
-| Python 2.7 | Visual Studio 2008 |
-| Python 3.4 | Visual Studio 2010 |
-| Python 3.5+ | Visual Studio 2015 |
+that matches the version that was used to compile your Python interpreter.
### Run tests {#run-tests}
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index b7b840999..7382f2b8c 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -367,7 +367,7 @@ tokens and a conditional message based on the document length.
import spacy
def my_component(doc):
- print("After tokenization, this doc has {} tokens.".format(len(doc)))
+ print(f"After tokenization, this doc has {len(doc)} tokens.")
print("The part-of-speech tags are:", [token.pos_ for token in doc])
if len(doc) < 10:
print("This is a pretty short document.")
@@ -602,7 +602,7 @@ There are three main types of extensions, which can be defined using the
[these examples](/usage/examples#custom-components-attr-methods).
```python
- Doc.set_extension("hello", method=lambda doc, name: "Hi {}!".format(name))
+ Doc.set_extension("hello", method=lambda doc, name: f"Hi {name}!")
assert doc._.hello("Bob") == "Hi Bob!"
```
diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md
index 5a3a95a53..479bdd264 100644
--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@@ -304,12 +304,6 @@ print(doc.vocab.strings["coffee"]) # 3197928453018144401
print(doc.vocab.strings[3197928453018144401]) # 'coffee'
```
-> #### What does 'L' at the end of a hash mean?
->
-> If you return a hash value in the **Python 2 interpreter**, it'll show up as
-> `3197928453018144401L`. The `L` just means "long integer" – it's **not**
-> actually a part of the hash value.
-
Now that all strings are encoded, the entries in the vocabulary **don't need to
include the word text** themselves. Instead, they can look it up in the
`StringStore` via its hash value. Each entry in the vocabulary, also called
@@ -857,17 +851,16 @@ def put_spans_around_tokens(doc):
and you can calculate what you need, e.g. , etc.)
"""
output = []
- html = '{word} {space}'
for token in doc:
if token.is_space:
output.append(token.text)
else:
- classes = "pos-{} dep-{}".format(token.pos_, token.dep_)
- output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
+ classes = f"pos-{token.pos_} dep-{token.dep_}"
+ output.append(f'{token.text} {token.whitespace_}')
string = "".join(output)
string = string.replace("\\n", "")
string = string.replace("\\t", " ")
- return "
{} ".format(string)
+ return f"{string} "
nlp = spacy.load("en_core_web_sm")
From 33a2682d60c753469d78cf68b6065a284e774f40 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 25 Dec 2019 12:39:49 +0100
Subject: [PATCH 036/496] Add better schemas and validation using Pydantic
(#4831)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* Remove unicode declarations
* Remove Python 3.5 and 2.7 from CI
* Don't require pathlib
* Replace compat helpers
* Remove OrderedDict
* Use f-strings
* Set Cython compiler language level
* Fix typo
* Re-add OrderedDict for Table
* Update setup.cfg
* Revert CONTRIBUTING.md
* Add better schemas and validation using Pydantic
* Revert lookups.md
* Remove unused import
* Update spacy/schemas.py
Co-Authored-By: Sebastián Ramírez
* Various small fixes
* Fix docstring
Co-authored-by: Sebastián Ramírez
---
requirements.txt | 3 +-
setup.cfg | 1 +
spacy/cli/_schemas.py | 217 ------------------
spacy/errors.py | 3 -
spacy/matcher/_schemas.py | 197 ----------------
spacy/matcher/dependencymatcher.pyx | 5 +-
spacy/matcher/matcher.pxd | 2 +-
spacy/matcher/matcher.pyx | 18 +-
spacy/matcher/phrasematcher.pyx | 4 +-
spacy/schemas.py | 188 +++++++++++++++
spacy/tests/doc/test_to_json.py | 9 -
spacy/tests/matcher/test_matcher_api.py | 2 +-
spacy/tests/matcher/test_matcher_logic.py | 12 +-
.../tests/matcher/test_pattern_validation.py | 30 +--
spacy/tests/test_json_schemas.py | 47 ----
spacy/util.py | 42 ----
16 files changed, 217 insertions(+), 563 deletions(-)
delete mode 100644 spacy/cli/_schemas.py
delete mode 100644 spacy/matcher/_schemas.py
create mode 100644 spacy/schemas.py
delete mode 100644 spacy/tests/test_json_schemas.py
diff --git a/requirements.txt b/requirements.txt
index 188459c67..79a05b2bd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,8 +12,7 @@ numpy>=1.15.0
requests>=2.13.0,<3.0.0
plac>=0.9.6,<1.2.0
tqdm>=4.38.0,<5.0.0
-# Optional dependencies
-jsonschema>=2.6.0,<3.1.0
+pydantic>=1.0.0,<2.0.0
# Development dependencies
cython>=0.25
pytest>=4.6.5
diff --git a/setup.cfg b/setup.cfg
index 28259c989..755f522e7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -51,6 +51,7 @@ install_requires =
numpy>=1.15.0
plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0
+ pydantic>=1.0.0,<2.0.0
[options.extras_require]
lookups =
diff --git a/spacy/cli/_schemas.py b/spacy/cli/_schemas.py
deleted file mode 100644
index 42e5e04dd..000000000
--- a/spacy/cli/_schemas.py
+++ /dev/null
@@ -1,217 +0,0 @@
-
-# NB: This schema describes the new format of the training data, see #2928
-TRAINING_SCHEMA = {
- "$schema": "http://json-schema.org/draft-06/schema",
- "title": "Training data for spaCy models",
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "text": {
- "title": "The text of the training example",
- "type": "string",
- "minLength": 1,
- },
- "ents": {
- "title": "Named entity spans in the text",
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "start": {
- "title": "Start character offset of the span",
- "type": "integer",
- "minimum": 0,
- },
- "end": {
- "title": "End character offset of the span",
- "type": "integer",
- "minimum": 0,
- },
- "label": {
- "title": "Entity label",
- "type": "string",
- "minLength": 1,
- "pattern": "^[A-Z0-9]*$",
- },
- },
- "required": ["start", "end", "label"],
- },
- },
- "sents": {
- "title": "Sentence spans in the text",
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "start": {
- "title": "Start character offset of the span",
- "type": "integer",
- "minimum": 0,
- },
- "end": {
- "title": "End character offset of the span",
- "type": "integer",
- "minimum": 0,
- },
- },
- "required": ["start", "end"],
- },
- },
- "cats": {
- "title": "Text categories for the text classifier",
- "type": "object",
- "patternProperties": {
- "*": {
- "title": "A text category",
- "oneOf": [
- {"type": "boolean"},
- {"type": "number", "minimum": 0},
- ],
- }
- },
- "propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1},
- },
- "tokens": {
- "title": "The tokens in the text",
- "type": "array",
- "items": {
- "type": "object",
- "minProperties": 1,
- "properties": {
- "id": {
- "title": "Token ID, usually token index",
- "type": "integer",
- "minimum": 0,
- },
- "start": {
- "title": "Start character offset of the token",
- "type": "integer",
- "minimum": 0,
- },
- "end": {
- "title": "End character offset of the token",
- "type": "integer",
- "minimum": 0,
- },
- "pos": {
- "title": "Coarse-grained part-of-speech tag",
- "type": "string",
- "minLength": 1,
- },
- "tag": {
- "title": "Fine-grained part-of-speech tag",
- "type": "string",
- "minLength": 1,
- },
- "dep": {
- "title": "Dependency label",
- "type": "string",
- "minLength": 1,
- },
- "head": {
- "title": "Index of the token's head",
- "type": "integer",
- "minimum": 0,
- },
- },
- "required": ["start", "end"],
- },
- },
- "_": {"title": "Custom user space", "type": "object"},
- },
- "required": ["text"],
- },
-}
-
-META_SCHEMA = {
- "$schema": "http://json-schema.org/draft-06/schema",
- "type": "object",
- "properties": {
- "lang": {
- "title": "Two-letter language code, e.g. 'en'",
- "type": "string",
- "minLength": 2,
- "maxLength": 2,
- "pattern": "^[a-z]*$",
- },
- "name": {
- "title": "Model name",
- "type": "string",
- "minLength": 1,
- "pattern": "^[a-z_]*$",
- },
- "version": {
- "title": "Model version",
- "type": "string",
- "minLength": 1,
- "pattern": "^[0-9a-z.-]*$",
- },
- "spacy_version": {
- "title": "Compatible spaCy version identifier",
- "type": "string",
- "minLength": 1,
- "pattern": "^[0-9a-z.-><=]*$",
- },
- "parent_package": {
- "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
- "type": "string",
- "minLength": 1,
- "default": "spacy",
- },
- "pipeline": {
- "title": "Names of pipeline components",
- "type": "array",
- "items": {"type": "string", "minLength": 1},
- },
- "description": {"title": "Model description", "type": "string"},
- "license": {"title": "Model license", "type": "string"},
- "author": {"title": "Model author name", "type": "string"},
- "email": {"title": "Model author email", "type": "string", "format": "email"},
- "url": {"title": "Model author URL", "type": "string", "format": "uri"},
- "sources": {
- "title": "Training data sources",
- "type": "array",
- "items": {"type": "string"},
- },
- "vectors": {
- "title": "Included word vectors",
- "type": "object",
- "properties": {
- "keys": {
- "title": "Number of unique keys",
- "type": "integer",
- "minimum": 0,
- },
- "vectors": {
- "title": "Number of unique vectors",
- "type": "integer",
- "minimum": 0,
- },
- "width": {
- "title": "Number of dimensions",
- "type": "integer",
- "minimum": 0,
- },
- },
- },
- "accuracy": {
- "title": "Accuracy numbers",
- "type": "object",
- "patternProperties": {"*": {"type": "number", "minimum": 0.0}},
- },
- "speed": {
- "title": "Speed evaluation numbers",
- "type": "object",
- "patternProperties": {
- "*": {
- "oneOf": [
- {"type": "number", "minimum": 0.0},
- {"type": "integer", "minimum": 0},
- ]
- }
- },
- },
- },
- "required": ["lang", "name", "version"],
-}
diff --git a/spacy/errors.py b/spacy/errors.py
index 81747b33b..3aa4bedea 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -105,7 +105,6 @@ class Warnings(object):
"smaller JSON files instead.")
-
@add_codes
class Errors(object):
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
@@ -419,8 +418,6 @@ class Errors(object):
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
"`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
- E136 = ("This additional feature requires the jsonschema library to be "
- "installed:\npip install jsonschema")
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
"to provide a valid JSON object as input with either the `text` "
"or `tokens` key. For more info, see the docs:\n"
diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py
deleted file mode 100644
index ce6379c45..000000000
--- a/spacy/matcher/_schemas.py
+++ /dev/null
@@ -1,197 +0,0 @@
-
-TOKEN_PATTERN_SCHEMA = {
- "$schema": "http://json-schema.org/draft-06/schema",
- "definitions": {
- "string_value": {
- "anyOf": [
- {"type": "string"},
- {
- "type": "object",
- "properties": {
- "REGEX": {"type": "string"},
- "IN": {"type": "array", "items": {"type": "string"}},
- "NOT_IN": {"type": "array", "items": {"type": "string"}},
- },
- "additionalProperties": False,
- },
- ]
- },
- "integer_value": {
- "anyOf": [
- {"type": "integer"},
- {
- "type": "object",
- "properties": {
- "REGEX": {"type": "string"},
- "IN": {"type": "array", "items": {"type": "integer"}},
- "NOT_IN": {"type": "array", "items": {"type": "integer"}},
- "==": {"type": "integer"},
- ">=": {"type": "integer"},
- "<=": {"type": "integer"},
- ">": {"type": "integer"},
- "<": {"type": "integer"},
- },
- "additionalProperties": False,
- },
- ]
- },
- "boolean_value": {"type": "boolean"},
- "underscore_value": {
- "anyOf": [
- {"type": ["string", "integer", "number", "array", "boolean", "null"]},
- {
- "type": "object",
- "properties": {
- "REGEX": {"type": "string"},
- "IN": {
- "type": "array",
- "items": {"type": ["string", "integer"]},
- },
- "NOT_IN": {
- "type": "array",
- "items": {"type": ["string", "integer"]},
- },
- "==": {"type": "integer"},
- ">=": {"type": "integer"},
- "<=": {"type": "integer"},
- ">": {"type": "integer"},
- "<": {"type": "integer"},
- },
- "additionalProperties": False,
- },
- ]
- },
- },
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "ORTH": {
- "title": "Verbatim token text",
- "$ref": "#/definitions/string_value",
- },
- "TEXT": {
- "title": "Verbatim token text (spaCy v2.1+)",
- "$ref": "#/definitions/string_value",
- },
- "LOWER": {
- "title": "Lowercase form of token text",
- "$ref": "#/definitions/string_value",
- },
- "POS": {
- "title": "Coarse-grained part-of-speech tag",
- "$ref": "#/definitions/string_value",
- },
- "TAG": {
- "title": "Fine-grained part-of-speech tag",
- "$ref": "#/definitions/string_value",
- },
- "DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"},
- "LEMMA": {
- "title": "Lemma (base form)",
- "$ref": "#/definitions/string_value",
- },
- "SHAPE": {
- "title": "Abstract token shape",
- "$ref": "#/definitions/string_value",
- },
- "ENT_TYPE": {
- "title": "Entity label of single token",
- "$ref": "#/definitions/string_value",
- },
- "NORM": {
- "title": "Normalized form of the token text",
- "$ref": "#/definitions/string_value",
- },
- "LENGTH": {
- "title": "Token character length",
- "$ref": "#/definitions/integer_value",
- },
- "IS_ALPHA": {
- "title": "Token consists of alphabetic characters",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_ASCII": {
- "title": "Token consists of ASCII characters",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_DIGIT": {
- "title": "Token consists of digits",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_LOWER": {
- "title": "Token is lowercase",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_UPPER": {
- "title": "Token is uppercase",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_TITLE": {
- "title": "Token is titlecase",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_PUNCT": {
- "title": "Token is punctuation",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_SPACE": {
- "title": "Token is whitespace",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_BRACKET": {
- "title": "Token is a bracket",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_QUOTE": {
- "title": "Token is a quotation mark",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_LEFT_PUNCT": {
- "title": "Token is a left punctuation mark",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_RIGHT_PUNCT": {
- "title": "Token is a right punctuation mark",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_CURRENCY": {
- "title": "Token is a currency symbol",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_STOP": {
- "title": "Token is stop word",
- "$ref": "#/definitions/boolean_value",
- },
- "IS_SENT_START": {
- "title": "Token is the first in a sentence",
- "$ref": "#/definitions/boolean_value",
- },
- "LIKE_NUM": {
- "title": "Token resembles a number",
- "$ref": "#/definitions/boolean_value",
- },
- "LIKE_URL": {
- "title": "Token resembles a URL",
- "$ref": "#/definitions/boolean_value",
- },
- "LIKE_EMAIL": {
- "title": "Token resembles an email address",
- "$ref": "#/definitions/boolean_value",
- },
- "_": {
- "title": "Custom extension token attributes (token._.)",
- "type": "object",
- "patternProperties": {
- "^.*$": {"$ref": "#/definitions/underscore_value"}
- },
- },
- "OP": {
- "title": "Operators / quantifiers",
- "type": "string",
- "enum": ["+", "*", "?", "!"],
- },
- },
- "additionalProperties": False,
- },
-}
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 46cff0d0c..f94c66cb0 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -39,7 +39,8 @@ cdef class DependencyMatcher:
RETURNS (DependencyMatcher): The newly constructed object.
"""
size = 20
- self.token_matcher = Matcher(vocab)
+ # TODO: make matcher work with validation
+ self.token_matcher = Matcher(vocab, validate=False)
self._keys_to_token = {}
self._patterns = {}
self._root = {}
@@ -129,7 +130,7 @@ cdef class DependencyMatcher:
# TODO: Better ways to hash edges in pattern?
for j in range(len(_patterns[i])):
k = self._normalize_key(unicode(key) + DELIMITER + unicode(i) + DELIMITER + unicode(j))
- self.token_matcher.add(k, None, _patterns[i][j])
+ self.token_matcher.add(k, [_patterns[i][j]])
_keys_to_token[k] = j
_keys_to_token_list.append(_keys_to_token)
self._keys_to_token.setdefault(key, [])
diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd
index dd04153bf..689734079 100644
--- a/spacy/matcher/matcher.pxd
+++ b/spacy/matcher/matcher.pxd
@@ -63,7 +63,7 @@ cdef class Matcher:
cdef Pool mem
cdef vector[TokenPatternC*] patterns
cdef readonly Vocab vocab
- cdef public object validator
+ cdef public object validate
cdef public object _patterns
cdef public object _callbacks
cdef public object _extensions
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 2908ab0c2..4258fdb6a 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -15,8 +15,7 @@ from ..tokens.doc cimport Doc, get_token_attr
from ..tokens.token cimport Token
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA
-from ._schemas import TOKEN_PATTERN_SCHEMA
-from ..util import get_json_validator, validate_json
+from ..schemas import validate_token_pattern
from ..errors import Errors, MatchPatternError, Warnings, deprecation_warning
from ..strings import get_string_id
from ..attrs import IDS
@@ -32,7 +31,7 @@ cdef class Matcher:
USAGE: https://spacy.io/usage/rule-based-matching
"""
- def __init__(self, vocab, validate=False):
+ def __init__(self, vocab, validate=True):
"""Create the Matcher.
vocab (Vocab): The vocabulary object, which must be shared with the
@@ -46,10 +45,7 @@ cdef class Matcher:
self._seen_attrs = set()
self.vocab = vocab
self.mem = Pool()
- if validate:
- self.validator = get_json_validator(TOKEN_PATTERN_SCHEMA)
- else:
- self.validator = None
+ self.validate = validate
def __reduce__(self):
data = (self.vocab, self._patterns, self._callbacks)
@@ -119,8 +115,8 @@ cdef class Matcher:
raise ValueError(Errors.E012.format(key=key))
if not isinstance(pattern, list):
raise ValueError(Errors.E178.format(pat=pattern, key=key))
- if self.validator:
- errors[i] = validate_json(pattern, self.validator)
+ if self.validate:
+ errors[i] = validate_token_pattern(pattern)
if any(err for err in errors.values()):
raise MatchPatternError(key, errors)
key = self._normalize_key(key)
@@ -668,8 +664,6 @@ def _get_attr_values(spec, string_store):
continue
if attr == "TEXT":
attr = "ORTH"
- if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
- raise ValueError(Errors.E152.format(attr=attr))
attr = IDS.get(attr)
if isinstance(value, basestring):
value = string_store.add(value)
@@ -684,7 +678,7 @@ def _get_attr_values(spec, string_store):
if attr is not None:
attr_values.append((attr, value))
else:
- # should be caught above using TOKEN_PATTERN_SCHEMA
+ # should be caught in validation
raise ValueError(Errors.E152.format(attr=attr))
return attr_values
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 20f45b9e4..961a318f6 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -9,7 +9,7 @@ from ..structs cimport TokenC
from ..tokens.token cimport Token
from ..typedefs cimport attr_t
-from ._schemas import TOKEN_PATTERN_SCHEMA
+from ..schemas import TokenPattern
from ..errors import Errors, Warnings, deprecation_warning, user_warning
@@ -54,7 +54,7 @@ cdef class PhraseMatcher:
attr = attr.upper()
if attr == "TEXT":
attr = "ORTH"
- if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
+ if attr.lower() not in TokenPattern().dict():
raise ValueError(Errors.E152.format(attr=attr))
self.attr = self.vocab.strings[attr]
diff --git a/spacy/schemas.py b/spacy/schemas.py
new file mode 100644
index 000000000..4a5054125
--- /dev/null
+++ b/spacy/schemas.py
@@ -0,0 +1,188 @@
+from typing import Dict, List, Union, Optional
+from enum import Enum
+from pydantic import BaseModel, Field, ValidationError, validator
+from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
+from collections import defaultdict
+
+from .attrs import NAMES
+
+
+def validate(schema, obj):
+ """Validate data against a given pydantic schema.
+
+ obj (dict): JSON-serializable data to validate.
+ schema (pydantic.BaseModel): The schema to validate against.
+ RETURNS (list): A list of error messages, if available.
+ """
+ try:
+ schema(**obj)
+ return []
+ except ValidationError as e:
+ errors = e.errors()
+ data = defaultdict(list)
+ for error in errors:
+ err_loc = " -> ".join([str(p) for p in error.get("loc", [])])
+ data[err_loc].append(error.get("msg"))
+ return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]
+
+
+# Matcher token patterns
+
+
+def validate_token_pattern(obj):
+ # Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"})
+ get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k
+ if isinstance(obj, list):
+ converted = []
+ for pattern in obj:
+ if isinstance(pattern, dict):
+ pattern = {get_key(k): v for k, v in pattern.items()}
+ converted.append(pattern)
+ obj = converted
+ return validate(TokenPatternSchema, {"pattern": obj})
+
+
+class TokenPatternString(BaseModel):
+ REGEX: Optional[StrictStr]
+ IN: Optional[List[StrictStr]]
+ NOT_IN: Optional[List[StrictStr]]
+
+ class Config:
+ extra = "forbid"
+
+ @validator("*", pre=True, whole=True)
+ def raise_for_none(cls, v):
+ if v is None:
+ raise ValueError("None / null is not allowed")
+ return v
+
+
+class TokenPatternNumber(BaseModel):
+ REGEX: Optional[StrictStr] = None
+ IN: Optional[List[StrictInt]] = None
+ NOT_IN: Optional[List[StrictInt]] = None
+ EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
+ GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
+ LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
+ GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
+ LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
+
+ class Config:
+ extra = "forbid"
+
+ @validator("*", pre=True, whole=True)
+ def raise_for_none(cls, v):
+ if v is None:
+ raise ValueError("None / null is not allowed")
+ return v
+
+
+class TokenPatternOperator(str, Enum):
+ plus: StrictStr = "+"
+ start: StrictStr = "*"
+ question: StrictStr = "?"
+ exclamation: StrictStr = "!"
+
+
+StringValue = Union[TokenPatternString, StrictStr]
+NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
+UnderscoreValue = Union[
+ TokenPatternString, TokenPatternNumber, str, int, float, list, bool,
+]
+
+
+class TokenPattern(BaseModel):
+ orth: Optional[StringValue] = None
+ text: Optional[StringValue] = None
+ lower: Optional[StringValue] = None
+ pos: Optional[StringValue] = None
+ tag: Optional[StringValue] = None
+ dep: Optional[StringValue] = None
+ lemma: Optional[StringValue] = None
+ shape: Optional[StringValue] = None
+ ent_type: Optional[StringValue] = None
+ norm: Optional[StringValue] = None
+ length: Optional[NumberValue] = None
+ is_alpha: Optional[StrictBool] = None
+ is_ascii: Optional[StrictBool] = None
+ is_digit: Optional[StrictBool] = None
+ is_lower: Optional[StrictBool] = None
+ is_upper: Optional[StrictBool] = None
+ is_title: Optional[StrictBool] = None
+ is_punct: Optional[StrictBool] = None
+ is_space: Optional[StrictBool] = None
+ is_bracket: Optional[StrictBool] = None
+ is_quote: Optional[StrictBool] = None
+ is_left_punct: Optional[StrictBool] = None
+ is_right_punct: Optional[StrictBool] = None
+ is_currency: Optional[StrictBool] = None
+ is_stop: Optional[StrictBool] = None
+ is_sent_start: Optional[StrictBool] = None
+ like_num: Optional[StrictBool] = None
+ like_url: Optional[StrictBool] = None
+ like_email: Optional[StrictBool] = None
+ op: Optional[TokenPatternOperator] = None
+ underscore: Optional[Dict[StrictStr, UnderscoreValue]] = Field(None, alias="_")
+
+ class Config:
+ extra = "forbid"
+ allow_population_by_field_name = True
+ alias_generator = lambda value: value.upper()
+
+ @validator("*", pre=True)
+ def raise_for_none(cls, v):
+ if v is None:
+ raise ValueError("None / null is not allowed")
+ return v
+
+
+class TokenPatternSchema(BaseModel):
+ pattern: List[TokenPattern] = Field(..., minItems=1)
+
+ class Config:
+ extra = "forbid"
+
+
+# Model meta
+
+
+class ModelMetaSchema(BaseModel):
+ # fmt: off
+ lang: StrictStr = Field(..., title="Two-letter language code, e.g. 'en'")
+ name: StrictStr = Field(..., title="Model name")
+ version: StrictStr = Field(..., title="Model version")
+ spacy_version: Optional[StrictStr] = Field(None, title="Compatible spaCy version identifier")
+ parent_package: Optional[StrictStr] = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly")
+ pipeline: Optional[List[StrictStr]] = Field([], title="Names of pipeline components")
+ description: Optional[StrictStr] = Field(None, title="Model description")
+ license: Optional[StrictStr] = Field(None, title="Model license")
+ author: Optional[StrictStr] = Field(None, title="Model author name")
+ email: Optional[StrictStr] = Field(None, title="Model author email")
+ url: Optional[StrictStr] = Field(None, title="Model author URL")
+ sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources")
+ vectors: Optional[Dict[str, int]] = Field(None, title="Included word vectors")
+ accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers")
+ speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers")
+ # fmt: on
+
+
+# Training data object in "simple training style"
+
+
+class SimpleTrainingSchema(BaseModel):
+ # TODO: write
+
+ class Config:
+ title = "Schema for training data dict in passed to nlp.update"
+ extra = "forbid"
+
+
+# JSON training format
+
+
+class TrainingSchema(BaseModel):
+ # TODO: write
+
+ class Config:
+ title = "Schema for training data in spaCy's JSON format"
+ extra = "forbid"
diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py
index 18243c306..da3bc7dbb 100644
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@@ -1,6 +1,4 @@
import pytest
-from spacy.cli._schemas import TRAINING_SCHEMA
-from spacy.util import get_json_validator, validate_json
from spacy.tokens import Doc
from ..util import get_doc
@@ -55,10 +53,3 @@ def test_doc_to_json_underscore_error_serialize(doc):
Doc.set_extension("json_test4", method=lambda doc: doc.text)
with pytest.raises(ValueError):
doc.to_json(underscore=["json_test4"])
-
-
-def test_doc_to_json_valid_training(doc):
- json_doc = doc.to_json()
- validator = get_json_validator(TRAINING_SCHEMA)
- errors = validate_json([json_doc], validator)
- assert not errors
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index adeef834d..3900f1e68 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -179,7 +179,7 @@ def test_matcher_match_one_plus(matcher):
doc = Doc(control.vocab, words=["Philippe", "Philippe"])
m = control(doc)
assert len(m) == 2
- pattern = [{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}]
+ pattern = [{"ORTH": "Philippe"}, {"ORTH": "Philippe", "OP": "+"}]
matcher.add("KleenePhilippe", [pattern])
m = matcher(doc)
assert len(m) == 1
diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py
index a6a82f2e2..a2b2cd83f 100644
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@@ -6,18 +6,18 @@ from spacy.matcher import Matcher
from spacy.tokens import Doc, Span
-pattern1 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "*"}]
-pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A", "OP": "1"}]
-pattern3 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "1"}]
+pattern1 = [{"ORTH": "A"}, {"ORTH": "A", "OP": "*"}]
+pattern2 = [{"ORTH": "A"}, {"ORTH": "A"}]
+pattern3 = [{"ORTH": "A"}, {"ORTH": "A"}]
pattern4 = [
- {"ORTH": "B", "OP": "1"},
+ {"ORTH": "B"},
{"ORTH": "A", "OP": "*"},
- {"ORTH": "B", "OP": "1"},
+ {"ORTH": "B"},
]
pattern5 = [
{"ORTH": "B", "OP": "*"},
{"ORTH": "A", "OP": "*"},
- {"ORTH": "B", "OP": "1"},
+ {"ORTH": "B"},
]
re_pattern1 = "AA*"
diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py
index c879cc0fe..ade724d05 100644
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@@ -1,8 +1,7 @@
import pytest
from spacy.matcher import Matcher
-from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA
from spacy.errors import MatchPatternError
-from spacy.util import get_json_validator, validate_json
+from spacy.schemas import validate_token_pattern
# (pattern, num errors with validation, num errors identified with minimal
# checks)
@@ -15,12 +14,12 @@ TEST_PATTERNS = [
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
([1, 2, 3], 3, 1),
# Bad patterns flagged outside of Matcher
- ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0),
+ ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
# Bad patterns not flagged with minimal checks
([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 0),
- ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0),
- ([{"LENGTH": {"VALUE": 5}}], 1, 0),
- ([{"TEXT": {"VALUE": "foo"}}], 1, 0),
+ ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 4, 0), # prev: (2, 0)
+ ([{"LENGTH": {"VALUE": 5}}], 2, 0), # prev: (1, 0)
+ ([{"TEXT": {"VALUE": "foo"}}], 2, 0), # prev: (1, 0)
([{"IS_DIGIT": -1}], 1, 0),
([{"ORTH": -1}], 1, 0),
# Good patterns
@@ -31,15 +30,9 @@ TEST_PATTERNS = [
([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0),
([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0),
([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0),
+ ([{"orth": "foo"}], 0, 0), # prev: xfail
]
-XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)]
-
-
-@pytest.fixture
-def validator():
- return get_json_validator(TOKEN_PATTERN_SCHEMA)
-
@pytest.mark.parametrize(
"pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]]
@@ -51,15 +44,8 @@ def test_matcher_pattern_validation(en_vocab, pattern):
@pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS)
-def test_pattern_validation(validator, pattern, n_errors, _):
- errors = validate_json(pattern, validator)
- assert len(errors) == n_errors
-
-
-@pytest.mark.xfail
-@pytest.mark.parametrize("pattern,n_errors,_", XFAIL_TEST_PATTERNS)
-def test_xfail_pattern_validation(validator, pattern, n_errors, _):
- errors = validate_json(pattern, validator)
+def test_pattern_validation(pattern, n_errors, _):
+ errors = validate_token_pattern(pattern)
assert len(errors) == n_errors
diff --git a/spacy/tests/test_json_schemas.py b/spacy/tests/test_json_schemas.py
deleted file mode 100644
index 1330d3a65..000000000
--- a/spacy/tests/test_json_schemas.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from spacy.util import get_json_validator, validate_json, validate_schema
-from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA
-from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA
-import pytest
-
-
-@pytest.fixture(scope="session")
-def training_schema_validator():
- return get_json_validator(TRAINING_SCHEMA)
-
-
-def test_validate_schema():
- validate_schema({"type": "object"})
- with pytest.raises(Exception):
- validate_schema({"type": lambda x: x})
-
-
-@pytest.mark.parametrize("schema", [TRAINING_SCHEMA, META_SCHEMA, TOKEN_PATTERN_SCHEMA])
-def test_schemas(schema):
- validate_schema(schema)
-
-
-@pytest.mark.parametrize(
- "data",
- [
- {"text": "Hello world"},
- {"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]},
- ],
-)
-def test_json_schema_training_valid(data, training_schema_validator):
- errors = validate_json([data], training_schema_validator)
- assert not errors
-
-
-@pytest.mark.parametrize(
- "data,n_errors",
- [
- ({"spans": []}, 1),
- ({"text": "Hello", "ents": [{"start": "0", "end": "5", "label": "TEST"}]}, 2),
- ({"text": "Hello", "ents": [{"start": 0, "end": 5}]}, 1),
- ({"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "test"}]}, 1),
- ({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2),
- ],
-)
-def test_json_schema_training_invalid(data, n_errors, training_schema_validator):
- errors = validate_json([data], training_schema_validator)
- assert len(errors) == n_errors
diff --git a/spacy/util.py b/spacy/util.py
index 4e6c10e2b..57bbee69f 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -13,11 +13,6 @@ import srsly
import catalogue
import sys
-try:
- import jsonschema
-except ImportError:
- jsonschema = None
-
try:
import cupy.random
except ImportError:
@@ -705,43 +700,6 @@ def fix_random_seed(seed=0):
cupy.random.seed(seed)
-def get_json_validator(schema):
- # We're using a helper function here to make it easier to change the
- # validator that's used (e.g. different draft implementation), without
- # having to change it all across the codebase.
- # TODO: replace with (stable) Draft6Validator, if available
- if jsonschema is None:
- raise ValueError(Errors.E136)
- return jsonschema.Draft4Validator(schema)
-
-
-def validate_schema(schema):
- """Validate a given schema. This just checks if the schema itself is valid."""
- validator = get_json_validator(schema)
- validator.check_schema(schema)
-
-
-def validate_json(data, validator):
- """Validate data against a given JSON schema (see https://json-schema.org).
-
- data: JSON-serializable data to validate.
- validator (jsonschema.DraftXValidator): The validator.
- RETURNS (list): A list of error messages, if available.
- """
- errors = []
- for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
- if err.path:
- err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
- else:
- err_path = ""
- msg = err.message + " " + err_path
- if err.context: # Error has suberrors, e.g. if schema uses anyOf
- suberrs = [f" - {suberr.message}" for suberr in err.context]
- msg += f":\n{''.join(suberrs)}"
- errors.append(msg)
- return errors
-
-
def get_serialization_exclude(serializers, exclude, kwargs):
"""Helper function to validate serialization args and manage transition from
keyword arguments (pre v2.1) to exclude argument.
From c22f0755098ba153d3617320e3c70fe64fdac2d1 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 25 Dec 2019 17:29:53 +0100
Subject: [PATCH 037/496] Update pydantic version pin [ci skip]
---
setup.cfg | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/setup.cfg b/setup.cfg
index 755f522e7..9516a3dda 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -51,7 +51,7 @@ install_requires =
numpy>=1.15.0
plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0
- pydantic>=1.0.0,<2.0.0
+ pydantic>=1.3.0,<2.0.0
[options.extras_require]
lookups =
From a892821c51ab61aa917cf8ed342867a0d3b31a35 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 25 Dec 2019 17:59:52 +0100
Subject: [PATCH 038/496] More formatting changes
---
fabfile.py | 6 ++----
spacy/__main__.py | 2 +-
spacy/_ml.py | 2 +-
spacy/cli/evaluate.py | 26 +++++++++++------------
spacy/cli/init_model.py | 4 ++--
spacy/cli/package.py | 2 +-
spacy/cli/pretrain.py | 12 ++++-------
spacy/cli/train.py | 10 ++++-----
spacy/gold.pyx | 10 ++++-----
spacy/kb.pyx | 2 +-
spacy/lang/ca/tokenizer_exceptions.py | 4 ++--
spacy/lang/da/tokenizer_exceptions.py | 2 +-
spacy/lang/el/tokenizer_exceptions.py | 8 +++----
spacy/lang/en/tokenizer_exceptions.py | 8 +++----
spacy/lang/es/tokenizer_exceptions.py | 4 ++--
spacy/lang/fr/tokenizer_exceptions.py | 2 +-
spacy/language.py | 2 +-
spacy/morphology.pyx | 2 +-
spacy/pipeline/pipes.pyx | 4 ++--
spacy/syntax/nonproj.pyx | 3 +--
spacy/syntax/stateclass.pyx | 6 +++---
spacy/tests/conftest.py | 4 ++--
spacy/tests/pipeline/test_pipe_methods.py | 2 +-
spacy/tokens/doc.pyx | 2 +-
spacy/util.py | 4 ++--
25 files changed, 63 insertions(+), 70 deletions(-)
diff --git a/fabfile.py b/fabfile.py
index 460471747..760c2c0e2 100644
--- a/fabfile.py
+++ b/fabfile.py
@@ -1,6 +1,6 @@
import contextlib
from pathlib import Path
-from fabric.api import local, lcd, env, settings, prefix
+from fabric.api import local, lcd
from os import path, environ
import shutil
import sys
@@ -79,9 +79,7 @@ def pex():
with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)):
sha = local("git rev-parse --short HEAD", capture=True)
- venv_local(
- "pex dist/*.whl -e spacy -o dist/spacy-%s.pex" % sha, direct=True
- )
+ venv_local(f"pex dist/*.whl -e spacy -o dist/spacy-{sha}.pex", direct=True)
def clean():
diff --git a/spacy/__main__.py b/spacy/__main__.py
index 06ba5704d..05e3d5e02 100644
--- a/spacy/__main__.py
+++ b/spacy/__main__.py
@@ -22,7 +22,7 @@ if __name__ == "__main__":
if len(sys.argv) == 1:
msg.info("Available commands", ", ".join(commands), exits=1)
command = sys.argv.pop(1)
- sys.argv[0] = "spacy %s" % command
+ sys.argv[0] = f"spacy {command}"
if command in commands:
plac.call(commands[command], sys.argv[1:])
else:
diff --git a/spacy/_ml.py b/spacy/_ml.py
index a1d2b6b77..37cfff0b7 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -296,7 +296,7 @@ def link_vectors_to_models(vocab):
# This is a hack to avoid the problem in #3853. Maybe we should
# print a warning as well?
old_name = vectors.name
- new_name = vectors.name + "_%d" % data.shape[0]
+ new_name = f"{vectors.name}_{data.shape[0]}"
user_warning(Warnings.W019.format(old=old_name, new=new_name))
vectors.name = new_name
key = (ops.device, vectors.name)
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index de2cb4d09..a6b730d65 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -47,20 +47,20 @@ def evaluate(
end = timer()
nwords = sum(len(ex.doc) for ex in dev_dataset)
results = {
- "Time": "%.2f s" % (end - begin),
+ "Time": f"{end - begin:.2f} s",
"Words": nwords,
- "Words/s": "%.0f" % (nwords / (end - begin)),
- "TOK": "%.2f" % scorer.token_acc,
- "POS": "%.2f" % scorer.tags_acc,
- "UAS": "%.2f" % scorer.uas,
- "LAS": "%.2f" % scorer.las,
- "NER P": "%.2f" % scorer.ents_p,
- "NER R": "%.2f" % scorer.ents_r,
- "NER F": "%.2f" % scorer.ents_f,
- "Textcat": "%.2f" % scorer.textcat_score,
- "Sent P": "%.2f" % scorer.sent_p,
- "Sent R": "%.2f" % scorer.sent_r,
- "Sent F": "%.2f" % scorer.sent_f,
+ "Words/s": f"{nwords / (end - begin):.0f}",
+ "TOK": f"{scorer.token_acc:.2f}",
+ "POS": f"{scorer.tags_acc:.2f}",
+ "UAS": f"{scorer.uas:.2f}",
+ "LAS": f"{scorer.las:.2f}",
+ "NER P": f"{scorer.ents_p:.2f}",
+ "NER R": f"{scorer.ents_r:.2f}",
+ "NER F": f"{scorer.ents_f:.2f}",
+ "Textcat": f"{scorer.textcat_score:.2f}",
+ "Sent P": f"{scorer.sent_p:.2f}",
+ "Sent R": f"{scorer.sent_r:.2f}",
+ "Sent F": f"{scorer.sent_f:.2f}",
}
msg.table(results, title="Results")
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index c3ef5267c..87583ba73 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -186,7 +186,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
if vectors_data is not None:
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
if name is None:
- nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
+ nlp.vocab.vectors.name = f"{nlp.meta['lang']}_model.vectors"
else:
nlp.vocab.vectors.name = name
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
@@ -232,7 +232,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
word = literal_eval(key)
except SyntaxError:
# Take odd strings literally.
- word = literal_eval("'%s'" % key)
+ word = literal_eval(f"'{key}'")
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 8830a0ca2..edd9117c5 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -83,7 +83,7 @@ def generate_meta(model_path, existing_meta, msg):
("lang", "Model language", meta.get("lang", "en")),
("name", "Model name", meta.get("name", "model")),
("version", "Model version", meta.get("version", "0.0.0")),
- ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
+ ("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"),
("description", "Model description", meta.get("description", False)),
("author", "Author", meta.get("author", False)),
("email", "Author email", meta.get("email", False)),
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 75840923e..12aa8b5c2 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -179,14 +179,12 @@ def pretrain(
else:
if not epoch_start:
msg.fail(
- "You have to use the '--epoch-start' argument when using a renamed weight file for "
- "'--init-tok2vec'",
+ "You have to use the --epoch-start argument when using a renamed weight file for --init-tok2vec",
exits=True,
)
elif epoch_start < 0:
msg.fail(
- "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid"
- % epoch_start,
+ f"The argument --epoch-start has to be greater or equal to 0. {epoch_start} is invalid",
exits=True,
)
else:
@@ -195,16 +193,14 @@ def pretrain(
optimizer = create_default_optimizer(model.ops)
tracker = ProgressTracker(frequency=10000)
- msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start)
+ msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}")
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
def _save_model(epoch, is_temp=False):
is_temp_str = ".temp" if is_temp else ""
with model.use_params(optimizer.averages):
- with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
- "wb"
- ) as file_:
+ with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
file_.write(model.tok2vec.to_bytes())
log = {
"nr_word": tracker.nr_word,
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index e8662a101..df5456df3 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -375,7 +375,7 @@ def train(
words_seen += sum(len(doc) for doc in docs)
with nlp.use_params(optimizer.averages):
util.set_env_log(False)
- epoch_model_path = output_path / ("model%d" % i)
+ epoch_model_path = output_path / f"model{i}"
nlp.to_disk(epoch_model_path)
nlp_loaded = util.load_model_from_path(epoch_model_path)
for beam_width in eval_beam_widths:
@@ -414,13 +414,13 @@ def train(
scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
end_time = timer()
cpu_wps = nwords / (end_time - start_time)
- acc_loc = output_path / ("model%d" % i) / "accuracy.json"
+ acc_loc = output_path / f"model{i}" / "accuracy.json"
srsly.write_json(acc_loc, scorer.scores)
# Update model meta.json
meta["lang"] = nlp.lang
meta["pipeline"] = nlp.pipe_names
- meta["spacy_version"] = ">=%s" % about.__version__
+ meta["spacy_version"] = f">={about.__version__}"
if beam_width == 1:
meta["speed"] = {
"nwords": nwords,
@@ -443,10 +443,10 @@ def train(
"keys": nlp.vocab.vectors.n_keys,
"name": nlp.vocab.vectors.name,
}
- meta.setdefault("name", "model%d" % i)
+ meta.setdefault("name", f"model{i}")
meta.setdefault("version", version)
meta["labels"] = nlp.meta["labels"]
- meta_loc = output_path / ("model%d" % i) / "meta.json"
+ meta_loc = output_path / f"model{i}" / "meta.json"
srsly.write_json(meta_loc, meta)
util.set_env_log(verbose)
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index e3af40d4d..1d3d8e034 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -615,7 +615,7 @@ def _consume_ent(tags):
else:
start = "B-" + label
end = "L-" + label
- middle = ["I-%s" % label for _ in range(1, length - 1)]
+ middle = [f"I-{label}" for _ in range(1, length - 1)]
return [start] + middle + [end]
@@ -1204,12 +1204,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
# Only interested if the tokenization is correct
if start_token is not None and end_token is not None:
if start_token == end_token:
- biluo[start_token] = "U-%s" % label
+ biluo[start_token] = f"U-{label}"
else:
- biluo[start_token] = "B-%s" % label
+ biluo[start_token] = f"B-{label}"
for i in range(start_token+1, end_token):
- biluo[i] = "I-%s" % label
- biluo[end_token] = "L-%s" % label
+ biluo[i] = f"I-{label}"
+ biluo[end_token] = f"L-{label}"
# Now distinguish the O cases from ones where we miss the tokenization
entity_chars = set()
for start_char, end_char, label in entities:
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 1129fa860..64fbb1e29 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -442,7 +442,7 @@ cdef class KnowledgeBase:
cdef class Writer:
def __init__(self, object loc):
if path.exists(loc):
- assert not path.isdir(loc), "%s is directory." % loc
+ assert not path.isdir(loc), f"{loc} is directory"
if isinstance(loc, Path):
loc = bytes(loc)
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py
index 5a9d9055a..b4ae61a2d 100644
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@@ -30,9 +30,9 @@ _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}]
for h in range(1, 12 + 1):
for period in ["a.m.", "am"]:
- _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "a.m."}]
+ _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}]
for period in ["p.m.", "pm"]:
- _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "p.m."}]
+ _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]
TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py
index 64eba819f..c8ea9cbf5 100644
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@@ -559,7 +559,7 @@ for exc_data in [
# Dates
for h in range(1, 31 + 1):
for period in ["."]:
- _exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}]
+ _exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]}
_exc.update(_custom_base_exc)
diff --git a/spacy/lang/el/tokenizer_exceptions.py b/spacy/lang/el/tokenizer_exceptions.py
index 27ae1fe3a..112fd991b 100644
--- a/spacy/lang/el/tokenizer_exceptions.py
+++ b/spacy/lang/el/tokenizer_exceptions.py
@@ -131,14 +131,14 @@ _exc.update(_other_exc)
for h in range(1, 12 + 1):
for period in ["π.μ.", "πμ"]:
- _exc["%d%s" % (h, period)] = [
- {ORTH: "%d" % h},
+ _exc[f"{h}{period}"] = [
+ {ORTH: f"{h}"},
{ORTH: period, LEMMA: "π.μ.", NORM: "π.μ."},
]
for period in ["μ.μ.", "μμ"]:
- _exc["%d%s" % (h, period)] = [
- {ORTH: "%d" % h},
+ _exc[f"{h}{period}"] = [
+ {ORTH: f"{h}"},
{ORTH: period, LEMMA: "μ.μ.", NORM: "μ.μ."},
]
diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index 776948c28..3e8075ec4 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -328,13 +328,13 @@ for exc_data in [
for h in range(1, 12 + 1):
for period in ["a.m.", "am"]:
- _exc["%d%s" % (h, period)] = [
- {ORTH: "%d" % h},
+ _exc[f"{h}{period}"] = [
+ {ORTH: f"{h}"},
{ORTH: period, LEMMA: "a.m.", NORM: "a.m."},
]
for period in ["p.m.", "pm"]:
- _exc["%d%s" % (h, period)] = [
- {ORTH: "%d" % h},
+ _exc[f"{h}{period}"] = [
+ {ORTH: f"{h}"},
{ORTH: period, LEMMA: "p.m.", NORM: "p.m."},
]
diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py
index 1cd5941be..5c7fcb15d 100644
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@@ -28,9 +28,9 @@ _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}]
for h in range(1, 12 + 1):
for period in ["a.m.", "am"]:
- _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "a.m."}]
+ _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}]
for period in ["p.m.", "pm"]:
- _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "p.m."}]
+ _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]
for orth in [
diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py
index b1c0a53af..4e2e7fb18 100644
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@@ -85,7 +85,7 @@ for verb, verb_lemma in [("est", "être")]:
for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
for orth in [pre, pre.title()]:
- _exc["%sest-ce" % orth] = [
+ _exc[f"{orth}est-ce"] = [
{LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
{LEMMA: "être", ORTH: "est", TAG: "VERB"},
{LEMMA: "ce", ORTH: "-ce"},
diff --git a/spacy/language.py b/spacy/language.py
index 4a553bcaf..4ae729588 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1015,7 +1015,7 @@ def _fix_pretrained_vectors_name(nlp):
elif not nlp.vocab.vectors.size:
nlp.vocab.vectors.name = None
elif "name" in nlp.meta and "lang" in nlp.meta:
- vectors_name = "%s_%s.vectors" % (nlp.meta["lang"], nlp.meta["name"])
+ vectors_name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
nlp.vocab.vectors.name = vectors_name
else:
raise ValueError(Errors.E092)
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index f12691170..8030a9a28 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -72,7 +72,7 @@ def _normalize_props(props):
# just take the first one :(
if "|" in value:
value = value.split("|")[0]
- attr = '%s_%s' % (key, value)
+ attr = f"{key}_{value}"
if attr in FEATURES:
props.pop(key)
props[attr] = True
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index ff88340cd..5ca651077 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -985,14 +985,14 @@ class MultitaskObjective(Tagger):
offset = token_annotation.heads[i] - i
offset = min(offset, 2)
offset = max(offset, -2)
- return "%s-%s:%d" % (token_annotation.deps[i], token_annotation.tags[i], offset)
+ return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}"
@staticmethod
def make_ent_tag(i, token_annotation):
if token_annotation.entities is None or token_annotation.entities[i] is None:
return None
else:
- return "%s-%s" % (token_annotation.tags[i], token_annotation.entities[i])
+ return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}"
@staticmethod
def make_sent_start(target, token_annotation, cache=True, _cache={}):
diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx
index 0f738f99f..f024c1f05 100644
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@@ -154,8 +154,7 @@ def _decorate(heads, proj_heads, labels):
deco_labels = []
for tokenid, head in enumerate(heads):
if head != proj_heads[tokenid]:
- deco_labels.append(
- '%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
+ deco_labels.append(f"{labels[tokenid]}{DELIMITER}{labels[head]}")
else:
deco_labels.append(labels[tokenid])
return deco_labels
diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx
index 47b37946c..e472e9861 100644
--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@@ -46,9 +46,9 @@ cdef class StateClass:
def print_state(self, words):
words = list(words) + ['_']
- top = words[self.S(0)] + '_%d' % self.S_(0).head
- second = words[self.S(1)] + '_%d' % self.S_(1).head
- third = words[self.S(2)] + '_%d' % self.S_(2).head
+ top = f"{words[self.S(0)]}_{self.S_(0).head}"
+ second = f"{words[self.S(1)]}_{self.S_(1).head}"
+ third = f"{words[self.S(2)]}_{self.S_(2).head}"
n0 = words[self.B(0)]
n1 = words[self.B(1)]
return ' '.join((third, second, top, '|', n0, n1))
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index ba7b67e25..b391dd88e 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -14,11 +14,11 @@ def pytest_runtest_setup(item):
# recognize the option we're asking about. To avoid this, we need to
# pass a default value. We default to False, i.e., we act like all the
# options weren't given.
- return item.config.getoption("--%s" % opt, False)
+ return item.config.getoption(f"--{opt}", False)
for opt in ["slow"]:
if opt in item.keywords and not getopt(opt):
- pytest.skip("need --%s option to run" % opt)
+ pytest.skip(f"need --{opt} option to run")
# Fixtures for language tokenizers (languages sorted alphabetically)
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 3ec8b508d..e2fb02a2a 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -115,7 +115,7 @@ def test_disable_pipes_list_arg(nlp):
@pytest.mark.parametrize("n_pipes", [100])
def test_add_lots_of_pipes(nlp, n_pipes):
for i in range(n_pipes):
- nlp.add_pipe(lambda doc: doc, name="pipe_%d" % i)
+ nlp.add_pipe(lambda doc: doc, name=f"pipe_{i}")
assert len(nlp.pipe_names) == n_pipes
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 58423c420..7e6473d56 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -499,7 +499,7 @@ cdef class Doc:
token = &self.c[i]
if token.ent_iob == 1:
if start == -1:
- seq = ["%s|%s" % (t.text, t.ent_iob_) for t in self[i-5:i+5]]
+ seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]]
raise ValueError(Errors.E093.format(seq=" ".join(seq)))
elif token.ent_iob == 2 or token.ent_iob == 0:
if start != -1:
diff --git a/spacy/util.py b/spacy/util.py
index 57bbee69f..55e197eb2 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -62,7 +62,7 @@ def get_lang_class(lang):
return registry.languages.get(lang)
else:
try:
- module = importlib.import_module(".lang.%s" % lang, "spacy")
+ module = importlib.import_module(f".lang.{lang}", "spacy")
except ImportError as err:
raise ImportError(Errors.E048.format(lang=lang, err=err))
set_lang_class(lang, getattr(module, module.__all__[0]))
@@ -212,7 +212,7 @@ def load_model_from_init_py(init_file, **overrides):
"""
model_path = Path(init_file).parent
meta = get_model_meta(model_path)
- data_dir = "%s_%s-%s" % (meta["lang"], meta["name"], meta["version"])
+ data_dir = f"{meta['lang']}_{meta['name']}-{meta['version']}"
data_path = model_path / data_dir
if not model_path.exists():
raise IOError(Errors.E052.format(path=data_path))
From 401946d480d2841139c2b8986d900da0d5e12e40 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 25 Dec 2019 18:02:20 +0100
Subject: [PATCH 039/496] Un-xfail passing tests
---
spacy/tests/lang/en/test_prefix_suffix_infix.py | 1 -
spacy/tests/lang/en/test_punct.py | 1 -
spacy/tests/lang/ru/test_tokenizer.py | 1 -
spacy/tests/lang/sr/test_tokenizer.py | 1 -
4 files changed, 4 deletions(-)
diff --git a/spacy/tests/lang/en/test_prefix_suffix_infix.py b/spacy/tests/lang/en/test_prefix_suffix_infix.py
index 8c9c58fea..9efcc1015 100644
--- a/spacy/tests/lang/en/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/en/test_prefix_suffix_infix.py
@@ -108,7 +108,6 @@ def test_en_tokenizer_splits_double_hyphen_infix(en_tokenizer):
assert tokens[9].text == "people"
-@pytest.mark.xfail
def test_en_tokenizer_splits_period_abbr(en_tokenizer):
text = "Today is Tuesday.Mr."
tokens = en_tokenizer(text)
diff --git a/spacy/tests/lang/en/test_punct.py b/spacy/tests/lang/en/test_punct.py
index 4dc6ddfe4..1d10478a1 100644
--- a/spacy/tests/lang/en/test_punct.py
+++ b/spacy/tests/lang/en/test_punct.py
@@ -79,7 +79,6 @@ def test_en_tokenizer_splits_open_appostrophe(en_tokenizer, text):
assert tokens[0].text == "'"
-@pytest.mark.xfail
@pytest.mark.parametrize("text", ["Hello''"])
def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text):
tokens = en_tokenizer(text)
diff --git a/spacy/tests/lang/ru/test_tokenizer.py b/spacy/tests/lang/ru/test_tokenizer.py
index e05a479aa..1cfdc50ee 100644
--- a/spacy/tests/lang/ru/test_tokenizer.py
+++ b/spacy/tests/lang/ru/test_tokenizer.py
@@ -77,7 +77,6 @@ def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text):
assert tokens[0].text == "'"
-@pytest.mark.xfail
@pytest.mark.parametrize("text", ["Тест''"])
def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text):
tokens = ru_tokenizer(text)
diff --git a/spacy/tests/lang/sr/test_tokenizer.py b/spacy/tests/lang/sr/test_tokenizer.py
index 03a0470bd..fdcf790d8 100644
--- a/spacy/tests/lang/sr/test_tokenizer.py
+++ b/spacy/tests/lang/sr/test_tokenizer.py
@@ -77,7 +77,6 @@ def test_sr_tokenizer_splits_open_appostrophe(sr_tokenizer, text):
assert tokens[0].text == "'"
-@pytest.mark.xfail
@pytest.mark.parametrize("text", ["Тест''"])
def test_sr_tokenizer_splits_double_end_quote(sr_tokenizer, text):
tokens = sr_tokenizer(text)
From 83e0a6f3e3bc21e32d95cbe8fcf2f8dd4fa76c65 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 1 Jan 2020 13:15:46 +0100
Subject: [PATCH 040/496] Modernize plac commands for Python 3 (#4836)
---
spacy/cli/convert.py | 37 +++++----------
spacy/cli/debug_data.py | 34 +++++--------
spacy/cli/download.py | 12 ++---
spacy/cli/evaluate.py | 26 ++++------
spacy/cli/info.py | 12 ++---
spacy/cli/init_model.py | 37 +++++----------
spacy/cli/link.py | 13 +++--
spacy/cli/package.py | 18 +++----
spacy/cli/pretrain.py | 103 ++++++++++------------------------------
spacy/cli/profile.py | 14 +++---
spacy/cli/train.py | 91 ++++++++++++-----------------------
11 files changed, 129 insertions(+), 268 deletions(-)
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index d8c8a7a18..31931db68 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -1,4 +1,3 @@
-import plac
from pathlib import Path
from wasabi import Printer
import srsly
@@ -26,31 +25,19 @@ FILE_TYPES = ("json", "jsonl", "msg")
FILE_TYPES_STDOUT = ("json", "jsonl")
-@plac.annotations(
- # fmt: off
- input_file=("Input file", "positional", None, str),
- output_dir=("Output directory. '-' for stdout.", "positional", None, str),
- file_type=(f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES),
- n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
- seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
- model=("Model for sentence segmentation (for -s)", "option", "b", str),
- converter=(f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str),
- lang=("Language (if tokenizer required)", "option", "l", str),
- morphology=("Enable appending morphology to tags", "flag", "m", bool),
- ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path,),
- # fmt: on
-)
def convert(
- input_file,
- output_dir="-",
- file_type="json",
- n_sents=1,
- seg_sents=False,
- model=None,
- morphology=False,
- converter="auto",
- ner_map_path=None,
- lang=None,
+ # fmt: off
+ input_file: ("Input file", "positional", None, str),
+ output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-",
+ file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json",
+ n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
+ seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
+ model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
+ morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
+ converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
+ ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
+ lang: ("Language (if tokenizer required)", "option", "l", str) = None,
+ # fmt: on
):
"""
Convert files into JSON format for use with train command and other
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 2e780f53c..c894788cb 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,6 +1,5 @@
from pathlib import Path
from collections import Counter
-import plac
import sys
import srsly
from wasabi import Printer, MESSAGES
@@ -19,29 +18,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100
BLANK_MODEL_THRESHOLD = 2000
-@plac.annotations(
- # fmt: off
- lang=("model language", "positional", None, str),
- train_path=("location of JSON-formatted training data", "positional", None, Path),
- dev_path=("location of JSON-formatted development data", "positional", None, Path),
- tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
- base_model=("name of model to update (optional)", "option", "b", str),
- pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
- ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
- verbose=("Print additional information and explanations", "flag", "V", bool),
- no_format=("Don't pretty-print the results", "flag", "NF", bool),
- # fmt: on
-)
def debug_data(
- lang,
- train_path,
- dev_path,
- tag_map_path=None,
- base_model=None,
- pipeline="tagger,parser,ner",
- ignore_warnings=False,
- verbose=False,
- no_format=False,
+ # fmt: off
+ lang: ("Model language", "positional", None, str),
+ train_path: ("Location of JSON-formatted training data", "positional", None, Path),
+ dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
+ tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
+ base_model: ("Name of model to update (optional)", "option", "b", str) = None,
+ pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner",
+ ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False,
+ verbose: ("Print additional information and explanations", "flag", "V", bool) = False,
+ no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False,
+ # fmt: on
):
"""
Analyze, debug and validate your training and development data, get useful
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 7c87a582a..7388bf615 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -1,4 +1,3 @@
-import plac
import requests
import os
import subprocess
@@ -10,12 +9,11 @@ from ..util import get_package_path
from .. import about
-@plac.annotations(
- model=("Model to download (shortcut or name)", "positional", None, str),
- direct=("Force direct download of name + version", "flag", "d", bool),
- pip_args=("Additional arguments to be passed to `pip install` on model install"),
-)
-def download(model, direct=False, *pip_args):
+def download(
+ model: ("Model to download (shortcut or name)", "positional", None, str),
+ direct: ("Force direct download of name + version", "flag", "d", bool) = False,
+ *pip_args: ("Additional arguments to be passed to `pip install` on model install"),
+):
"""
Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index a6b730d65..e047f1283 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -1,4 +1,3 @@
-import plac
from timeit import default_timer as timer
from wasabi import msg
@@ -7,23 +6,16 @@ from .. import util
from .. import displacy
-@plac.annotations(
- model=("Model name or path", "positional", None, str),
- data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
- gold_preproc=("Use gold preprocessing", "flag", "G", bool),
- gpu_id=("Use GPU", "option", "g", int),
- displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
- displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
- return_scores=("Return dict containing model scores", "flag", "R", bool),
-)
def evaluate(
- model,
- data_path,
- gpu_id=-1,
- gold_preproc=False,
- displacy_path=None,
- displacy_limit=25,
- return_scores=False,
+ # fmt: off
+ model: ("Model name or path", "positional", None, str),
+ data_path: ("Location of JSON-formatted evaluation data", "positional", None, str),
+ gpu_id: ("Use GPU", "option", "g", int) = -1,
+ gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
+ displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None,
+ displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25,
+ return_scores: ("Return dict containing model scores", "flag", "R", bool) = False,
+ # fmt: on
):
"""
Evaluate a model. To render a sample of parses in a HTML file, set an
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 060a38e78..fc8764ca8 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -1,4 +1,3 @@
-import plac
import platform
from pathlib import Path
from wasabi import msg
@@ -8,12 +7,11 @@ from .. import util
from .. import about
-@plac.annotations(
- model=("Optional shortcut link of model", "positional", None, str),
- markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
- silent=("Don't print anything (just return)", "flag", "s"),
-)
-def info(model=None, markdown=False, silent=False):
+def info(
+ model: ("Optional shortcut link of model", "positional", None, str) = None,
+ markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False,
+ silent: ("Don't print anything (just return)", "flag", "s") = False,
+):
"""
Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 87583ba73..babef106c 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -1,4 +1,3 @@
-import plac
import math
from tqdm import tqdm
import numpy
@@ -24,32 +23,18 @@ except ImportError:
DEFAULT_OOV_PROB = -20
-@plac.annotations(
- lang=("Model language", "positional", None, str),
- output_dir=("Model output directory", "positional", None, Path),
- freqs_loc=("Location of words frequencies file", "option", "f", Path),
- jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
- clusters_loc=("Optional location of brown clusters data", "option", "c", str),
- vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
- prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
- vectors_name=(
- "Optional name for the word vectors, e.g. en_core_web_lg.vectors",
- "option",
- "vn",
- str,
- ),
- model_name=("Optional name for the model meta", "option", "mn", str),
-)
def init_model(
- lang,
- output_dir,
- freqs_loc=None,
- clusters_loc=None,
- jsonl_loc=None,
- vectors_loc=None,
- prune_vectors=-1,
- vectors_name=None,
- model_name=None,
+ # fmt: off
+ lang: ("Model language", "positional", None, str),
+ output_dir: ("Model output directory", "positional", None, Path),
+ freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None,
+ clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None,
+ jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None,
+ vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None,
+ prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1,
+ vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None,
+ model_name: ("Optional name for the model meta", "option", "mn", str) = None,
+ # fmt: on
):
"""
Create a new model from raw data, like word frequencies, Brown clusters
diff --git a/spacy/cli/link.py b/spacy/cli/link.py
index df24adc23..d8af469dc 100644
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@@ -1,4 +1,3 @@
-import plac
from pathlib import Path
from wasabi import msg
@@ -6,12 +5,12 @@ from ..compat import symlink_to
from .. import util
-@plac.annotations(
- origin=("package name or local path to model", "positional", None, str),
- link_name=("name of shortuct link to create", "positional", None, str),
- force=("force overwriting of existing link", "flag", "f", bool),
-)
-def link(origin, link_name, force=False, model_path=None):
+def link(
+ origin: ("package name or local path to model", "positional", None, str),
+ link_name: ("name of shortuct link to create", "positional", None, str),
+ force: ("force overwriting of existing link", "flag", "f", bool) = False,
+ model_path=None,
+):
"""
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index edd9117c5..8e27e44d0 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -1,4 +1,3 @@
-import plac
import shutil
from pathlib import Path
from wasabi import msg, get_raw_input
@@ -8,14 +7,15 @@ from .. import util
from .. import about
-@plac.annotations(
- input_dir=("Directory with model data", "positional", None, str),
- output_dir=("Output parent directory", "positional", None, str),
- meta_path=("Path to meta.json", "option", "m", str),
- create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
- force=("Force overwriting existing model in output directory", "flag", "f", bool),
-)
-def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
+def package(
+ # fmt: off
+ input_dir: ("Directory with model data", "positional", None, str),
+ output_dir: ("Output parent directory", "positional", None, str),
+ meta_path: ("Path to meta.json", "option", "m", str) = None,
+ create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False,
+ force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False,
+ # fmt: on
+):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 12aa8b5c2..9e2fc5b1c 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -1,4 +1,3 @@
-import plac
import random
import numpy
import time
@@ -21,85 +20,31 @@ from .. import util
from .train import _load_pretrained_tok2vec
-@plac.annotations(
- texts_loc=(
- "Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
- "key 'tokens'",
- "positional",
- None,
- str,
- ),
- vectors_model=("Name or path to spaCy model with vectors to learn from"),
- output_dir=("Directory to write models to on each epoch", "positional", None, str),
- width=("Width of CNN layers", "option", "cw", int),
- depth=("Depth of CNN layers", "option", "cd", int),
- cnn_window=("Window size for CNN layers", "option", "cW", int),
- cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int),
- use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
- sa_depth=("Depth of self-attention layers", "option", "sa", int),
- bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
- embed_rows=("Number of embedding rows", "option", "er", int),
- loss_func=(
- "Loss function to use for the objective. Either 'L2' or 'cosine'",
- "option",
- "L",
- str,
- ),
- use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
- dropout=("Dropout rate", "option", "d", float),
- batch_size=("Number of words per training batch", "option", "bs", int),
- max_length=(
- "Max words per example. Longer examples are discarded",
- "option",
- "xw",
- int,
- ),
- min_length=(
- "Min words per example. Shorter examples are discarded",
- "option",
- "nw",
- int,
- ),
- seed=("Seed for random number generators", "option", "s", int),
- n_iter=("Number of iterations to pretrain", "option", "i", int),
- n_save_every=("Save model every X batches.", "option", "se", int),
- init_tok2vec=(
- "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
- "option",
- "t2v",
- Path,
- ),
- epoch_start=(
- "The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been "
- "renamed. Prevents unintended overwriting of existing weight files.",
- "option",
- "es",
- int,
- ),
-)
def pretrain(
- texts_loc,
- vectors_model,
- output_dir,
- width=96,
- depth=4,
- bilstm_depth=0,
- cnn_pieces=3,
- sa_depth=0,
- use_chars=False,
- cnn_window=1,
- embed_rows=2000,
- loss_func="cosine",
- use_vectors=False,
- dropout=0.2,
- n_iter=1000,
- batch_size=3000,
- max_length=500,
- min_length=5,
- seed=0,
- n_save_every=None,
- init_tok2vec=None,
- epoch_start=None,
+ # fmt: off
+ texts_loc: ("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
+ vectors_model: ("Name or path to spaCy model with vectors to learn from", "positional", None, str),
+ output_dir: ("Directory to write models to on each epoch", "positional", None, str),
+ width: ("Width of CNN layers", "option", "cw", int) = 96,
+ depth: ("Depth of CNN layers", "option", "cd", int) = 4,
+ bilstm_depth: ("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int) = 0,
+ cnn_pieces: ("Maxout size for CNN layers. 1 for Mish", "option", "cP", int) = 3,
+ sa_depth: ("Depth of self-attention layers", "option", "sa", int) = 0,
+ use_chars: ("Whether to use character-based embedding", "flag", "chr", bool) = False,
+ cnn_window: ("Window size for CNN layers", "option", "cW", int) = 1,
+ embed_rows: ("Number of embedding rows", "option", "er", int) = 2000,
+ loss_func: ("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str) = "cosine",
+ use_vectors: ("Whether to use the static vectors as input features", "flag", "uv") = False,
+ dropout: ("Dropout rate", "option", "d", float) = 0.2,
+ n_iter: ("Number of iterations to pretrain", "option", "i", int) = 1000,
+ batch_size: ("Number of words per training batch", "option", "bs", int) = 3000,
+ max_length: ("Max words per example. Longer examples are discarded", "option", "xw", int) = 500,
+ min_length: ("Min words per example. Shorter examples are discarded", "option", "nw", int) = 5,
+ seed: ("Seed for random number generators", "option", "s", int) = 0,
+ n_save_every: ("Save model every X batches.", "option", "se", int) = None,
+ init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
+ epoch_start: ("The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.", "option", "es", int) = None,
+ # fmt: on
):
"""
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
index f3df0817d..44e59971a 100644
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@@ -1,4 +1,3 @@
-import plac
import tqdm
from pathlib import Path
import srsly
@@ -12,12 +11,13 @@ from wasabi import msg
from ..util import load_model
-@plac.annotations(
- model=("Model to load", "positional", None, str),
- inputs=("Location of input file. '-' for stdin.", "positional", None, str),
- n_texts=("Maximum number of texts to use if available", "option", "n", int),
-)
-def profile(model, inputs=None, n_texts=10000):
+def profile(
+ # fmt: off
+ model: ("Model to load", "positional", None, str),
+ inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None,
+ n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000,
+ # fmt: on
+):
"""
Profile a spaCy pipeline, to find out which functions take the most time.
Input should be formatted as one JSON object per line with a key "text".
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index df5456df3..454403529 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -1,4 +1,3 @@
-import plac
import os
import tqdm
from pathlib import Path
@@ -17,67 +16,37 @@ from .. import util
from .. import about
-@plac.annotations(
- # fmt: off
- lang=("Model language", "positional", None, str),
- output_path=("Output directory to store model in", "positional", None, Path),
- train_path=("Location of JSON-formatted training data", "positional", None, Path),
- dev_path=("Location of JSON-formatted development data", "positional", None, Path),
- raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
- base_model=("Name of model to update (optional)", "option", "b", str),
- pipeline=("Comma-separated names of pipeline components", "option", "p", str),
- vectors=("Model to load vectors from", "option", "v", str),
- n_iter=("Number of iterations", "option", "n", int),
- n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int),
- n_examples=("Number of examples", "option", "ns", int),
- use_gpu=("Use GPU", "option", "g", int),
- version=("Model version", "option", "V", str),
- meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
- init_tok2vec=("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
- parser_multitasks=("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str),
- entity_multitasks=("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str),
- noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
- orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float),
- eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
- gold_preproc=("Use gold preprocessing", "flag", "G", bool),
- learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
- textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool),
- textcat_arch=("Textcat model architecture", "option", "ta", str),
- textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
- tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
- verbose=("Display more information for debug", "flag", "VV", bool),
- debug=("Run data diagnostics before training", "flag", "D", bool),
- # fmt: on
-)
def train(
- lang,
- output_path,
- train_path,
- dev_path,
- raw_text=None,
- base_model=None,
- pipeline="tagger,parser,ner",
- vectors=None,
- n_iter=30,
- n_early_stopping=None,
- n_examples=0,
- use_gpu=-1,
- version="0.0.0",
- meta_path=None,
- init_tok2vec=None,
- parser_multitasks="",
- entity_multitasks="",
- noise_level=0.0,
- orth_variant_level=0.0,
- eval_beam_widths="",
- gold_preproc=False,
- learn_tokens=False,
- textcat_multilabel=False,
- textcat_arch="bow",
- textcat_positive_label=None,
- tag_map_path=None,
- verbose=False,
- debug=False,
+ # fmt: off
+ lang: ("Model language", "positional", None, str),
+ output_path: ("Output directory to store model in", "positional", None, Path),
+ train_path: ("Location of JSON-formatted training data", "positional", None, Path),
+ dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
+ raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
+ base_model: ("Name of model to update (optional)", "option", "b", str) = None,
+ pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner",
+ vectors: ("Model to load vectors from", "option", "v", str) = None,
+ n_iter: ("Number of iterations", "option", "n", int) = 30,
+ n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None,
+ n_examples: ("Number of examples", "option", "ns", int) = 0,
+ use_gpu: ("Use GPU", "option", "g", int) = -1,
+ version: ("Model version", "option", "V", str) = "0.0.0",
+ meta_path: ("Optional path to meta.json to use as base.", "option", "m", Path) = None,
+ init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
+ parser_multitasks: ("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str) = "",
+ entity_multitasks: ("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str) = "",
+ noise_level: ("Amount of corruption for data augmentation", "option", "nl", float) = 0.0,
+ orth_variant_level: ("Amount of orthography variation for data augmentation", "option", "ovl", float) = 0.0,
+ eval_beam_widths: ("Beam widths to evaluate, e.g. 4,8", "option", "bw", str) = "",
+ gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
+ learn_tokens: ("Make parser learn gold-standard tokenization", "flag", "T", bool) = False,
+ textcat_multilabel: ("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool) = False,
+ textcat_arch: ("Textcat model architecture", "option", "ta", str) = "bow",
+ textcat_positive_label: ("Textcat positive label for binary classes with two labels", "option", "tpl", str) = None,
+ tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
+ verbose: ("Display more information for debug", "flag", "VV", bool) = False,
+ debug: ("Run data diagnostics before training", "flag", "D", bool) = False,
+ # fmt: on
):
"""
Train or update a spaCy model. Requires data to be formatted in spaCy's
From 581eeed98b7a5a4565ea8286ef78bfce01667535 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Wed, 1 Jan 2020 13:16:48 +0100
Subject: [PATCH 041/496] Warning goldparse (#4851)
* label in span not writable anymore
* Revert "label in span not writable anymore"
This reverts commit ab442338c8c4ddd7dfbc15348f999b74f4928090.
* provide more friendly error msg for parsing file
---
spacy/errors.py | 1 +
spacy/gold.pyx | 24 ++++++++++++++++--------
2 files changed, 17 insertions(+), 8 deletions(-)
diff --git a/spacy/errors.py b/spacy/errors.py
index 3aa4bedea..7393ddc07 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -534,6 +534,7 @@ class Errors(object):
"make sure the gold EL data refers to valid results of the "
"named entity recognizer in the `nlp` pipeline.")
# TODO: fix numbering after merging develop into master
+ E996 = ("Could not parse {file}: {msg}")
E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes "
"'{token_attrs}'.")
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 1d3d8e034..10b8bf0cf 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -194,9 +194,10 @@ class GoldCorpus(object):
i = 0
for loc in locs:
loc = util.ensure_path(loc)
- if loc.parts[-1].endswith("json"):
+ file_name = loc.parts[-1]
+ if file_name.endswith("json"):
examples = read_json_file(loc)
- elif loc.parts[-1].endswith("jsonl"):
+ elif file_name.endswith("jsonl"):
gold_tuples = srsly.read_jsonl(loc)
first_gold_tuple = next(gold_tuples)
gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
@@ -212,17 +213,24 @@ class GoldCorpus(object):
doc = ex_dict.get("text", None)
examples.append(Example.from_dict(ex_dict, doc=doc))
- elif loc.parts[-1].endswith("msg"):
+ elif file_name.endswith("msg"):
text, ex_dict = srsly.read_msgpack(loc)
examples = [Example.from_dict(ex_dict, doc=text)]
else:
supported = ("json", "jsonl", "msg")
raise ValueError(Errors.E124.format(path=loc, formats=supported))
- for example in examples:
- yield example
- i += 1
- if limit and i >= limit:
- return
+ try:
+ for example in examples:
+ yield example
+ i += 1
+ if limit and i >= limit:
+ return
+ except KeyError as e:
+ msg = "Missing key {}".format(e)
+ raise KeyError(Errors.E996.format(file=file_name, msg=msg))
+ except UnboundLocalError as e:
+ msg = "Unexpected document structure"
+ raise ValueError(Errors.E996.format(file=file_name, msg=msg))
@property
def dev_examples(self):
From e1b493ae8521af36fd1f0dfeafe7d9eb0408fe75 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 8 Jan 2020 16:51:24 +0100
Subject: [PATCH 042/496] Add sentrec shortcut to Language (#4890)
---
spacy/language.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/spacy/language.py b/spacy/language.py
index 4ae729588..b91903595 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -225,6 +225,10 @@ class Language(object):
def linker(self):
return self.get_pipe("entity_linker")
+ @property
+ def sentrec(self):
+ return self.get_pipe("sentrec")
+
@property
def matcher(self):
return self.get_pipe("matcher")
From e55fa1899aa8bae311064004d0edaed8b37979e5 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 8 Jan 2020 16:51:51 +0100
Subject: [PATCH 043/496] Report length of dev dataset correctly (#4891)
---
spacy/cli/debug_data.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index c894788cb..8c77f7356 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -107,7 +107,7 @@ def debug_data(
else:
msg.text(f"Starting with blank model '{lang}'")
msg.text(f"{len(train_dataset)} training docs")
- msg.text(f"{len(gold_dev_data)} evaluation docs")
+ msg.text(f"{len(dev_dataset)} evaluation docs")
if not len(gold_dev_data):
msg.fail("No evaluation docs")
From d2f3a44b42bfff9773fdf3abaccdcc0e78d295f7 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 8 Jan 2020 16:52:14 +0100
Subject: [PATCH 044/496] Improve train CLI sentrec scoring (#4892)
* reorder to metrics to prioritize F over P/R
* add sentrec to model metrics
---
spacy/cli/train.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 454403529..6ebf5d37d 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -476,6 +476,8 @@ def _score_for_model(meta):
mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
if "textcat" in pipes:
mean_acc.append(acc["textcat_score"])
+ if "sentrec" in pipes:
+ mean_acc.append((acc["sent_p"] + acc["sent_r"] + acc["sent_f"]) / 3)
return sum(mean_acc) / len(mean_acc)
@@ -554,7 +556,7 @@ def _get_metrics(component):
elif component == "ner":
return ("ents_f", "ents_p", "ents_r")
elif component == "sentrec":
- return ("sent_p", "sent_r", "sent_f",)
+ return ("sent_f", "sent_p", "sent_r")
return ("token_acc",)
From 199d89943e546eefb76656ed933ca6ab34296662 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 22 Jan 2020 15:40:31 +0100
Subject: [PATCH 045/496] Add as_example to Sentencizer pipe() (#4933)
---
spacy/pipeline/pipes.pyx | 24 ++++++++++++++++++------
spacy/tests/pipeline/test_sentencizer.py | 6 ++++++
2 files changed, 24 insertions(+), 6 deletions(-)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 5ca651077..4f0f2469e 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1711,12 +1711,24 @@ class Sentencizer(Pipe):
return example
return doc
- def pipe(self, stream, batch_size=128, n_threads=-1):
- for docs in util.minibatch(stream, size=batch_size):
- docs = list(docs)
- tag_ids = self.predict(docs)
- self.set_annotations(docs, tag_ids)
- yield from docs
+ def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
+ for examples in util.minibatch(stream, size=batch_size):
+ docs = [self._get_doc(ex) for ex in examples]
+ predictions = self.predict(docs)
+ if isinstance(predictions, tuple) and len(tuple) == 2:
+ scores, tensors = predictions
+ self.set_annotations(docs, scores, tensors=tensors)
+ else:
+ self.set_annotations(docs, predictions)
+
+ if as_example:
+ annotated_examples = []
+ for ex, doc in zip(examples, docs):
+ ex.doc = doc
+ annotated_examples.append(ex)
+ yield from annotated_examples
+ else:
+ yield from docs
def predict(self, docs):
"""Apply the pipeline's model to a batch of docs, without
diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py
index 78ab6d2d1..5f9c55dbb 100644
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@@ -24,6 +24,12 @@ def test_sentencizer_pipe():
sent_starts = [t.is_sent_start for t in doc]
assert sent_starts == [True, False, True, False, False, False, False]
assert len(list(doc.sents)) == 2
+ for ex in nlp.pipe(texts, as_example=True):
+ doc = ex.doc
+ assert doc.is_sentenced
+ sent_starts = [t.is_sent_start for t in doc]
+ assert sent_starts == [True, False, True, False, False, False, False]
+ assert len(list(doc.sents)) == 2
@pytest.mark.parametrize(
From 0a0de85409e796d37b3b74796e2475c37d131c1b Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Thu, 23 Jan 2020 22:00:24 +0100
Subject: [PATCH 046/496] Fix gold training (#4938)
* label in span not writable anymore
* Revert "label in span not writable anymore"
This reverts commit ab442338c8c4ddd7dfbc15348f999b74f4928090.
* ensure doc is not None
---
spacy/gold.pyx | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 10b8bf0cf..0dfa32c84 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -326,14 +326,18 @@ class GoldCorpus(object):
@classmethod
def _make_golds(cls, examples, vocab=None, make_projective=False,
ignore_misaligned=False):
+ filtered_examples = []
for example in examples:
gold_parses = example.get_gold_parses(vocab=vocab,
make_projective=make_projective,
ignore_misaligned=ignore_misaligned)
assert len(gold_parses) == 1
- assert gold_parses[0][0] == example.doc
- example.goldparse = gold_parses[0][1]
- return examples
+ doc, gold = gold_parses[0]
+ if doc:
+ assert doc == example.doc
+ example.goldparse = gold
+ filtered_examples.append(example)
+ return filtered_examples
def make_orth_variants(nlp, example, orth_variant_level=0.0):
From adc974571803f984d73b27d70320c4856001a4fd Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Thu, 23 Jan 2020 22:01:54 +0100
Subject: [PATCH 047/496] Modify morphology to support arbitrary features
(#4932)
* Restructure tag maps for MorphAnalysis changes
Prepare tag maps for upcoming MorphAnalysis changes that allow
arbritrary features.
* Use default tag map rather than duplicating for ca / uk / vi
* Import tag map into defaults for ga
* Modify tag maps so all morphological fields and features are strings
* Move features from `"Other"` to the top level
* Rewrite tuples as strings separated by `","`
* Rewrite morph symbols for fr lemmatizer as strings
* Export MorphAnalysis under spacy.tokens
* Modify morphology to support arbitrary features
Modify `Morphology` and `MorphAnalysis` so that arbitrary features are
supported.
* Modify `MorphAnalysisC` so that it can support arbitrary features and
multiple values per field. `MorphAnalysisC` is redesigned to contain:
* key: hash of UD FEATS string of morphological features
* array of `MorphFeatureC` structs that each contain a hash of `Field`
and `Field=Value` for a given morphological feature, which makes it
possible to:
* find features by field
* represent multiple values for a given field
* `get_field()` is renamed to `get_by_field()` and is no longer `nogil`.
Instead a new helper function `get_n_by_field()` is `nogil` and returns
`n` features by field.
* `MorphAnalysis.get()` returns all possible values for a field as a
list of individual features such as `["Tense=Pres", "Tense=Past"]`.
* `MorphAnalysis`'s `str()` and `repr()` are the UD FEATS string.
* `Morphology.feats_to_dict()` converts a UD FEATS string to a dict
where:
* Each field has one entry in the dict
* Multiple values remain separated by a separator in the value string
* `Token.morph_` returns the UD FEATS string and you can set
`Token.morph_` with a UD FEATS string or with a tag map dict.
* Modify get_by_field to use np.ndarray
Modify `get_by_field()` to use np.ndarray. Remove `max_results` from
`get_n_by_field()` and always iterate over all the fields.
* Rewrite without MorphFeatureC
* Add shortcut for existing feats strings as keys
Add shortcut for existing feats strings as keys in `Morphology.add()`.
* Check for '_' as empty analysis when adding morphs
* Extend helper converters in Morphology
Add and extend helper converters that convert and normalize between:
* UD FEATS strings (`"Case=dat,gen|Number=sing"`)
* per-field dict of feats (`{"Case": "dat,gen", "Number": "sing"}`)
* list of individual features (`["Case=dat", "Case=gen",
"Number=sing"]`)
All converters sort fields and values where applicable.
---
spacy/errors.py | 3 +
spacy/lang/bn/tag_map.py | 4 +-
spacy/lang/ca/tag_map.py | 25 -
spacy/lang/da/__init__.py | 2 -
spacy/lang/el/tag_map.py | 64 +-
spacy/lang/fr/lemmatizer.py | 9 +-
spacy/lang/ga/__init__.py | 2 +
spacy/lang/ga/tag_map.py | 400 +++----
spacy/lang/nb/morph_rules.py | 18 +-
spacy/lang/sv/morph_rules.py | 10 +-
spacy/lang/uk/tag_map.py | 25 -
spacy/lang/ur/tag_map.py | 6 +-
spacy/lang/vi/tag_map.py | 25 -
spacy/morphology.pxd | 15 +-
spacy/morphology.pyx | 1056 +++--------------
spacy/structs.pxd | 49 +-
spacy/symbols.pxd | 552 ++++-----
spacy/symbols.pyx | 552 ++++-----
spacy/tests/doc/test_morphanalysis.py | 50 +-
.../tests/morphology/test_morph_converters.py | 26 +
spacy/tests/morphology/test_morph_features.py | 28 +-
spacy/tests/regression/test_issue1-1000.py | 4 +-
spacy/tests/regression/test_issue1001-1500.py | 4 +-
spacy/tokens/__init__.py | 3 +-
spacy/tokens/morphanalysis.pxd | 2 +-
spacy/tokens/morphanalysis.pyx | 392 +-----
spacy/tokens/token.pyx | 8 +
27 files changed, 1080 insertions(+), 2254 deletions(-)
delete mode 100644 spacy/lang/ca/tag_map.py
delete mode 100644 spacy/lang/uk/tag_map.py
delete mode 100644 spacy/lang/vi/tag_map.py
create mode 100644 spacy/tests/morphology/test_morph_converters.py
diff --git a/spacy/errors.py b/spacy/errors.py
index 7393ddc07..e00df2c51 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -103,6 +103,9 @@ class Warnings(object):
W027 = ("Found a large training file of {size} bytes. Note that it may "
"be more efficient to split your training data into multiple "
"smaller JSON files instead.")
+ W028 = ("Skipping unsupported morphological feature(s): {feature}. "
+ "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
+ "string \"Field1=Value1,Value2|Field2=Value3\".")
@add_codes
diff --git a/spacy/lang/bn/tag_map.py b/spacy/lang/bn/tag_map.py
index 36d69ccf9..bc4c5ef6b 100644
--- a/spacy/lang/bn/tag_map.py
+++ b/spacy/lang/bn/tag_map.py
@@ -11,8 +11,8 @@ TAG_MAP = {
'""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
":": {POS: PUNCT},
- "৳": {POS: SYM, "Other": {"SymType": "currency"}},
- "#": {POS: SYM, "Other": {"SymType": "numbersign"}},
+ "৳": {POS: SYM, "SymType": "currency"},
+ "#": {POS: SYM, "SymType": "numbersign"},
"AFX": {POS: ADJ, "Hyph": "yes"},
"CC": {POS: CONJ, "ConjType": "coor"},
"CD": {POS: NUM, "NumType": "card"},
diff --git a/spacy/lang/ca/tag_map.py b/spacy/lang/ca/tag_map.py
deleted file mode 100644
index 1ecbddc49..000000000
--- a/spacy/lang/ca/tag_map.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
-from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
-
-
-TAG_MAP = {
- "ADV": {POS: ADV},
- "NOUN": {POS: NOUN},
- "ADP": {POS: ADP},
- "PRON": {POS: PRON},
- "SCONJ": {POS: SCONJ},
- "PROPN": {POS: PROPN},
- "DET": {POS: DET},
- "SYM": {POS: SYM},
- "INTJ": {POS: INTJ},
- "PUNCT": {POS: PUNCT},
- "NUM": {POS: NUM},
- "AUX": {POS: AUX},
- "X": {POS: X},
- "CONJ": {POS: CONJ},
- "CCONJ": {POS: CCONJ},
- "ADJ": {POS: ADJ},
- "VERB": {POS: VERB},
- "PART": {POS: PART},
- "SP": {POS: SPACE},
-}
diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py
index 2828c014b..6d1e33986 100644
--- a/spacy/lang/da/__init__.py
+++ b/spacy/lang/da/__init__.py
@@ -4,7 +4,6 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .morph_rules import MORPH_RULES
-from ..tag_map import TAG_MAP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
@@ -24,7 +23,6 @@ class DanishDefaults(Language.Defaults):
morph_rules = MORPH_RULES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
- tag_map = TAG_MAP
stop_words = STOP_WORDS
diff --git a/spacy/lang/el/tag_map.py b/spacy/lang/el/tag_map.py
index adfacd025..f37f84c57 100644
--- a/spacy/lang/el/tag_map.py
+++ b/spacy/lang/el/tag_map.py
@@ -656,7 +656,7 @@ TAG_MAP = {
"Gender": "Fem",
"Number": "Plur",
"Case": "Acc",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfFePlGe": {
POS: DET,
@@ -664,7 +664,7 @@ TAG_MAP = {
"Gender": "Fem",
"Number": "Plur",
"Case": "Gen",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfFePlNm": {
POS: DET,
@@ -672,7 +672,7 @@ TAG_MAP = {
"Gender": "Fem",
"Number": "Plur",
"Case": "Nom",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfFeSgAc": {
POS: DET,
@@ -680,7 +680,7 @@ TAG_MAP = {
"Gender": "Fem",
"Number": "Sing",
"Case": "Acc",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfFeSgDa": {
POS: DET,
@@ -688,7 +688,7 @@ TAG_MAP = {
"Gender": "Fem",
"Number": "Sing",
"Case": "Dat",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfFeSgGe": {
POS: DET,
@@ -696,7 +696,7 @@ TAG_MAP = {
"Gender": "Fem",
"Number": "Sing",
"Case": "Gen",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfFeSgNm": {
POS: DET,
@@ -704,7 +704,7 @@ TAG_MAP = {
"Gender": "Fem",
"Number": "Sing",
"Case": "Nom",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfMaPlAc": {
POS: DET,
@@ -712,7 +712,7 @@ TAG_MAP = {
"Gender": "Masc",
"Number": "Plur",
"Case": "Acc",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfMaPlGe": {
POS: DET,
@@ -720,7 +720,7 @@ TAG_MAP = {
"Gender": "Masc",
"Number": "Plur",
"Case": "Gen",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfMaPlNm": {
POS: DET,
@@ -728,7 +728,7 @@ TAG_MAP = {
"Gender": "Masc",
"Number": "Plur",
"Case": "Nom",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfMaSgAc": {
POS: DET,
@@ -736,7 +736,7 @@ TAG_MAP = {
"Gender": "Masc",
"Number": "Sing",
"Case": "Acc",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfMaSgDa": {
POS: DET,
@@ -744,7 +744,7 @@ TAG_MAP = {
"Gender": "Masc",
"Number": "Sing",
"Case": "Dat",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfMaSgGe": {
POS: DET,
@@ -752,7 +752,7 @@ TAG_MAP = {
"Gender": "Masc",
"Number": "Sing",
"Case": "Gen",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfMaSgNm": {
POS: DET,
@@ -760,7 +760,7 @@ TAG_MAP = {
"Gender": "Masc",
"Number": "Sing",
"Case": "Nom",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfNePlAc": {
POS: DET,
@@ -768,7 +768,7 @@ TAG_MAP = {
"Gender": "Neut",
"Number": "Plur",
"Case": "Acc",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfNePlDa": {
POS: DET,
@@ -776,7 +776,7 @@ TAG_MAP = {
"Gender": "Neut",
"Number": "Plur",
"Case": "Dat",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfNePlGe": {
POS: DET,
@@ -784,7 +784,7 @@ TAG_MAP = {
"Gender": "Neut",
"Number": "Plur",
"Case": "Gen",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfNePlNm": {
POS: DET,
@@ -792,7 +792,7 @@ TAG_MAP = {
"Gender": "Neut",
"Number": "Plur",
"Case": "Nom",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfNeSgAc": {
POS: DET,
@@ -800,7 +800,7 @@ TAG_MAP = {
"Gender": "Neut",
"Number": "Sing",
"Case": "Acc",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfNeSgDa": {
POS: DET,
@@ -808,7 +808,7 @@ TAG_MAP = {
"Gender": "Neut",
"Number": "Sing",
"Case": "Dat",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfNeSgGe": {
POS: DET,
@@ -816,7 +816,7 @@ TAG_MAP = {
"Gender": "Neut",
"Number": "Sing",
"Case": "Gen",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtDfNeSgNm": {
POS: DET,
@@ -824,7 +824,7 @@ TAG_MAP = {
"Gender": "Neut",
"Number": "Sing",
"Case": "Nom",
- "Other": {"Definite": "Def"},
+ "Definite": "Def",
},
"AtIdFeSgAc": {
POS: DET,
@@ -832,7 +832,7 @@ TAG_MAP = {
"Gender": "Fem",
"Number": "Sing",
"Case": "Acc",
- "Other": {"Definite": "Ind"},
+ "Definite": "Ind",
},
"AtIdFeSgDa": {
POS: DET,
@@ -840,7 +840,7 @@ TAG_MAP = {
"Gender": "Fem",
"Number": "Sing",
"Case": "Dat",
- "Other": {"Definite": "Ind"},
+ "Definite": "Ind",
},
"AtIdFeSgGe": {
POS: DET,
@@ -848,7 +848,7 @@ TAG_MAP = {
"Gender": "Fem",
"Number": "Sing",
"Case": "Gen",
- "Other": {"Definite": "Ind"},
+ "Definite": "Ind",
},
"AtIdFeSgNm": {
POS: DET,
@@ -856,7 +856,7 @@ TAG_MAP = {
"Gender": "Fem",
"Number": "Sing",
"Case": "Nom",
- "Other": {"Definite": "Ind"},
+ "Definite": "Ind",
},
"AtIdMaSgAc": {
POS: DET,
@@ -864,7 +864,7 @@ TAG_MAP = {
"Gender": "Masc",
"Number": "Sing",
"Case": "Acc",
- "Other": {"Definite": "Ind"},
+ "Definite": "Ind",
},
"AtIdMaSgGe": {
POS: DET,
@@ -872,7 +872,7 @@ TAG_MAP = {
"Gender": "Masc",
"Number": "Sing",
"Case": "Gen",
- "Other": {"Definite": "Ind"},
+ "Definite": "Ind",
},
"AtIdMaSgNm": {
POS: DET,
@@ -880,7 +880,7 @@ TAG_MAP = {
"Gender": "Masc",
"Number": "Sing",
"Case": "Nom",
- "Other": {"Definite": "Ind"},
+ "Definite": "Ind",
},
"AtIdNeSgAc": {
POS: DET,
@@ -888,7 +888,7 @@ TAG_MAP = {
"Gender": "Neut",
"Number": "Sing",
"Case": "Acc",
- "Other": {"Definite": "Ind"},
+ "Definite": "Ind",
},
"AtIdNeSgGe": {
POS: DET,
@@ -896,7 +896,7 @@ TAG_MAP = {
"Gender": "Neut",
"Number": "Sing",
"Case": "Gen",
- "Other": {"Definite": "Ind"},
+ "Definite": "Ind",
},
"AtIdNeSgNm": {
POS: DET,
@@ -904,7 +904,7 @@ TAG_MAP = {
"Gender": "Neut",
"Number": "Sing",
"Case": "Nom",
- "Other": {"Definite": "Ind"},
+ "Definite": "Ind",
},
"CjCo": {POS: CCONJ},
"CjSb": {POS: SCONJ},
diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py
index 84e55d509..fe128df1f 100644
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@@ -1,7 +1,6 @@
from ...lemmatizer import Lemmatizer
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
from ...symbols import SCONJ, CCONJ
-from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
class FrenchLemmatizer(Lemmatizer):
@@ -82,13 +81,13 @@ class FrenchLemmatizer(Lemmatizer):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
- elif VerbForm_inf in morphology:
+ elif "VerbForm=inf" in morphology:
return True
- elif VerbForm_none in morphology:
+ elif "VerbForm=none" in morphology:
return True
- elif Number_sing in morphology:
+ elif "Number=sing" in morphology:
return True
- elif Degree_pos in morphology:
+ elif "Degree=pos" in morphology:
return True
else:
return False
diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py
index cea7c0e94..4c3d219c7 100644
--- a/spacy/lang/ga/__init__.py
+++ b/spacy/lang/ga/__init__.py
@@ -1,5 +1,6 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
+from .tag_map import TAG_MAP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
@@ -13,6 +14,7 @@ class IrishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
+ tag_map = TAG_MAP
class Irish(Language):
diff --git a/spacy/lang/ga/tag_map.py b/spacy/lang/ga/tag_map.py
index baf64c1b8..efcaf5d1f 100644
--- a/spacy/lang/ga/tag_map.py
+++ b/spacy/lang/ga/tag_map.py
@@ -1,26 +1,26 @@
# fmt: off
TAG_MAP = {
- "ADJ__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
+ "ADJ__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing", "Form": "len"},
"ADJ__Case=Gen|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "fem", "Number": "sing"},
"ADJ__Case=Gen|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing"},
- "ADJ__Case=Gen|NounType=Strong|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "strong"}},
- "ADJ__Case=Gen|NounType=Weak|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "Other": {"NounType": "weak"}},
- "ADJ__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
- "ADJ__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
+ "ADJ__Case=Gen|NounType=Strong|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "NounType": "strong"},
+ "ADJ__Case=Gen|NounType=Weak|Number=Plur": {"pos": "ADJ", "Case": "gen", "Number": "plur", "NounType": "weak"},
+ "ADJ__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "len"},
+ "ADJ__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "len"},
"ADJ__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "plur"},
"ADJ__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "fem", "Number": "sing"},
"ADJ__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "plur"},
"ADJ__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "nom|acc", "Gender": "masc", "Number": "sing"},
- "ADJ__Case=NomAcc|NounType=NotSlender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "notslender"}},
- "ADJ__Case=NomAcc|NounType=Slender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "Other": {"NounType": "slender"}},
- "ADJ__Degree=Cmp,Sup|Form=Len": {"pos": "ADJ", "Degree": "cmp|sup", "Other": {"Form": "len"}},
+ "ADJ__Case=NomAcc|NounType=NotSlender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "NounType": "notslender"},
+ "ADJ__Case=NomAcc|NounType=Slender|Number=Plur": {"pos": "ADJ", "Case": "nom|acc", "Number": "plur", "NounType": "slender"},
+ "ADJ__Degree=Cmp,Sup|Form=Len": {"pos": "ADJ", "Degree": "cmp|sup", "Form": "len"},
"ADJ__Degree=Cmp,Sup": {"pos": "ADJ", "Degree": "cmp|sup"},
- "ADJ__Degree=Pos|Form=Ecl": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "ecl"}},
- "ADJ__Degree=Pos|Form=HPref": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "hpref"}},
- "ADJ__Degree=Pos|Form=Len": {"pos": "ADJ", "Degree": "pos", "Other": {"Form": "len"}},
+ "ADJ__Degree=Pos|Form=Ecl": {"pos": "ADJ", "Degree": "pos", "Form": "ecl"},
+ "ADJ__Degree=Pos|Form=HPref": {"pos": "ADJ", "Degree": "pos", "Form": "hpref"},
+ "ADJ__Degree=Pos|Form=Len": {"pos": "ADJ", "Degree": "pos", "Form": "len"},
"ADJ__Degree=Pos": {"pos": "ADJ", "Degree": "pos"},
"ADJ__Foreign=Yes": {"pos": "ADJ", "Foreign": "yes"},
- "ADJ__Form=Len|VerbForm=Part": {"pos": "ADJ", "VerbForm": "part", "Other": {"Form": "len"}},
+ "ADJ__Form=Len|VerbForm=Part": {"pos": "ADJ", "VerbForm": "part", "Form": "len"},
"ADJ__Gender=Masc|Number=Sing|PartType=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"},
"ADJ__Gender=Masc|Number=Sing|Case=Voc": {"pos": "ADJ", "Gender": "masc", "Number": "sing", "Case": "voc"},
"ADJ__Number=Plur|PartType=Voc": {"pos": "ADJ", "Number": "plur", "Case": "voc"},
@@ -29,9 +29,9 @@ TAG_MAP = {
"ADJ___": {"pos": "ADJ"},
"ADJ__VerbForm=Part": {"pos": "ADJ", "VerbForm": "part"},
"ADP__Foreign=Yes": {"pos": "ADP", "Foreign": "yes"},
- "ADP__Form=Len|Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1, "Other": {"Form": "len"}},
- "ADP__Form=Len|Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3, "Other": {"Form": "len"}},
- "ADP__Form=Len|Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1, "Other": {"Form": "len"}},
+ "ADP__Form=Len|Number=Plur|Person=1": {"pos": "ADP", "Number": "plur", "Person": 1, "Form": "len"},
+ "ADP__Form=Len|Number=Plur|Person=3": {"pos": "ADP", "Number": "plur", "Person": 3, "Form": "len"},
+ "ADP__Form=Len|Number=Sing|Person=1": {"pos": "ADP", "Number": "sing", "Person": 1, "Form": "len"},
"ADP__Gender=Fem|Number=Sing|Person=3": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3},
"ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"},
"ADP__Gender=Fem|Number=Sing|Person=3|Poss=Yes|PronType=Prs": {"pos": "ADP", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes", "PronType": "prs"},
@@ -57,41 +57,41 @@ TAG_MAP = {
"ADP__Person=3|Poss=Yes": {"pos": "ADP", "Person": 3, "Poss": "yes"},
"ADP___": {"pos": "ADP"},
"ADP__Poss=Yes": {"pos": "ADP", "Poss": "yes"},
- "ADP__PrepForm=Cmpd": {"pos": "ADP", "Other": {"PrepForm": "cmpd"}},
+ "ADP__PrepForm=Cmpd": {"pos": "ADP", "PrepForm": "cmpd"},
"ADP__PronType=Art": {"pos": "ADP", "PronType": "art"},
- "ADV__Form=Len": {"pos": "ADV", "Other": {"Form": "len"}},
+ "ADV__Form=Len": {"pos": "ADV", "Form": "len"},
"ADV___": {"pos": "ADV"},
"ADV__PronType=Int": {"pos": "ADV", "PronType": "int"},
- "AUX__Form=VF|Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}},
- "AUX__Form=VF|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}},
- "AUX__Form=VF|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}},
- "AUX__Form=VF|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"Form": "vf", "VerbForm": "cop"}},
- "AUX__Form=VF|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Form": "vf", "VerbForm": "cop"}},
- "AUX__Gender=Masc|Number=Sing|Person=3|VerbForm=Cop": {"pos": "AUX", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"VerbForm": "cop"}},
- "AUX__Mood=Int|Number=Sing|PronType=Art|VerbForm=Cop": {"pos": "AUX", "Number": "sing", "PronType": "art", "Other": {"Mood": "int", "VerbForm": "cop"}},
- "AUX__Mood=Int|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"Mood": "int", "VerbForm": "cop"}},
- "AUX__Mood=Int|Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}},
- "AUX__Mood=Int|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"Mood": "int", "VerbForm": "cop"}},
- "AUX__PartType=Comp|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"PartType": "comp", "VerbForm": "cop"}},
- "AUX__Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}},
- "AUX__Polarity=Neg|PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}},
- "AUX__Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Other": {"VerbForm": "cop"}},
- "AUX__Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Other": {"VerbForm": "cop"}},
+ "AUX__Form=VF|Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Form": "vf", "VerbForm": "cop"},
+ "AUX__Form=VF|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Form": "vf", "VerbForm": "cop"},
+ "AUX__Form=VF|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Form": "vf", "VerbForm": "cop"},
+ "AUX__Form=VF|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Form": "vf", "VerbForm": "cop"},
+ "AUX__Form=VF|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Form": "vf", "VerbForm": "cop"},
+ "AUX__Gender=Masc|Number=Sing|Person=3|VerbForm=Cop": {"pos": "AUX", "Gender": "masc", "Number": "sing", "Person": 3, "VerbForm": "cop"},
+ "AUX__Mood=Int|Number=Sing|PronType=Art|VerbForm=Cop": {"pos": "AUX", "Number": "sing", "PronType": "art", "Mood": "int", "VerbForm": "cop"},
+ "AUX__Mood=Int|Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "Mood": "int", "VerbForm": "cop"},
+ "AUX__Mood=Int|Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "Mood": "int", "VerbForm": "cop"},
+ "AUX__Mood=Int|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Mood": "int", "VerbForm": "cop"},
+ "AUX__PartType=Comp|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "PartType": "comp", "VerbForm": "cop"},
+ "AUX__Polarity=Neg|PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "past", "VerbForm": "cop"},
+ "AUX__Polarity=Neg|PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "PronType": "rel", "Tense": "pres", "VerbForm": "cop"},
+ "AUX__Polarity=Neg|Tense=Past|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "past", "VerbForm": "cop"},
+ "AUX__Polarity=Neg|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Polarity": "neg", "Tense": "pres", "VerbForm": "cop"},
"AUX___": {"pos": "AUX"},
- "AUX__PronType=Dem|VerbForm=Cop": {"pos": "AUX", "PronType": "dem", "Other": {"VerbForm": "cop"}},
- "AUX__PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "Other": {"VerbForm": "cop"}},
- "AUX__PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "pres", "Other": {"VerbForm": "cop"}},
- "AUX__Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "Other": {"VerbForm": "cop"}},
- "AUX__Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "Other": {"VerbForm": "cop"}},
- "AUX__VerbForm=Cop": {"pos": "AUX", "Other": {"VerbForm": "cop"}},
+ "AUX__PronType=Dem|VerbForm=Cop": {"pos": "AUX", "PronType": "dem", "VerbForm": "cop"},
+ "AUX__PronType=Rel|Tense=Past|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "past", "VerbForm": "cop"},
+ "AUX__PronType=Rel|Tense=Pres|VerbForm=Cop": {"pos": "AUX", "PronType": "rel", "Tense": "pres", "VerbForm": "cop"},
+ "AUX__Tense=Past|VerbForm=Cop": {"pos": "AUX", "Tense": "past", "VerbForm": "cop"},
+ "AUX__Tense=Pres|VerbForm=Cop": {"pos": "AUX", "Tense": "pres", "VerbForm": "cop"},
+ "AUX__VerbForm=Cop": {"pos": "AUX", "VerbForm": "cop"},
"CCONJ___": {"pos": "CCONJ"},
"DET__Case=Gen|Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"},
- "DET__Definite=Def|Form=Ecl": {"pos": "DET", "Definite": "def", "Other": {"Form": "ecl"}},
+ "DET__Definite=Def|Form=Ecl": {"pos": "DET", "Definite": "def", "Form": "ecl"},
"DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Gender": "fem", "Number": "sing", "PronType": "art"},
"DET__Definite=Def|Number=Plur|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "plur", "PronType": "art"},
"DET__Definite=Def|Number=Sing|PronType=Art": {"pos": "DET", "Definite": "def", "Number": "sing", "PronType": "art"},
"DET__Definite=Def": {"pos": "DET", "Definite": "def"},
- "DET__Form=HPref|PronType=Ind": {"pos": "DET", "PronType": "ind", "Other": {"Form": "hpref"}},
+ "DET__Form=HPref|PronType=Ind": {"pos": "DET", "PronType": "ind", "Form": "hpref"},
"DET__Gender=Fem|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "fem", "Number": "sing", "Person": 3, "Poss": "yes"},
"DET__Gender=Masc|Number=Sing|Person=3|Poss=Yes": {"pos": "DET", "Gender": "masc", "Number": "sing", "Person": 3, "Poss": "yes"},
"DET__Number=Plur|Person=1|Poss=Yes": {"pos": "DET", "Number": "plur", "Person": 1, "Poss": "yes"},
@@ -103,33 +103,33 @@ TAG_MAP = {
"DET__PronType=Dem": {"pos": "DET", "PronType": "dem"},
"DET__PronType=Ind": {"pos": "DET", "PronType": "ind"},
"NOUN__Case=Dat|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Definite": "ind", "Gender": "fem", "Number": "sing"},
- "NOUN__Case=Dat|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}},
- "NOUN__Case=Dat|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
+ "NOUN__Case=Dat|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Form": "ecl"},
+ "NOUN__Case=Dat|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing", "Form": "len"},
"NOUN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "fem", "Number": "sing"},
"NOUN__Case=Dat|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "dat", "Gender": "masc", "Number": "sing"},
- "NOUN__Case=Gen|Definite=Def|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}},
+ "NOUN__Case=Gen|Definite=Def|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "plur", "NounType": "strong"},
"NOUN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"},
- "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}},
- "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}},
+ "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "NounType": "strong"},
+ "NOUN__Case=Gen|Definite=Def|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "plur", "NounType": "weak"},
"NOUN__Case=Gen|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "def", "Gender": "masc", "Number": "sing"},
"NOUN__Case=Gen|Definite=Ind|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Definite": "ind", "Gender": "fem", "Number": "sing"},
- "NOUN__Case=Gen|Form=Ecl|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}},
- "NOUN__Case=Gen|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}},
- "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "strong"}},
- "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl", "NounType": "weak"}},
- "NOUN__Case=Gen|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}},
- "NOUN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}},
- "NOUN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
- "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "strong"}},
- "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "len", "NounType": "weak"}},
- "NOUN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
- "NOUN__Case=Gen|Form=Len|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf", "Other": {"Form": "len"}},
- "NOUN__Case=Gen|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "strong"}},
- "NOUN__Case=Gen|Gender=Fem|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"NounType": "weak"}},
+ "NOUN__Case=Gen|Form=Ecl|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "Form": "ecl", "NounType": "strong"},
+ "NOUN__Case=Gen|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "ecl"},
+ "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "ecl", "NounType": "strong"},
+ "NOUN__Case=Gen|Form=Ecl|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "ecl", "NounType": "weak"},
+ "NOUN__Case=Gen|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Form": "ecl"},
+ "NOUN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "hpref"},
+ "NOUN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "len"},
+ "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "len", "NounType": "strong"},
+ "NOUN__Case=Gen|Form=Len|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "len", "NounType": "weak"},
+ "NOUN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing", "Form": "len"},
+ "NOUN__Case=Gen|Form=Len|VerbForm=Inf": {"pos": "NOUN", "Case": "gen", "VerbForm": "inf", "Form": "len"},
+ "NOUN__Case=Gen|Gender=Fem|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "NounType": "strong"},
+ "NOUN__Case=Gen|Gender=Fem|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur", "NounType": "weak"},
"NOUN__Case=Gen|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "plur"},
"NOUN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "fem", "Number": "sing"},
- "NOUN__Case=Gen|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "strong"}},
- "NOUN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}},
+ "NOUN__Case=Gen|Gender=Masc|NounType=Strong|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "NounType": "strong"},
+ "NOUN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur", "NounType": "weak"},
"NOUN__Case=Gen|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "plur"},
"NOUN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "gen", "Gender": "masc", "Number": "sing"},
"NOUN__Case=Gen|Number=Sing": {"pos": "NOUN", "Case": "gen", "Number": "sing"},
@@ -140,79 +140,79 @@ TAG_MAP = {
"NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"},
"NOUN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"},
"NOUN__Case=NomAcc|Definite=Ind|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Definite": "ind", "Gender": "masc", "Number": "plur"},
- "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}},
- "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}},
- "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}},
- "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}},
- "NOUN__Case=NomAcc|Form=Emp|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "emp"}},
- "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "hpref"}},
- "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}},
- "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "hpref"}},
- "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}},
- "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Other": {"Form": "len"}},
- "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
- "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}},
- "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
+ "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Form": "ecl"},
+ "NOUN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "ecl"},
+ "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Form": "ecl"},
+ "NOUN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "ecl"},
+ "NOUN__Case=NomAcc|Form=Emp|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "emp"},
+ "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Form": "hpref"},
+ "NOUN__Case=NomAcc|Form=HPref|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "hpref"},
+ "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Form": "hpref"},
+ "NOUN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "hpref"},
+ "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur", "Form": "len"},
+ "NOUN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "len"},
+ "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur", "Form": "len"},
+ "NOUN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "len"},
"NOUN__Case=NomAcc|Gender=Fem|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "plur"},
"NOUN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"},
"NOUN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"},
"NOUN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"},
"NOUN__Case=Voc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Definite": "def", "Gender": "masc", "Number": "plur"},
- "NOUN__Case=Voc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
- "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "plur", "Other": {"Form": "len"}},
- "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
+ "NOUN__Case=Voc|Form=Len|Gender=Fem|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "fem", "Number": "sing", "Form": "len"},
+ "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Plur": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "plur", "Form": "len"},
+ "NOUN__Case=Voc|Form=Len|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing", "Form": "len"},
"NOUN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "NOUN", "Case": "voc", "Gender": "masc", "Number": "sing"},
"NOUN__Degree=Pos": {"pos": "NOUN", "Degree": "pos"},
"NOUN__Foreign=Yes": {"pos": "NOUN", "Foreign": "yes"},
- "NOUN__Form=Ecl|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "ecl"}},
- "NOUN__Form=Ecl|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "ecl"}},
- "NOUN__Form=Ecl|VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun", "Other": {"Form": "ecl"}},
- "NOUN__Form=HPref|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "hpref"}},
- "NOUN__Form=Len|Number=Sing": {"pos": "NOUN", "Number": "sing", "Other": {"Form": "len"}},
- "NOUN__Form=Len|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Other": {"Form": "len"}},
+ "NOUN__Form=Ecl|Number=Sing": {"pos": "NOUN", "Number": "sing", "Form": "ecl"},
+ "NOUN__Form=Ecl|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Form": "ecl"},
+ "NOUN__Form=Ecl|VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun", "Form": "ecl"},
+ "NOUN__Form=HPref|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Form": "hpref"},
+ "NOUN__Form=Len|Number=Sing": {"pos": "NOUN", "Number": "sing", "Form": "len"},
+ "NOUN__Form=Len|VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf", "Form": "len"},
"NOUN__Gender=Fem|Number=Sing": {"pos": "NOUN", "Gender": "fem", "Number": "sing"},
- "NOUN__Number=Sing|PartType=Comp": {"pos": "NOUN", "Number": "sing", "Other": {"PartType": "comp"}},
+ "NOUN__Number=Sing|PartType=Comp": {"pos": "NOUN", "Number": "sing", "PartType": "comp"},
"NOUN__Number=Sing": {"pos": "NOUN", "Number": "sing"},
"NOUN___": {"pos": "NOUN"},
"NOUN__Reflex=Yes": {"pos": "NOUN", "Reflex": "yes"},
"NOUN__VerbForm=Inf": {"pos": "NOUN", "VerbForm": "inf"},
"NOUN__VerbForm=Vnoun": {"pos": "NOUN", "VerbForm": "vnoun"},
"NUM__Definite=Def|NumType=Card": {"pos": "NUM", "Definite": "def", "NumType": "card"},
- "NUM__Form=Ecl|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "ecl"}},
- "NUM__Form=Ecl|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "ecl"}},
- "NUM__Form=HPref|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "hpref"}},
- "NUM__Form=Len|NumType=Card": {"pos": "NUM", "NumType": "card", "Other": {"Form": "len"}},
- "NUM__Form=Len|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Other": {"Form": "len"}},
+ "NUM__Form=Ecl|NumType=Card": {"pos": "NUM", "NumType": "card", "Form": "ecl"},
+ "NUM__Form=Ecl|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Form": "ecl"},
+ "NUM__Form=HPref|NumType=Card": {"pos": "NUM", "NumType": "card", "Form": "hpref"},
+ "NUM__Form=Len|NumType=Card": {"pos": "NUM", "NumType": "card", "Form": "len"},
+ "NUM__Form=Len|NumType=Ord": {"pos": "NUM", "NumType": "ord", "Form": "len"},
"NUM__NumType=Card": {"pos": "NUM", "NumType": "card"},
"NUM__NumType=Ord": {"pos": "NUM", "NumType": "ord"},
"NUM___": {"pos": "NUM"},
- "PART__Form=Ecl|PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"Form": "ecl", "PartType": "vb"}},
- "PART__Mood=Imp|PartType=Vb|Polarity=Neg": {"pos": "PART", "Mood": "imp", "Polarity": "neg", "Other": {"PartType": "vb"}},
- "PART__Mood=Imp|PartType=Vb": {"pos": "PART", "Mood": "imp", "Other": {"PartType": "vb"}},
- "PART__Mood=Int|PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"Mood": "int", "PartType": "vb"}},
- "PART__PartType=Ad": {"pos": "PART", "Other": {"PartType": "ad"}},
- "PART__PartType=Cmpl|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "cmpl"}},
- "PART__PartType=Cmpl|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "cmpl"}},
- "PART__PartType=Cmpl": {"pos": "PART", "Other": {"PartType": "cmpl"}},
- "PART__PartType=Comp": {"pos": "PART", "Other": {"PartType": "comp"}},
- "PART__PartType=Cop|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "cop"}},
- "PART__PartType=Deg": {"pos": "PART", "Other": {"PartType": "deg"}},
+ "PART__Form=Ecl|PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Form": "ecl", "PartType": "vb"},
+ "PART__Mood=Imp|PartType=Vb|Polarity=Neg": {"pos": "PART", "Mood": "imp", "Polarity": "neg", "PartType": "vb"},
+ "PART__Mood=Imp|PartType=Vb": {"pos": "PART", "Mood": "imp", "PartType": "vb"},
+ "PART__Mood=Int|PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Mood": "int", "PartType": "vb"},
+ "PART__PartType=Ad": {"pos": "PART", "PartType": "ad"},
+ "PART__PartType=Cmpl|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "PartType": "cmpl"},
+ "PART__PartType=Cmpl|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "PartType": "cmpl"},
+ "PART__PartType=Cmpl": {"pos": "PART", "PartType": "cmpl"},
+ "PART__PartType=Comp": {"pos": "PART", "PartType": "comp"},
+ "PART__PartType=Cop|PronType=Rel": {"pos": "PART", "PronType": "rel", "PartType": "cop"},
+ "PART__PartType=Deg": {"pos": "PART", "PartType": "deg"},
"PART__PartType=Inf": {"pos": "PART", "PartType": "inf"},
- "PART__PartType=Num": {"pos": "PART", "Other": {"PartType": "num"}},
- "PART__PartType=Pat": {"pos": "PART", "Other": {"PartType": "pat"}},
- "PART__PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "Other": {"PartType": "vb"}},
- "PART__PartType=Vb|Polarity=Neg|PronType=Rel": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Other": {"PartType": "vb"}},
- "PART__PartType=Vb|Polarity=Neg|PronType=Rel|Tense=Past": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}},
- "PART__PartType=Vb|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "Other": {"PartType": "vb"}},
- "PART__PartType=Vb": {"pos": "PART", "Other": {"PartType": "vb"}},
- "PART__PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "Other": {"PartType": "vb"}},
- "PART__PartType=Vb|PronType=Rel|Tense=Past": {"pos": "PART", "PronType": "rel", "Tense": "past", "Other": {"PartType": "vb"}},
- "PART__PartType=Vb|Tense=Past": {"pos": "PART", "Tense": "past", "Other": {"PartType": "vb"}},
- "PART__PartType=Voc": {"pos": "PART", "Other": {"PartType": "voc"}},
+ "PART__PartType=Num": {"pos": "PART", "PartType": "num"},
+ "PART__PartType=Pat": {"pos": "PART", "PartType": "pat"},
+ "PART__PartType=Vb|Polarity=Neg": {"pos": "PART", "Polarity": "neg", "PartType": "vb"},
+ "PART__PartType=Vb|Polarity=Neg|PronType=Rel": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "PartType": "vb"},
+ "PART__PartType=Vb|Polarity=Neg|PronType=Rel|Tense=Past": {"pos": "PART", "Polarity": "neg", "PronType": "rel", "Tense": "past", "PartType": "vb"},
+ "PART__PartType=Vb|Polarity=Neg|Tense=Past": {"pos": "PART", "Polarity": "neg", "Tense": "past", "PartType": "vb"},
+ "PART__PartType=Vb": {"pos": "PART", "PartType": "vb"},
+ "PART__PartType=Vb|PronType=Rel": {"pos": "PART", "PronType": "rel", "PartType": "vb"},
+ "PART__PartType=Vb|PronType=Rel|Tense=Past": {"pos": "PART", "PronType": "rel", "Tense": "past", "PartType": "vb"},
+ "PART__PartType=Vb|Tense=Past": {"pos": "PART", "Tense": "past", "PartType": "vb"},
+ "PART__PartType=Voc": {"pos": "PART", "PartType": "voc"},
"PART___": {"pos": "PART"},
"PART__PronType=Rel": {"pos": "PART", "PronType": "rel"},
- "PRON__Form=Len|Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2, "Other": {"Form": "len"}},
- "PRON__Form=Len|PronType=Ind": {"pos": "PRON", "PronType": "ind", "Other": {"Form": "len"}},
+ "PRON__Form=Len|Number=Sing|Person=2": {"pos": "PRON", "Number": "sing", "Person": 2, "Form": "len"},
+ "PRON__Form=Len|PronType=Ind": {"pos": "PRON", "PronType": "ind", "Form": "len"},
"PRON__Gender=Fem|Number=Sing|Person=3": {"pos": "PRON", "Gender": "fem", "Number": "sing", "Person": 3},
"PRON__Gender=Masc|Number=Sing|Person=3": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3},
"PRON__Gender=Masc|Number=Sing|Person=3|PronType=Emp": {"pos": "PRON", "Gender": "masc", "Number": "sing", "Person": 3, "PronType": "emp"},
@@ -232,103 +232,103 @@ TAG_MAP = {
"PRON__PronType=Ind": {"pos": "PRON", "PronType": "ind"},
"PRON__PronType=Int": {"pos": "PRON", "PronType": "int"},
"PRON__Reflex=Yes": {"pos": "PRON", "Reflex": "yes"},
- "PROPN__Abbr=Yes": {"pos": "PROPN", "Other": {"Abbr": "yes"}},
+ "PROPN__Abbr=Yes": {"pos": "PROPN", "Abbr": "yes"},
"PROPN__Case=Dat|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "dat", "Gender": "fem", "Number": "sing"},
"PROPN__Case=Gen|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Definite": "def", "Gender": "fem", "Number": "sing"},
- "PROPN__Case=Gen|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "plur", "Other": {"Form": "ecl"}},
- "PROPN__Case=Gen|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"Form": "ecl"}},
- "PROPN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "hpref"}},
- "PROPN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
- "PROPN__Case=Gen|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Other": {"Form": "len"}},
- "PROPN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
- "PROPN__Case=Gen|Form=Len|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Other": {"Form": "len"}},
+ "PROPN__Case=Gen|Form=Ecl|Gender=Fem|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "plur", "Form": "ecl"},
+ "PROPN__Case=Gen|Form=Ecl|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Form": "ecl"},
+ "PROPN__Case=Gen|Form=HPref|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "hpref"},
+ "PROPN__Case=Gen|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing", "Form": "len"},
+ "PROPN__Case=Gen|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Form": "len"},
+ "PROPN__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing", "Form": "len"},
+ "PROPN__Case=Gen|Form=Len|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Form": "len"},
"PROPN__Case=Gen|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "fem", "Number": "sing"},
"PROPN__Case=Gen|Gender=Fem": {"pos": "PROPN", "Case": "gen", "Gender": "fem"},
- "PROPN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "Other": {"NounType": "weak"}},
+ "PROPN__Case=Gen|Gender=Masc|NounType=Weak|Number=Plur": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "plur", "NounType": "weak"},
"PROPN__Case=Gen|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "gen", "Gender": "masc", "Number": "sing"},
"PROPN__Case=Gen|Gender=Masc": {"pos": "PROPN", "Case": "gen", "Gender": "masc"},
"PROPN__Case=NomAcc|Definite=Def|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "fem", "Number": "sing"},
"PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "plur"},
"PROPN__Case=NomAcc|Definite=Def|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Definite": "def", "Gender": "masc", "Number": "sing"},
- "PROPN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "ecl"}},
- "PROPN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "ecl"}},
- "PROPN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "hpref"}},
- "PROPN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Other": {"Form": "len"}},
- "PROPN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}},
+ "PROPN__Case=NomAcc|Form=Ecl|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "ecl"},
+ "PROPN__Case=NomAcc|Form=Ecl|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "ecl"},
+ "PROPN__Case=NomAcc|Form=HPref|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "hpref"},
+ "PROPN__Case=NomAcc|Form=Len|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Form": "len"},
+ "PROPN__Case=NomAcc|Form=Len|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing", "Form": "len"},
"PROPN__Case=NomAcc|Gender=Fem|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "fem", "Number": "sing"},
"PROPN__Case=NomAcc|Gender=Masc|Number=Plur": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "plur"},
"PROPN__Case=NomAcc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc", "Number": "sing"},
"PROPN__Case=NomAcc|Gender=Masc": {"pos": "PROPN", "Case": "nom|acc", "Gender": "masc"},
- "PROPN__Case=Voc|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "voc", "Gender": "fem", "Other": {"Form": "len"}},
+ "PROPN__Case=Voc|Form=Len|Gender=Fem": {"pos": "PROPN", "Case": "voc", "Gender": "fem", "Form": "len"},
"PROPN__Case=Voc|Gender=Masc|Number=Sing": {"pos": "PROPN", "Case": "voc", "Gender": "masc", "Number": "sing"},
"PROPN__Gender=Masc|Number=Sing": {"pos": "PROPN", "Gender": "masc", "Number": "sing"},
"PROPN___": {"pos": "PROPN"},
"PUNCT___": {"pos": "PUNCT"},
"SCONJ___": {"pos": "SCONJ"},
- "SCONJ__Tense=Past|VerbForm=Cop": {"pos": "SCONJ", "Tense": "past", "Other": {"VerbForm": "cop"}},
- "SCONJ__VerbForm=Cop": {"pos": "SCONJ", "Other": {"VerbForm": "cop"}},
- "SYM__Abbr=Yes": {"pos": "SYM", "Other": {"Abbr": "yes"}},
+ "SCONJ__Tense=Past|VerbForm=Cop": {"pos": "SCONJ", "Tense": "past", "VerbForm": "cop"},
+ "SCONJ__VerbForm=Cop": {"pos": "SCONJ", "VerbForm": "cop"},
+ "SYM__Abbr=Yes": {"pos": "SYM", "Abbr": "yes"},
"VERB__Case=NomAcc|Gender=Masc|Mood=Ind|Number=Sing|Tense=Pres": {"pos": "VERB", "Case": "nom|acc", "Gender": "masc", "Mood": "ind", "Number": "sing", "Tense": "pres"},
- "VERB__Dialect=Munster|Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}},
+ "VERB__Dialect=Munster|Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Dialect": "munster", "Form": "len"},
"VERB__Foreign=Yes": {"pos": "VERB", "Foreign": "yes"},
- "VERB__Form=Ecl|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "ecl", "Voice": "auto"}},
- "VERB__Form=Ecl|Mood=Imp|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}},
- "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "ecl", "Voice": "auto"}},
- "VERB__Form=Ecl|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "ecl", "Voice": "auto"}},
- "VERB__Form=Ecl|Mood=Sub|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Tense": "pres", "Other": {"Form": "ecl"}},
- "VERB__Form=Ecl": {"pos": "VERB", "Other": {"Form": "ecl"}},
- "VERB__Form=Emp|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}},
- "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "PronType": "rel", "Tense": "pres", "Other": {"Form": "emp"}},
- "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Other": {"Form": "emp"}},
- "VERB__Form=Len|Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3, "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=2": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 2, "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Form": "len", "Voice": "auto"}},
- "VERB__Form=Len|Mood=Imp|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Imp|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}},
- "VERB__Form=Len|Mood=Imp|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Other": {"Form": "len", "Voice": "auto"}},
- "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "fut", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Ind|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 3, "Tense": "past", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}},
- "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}},
- "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}},
- "VERB__Form=Len|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Form": "len", "Voice": "auto"}},
- "VERB__Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Form": "len", "Voice": "auto"}},
- "VERB__Form=Len|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len"}},
- "VERB__Form=Len|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Form": "len", "Voice": "auto"}},
- "VERB__Form=Len|Mood=Sub|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Polarity": "neg", "Tense": "pres", "Other": {"Form": "len"}},
- "VERB__Form=Len|Polarity=Neg": {"pos": "VERB", "Polarity": "neg", "Other": {"Form": "len"}},
- "VERB__Form=Len": {"pos": "VERB", "Other": {"Form": "len"}},
+ "VERB__Form=Ecl|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Form": "ecl", "Voice": "auto"},
+ "VERB__Form=Ecl|Mood=Imp|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "sing", "Person": 1, "Tense": "past", "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Form": "ecl", "Voice": "auto"},
+ "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Form": "ecl", "Voice": "auto"},
+ "VERB__Form=Ecl|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Form": "ecl"},
+ "VERB__Form=Ecl|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Form": "ecl", "Voice": "auto"},
+ "VERB__Form=Ecl|Mood=Sub|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Tense": "pres", "Form": "ecl"},
+ "VERB__Form=Ecl": {"pos": "VERB", "Form": "ecl"},
+ "VERB__Form=Emp|Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres", "Form": "emp"},
+ "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "PronType": "rel", "Tense": "pres", "Form": "emp"},
+ "VERB__Form=Emp|Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres", "Form": "emp"},
+ "VERB__Form=Len|Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3, "Form": "len"},
+ "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1, "Form": "len"},
+ "VERB__Form=Len|Mood=Cnd|Number=Sing|Person=2": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 2, "Form": "len"},
+ "VERB__Form=Len|Mood=Cnd|Polarity=Neg": {"pos": "VERB", "Mood": "cnd", "Polarity": "neg", "Form": "len"},
+ "VERB__Form=Len|Mood=Cnd": {"pos": "VERB", "Mood": "cnd", "Form": "len"},
+ "VERB__Form=Len|Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Form": "len", "Voice": "auto"},
+ "VERB__Form=Len|Mood=Imp|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 3, "Tense": "past", "Form": "len"},
+ "VERB__Form=Len|Mood=Imp|Tense=Past": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Form": "len"},
+ "VERB__Form=Len|Mood=Imp|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Tense": "past", "Form": "len", "Voice": "auto"},
+ "VERB__Form=Len|Mood=Imp|Voice=Auto": {"pos": "VERB", "Mood": "imp", "Form": "len", "Voice": "auto"},
+ "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "fut", "Form": "len"},
+ "VERB__Form=Len|Mood=Ind|Number=Plur|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "past", "Form": "len"},
+ "VERB__Form=Len|Mood=Ind|Number=Plur|Person=3|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 3, "Tense": "past", "Form": "len"},
+ "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "past", "Form": "len"},
+ "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Polarity": "neg", "Tense": "pres", "Form": "len"},
+ "VERB__Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Form": "len"},
+ "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Form": "len"},
+ "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "fut", "Form": "len", "Voice": "auto"},
+ "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Form": "len"},
+ "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Form": "len", "Voice": "auto"},
+ "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Form": "len"},
+ "VERB__Form=Len|Mood=Ind|Polarity=Neg|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres", "Form": "len", "Voice": "auto"},
+ "VERB__Form=Len|Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Form": "len"},
+ "VERB__Form=Len|Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Form": "len", "Voice": "auto"},
+ "VERB__Form=Len|Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Form": "len"},
+ "VERB__Form=Len|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Form": "len", "Voice": "auto"},
+ "VERB__Form=Len|Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Form": "len"},
+ "VERB__Form=Len|Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Form": "len", "Voice": "auto"},
+ "VERB__Form=Len|Mood=Sub|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "sub", "Polarity": "neg", "Tense": "pres", "Form": "len"},
+ "VERB__Form=Len|Polarity=Neg": {"pos": "VERB", "Polarity": "neg", "Form": "len"},
+ "VERB__Form=Len": {"pos": "VERB", "Form": "len"},
"VERB__Mood=Cnd|Number=Plur|Person=3": {"pos": "VERB", "Mood": "cnd", "Number": "plur", "Person": 3},
"VERB__Mood=Cnd|Number=Sing|Person=1": {"pos": "VERB", "Mood": "cnd", "Number": "sing", "Person": 1},
"VERB__Mood=Cnd": {"pos": "VERB", "Mood": "cnd"},
- "VERB__Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Other": {"Voice": "auto"}},
+ "VERB__Mood=Cnd|Voice=Auto": {"pos": "VERB", "Mood": "cnd", "Voice": "auto"},
"VERB__Mood=Imp|Number=Plur|Person=1|Polarity=Neg": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1, "Polarity": "neg"},
"VERB__Mood=Imp|Number=Plur|Person=1": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 1},
"VERB__Mood=Imp|Number=Plur|Person=2": {"pos": "VERB", "Mood": "imp", "Number": "plur", "Person": 2},
@@ -338,28 +338,28 @@ TAG_MAP = {
"VERB__Mood=Ind|Number=Plur|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "plur", "Person": 1, "Tense": "pres"},
"VERB__Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past"},
"VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "pres"},
- "VERB__Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Other": {"Voice": "auto"}},
+ "VERB__Mood=Ind|Polarity=Neg|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "past", "Voice": "auto"},
"VERB__Mood=Ind|Polarity=Neg|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Polarity": "neg", "Tense": "pres"},
"VERB__Mood=Ind|PronType=Rel|Tense=Fut": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "fut"},
"VERB__Mood=Ind|PronType=Rel|Tense=Pres": {"pos": "VERB", "Mood": "ind", "PronType": "rel", "Tense": "pres"},
"VERB__Mood=Ind|Tense=Fut": {"pos": "VERB", "Mood": "ind", "Tense": "fut"},
- "VERB__Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Other": {"Voice": "auto"}},
+ "VERB__Mood=Ind|Tense=Fut|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "fut", "Voice": "auto"},
"VERB__Mood=Ind|Tense=Past": {"pos": "VERB", "Mood": "ind", "Tense": "past"},
- "VERB__Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Other": {"Voice": "auto"}},
+ "VERB__Mood=Ind|Tense=Past|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "past", "Voice": "auto"},
"VERB__Mood=Ind|Tense=Pres": {"pos": "VERB", "Mood": "ind", "Tense": "pres"},
- "VERB__Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Other": {"Voice": "auto"}},
+ "VERB__Mood=Ind|Tense=Pres|Voice=Auto": {"pos": "VERB", "Mood": "ind", "Tense": "pres", "Voice": "auto"},
"VERB___": {"pos": "VERB"},
- "X__Abbr=Yes": {"pos": "X", "Other": {"Abbr": "yes"}},
+ "X__Abbr=Yes": {"pos": "X", "Abbr": "yes"},
"X__Case=NomAcc|Foreign=Yes|Gender=Fem|Number=Sing": {"pos": "X", "Case": "nom|acc", "Gender": "fem", "Number": "sing", "Foreign": "yes"},
- "X__Definite=Def|Dialect=Ulster": {"pos": "X", "Definite": "def", "Other": {"Dialect": "ulster"}},
- "X__Dialect=Munster|Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "X", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Other": {"Dialect": "munster", "Form": "len"}},
- "X__Dialect=Munster|Mood=Imp|Number=Sing|Person=2|Polarity=Neg": {"pos": "X", "Mood": "imp", "Number": "sing", "Person": 2, "Polarity": "neg", "Other": {"Dialect": "munster"}},
- "X__Dialect=Munster|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "X", "Mood": "ind", "Tense": "past", "Other": {"Dialect": "munster", "Voice": "auto"}},
- "X__Dialect=Munster": {"pos": "X", "Other": {"Dialect": "munster"}},
- "X__Dialect=Munster|PronType=Dem": {"pos": "X", "PronType": "dem", "Other": {"Dialect": "munster"}},
- "X__Dialect=Ulster|Gender=Masc|Number=Sing|Person=3": {"pos": "X", "Gender": "masc", "Number": "sing", "Person": 3, "Other": {"Dialect": "ulster"}},
- "X__Dialect=Ulster|PartType=Vb|Polarity=Neg": {"pos": "X", "Polarity": "neg", "Other": {"Dialect": "ulster", "PartType": "vb"}},
- "X__Dialect=Ulster|VerbForm=Cop": {"pos": "X", "Other": {"Dialect": "ulster", "VerbForm": "cop"}},
+ "X__Definite=Def|Dialect=Ulster": {"pos": "X", "Definite": "def", "Dialect": "ulster"},
+ "X__Dialect=Munster|Form=Len|Mood=Ind|Number=Sing|Person=1|Tense=Past": {"pos": "X", "Mood": "ind", "Number": "sing", "Person": 1, "Tense": "past", "Dialect": "munster", "Form": "len"},
+ "X__Dialect=Munster|Mood=Imp|Number=Sing|Person=2|Polarity=Neg": {"pos": "X", "Mood": "imp", "Number": "sing", "Person": 2, "Polarity": "neg", "Dialect": "munster"},
+ "X__Dialect=Munster|Mood=Ind|Tense=Past|Voice=Auto": {"pos": "X", "Mood": "ind", "Tense": "past", "Dialect": "munster", "Voice": "auto"},
+ "X__Dialect=Munster": {"pos": "X", "Dialect": "munster"},
+ "X__Dialect=Munster|PronType=Dem": {"pos": "X", "PronType": "dem", "Dialect": "munster"},
+ "X__Dialect=Ulster|Gender=Masc|Number=Sing|Person=3": {"pos": "X", "Gender": "masc", "Number": "sing", "Person": 3, "Dialect": "ulster"},
+ "X__Dialect=Ulster|PartType=Vb|Polarity=Neg": {"pos": "X", "Polarity": "neg", "Dialect": "ulster", "PartType": "vb"},
+ "X__Dialect=Ulster|VerbForm=Cop": {"pos": "X", "Dialect": "ulster", "VerbForm": "cop"},
"X__Foreign=Yes": {"pos": "X", "Foreign": "yes"},
"X___": {"pos": "X"}
}
diff --git a/spacy/lang/nb/morph_rules.py b/spacy/lang/nb/morph_rules.py
index b1799fca8..e96b9fd6b 100644
--- a/spacy/lang/nb/morph_rules.py
+++ b/spacy/lang/nb/morph_rules.py
@@ -195,7 +195,7 @@ MORPH_RULES = {
"seg": {
LEMMA: PRON_LEMMA,
"Person": "Three",
- "Number": ("Sing", "Plur"),
+ "Number": "Sing,Plur",
"Reflex": "Yes",
}
},
@@ -248,7 +248,7 @@ MORPH_RULES = {
},
"deres": {
LEMMA: "deres",
- "Person": ("Two", "Three"),
+ "Person": "Two,Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Masc",
@@ -309,7 +309,7 @@ MORPH_RULES = {
},
"deres": {
LEMMA: "deres",
- "Person": ("Two", "Three"),
+ "Person": "Two,Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Fem",
@@ -370,7 +370,7 @@ MORPH_RULES = {
},
"deres": {
LEMMA: "deres",
- "Person": ("Two", "Three"),
+ "Person": "Two,Three",
"Number": "Sing",
"Poss": "Yes",
"Gender": "Neut",
@@ -400,7 +400,7 @@ MORPH_RULES = {
"våre": {LEMMA: "vår", "Person": "One", "Number": "Plur", "Poss": "Yes"},
"deres": {
LEMMA: "deres",
- "Person": ("Two", "Three"),
+ "Person": "Two,Three",
"Number": "Plur",
"Poss": "Yes",
},
@@ -448,21 +448,21 @@ MORPH_RULES = {
"PronType": "Prs",
"Number": "Sing",
"Person": "Three",
- "Gender": ("Fem", "Masc"),
+ "Gender": "Fem,Masc",
},
"den": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Number": "Sing",
"Person": "Three",
- "Gender": ("Fem", "Masc"),
+ "Gender": "Fem,Masc",
},
"ingen": {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Number": "Sing",
"Person": "Three",
- "Gender": ("Fem", "Masc"),
+ "Gender": "Fem,Masc",
"Polarity": "Neg",
},
},
@@ -475,7 +475,7 @@ MORPH_RULES = {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Number": "Sing",
- "Case": ("Gen", "Nom"),
+ "Case": "Gen,Nom",
}
},
"PRON__Animacy=Anim|Case=Gen|Number=Sing|PronType=Prs": {
diff --git a/spacy/lang/sv/morph_rules.py b/spacy/lang/sv/morph_rules.py
index 8fca20a49..3ef6aedc5 100644
--- a/spacy/lang/sv/morph_rules.py
+++ b/spacy/lang/sv/morph_rules.py
@@ -105,7 +105,7 @@ MORPH_RULES = {
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
- "Case": ("Nom", "Acc"),
+ "Case": "Nom,Acc",
},
"dem": {
LEMMA: PRON_LEMMA,
@@ -166,7 +166,7 @@ MORPH_RULES = {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
- "Number": ("Sing", "Plur"),
+ "Number": "Sing,Plur",
"Gender": "Masc",
"Poss": "Yes",
"Reflex": "Yes",
@@ -175,7 +175,7 @@ MORPH_RULES = {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
- "Number": ("Sing", "Plur"),
+ "Number": "Sing,Plur",
"Gender": "Fem",
"Poss": "Yes",
"Reflex": "Yes",
@@ -184,7 +184,7 @@ MORPH_RULES = {
LEMMA: PRON_LEMMA,
"PronType": "Prs",
"Person": "Two",
- "Number": ("Sing", "Plur"),
+ "Number": "Sing,Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
@@ -272,7 +272,7 @@ MORPH_RULES = {
"VBZ": {
"är": {
"VerbForm": "Fin",
- "Person": ("One", "Two", "Three"),
+ "Person": "One,Two,Three",
"Tense": "Pres",
"Mood": "Ind",
}
diff --git a/spacy/lang/uk/tag_map.py b/spacy/lang/uk/tag_map.py
deleted file mode 100644
index 1ecbddc49..000000000
--- a/spacy/lang/uk/tag_map.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
-from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
-
-
-TAG_MAP = {
- "ADV": {POS: ADV},
- "NOUN": {POS: NOUN},
- "ADP": {POS: ADP},
- "PRON": {POS: PRON},
- "SCONJ": {POS: SCONJ},
- "PROPN": {POS: PROPN},
- "DET": {POS: DET},
- "SYM": {POS: SYM},
- "INTJ": {POS: INTJ},
- "PUNCT": {POS: PUNCT},
- "NUM": {POS: NUM},
- "AUX": {POS: AUX},
- "X": {POS: X},
- "CONJ": {POS: CONJ},
- "CCONJ": {POS: CCONJ},
- "ADJ": {POS: ADJ},
- "VERB": {POS: VERB},
- "PART": {POS: PART},
- "SP": {POS: SPACE},
-}
diff --git a/spacy/lang/ur/tag_map.py b/spacy/lang/ur/tag_map.py
index e0940edb7..d990fd46a 100644
--- a/spacy/lang/ur/tag_map.py
+++ b/spacy/lang/ur/tag_map.py
@@ -10,8 +10,8 @@ TAG_MAP = {
'""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
":": {POS: PUNCT},
- "$": {POS: SYM, "Other": {"SymType": "currency"}},
- "#": {POS: SYM, "Other": {"SymType": "numbersign"}},
+ "$": {POS: SYM, "SymType": "currency"},
+ "#": {POS: SYM, "SymType": "numbersign"},
"AFX": {POS: ADJ, "Hyph": "yes"},
"CC": {POS: CCONJ, "ConjType": "coor"},
"CD": {POS: NUM, "NumType": "card"},
@@ -52,7 +52,7 @@ TAG_MAP = {
"VerbForm": "fin",
"Tense": "pres",
"Number": "sing",
- "Person": 3,
+ "Person": "3",
},
"WDT": {POS: ADJ, "PronType": "int|rel"},
"WP": {POS: NOUN, "PronType": "int|rel"},
diff --git a/spacy/lang/vi/tag_map.py b/spacy/lang/vi/tag_map.py
deleted file mode 100644
index 1ecbddc49..000000000
--- a/spacy/lang/vi/tag_map.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
-from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
-
-
-TAG_MAP = {
- "ADV": {POS: ADV},
- "NOUN": {POS: NOUN},
- "ADP": {POS: ADP},
- "PRON": {POS: PRON},
- "SCONJ": {POS: SCONJ},
- "PROPN": {POS: PROPN},
- "DET": {POS: DET},
- "SYM": {POS: SYM},
- "INTJ": {POS: INTJ},
- "PUNCT": {POS: PUNCT},
- "NUM": {POS: NUM},
- "AUX": {POS: AUX},
- "X": {POS: X},
- "CONJ": {POS: CONJ},
- "CCONJ": {POS: CCONJ},
- "ADJ": {POS: ADJ},
- "VERB": {POS: VERB},
- "PART": {POS: PART},
- "SP": {POS: SPACE},
-}
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 1a3cedf97..1e8c255b8 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -2,6 +2,7 @@ from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap, PreshMapArray
from libc.stdint cimport uint64_t
from murmurhash cimport mrmr
+cimport numpy as np
from .structs cimport TokenC, MorphAnalysisC
from .strings cimport StringStore
@@ -20,12 +21,11 @@ cdef class Morphology:
cdef readonly object tag_names
cdef readonly object reverse_index
cdef readonly object exc
- cdef readonly object _feat_map
cdef readonly PreshMapArray _cache
cdef readonly int n_tags
- cpdef update(self, hash_t morph, features)
- cdef hash_t insert(self, MorphAnalysisC tag) except 0
+ cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
+ cdef int insert(self, MorphAnalysisC tag) except -1
cdef int assign_untagged(self, TokenC* token) except -1
cdef int assign_tag(self, TokenC* token, tag) except -1
@@ -34,8 +34,7 @@ cdef class Morphology:
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1
-cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil
-cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil
-cdef list list_features(const MorphAnalysisC* tag)
-
-cdef tag_to_json(const MorphAnalysisC* tag)
+cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
+cdef list list_features(const MorphAnalysisC* morph)
+cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
+cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 8030a9a28..3003d118f 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -2,6 +2,7 @@
from libc.string cimport memset
import srsly
from collections import Counter
+import numpy
from .strings import get_string_id
from . import symbols
@@ -10,130 +11,38 @@ from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme
-from .errors import Errors
+from .errors import Errors, Warnings, user_warning
from .util import ensure_path
-cdef enum univ_field_t:
- Field_POS
- Field_Abbr
- Field_AdpType
- Field_AdvType
- Field_Animacy
- Field_Aspect
- Field_Case
- Field_ConjType
- Field_Connegative
- Field_Definite
- Field_Degree
- Field_Derivation
- Field_Echo
- Field_Foreign
- Field_Gender
- Field_Hyph
- Field_InfForm
- Field_Mood
- Field_NameType
- Field_Negative
- Field_NounType
- Field_Number
- Field_NumForm
- Field_NumType
- Field_NumValue
- Field_PartForm
- Field_PartType
- Field_Person
- Field_Polarity
- Field_Polite
- Field_Poss
- Field_Prefix
- Field_PrepCase
- Field_PronType
- Field_PunctSide
- Field_PunctType
- Field_Reflex
- Field_Style
- Field_StyleVariant
- Field_Tense
- Field_Typo
- Field_VerbForm
- Field_VerbType
- Field_Voice
-
-
def _normalize_props(props):
- """Transform deprecated string keys to correct names."""
+ """Convert attrs dict so that POS is always by ID, other features are left
+ as is as long as they are strings or IDs.
+ """
out = {}
props = dict(props)
- for key in FIELDS:
- if key in props:
- value = str(props[key]).lower()
- # We don't have support for disjunctive int|rel features, so
- # just take the first one :(
- if "|" in value:
- value = value.split("|")[0]
- attr = f"{key}_{value}"
- if attr in FEATURES:
- props.pop(key)
- props[attr] = True
for key, value in props.items():
+ # convert POS value to ID
if key == POS:
if hasattr(value, 'upper'):
value = value.upper()
if value in POS_IDS:
value = POS_IDS[value]
out[key] = value
- elif isinstance(key, int):
- out[key] = value
- elif value is True:
- out[key] = value
- elif key.lower() == 'pos':
+ elif isinstance(key, str) and key.lower() == 'pos':
out[POS] = POS_IDS[value.upper()]
- elif key.lower() != 'morph':
+ # sort values
+ elif isinstance(value, str) and Morphology.VALUE_SEP in value:
+ out[key] = Morphology.VALUE_SEP.join(
+ sorted(value.split(Morphology.VALUE_SEP)))
+ # accept any string or ID fields and values
+ elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
out[key] = value
+ else:
+ user_warning(Warnings.W028.format(feature={key: value}))
return out
-class MorphologyClassMap(object):
- def __init__(self, features):
- self.features = tuple(features)
- self.fields = []
- self.feat2field = {}
- seen_fields = set()
- for feature in features:
- field = feature.split("_", 1)[0]
- if field not in seen_fields:
- self.fields.append(field)
- seen_fields.add(field)
- self.feat2field[feature] = FIELDS[field]
- self.id2feat = {get_string_id(name): name for name in features}
- self.field2feats = {"POS": []}
- self.col2info = []
- self.attr2field = dict(LOWER_FIELDS.items())
- self.feat2offset = {}
- self.field2col = {}
- self.field2id = dict(FIELDS.items())
- self.fieldid2field = {field_id: field for field, field_id in FIELDS.items()}
- for feature in features:
- field = self.fields[self.feat2field[feature]]
- if field not in self.field2col:
- self.field2col[field] = len(self.col2info)
- if field != "POS" and field not in self.field2feats:
- self.col2info.append((field, 0, "NIL"))
- self.field2feats.setdefault(field, ["NIL"])
- offset = len(self.field2feats[field])
- self.field2feats[field].append(feature)
- self.col2info.append((field, offset, feature))
- self.feat2offset[feature] = offset
-
- @property
- def field_sizes(self):
- return [len(self.field2feats[field]) for field in self.fields]
-
- def get_field_offset(self, field):
- return self.field2col[field]
-
-
cdef class Morphology:
'''Store the possible morphological analyses for a language, and index them
by hash.
@@ -142,9 +51,15 @@ cdef class Morphology:
analysis, so queries of morphological attributes are delegated
to this class.
'''
- def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
+
+ FEATURE_SEP = "|"
+ FIELD_SEP = "="
+ VALUE_SEP = ","
+ EMPTY_MORPH = "_"
+
+ def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None):
self.mem = Pool()
- self.strings = string_store
+ self.strings = strings
self.tags = PreshMap()
# Add special space symbol. We prefix with underscore, to make sure it
# always sorts to the end.
@@ -158,7 +73,6 @@ cdef class Morphology:
self.lemmatizer = lemmatizer
self.n_tags = len(tag_map)
self.reverse_index = {}
- self._feat_map = MorphologyClassMap(FEATURES)
self._load_from_tag_map(tag_map)
self._cache = PreshMapArray(self.n_tags)
@@ -172,8 +86,7 @@ cdef class Morphology:
def _load_from_tag_map(self, tag_map):
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs)
- self.add({self._feat_map.id2feat[feat] for feat in attrs
- if feat in self._feat_map.id2feat})
+ self.add(attrs)
self.tag_map[tag_str] = dict(attrs)
self.reverse_index[self.strings.add(tag_str)] = i
@@ -182,40 +95,78 @@ cdef class Morphology:
self.exc), None, None)
def add(self, features):
- """Insert a morphological analysis in the morphology table, if not already
- present. Returns the hash of the new analysis.
+ """Insert a morphological analysis in the morphology table, if not
+ already present. The morphological analysis may be provided in the UD
+ FEATS format as a string or in the tag map dict format.
+ Returns the hash of the new analysis.
+ """
+ cdef MorphAnalysisC* tag_ptr
+ if features == self.EMPTY_MORPH:
+ features = ""
+ if isinstance(features, str):
+ tag_ptr = self.tags.get(self.strings[features])
+ if tag_ptr != NULL:
+ return tag_ptr.key
+ features = self.feats_to_dict(features)
+ if not isinstance(features, dict):
+ user_warning(Warnings.W028.format(feature=features))
+ features = {}
+ features = _normalize_props(features)
+ string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
+ # normalized UFEATS string with sorted fields and values
+ norm_feats_string = self.FEATURE_SEP.join(sorted([
+ self.FIELD_SEP.join([field, values])
+ for field, values in string_features.items()
+ ]))
+ # intified ("Field", "Field=Value") pairs
+ field_feature_pairs = []
+ for field in sorted(string_features):
+ values = string_features[field]
+ for value in values.split(self.VALUE_SEP):
+ field_feature_pairs.append((
+ self.strings.add(field),
+ self.strings.add(field + self.FIELD_SEP + value),
+ ))
+ cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
+ # the hash key for the tag is either the hash of the normalized UFEATS
+ # string or the hash of an empty placeholder (using the empty string
+ # would give a hash key of 0, which is not good for PreshMap)
+ if norm_feats_string:
+ tag.key = self.strings.add(norm_feats_string)
+ else:
+ tag.key = self.strings.add(self.EMPTY_MORPH)
+ self.insert(tag)
+ return tag.key
+
+ cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
+ """Creates a MorphAnalysisC from a list of intified
+ ("Field", "Field=Value") tuples where fields with multiple values have
+ been split into individual tuples, e.g.:
+ [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
+ ("Field2", "Field2=Value3")]
"""
- for f in features:
- if isinstance(f, str):
- self.strings.add(f)
- string_features = features
- features = intify_features(features)
- cdef attr_t feature
- for feature in features:
- if feature != 0 and feature not in self._feat_map.id2feat:
- raise ValueError(Errors.E167.format(feat=self.strings[feature], feat_id=feature))
cdef MorphAnalysisC tag
- tag = create_rich_tag(features)
- cdef hash_t key = self.insert(tag)
- return key
+ tag.length = len(field_feature_pairs)
+ tag.fields = self.mem.alloc(tag.length, sizeof(attr_t))
+ tag.features = self.mem.alloc(tag.length, sizeof(attr_t))
+ for i, (field, feature) in enumerate(field_feature_pairs):
+ tag.fields[i] = field
+ tag.features[i] = feature
+ return tag
+
+ cdef int insert(self, MorphAnalysisC tag) except -1:
+ cdef hash_t key = tag.key
+ if self.tags.get(key) == NULL:
+ tag_ptr = self.mem.alloc(1, sizeof(MorphAnalysisC))
+ tag_ptr[0] = tag
+ self.tags.set(key, tag_ptr)
def get(self, hash_t morph):
tag = self.tags.get(morph)
if tag == NULL:
return []
else:
- return tag_to_json(tag)
-
- cpdef update(self, hash_t morph, features):
- """Update a morphological analysis with new feature values."""
- tag = (self.tags.get(morph))[0]
- features = intify_features(features)
- cdef attr_t feature
- for feature in features:
- field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
- set_feature(&tag, field, feature, 1)
- morph = self.insert(tag)
- return morph
+ return self.strings[tag.key]
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
if orth not in self.strings:
@@ -249,19 +200,10 @@ cdef class Morphology:
"""
attrs = dict(attrs)
attrs = _normalize_props(attrs)
- self.add({self._feat_map.id2feat[feat] for feat in attrs
- if feat in self._feat_map.id2feat})
+ self.add(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.exc[(tag_str, self.strings.add(orth_str))] = attrs
- cdef hash_t insert(self, MorphAnalysisC tag) except 0:
- cdef hash_t key = hash_tag(tag)
- if self.tags.get(key) == NULL:
- tag_ptr = self.mem.alloc(1, sizeof(MorphAnalysisC))
- tag_ptr[0] = tag
- self.tags.set(key, tag_ptr)
- return key
-
cdef int assign_untagged(self, TokenC* token) except -1:
"""Set morphological attributes on a token without a POS tag. Uses
the lemmatizer's lookup() method, which looks up the string in the
@@ -322,782 +264,60 @@ cdef class Morphology:
for form_str, attrs in entries.items():
self.add_special_case(tag_str, form_str, attrs)
- @classmethod
- def create_class_map(cls):
- return MorphologyClassMap(FEATURES)
+ @staticmethod
+ def feats_to_dict(feats):
+ if not feats:
+ return {}
+ return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
+ [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
+
+ @staticmethod
+ def dict_to_feats(feats_dict):
+ if len(feats_dict) == 0:
+ return ""
+ return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
+
+ @staticmethod
+ def list_to_feats(feats_list):
+ if len(feats_list) == 0:
+ return ""
+ feats_dict = {}
+ for feat in feats_list:
+ field, value = feat.split(Morphology.FIELD_SEP)
+ if field not in feats_dict:
+ feats_dict[field] = set()
+ feats_dict[field].add(value)
+ feats_dict = {field: Morphology.VALUE_SEP.join(sorted(values)) for field, values in feats_dict.items()}
+ return Morphology.dict_to_feats(feats_dict)
-cpdef univ_pos_t get_int_tag(pos_):
- return 0
-
-cpdef intify_features(features):
- return {get_string_id(feature) for feature in features}
-
-cdef hash_t hash_tag(MorphAnalysisC tag) nogil:
- return mrmr.hash64(&tag, sizeof(tag), 0)
+cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
+ cdef int i
+ for i in range(morph.length):
+ if morph.features[i] == feature:
+ return True
+ return False
-cdef MorphAnalysisC create_rich_tag(features) except *:
- cdef MorphAnalysisC tag
- cdef attr_t feature
- memset(&tag, 0, sizeof(tag))
- for feature in features:
- field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
- set_feature(&tag, field, feature, 1)
- return tag
+cdef list list_features(const MorphAnalysisC* morph):
+ cdef int i
+ features = []
+ for i in range(morph.length):
+ features.append(morph.features[i])
+ return features
-cdef tag_to_json(const MorphAnalysisC* tag):
- return [FEATURE_NAMES[f] for f in list_features(tag)]
+cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field):
+ cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64")
+ n = get_n_by_field(results.data, morph, field)
+ return results[:n]
-cdef MorphAnalysisC tag_from_json(json_tag):
- raise NotImplementedError
-
-
-cdef list list_features(const MorphAnalysisC* tag):
- output = []
- if tag.abbr != 0:
- output.append(tag.abbr)
- if tag.adp_type != 0:
- output.append(tag.adp_type)
- if tag.adv_type != 0:
- output.append(tag.adv_type)
- if tag.animacy != 0:
- output.append(tag.animacy)
- if tag.aspect != 0:
- output.append(tag.aspect)
- if tag.case != 0:
- output.append(tag.case)
- if tag.conj_type != 0:
- output.append(tag.conj_type)
- if tag.connegative != 0:
- output.append(tag.connegative)
- if tag.definite != 0:
- output.append(tag.definite)
- if tag.degree != 0:
- output.append(tag.degree)
- if tag.derivation != 0:
- output.append(tag.derivation)
- if tag.echo != 0:
- output.append(tag.echo)
- if tag.foreign != 0:
- output.append(tag.foreign)
- if tag.gender != 0:
- output.append(tag.gender)
- if tag.hyph != 0:
- output.append(tag.hyph)
- if tag.inf_form != 0:
- output.append(tag.inf_form)
- if tag.mood != 0:
- output.append(tag.mood)
- if tag.negative != 0:
- output.append(tag.negative)
- if tag.number != 0:
- output.append(tag.number)
- if tag.name_type != 0:
- output.append(tag.name_type)
- if tag.noun_type != 0:
- output.append(tag.noun_type)
- if tag.part_form != 0:
- output.append(tag.part_form)
- if tag.part_type != 0:
- output.append(tag.part_type)
- if tag.person != 0:
- output.append(tag.person)
- if tag.polite != 0:
- output.append(tag.polite)
- if tag.polarity != 0:
- output.append(tag.polarity)
- if tag.poss != 0:
- output.append(tag.poss)
- if tag.prefix != 0:
- output.append(tag.prefix)
- if tag.prep_case != 0:
- output.append(tag.prep_case)
- if tag.pron_type != 0:
- output.append(tag.pron_type)
- if tag.punct_type != 0:
- output.append(tag.punct_type)
- if tag.reflex != 0:
- output.append(tag.reflex)
- if tag.style != 0:
- output.append(tag.style)
- if tag.style_variant != 0:
- output.append(tag.style_variant)
- if tag.typo != 0:
- output.append(tag.typo)
- if tag.verb_form != 0:
- output.append(tag.verb_form)
- if tag.voice != 0:
- output.append(tag.voice)
- if tag.verb_type != 0:
- output.append(tag.verb_type)
- return output
-
-
-cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil:
- field = field_id
- if field == Field_POS:
- return tag.pos
- if field == Field_Abbr:
- return tag.abbr
- elif field == Field_AdpType:
- return tag.adp_type
- elif field == Field_AdvType:
- return tag.adv_type
- elif field == Field_Animacy:
- return tag.animacy
- elif field == Field_Aspect:
- return tag.aspect
- elif field == Field_Case:
- return tag.case
- elif field == Field_ConjType:
- return tag.conj_type
- elif field == Field_Connegative:
- return tag.connegative
- elif field == Field_Definite:
- return tag.definite
- elif field == Field_Degree:
- return tag.degree
- elif field == Field_Derivation:
- return tag.derivation
- elif field == Field_Echo:
- return tag.echo
- elif field == Field_Foreign:
- return tag.foreign
- elif field == Field_Gender:
- return tag.gender
- elif field == Field_Hyph:
- return tag.hyph
- elif field == Field_InfForm:
- return tag.inf_form
- elif field == Field_Mood:
- return tag.mood
- elif field == Field_Negative:
- return tag.negative
- elif field == Field_Number:
- return tag.number
- elif field == Field_NameType:
- return tag.name_type
- elif field == Field_NounType:
- return tag.noun_type
- elif field == Field_NumForm:
- return tag.num_form
- elif field == Field_NumType:
- return tag.num_type
- elif field == Field_NumValue:
- return tag.num_value
- elif field == Field_PartForm:
- return tag.part_form
- elif field == Field_PartType:
- return tag.part_type
- elif field == Field_Person:
- return tag.person
- elif field == Field_Polite:
- return tag.polite
- elif field == Field_Polarity:
- return tag.polarity
- elif field == Field_Poss:
- return tag.poss
- elif field == Field_Prefix:
- return tag.prefix
- elif field == Field_PrepCase:
- return tag.prep_case
- elif field == Field_PronType:
- return tag.pron_type
- elif field == Field_PunctSide:
- return tag.punct_side
- elif field == Field_PunctType:
- return tag.punct_type
- elif field == Field_Reflex:
- return tag.reflex
- elif field == Field_Style:
- return tag.style
- elif field == Field_StyleVariant:
- return tag.style_variant
- elif field == Field_Tense:
- return tag.tense
- elif field == Field_Typo:
- return tag.typo
- elif field == Field_VerbForm:
- return tag.verb_form
- elif field == Field_Voice:
- return tag.voice
- elif field == Field_VerbType:
- return tag.verb_type
- else:
- raise ValueError(Errors.E168.format(field=field_id))
-
-
-cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil:
- if tag.abbr == feature:
- return 1
- elif tag.adp_type == feature:
- return 1
- elif tag.adv_type == feature:
- return 1
- elif tag.animacy == feature:
- return 1
- elif tag.aspect == feature:
- return 1
- elif tag.case == feature:
- return 1
- elif tag.conj_type == feature:
- return 1
- elif tag.connegative == feature:
- return 1
- elif tag.definite == feature:
- return 1
- elif tag.degree == feature:
- return 1
- elif tag.derivation == feature:
- return 1
- elif tag.echo == feature:
- return 1
- elif tag.foreign == feature:
- return 1
- elif tag.gender == feature:
- return 1
- elif tag.hyph == feature:
- return 1
- elif tag.inf_form == feature:
- return 1
- elif tag.mood == feature:
- return 1
- elif tag.negative == feature:
- return 1
- elif tag.number == feature:
- return 1
- elif tag.name_type == feature:
- return 1
- elif tag.noun_type == feature:
- return 1
- elif tag.num_form == feature:
- return 1
- elif tag.num_type == feature:
- return 1
- elif tag.num_value == feature:
- return 1
- elif tag.part_form == feature:
- return 1
- elif tag.part_type == feature:
- return 1
- elif tag.person == feature:
- return 1
- elif tag.polite == feature:
- return 1
- elif tag.polarity == feature:
- return 1
- elif tag.poss == feature:
- return 1
- elif tag.prefix == feature:
- return 1
- elif tag.prep_case == feature:
- return 1
- elif tag.pron_type == feature:
- return 1
- elif tag.punct_side == feature:
- return 1
- elif tag.punct_type == feature:
- return 1
- elif tag.reflex == feature:
- return 1
- elif tag.style == feature:
- return 1
- elif tag.style_variant == feature:
- return 1
- elif tag.tense == feature:
- return 1
- elif tag.typo == feature:
- return 1
- elif tag.verb_form == feature:
- return 1
- elif tag.voice == feature:
- return 1
- elif tag.verb_type == feature:
- return 1
- else:
- return 0
-
-cdef int set_feature(MorphAnalysisC* tag,
- univ_field_t field, attr_t feature, int value) except -1:
- if value == True:
- value_ = feature
- else:
- value_ = 0
- prev_value = get_field(tag, field)
- if prev_value != 0 and value_ == 0 and field != Field_POS:
- tag.length -= 1
- elif prev_value == 0 and value_ != 0 and field != Field_POS:
- tag.length += 1
- if feature == 0:
- pass
- elif field == Field_POS:
- tag.pos = get_string_id(FEATURE_NAMES[value_].split('_')[1])
- elif field == Field_Abbr:
- tag.abbr = value_
- elif field == Field_AdpType:
- tag.adp_type = value_
- elif field == Field_AdvType:
- tag.adv_type = value_
- elif field == Field_Animacy:
- tag.animacy = value_
- elif field == Field_Aspect:
- tag.aspect = value_
- elif field == Field_Case:
- tag.case = value_
- elif field == Field_ConjType:
- tag.conj_type = value_
- elif field == Field_Connegative:
- tag.connegative = value_
- elif field == Field_Definite:
- tag.definite = value_
- elif field == Field_Degree:
- tag.degree = value_
- elif field == Field_Derivation:
- tag.derivation = value_
- elif field == Field_Echo:
- tag.echo = value_
- elif field == Field_Foreign:
- tag.foreign = value_
- elif field == Field_Gender:
- tag.gender = value_
- elif field == Field_Hyph:
- tag.hyph = value_
- elif field == Field_InfForm:
- tag.inf_form = value_
- elif field == Field_Mood:
- tag.mood = value_
- elif field == Field_Negative:
- tag.negative = value_
- elif field == Field_Number:
- tag.number = value_
- elif field == Field_NameType:
- tag.name_type = value_
- elif field == Field_NounType:
- tag.noun_type = value_
- elif field == Field_NumForm:
- tag.num_form = value_
- elif field == Field_NumType:
- tag.num_type = value_
- elif field == Field_NumValue:
- tag.num_value = value_
- elif field == Field_PartForm:
- tag.part_form = value_
- elif field == Field_PartType:
- tag.part_type = value_
- elif field == Field_Person:
- tag.person = value_
- elif field == Field_Polite:
- tag.polite = value_
- elif field == Field_Polarity:
- tag.polarity = value_
- elif field == Field_Poss:
- tag.poss = value_
- elif field == Field_Prefix:
- tag.prefix = value_
- elif field == Field_PrepCase:
- tag.prep_case = value_
- elif field == Field_PronType:
- tag.pron_type = value_
- elif field == Field_PunctSide:
- tag.punct_side = value_
- elif field == Field_PunctType:
- tag.punct_type = value_
- elif field == Field_Reflex:
- tag.reflex = value_
- elif field == Field_Style:
- tag.style = value_
- elif field == Field_StyleVariant:
- tag.style_variant = value_
- elif field == Field_Tense:
- tag.tense = value_
- elif field == Field_Typo:
- tag.typo = value_
- elif field == Field_VerbForm:
- tag.verb_form = value_
- elif field == Field_Voice:
- tag.voice = value_
- elif field == Field_VerbType:
- tag.verb_type = value_
- else:
- raise ValueError(Errors.E167.format(field=FEATURE_NAMES.get(feature), field_id=feature))
-
-
-FIELDS = {
- 'POS': Field_POS,
- 'Abbr': Field_Abbr,
- 'AdpType': Field_AdpType,
- 'AdvType': Field_AdvType,
- 'Animacy': Field_Animacy,
- 'Aspect': Field_Aspect,
- 'Case': Field_Case,
- 'ConjType': Field_ConjType,
- 'Connegative': Field_Connegative,
- 'Definite': Field_Definite,
- 'Degree': Field_Degree,
- 'Derivation': Field_Derivation,
- 'Echo': Field_Echo,
- 'Foreign': Field_Foreign,
- 'Gender': Field_Gender,
- 'Hyph': Field_Hyph,
- 'InfForm': Field_InfForm,
- 'Mood': Field_Mood,
- 'NameType': Field_NameType,
- 'Negative': Field_Negative,
- 'NounType': Field_NounType,
- 'Number': Field_Number,
- 'NumForm': Field_NumForm,
- 'NumType': Field_NumType,
- 'NumValue': Field_NumValue,
- 'PartForm': Field_PartForm,
- 'PartType': Field_PartType,
- 'Person': Field_Person,
- 'Polite': Field_Polite,
- 'Polarity': Field_Polarity,
- 'Poss': Field_Poss,
- 'Prefix': Field_Prefix,
- 'PrepCase': Field_PrepCase,
- 'PronType': Field_PronType,
- 'PunctSide': Field_PunctSide,
- 'PunctType': Field_PunctType,
- 'Reflex': Field_Reflex,
- 'Style': Field_Style,
- 'StyleVariant': Field_StyleVariant,
- 'Tense': Field_Tense,
- 'Typo': Field_Typo,
- 'VerbForm': Field_VerbForm,
- 'VerbType': Field_VerbType,
- 'Voice': Field_Voice,
-}
-
-LOWER_FIELDS = {
- 'pos': Field_POS,
- 'abbr': Field_Abbr,
- 'adp_type': Field_AdpType,
- 'adv_type': Field_AdvType,
- 'animacy': Field_Animacy,
- 'aspect': Field_Aspect,
- 'case': Field_Case,
- 'conj_type': Field_ConjType,
- 'connegative': Field_Connegative,
- 'definite': Field_Definite,
- 'degree': Field_Degree,
- 'derivation': Field_Derivation,
- 'echo': Field_Echo,
- 'foreign': Field_Foreign,
- 'gender': Field_Gender,
- 'hyph': Field_Hyph,
- 'inf_form': Field_InfForm,
- 'mood': Field_Mood,
- 'name_type': Field_NameType,
- 'negative': Field_Negative,
- 'noun_type': Field_NounType,
- 'number': Field_Number,
- 'num_form': Field_NumForm,
- 'num_type': Field_NumType,
- 'num_value': Field_NumValue,
- 'part_form': Field_PartForm,
- 'part_type': Field_PartType,
- 'person': Field_Person,
- 'polarity': Field_Polarity,
- 'polite': Field_Polite,
- 'poss': Field_Poss,
- 'prefix': Field_Prefix,
- 'prep_case': Field_PrepCase,
- 'pron_type': Field_PronType,
- 'punct_side': Field_PunctSide,
- 'punct_type': Field_PunctType,
- 'reflex': Field_Reflex,
- 'style': Field_Style,
- 'style_variant': Field_StyleVariant,
- 'tense': Field_Tense,
- 'typo': Field_Typo,
- 'verb_form': Field_VerbForm,
- 'verb_type': Field_VerbType,
- 'voice': Field_Voice,
-}
-
-
-FEATURES = [
- "POS_ADJ",
- "POS_ADP",
- "POS_ADV",
- "POS_AUX",
- "POS_CONJ",
- "POS_CCONJ",
- "POS_DET",
- "POS_INTJ",
- "POS_NOUN",
- "POS_NUM",
- "POS_PART",
- "POS_PRON",
- "POS_PROPN",
- "POS_PUNCT",
- "POS_SCONJ",
- "POS_SYM",
- "POS_VERB",
- "POS_X",
- "POS_EOL",
- "POS_SPACE",
- "Abbr_yes",
- "AdpType_circ",
- "AdpType_comprep",
- "AdpType_prep",
- "AdpType_post",
- "AdpType_voc",
- "AdvType_adadj",
- "AdvType_cau",
- "AdvType_deg",
- "AdvType_ex",
- "AdvType_loc",
- "AdvType_man",
- "AdvType_mod",
- "AdvType_sta",
- "AdvType_tim",
- "Animacy_anim",
- "Animacy_hum",
- "Animacy_inan",
- "Animacy_nhum",
- "Aspect_hab",
- "Aspect_imp",
- "Aspect_iter",
- "Aspect_perf",
- "Aspect_prog",
- "Aspect_prosp",
- "Aspect_none",
- "Case_abe",
- "Case_abl",
- "Case_abs",
- "Case_acc",
- "Case_ade",
- "Case_all",
- "Case_cau",
- "Case_com",
- "Case_dat",
- "Case_del",
- "Case_dis",
- "Case_ela",
- "Case_ess",
- "Case_gen",
- "Case_ill",
- "Case_ine",
- "Case_ins",
- "Case_loc",
- "Case_lat",
- "Case_nom",
- "Case_par",
- "Case_sub",
- "Case_sup",
- "Case_tem",
- "Case_ter",
- "Case_tra",
- "Case_voc",
- "ConjType_comp",
- "ConjType_oper",
- "Connegative_yes",
- "Definite_cons",
- "Definite_def",
- "Definite_ind",
- "Definite_red",
- "Definite_two",
- "Degree_abs",
- "Degree_cmp",
- "Degree_comp",
- "Degree_none",
- "Degree_pos",
- "Degree_sup",
- "Degree_com",
- "Degree_dim",
- "Derivation_minen",
- "Derivation_sti",
- "Derivation_inen",
- "Derivation_lainen",
- "Derivation_ja",
- "Derivation_ton",
- "Derivation_vs",
- "Derivation_ttain",
- "Derivation_ttaa",
- "Echo_rdp",
- "Echo_ech",
- "Foreign_foreign",
- "Foreign_fscript",
- "Foreign_tscript",
- "Foreign_yes",
- "Gender_com",
- "Gender_fem",
- "Gender_masc",
- "Gender_neut",
- "Gender_dat_masc",
- "Gender_dat_fem",
- "Gender_erg_masc",
- "Gender_erg_fem",
- "Gender_psor_masc",
- "Gender_psor_fem",
- "Gender_psor_neut",
- "Hyph_yes",
- "InfForm_one",
- "InfForm_two",
- "InfForm_three",
- "Mood_cnd",
- "Mood_imp",
- "Mood_ind",
- "Mood_n",
- "Mood_pot",
- "Mood_sub",
- "Mood_opt",
- "NameType_geo",
- "NameType_prs",
- "NameType_giv",
- "NameType_sur",
- "NameType_nat",
- "NameType_com",
- "NameType_pro",
- "NameType_oth",
- "Negative_neg",
- "Negative_pos",
- "Negative_yes",
- "NounType_com",
- "NounType_prop",
- "NounType_class",
- "Number_com",
- "Number_dual",
- "Number_none",
- "Number_plur",
- "Number_sing",
- "Number_ptan",
- "Number_count",
- "Number_abs_sing",
- "Number_abs_plur",
- "Number_dat_sing",
- "Number_dat_plur",
- "Number_erg_sing",
- "Number_erg_plur",
- "Number_psee_sing",
- "Number_psee_plur",
- "Number_psor_sing",
- "Number_psor_plur",
- "NumForm_digit",
- "NumForm_roman",
- "NumForm_word",
- "NumForm_combi",
- "NumType_card",
- "NumType_dist",
- "NumType_frac",
- "NumType_gen",
- "NumType_mult",
- "NumType_none",
- "NumType_ord",
- "NumType_sets",
- "NumType_dual",
- "NumValue_one",
- "NumValue_two",
- "NumValue_three",
- "PartForm_pres",
- "PartForm_past",
- "PartForm_agt",
- "PartForm_neg",
- "PartType_mod",
- "PartType_emp",
- "PartType_res",
- "PartType_inf",
- "PartType_vbp",
- "Person_one",
- "Person_two",
- "Person_three",
- "Person_none",
- "Person_abs_one",
- "Person_abs_two",
- "Person_abs_three",
- "Person_dat_one",
- "Person_dat_two",
- "Person_dat_three",
- "Person_erg_one",
- "Person_erg_two",
- "Person_erg_three",
- "Person_psor_one",
- "Person_psor_two",
- "Person_psor_three",
- "Polarity_neg",
- "Polarity_pos",
- "Polite_inf",
- "Polite_pol",
- "Polite_abs_inf",
- "Polite_abs_pol",
- "Polite_erg_inf",
- "Polite_erg_pol",
- "Polite_dat_inf",
- "Polite_dat_pol",
- "Poss_yes",
- "Prefix_yes",
- "PrepCase_npr",
- "PrepCase_pre",
- "PronType_advPart",
- "PronType_art",
- "PronType_default",
- "PronType_dem",
- "PronType_ind",
- "PronType_int",
- "PronType_neg",
- "PronType_prs",
- "PronType_rcp",
- "PronType_rel",
- "PronType_tot",
- "PronType_clit",
- "PronType_exc",
- "PunctSide_ini",
- "PunctSide_fin",
- "PunctType_peri",
- "PunctType_qest",
- "PunctType_excl",
- "PunctType_quot",
- "PunctType_brck",
- "PunctType_comm",
- "PunctType_colo",
- "PunctType_semi",
- "PunctType_dash",
- "Reflex_yes",
- "Style_arch",
- "Style_rare",
- "Style_poet",
- "Style_norm",
- "Style_coll",
- "Style_vrnc",
- "Style_sing",
- "Style_expr",
- "Style_derg",
- "Style_vulg",
- "Style_yes",
- "StyleVariant_styleShort",
- "StyleVariant_styleBound",
- "Tense_fut",
- "Tense_imp",
- "Tense_past",
- "Tense_pres",
- "Typo_yes",
- "VerbForm_fin",
- "VerbForm_ger",
- "VerbForm_inf",
- "VerbForm_none",
- "VerbForm_part",
- "VerbForm_partFut",
- "VerbForm_partPast",
- "VerbForm_partPres",
- "VerbForm_sup",
- "VerbForm_trans",
- "VerbForm_conv",
- "VerbForm_gdv",
- "VerbType_aux",
- "VerbType_cop",
- "VerbType_mod",
- "VerbType_light",
- "Voice_act",
- "Voice_cau",
- "Voice_pass",
- "Voice_mid",
- "Voice_int",
-]
-
-FEATURE_NAMES = {get_string_id(f): f for f in FEATURES}
-FEATURE_FIELDS = {f: FIELDS[f.split('_', 1)[0]] for f in FEATURES}
+cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil:
+ cdef int n_results = 0
+ cdef int i
+ for i in range(morph.length):
+ if morph.fields[i] == field:
+ results[n_results] = morph.features[i]
+ n_results += 1
+ return n_results
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index b3878db3f..259fd657d 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -82,52 +82,11 @@ cdef struct TokenC:
cdef struct MorphAnalysisC:
- univ_pos_t pos
+ hash_t key
int length
-
- attr_t abbr
- attr_t adp_type
- attr_t adv_type
- attr_t animacy
- attr_t aspect
- attr_t case
- attr_t conj_type
- attr_t connegative
- attr_t definite
- attr_t degree
- attr_t derivation
- attr_t echo
- attr_t foreign
- attr_t gender
- attr_t hyph
- attr_t inf_form
- attr_t mood
- attr_t negative
- attr_t number
- attr_t name_type
- attr_t noun_type
- attr_t num_form
- attr_t num_type
- attr_t num_value
- attr_t part_form
- attr_t part_type
- attr_t person
- attr_t polite
- attr_t polarity
- attr_t poss
- attr_t prefix
- attr_t prep_case
- attr_t pron_type
- attr_t punct_side
- attr_t punct_type
- attr_t reflex
- attr_t style
- attr_t style_variant
- attr_t tense
- attr_t typo
- attr_t verb_form
- attr_t voice
- attr_t verb_type
+ attr_t* fields
+ attr_t* features
+
# Internal struct, for storage and disambiguation of entities.
cdef struct KBEntryC:
diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index 5922ee588..b95b4b805 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -108,282 +108,282 @@ cdef enum symbol_t:
EOL
SPACE
- Animacy_anim
- Animacy_inan
- Animacy_hum # U20
- Animacy_nhum
- Aspect_freq
- Aspect_imp
- Aspect_mod
- Aspect_none
- Aspect_perf
- Aspect_iter # U20
- Aspect_hab # U20
- Case_abe
- Case_abl
- Case_abs
- Case_acc
- Case_ade
- Case_all
- Case_cau
- Case_com
- Case_cmp # U20
- Case_dat
- Case_del
- Case_dis
- Case_ela
- Case_equ # U20
- Case_ess
- Case_gen
- Case_ill
- Case_ine
- Case_ins
- Case_loc
- Case_lat
- Case_nom
- Case_par
- Case_sub
- Case_sup
- Case_tem
- Case_ter
- Case_tra
- Case_voc
- Definite_two
- Definite_def
- Definite_red
- Definite_cons # U20
- Definite_ind
- Definite_spec # U20
- Degree_cmp
- Degree_comp
- Degree_none
- Degree_pos
- Degree_sup
- Degree_abs
- Degree_com
- Degree_dim # du
- Degree_equ # U20
- Evident_nfh # U20
- Gender_com
- Gender_fem
- Gender_masc
- Gender_neut
- Mood_cnd
- Mood_imp
- Mood_ind
- Mood_n
- Mood_pot
- Mood_sub
- Mood_opt
- Mood_prp # U20
- Mood_adm # U20
- Negative_neg
- Negative_pos
- Negative_yes
- Polarity_neg # U20
- Polarity_pos # U20
- Number_com
- Number_dual
- Number_none
- Number_plur
- Number_sing
- Number_ptan # bg
- Number_count # bg, U20
- Number_tri # U20
- NumType_card
- NumType_dist
- NumType_frac
- NumType_gen
- NumType_mult
- NumType_none
- NumType_ord
- NumType_sets
- Person_one
- Person_two
- Person_three
- Person_none
- Poss_yes
- PronType_advPart
- PronType_art
- PronType_default
- PronType_dem
- PronType_ind
- PronType_int
- PronType_neg
- PronType_prs
- PronType_rcp
- PronType_rel
- PronType_tot
- PronType_clit
- PronType_exc # es, ca, it, fa, U20
- PronType_emp # U20
- Reflex_yes
- Tense_fut
- Tense_imp
- Tense_past
- Tense_pres
- VerbForm_fin
- VerbForm_ger
- VerbForm_inf
- VerbForm_none
- VerbForm_part
- VerbForm_partFut
- VerbForm_partPast
- VerbForm_partPres
- VerbForm_sup
- VerbForm_trans
- VerbForm_conv # U20
- VerbForm_gdv # la
- VerbForm_vnoun # U20
- Voice_act
- Voice_cau
- Voice_pass
- Voice_mid # gkc, U20
- Voice_int # hb
- Voice_antip # U20
- Voice_dir # U20
- Voice_inv # U20
- Abbr_yes # cz, fi, sl, U
- AdpType_prep # cz, U
- AdpType_post # U
- AdpType_voc # cz
- AdpType_comprep # cz
- AdpType_circ # U
- AdvType_man
- AdvType_loc
- AdvType_tim
- AdvType_deg
- AdvType_cau
- AdvType_mod
- AdvType_sta
- AdvType_ex
- AdvType_adadj
- ConjType_oper # cz, U
- ConjType_comp # cz, U
- Connegative_yes # fi
- Derivation_minen # fi
- Derivation_sti # fi
- Derivation_inen # fi
- Derivation_lainen # fi
- Derivation_ja # fi
- Derivation_ton # fi
- Derivation_vs # fi
- Derivation_ttain # fi
- Derivation_ttaa # fi
- Echo_rdp # U
- Echo_ech # U
- Foreign_foreign # cz, fi, U
- Foreign_fscript # cz, fi, U
- Foreign_tscript # cz, U
- Foreign_yes # sl
- Gender_dat_masc # bq, U
- Gender_dat_fem # bq, U
- Gender_erg_masc # bq
- Gender_erg_fem # bq
- Gender_psor_masc # cz, sl, U
- Gender_psor_fem # cz, sl, U
- Gender_psor_neut # sl
- Hyph_yes # cz, U
- InfForm_one # fi
- InfForm_two # fi
- InfForm_three # fi
- NameType_geo # U, cz
- NameType_prs # U, cz
- NameType_giv # U, cz
- NameType_sur # U, cz
- NameType_nat # U, cz
- NameType_com # U, cz
- NameType_pro # U, cz
- NameType_oth # U, cz
- NounType_com # U
- NounType_prop # U
- NounType_class # U
- Number_abs_sing # bq, U
- Number_abs_plur # bq, U
- Number_dat_sing # bq, U
- Number_dat_plur # bq, U
- Number_erg_sing # bq, U
- Number_erg_plur # bq, U
- Number_psee_sing # U
- Number_psee_plur # U
- Number_psor_sing # cz, fi, sl, U
- Number_psor_plur # cz, fi, sl, U
- Number_pauc # U20
- Number_grpa # U20
- Number_grpl # U20
- Number_inv # U20
- NumForm_digit # cz, sl, U
- NumForm_roman # cz, sl, U
- NumForm_word # cz, sl, U
- NumValue_one # cz, U
- NumValue_two # cz, U
- NumValue_three # cz, U
- PartForm_pres # fi
- PartForm_past # fi
- PartForm_agt # fi
- PartForm_neg # fi
- PartType_mod # U
- PartType_emp # U
- PartType_res # U
- PartType_inf # U
- PartType_vbp # U
- Person_abs_one # bq, U
- Person_abs_two # bq, U
- Person_abs_three # bq, U
- Person_dat_one # bq, U
- Person_dat_two # bq, U
- Person_dat_three # bq, U
- Person_erg_one # bq, U
- Person_erg_two # bq, U
- Person_erg_three # bq, U
- Person_psor_one # fi, U
- Person_psor_two # fi, U
- Person_psor_three # fi, U
- Person_zero # U20
- Person_four # U20
- Polite_inf # bq, U
- Polite_pol # bq, U
- Polite_abs_inf # bq, U
- Polite_abs_pol # bq, U
- Polite_erg_inf # bq, U
- Polite_erg_pol # bq, U
- Polite_dat_inf # bq, U
- Polite_dat_pol # bq, U
- Polite_infm # U20
- Polite_form # U20
- Polite_form_elev # U20
- Polite_form_humb # U20
- Prefix_yes # U
- PrepCase_npr # cz
- PrepCase_pre # U
- PunctSide_ini # U
- PunctSide_fin # U
- PunctType_peri # U
- PunctType_qest # U
- PunctType_excl # U
- PunctType_quot # U
- PunctType_brck # U
- PunctType_comm # U
- PunctType_colo # U
- PunctType_semi # U
- PunctType_dash # U
- Style_arch # cz, fi, U
- Style_rare # cz, fi, U
- Style_poet # cz, U
- Style_norm # cz, U
- Style_coll # cz, U
- Style_vrnc # cz, U
- Style_sing # cz, U
- Style_expr # cz, U
- Style_derg # cz, U
- Style_vulg # cz, U
- Style_yes # fi, U
- StyleVariant_styleShort # cz
- StyleVariant_styleBound # cz, sl
- VerbType_aux # U
- VerbType_cop # U
- VerbType_mod # U
- VerbType_light # U
+ DEPRECATED001
+ DEPRECATED002
+ DEPRECATED003
+ DEPRECATED004
+ DEPRECATED005
+ DEPRECATED006
+ DEPRECATED007
+ DEPRECATED008
+ DEPRECATED009
+ DEPRECATED010
+ DEPRECATED011
+ DEPRECATED012
+ DEPRECATED013
+ DEPRECATED014
+ DEPRECATED015
+ DEPRECATED016
+ DEPRECATED017
+ DEPRECATED018
+ DEPRECATED019
+ DEPRECATED020
+ DEPRECATED021
+ DEPRECATED022
+ DEPRECATED023
+ DEPRECATED024
+ DEPRECATED025
+ DEPRECATED026
+ DEPRECATED027
+ DEPRECATED028
+ DEPRECATED029
+ DEPRECATED030
+ DEPRECATED031
+ DEPRECATED032
+ DEPRECATED033
+ DEPRECATED034
+ DEPRECATED035
+ DEPRECATED036
+ DEPRECATED037
+ DEPRECATED038
+ DEPRECATED039
+ DEPRECATED040
+ DEPRECATED041
+ DEPRECATED042
+ DEPRECATED043
+ DEPRECATED044
+ DEPRECATED045
+ DEPRECATED046
+ DEPRECATED047
+ DEPRECATED048
+ DEPRECATED049
+ DEPRECATED050
+ DEPRECATED051
+ DEPRECATED052
+ DEPRECATED053
+ DEPRECATED054
+ DEPRECATED055
+ DEPRECATED056
+ DEPRECATED057
+ DEPRECATED058
+ DEPRECATED059
+ DEPRECATED060
+ DEPRECATED061
+ DEPRECATED062
+ DEPRECATED063
+ DEPRECATED064
+ DEPRECATED065
+ DEPRECATED066
+ DEPRECATED067
+ DEPRECATED068
+ DEPRECATED069
+ DEPRECATED070
+ DEPRECATED071
+ DEPRECATED072
+ DEPRECATED073
+ DEPRECATED074
+ DEPRECATED075
+ DEPRECATED076
+ DEPRECATED077
+ DEPRECATED078
+ DEPRECATED079
+ DEPRECATED080
+ DEPRECATED081
+ DEPRECATED082
+ DEPRECATED083
+ DEPRECATED084
+ DEPRECATED085
+ DEPRECATED086
+ DEPRECATED087
+ DEPRECATED088
+ DEPRECATED089
+ DEPRECATED090
+ DEPRECATED091
+ DEPRECATED092
+ DEPRECATED093
+ DEPRECATED094
+ DEPRECATED095
+ DEPRECATED096
+ DEPRECATED097
+ DEPRECATED098
+ DEPRECATED099
+ DEPRECATED100
+ DEPRECATED101
+ DEPRECATED102
+ DEPRECATED103
+ DEPRECATED104
+ DEPRECATED105
+ DEPRECATED106
+ DEPRECATED107
+ DEPRECATED108
+ DEPRECATED109
+ DEPRECATED110
+ DEPRECATED111
+ DEPRECATED112
+ DEPRECATED113
+ DEPRECATED114
+ DEPRECATED115
+ DEPRECATED116
+ DEPRECATED117
+ DEPRECATED118
+ DEPRECATED119
+ DEPRECATED120
+ DEPRECATED121
+ DEPRECATED122
+ DEPRECATED123
+ DEPRECATED124
+ DEPRECATED125
+ DEPRECATED126
+ DEPRECATED127
+ DEPRECATED128
+ DEPRECATED129
+ DEPRECATED130
+ DEPRECATED131
+ DEPRECATED132
+ DEPRECATED133
+ DEPRECATED134
+ DEPRECATED135
+ DEPRECATED136
+ DEPRECATED137
+ DEPRECATED138
+ DEPRECATED139
+ DEPRECATED140
+ DEPRECATED141
+ DEPRECATED142
+ DEPRECATED143
+ DEPRECATED144
+ DEPRECATED145
+ DEPRECATED146
+ DEPRECATED147
+ DEPRECATED148
+ DEPRECATED149
+ DEPRECATED150
+ DEPRECATED151
+ DEPRECATED152
+ DEPRECATED153
+ DEPRECATED154
+ DEPRECATED155
+ DEPRECATED156
+ DEPRECATED157
+ DEPRECATED158
+ DEPRECATED159
+ DEPRECATED160
+ DEPRECATED161
+ DEPRECATED162
+ DEPRECATED163
+ DEPRECATED164
+ DEPRECATED165
+ DEPRECATED166
+ DEPRECATED167
+ DEPRECATED168
+ DEPRECATED169
+ DEPRECATED170
+ DEPRECATED171
+ DEPRECATED172
+ DEPRECATED173
+ DEPRECATED174
+ DEPRECATED175
+ DEPRECATED176
+ DEPRECATED177
+ DEPRECATED178
+ DEPRECATED179
+ DEPRECATED180
+ DEPRECATED181
+ DEPRECATED182
+ DEPRECATED183
+ DEPRECATED184
+ DEPRECATED185
+ DEPRECATED186
+ DEPRECATED187
+ DEPRECATED188
+ DEPRECATED189
+ DEPRECATED190
+ DEPRECATED191
+ DEPRECATED192
+ DEPRECATED193
+ DEPRECATED194
+ DEPRECATED195
+ DEPRECATED196
+ DEPRECATED197
+ DEPRECATED198
+ DEPRECATED199
+ DEPRECATED200
+ DEPRECATED201
+ DEPRECATED202
+ DEPRECATED203
+ DEPRECATED204
+ DEPRECATED205
+ DEPRECATED206
+ DEPRECATED207
+ DEPRECATED208
+ DEPRECATED209
+ DEPRECATED210
+ DEPRECATED211
+ DEPRECATED212
+ DEPRECATED213
+ DEPRECATED214
+ DEPRECATED215
+ DEPRECATED216
+ DEPRECATED217
+ DEPRECATED218
+ DEPRECATED219
+ DEPRECATED220
+ DEPRECATED221
+ DEPRECATED222
+ DEPRECATED223
+ DEPRECATED224
+ DEPRECATED225
+ DEPRECATED226
+ DEPRECATED227
+ DEPRECATED228
+ DEPRECATED229
+ DEPRECATED230
+ DEPRECATED231
+ DEPRECATED232
+ DEPRECATED233
+ DEPRECATED234
+ DEPRECATED235
+ DEPRECATED236
+ DEPRECATED237
+ DEPRECATED238
+ DEPRECATED239
+ DEPRECATED240
+ DEPRECATED241
+ DEPRECATED242
+ DEPRECATED243
+ DEPRECATED244
+ DEPRECATED245
+ DEPRECATED246
+ DEPRECATED247
+ DEPRECATED248
+ DEPRECATED249
+ DEPRECATED250
+ DEPRECATED251
+ DEPRECATED252
+ DEPRECATED253
+ DEPRECATED254
+ DEPRECATED255
+ DEPRECATED256
+ DEPRECATED257
+ DEPRECATED258
+ DEPRECATED259
+ DEPRECATED260
+ DEPRECATED261
+ DEPRECATED262
+ DEPRECATED263
+ DEPRECATED264
+ DEPRECATED265
+ DEPRECATED266
+ DEPRECATED267
+ DEPRECATED268
+ DEPRECATED269
+ DEPRECATED270
+ DEPRECATED271
+ DEPRECATED272
+ DEPRECATED273
+ DEPRECATED274
+ DEPRECATED275
+ DEPRECATED276
PERSON
NORP
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index 85f23ccbc..36b9ffa67 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -110,282 +110,282 @@ IDS = {
"EOL": EOL,
"SPACE": SPACE,
- "Animacy_anim": Animacy_anim,
- "Animacy_inam": Animacy_inan,
- "Animacy_hum": Animacy_hum, # U20
- "Animacy_nhum": Animacy_nhum,
- "Aspect_freq": Aspect_freq,
- "Aspect_imp": Aspect_imp,
- "Aspect_mod": Aspect_mod,
- "Aspect_none": Aspect_none,
- "Aspect_perf": Aspect_perf,
- "Aspect_iter": Aspect_iter, # U20
- "Aspect_hab": Aspect_hab, # U20
- "Case_abe": Case_abe,
- "Case_abl": Case_abl,
- "Case_abs": Case_abs,
- "Case_acc": Case_acc,
- "Case_ade": Case_ade,
- "Case_all": Case_all,
- "Case_cau": Case_cau,
- "Case_com": Case_com,
- "Case_cmp": Case_cmp, # U20
- "Case_dat": Case_dat,
- "Case_del": Case_del,
- "Case_dis": Case_dis,
- "Case_ela": Case_ela,
- "Case_equ": Case_equ, # U20
- "Case_ess": Case_ess,
- "Case_gen": Case_gen,
- "Case_ill": Case_ill,
- "Case_ine": Case_ine,
- "Case_ins": Case_ins,
- "Case_loc": Case_loc,
- "Case_lat": Case_lat,
- "Case_nom": Case_nom,
- "Case_par": Case_par,
- "Case_sub": Case_sub,
- "Case_sup": Case_sup,
- "Case_tem": Case_tem,
- "Case_ter": Case_ter,
- "Case_tra": Case_tra,
- "Case_voc": Case_voc,
- "Definite_two": Definite_two,
- "Definite_def": Definite_def,
- "Definite_red": Definite_red,
- "Definite_cons": Definite_cons, # U20
- "Definite_ind": Definite_ind,
- "Definite_spec": Definite_spec, # U20
- "Degree_cmp": Degree_cmp,
- "Degree_comp": Degree_comp,
- "Degree_none": Degree_none,
- "Degree_pos": Degree_pos,
- "Degree_sup": Degree_sup,
- "Degree_abs": Degree_abs,
- "Degree_com": Degree_com,
- "Degree_dim": Degree_dim, # du
- "Degree_equ": Degree_equ, # U20
- "Evident_nfh": Evident_nfh, # U20
- "Gender_com": Gender_com,
- "Gender_fem": Gender_fem,
- "Gender_masc": Gender_masc,
- "Gender_neut": Gender_neut,
- "Mood_cnd": Mood_cnd,
- "Mood_imp": Mood_imp,
- "Mood_ind": Mood_ind,
- "Mood_n": Mood_n,
- "Mood_pot": Mood_pot,
- "Mood_sub": Mood_sub,
- "Mood_opt": Mood_opt,
- "Mood_prp": Mood_prp, # U20
- "Mood_adm": Mood_adm, # U20
- "Negative_neg": Negative_neg,
- "Negative_pos": Negative_pos,
- "Negative_yes": Negative_yes,
- "Polarity_neg": Polarity_neg, # U20
- "Polarity_pos": Polarity_pos, # U20
- "Number_com": Number_com,
- "Number_dual": Number_dual,
- "Number_none": Number_none,
- "Number_plur": Number_plur,
- "Number_sing": Number_sing,
- "Number_ptan": Number_ptan, # bg
- "Number_count": Number_count, # bg, U20
- "Number_tri": Number_tri, # U20
- "NumType_card": NumType_card,
- "NumType_dist": NumType_dist,
- "NumType_frac": NumType_frac,
- "NumType_gen": NumType_gen,
- "NumType_mult": NumType_mult,
- "NumType_none": NumType_none,
- "NumType_ord": NumType_ord,
- "NumType_sets": NumType_sets,
- "Person_one": Person_one,
- "Person_two": Person_two,
- "Person_three": Person_three,
- "Person_none": Person_none,
- "Poss_yes": Poss_yes,
- "PronType_advPart": PronType_advPart,
- "PronType_art": PronType_art,
- "PronType_default": PronType_default,
- "PronType_dem": PronType_dem,
- "PronType_ind": PronType_ind,
- "PronType_int": PronType_int,
- "PronType_neg": PronType_neg,
- "PronType_prs": PronType_prs,
- "PronType_rcp": PronType_rcp,
- "PronType_rel": PronType_rel,
- "PronType_tot": PronType_tot,
- "PronType_clit": PronType_clit,
- "PronType_exc": PronType_exc, # es, ca, it, fa, U20
- "PronType_emp": PronType_emp, # U20
- "Reflex_yes": Reflex_yes,
- "Tense_fut": Tense_fut,
- "Tense_imp": Tense_imp,
- "Tense_past": Tense_past,
- "Tense_pres": Tense_pres,
- "VerbForm_fin": VerbForm_fin,
- "VerbForm_ger": VerbForm_ger,
- "VerbForm_inf": VerbForm_inf,
- "VerbForm_none": VerbForm_none,
- "VerbForm_part": VerbForm_part,
- "VerbForm_partFut": VerbForm_partFut,
- "VerbForm_partPast": VerbForm_partPast,
- "VerbForm_partPres": VerbForm_partPres,
- "VerbForm_sup": VerbForm_sup,
- "VerbForm_trans": VerbForm_trans,
- "VerbForm_conv": VerbForm_conv, # U20
- "VerbForm_gdv": VerbForm_gdv, # la,
- "VerbForm_vnoun": VerbForm_vnoun, # U20
- "Voice_act": Voice_act,
- "Voice_cau": Voice_cau,
- "Voice_pass": Voice_pass,
- "Voice_mid": Voice_mid, # gkc, U20
- "Voice_int": Voice_int, # hb,
- "Voice_antip": Voice_antip, # U20
- "Voice_dir": Voice_dir, # U20
- "Voice_inv": Voice_inv, # U20
- "Abbr_yes": Abbr_yes, # cz, fi, sl, U,
- "AdpType_prep": AdpType_prep, # cz, U,
- "AdpType_post": AdpType_post, # U,
- "AdpType_voc": AdpType_voc, # cz,
- "AdpType_comprep": AdpType_comprep, # cz,
- "AdpType_circ": AdpType_circ, # U,
- "AdvType_man": AdvType_man,
- "AdvType_loc": AdvType_loc,
- "AdvType_tim": AdvType_tim,
- "AdvType_deg": AdvType_deg,
- "AdvType_cau": AdvType_cau,
- "AdvType_mod": AdvType_mod,
- "AdvType_sta": AdvType_sta,
- "AdvType_ex": AdvType_ex,
- "AdvType_adadj": AdvType_adadj,
- "ConjType_oper": ConjType_oper, # cz, U,
- "ConjType_comp": ConjType_comp, # cz, U,
- "Connegative_yes": Connegative_yes, # fi,
- "Derivation_minen": Derivation_minen, # fi,
- "Derivation_sti": Derivation_sti, # fi,
- "Derivation_inen": Derivation_inen, # fi,
- "Derivation_lainen": Derivation_lainen, # fi,
- "Derivation_ja": Derivation_ja, # fi,
- "Derivation_ton": Derivation_ton, # fi,
- "Derivation_vs": Derivation_vs, # fi,
- "Derivation_ttain": Derivation_ttain, # fi,
- "Derivation_ttaa": Derivation_ttaa, # fi,
- "Echo_rdp": Echo_rdp, # U,
- "Echo_ech": Echo_ech, # U,
- "Foreign_foreign": Foreign_foreign, # cz, fi, U,
- "Foreign_fscript": Foreign_fscript, # cz, fi, U,
- "Foreign_tscript": Foreign_tscript, # cz, U,
- "Foreign_yes": Foreign_yes, # sl,
- "Gender_dat_masc": Gender_dat_masc, # bq, U,
- "Gender_dat_fem": Gender_dat_fem, # bq, U,
- "Gender_erg_masc": Gender_erg_masc, # bq,
- "Gender_erg_fem": Gender_erg_fem, # bq,
- "Gender_psor_masc": Gender_psor_masc, # cz, sl, U,
- "Gender_psor_fem": Gender_psor_fem, # cz, sl, U,
- "Gender_psor_neut": Gender_psor_neut, # sl,
- "Hyph_yes": Hyph_yes, # cz, U,
- "InfForm_one": InfForm_one, # fi,
- "InfForm_two": InfForm_two, # fi,
- "InfForm_three": InfForm_three, # fi,
- "NameType_geo": NameType_geo, # U, cz,
- "NameType_prs": NameType_prs, # U, cz,
- "NameType_giv": NameType_giv, # U, cz,
- "NameType_sur": NameType_sur, # U, cz,
- "NameType_nat": NameType_nat, # U, cz,
- "NameType_com": NameType_com, # U, cz,
- "NameType_pro": NameType_pro, # U, cz,
- "NameType_oth": NameType_oth, # U, cz,
- "NounType_com": NounType_com, # U,
- "NounType_prop": NounType_prop, # U,
- "NounType_class": NounType_class, # U,
- "Number_abs_sing": Number_abs_sing, # bq, U,
- "Number_abs_plur": Number_abs_plur, # bq, U,
- "Number_dat_sing": Number_dat_sing, # bq, U,
- "Number_dat_plur": Number_dat_plur, # bq, U,
- "Number_erg_sing": Number_erg_sing, # bq, U,
- "Number_erg_plur": Number_erg_plur, # bq, U,
- "Number_psee_sing": Number_psee_sing, # U,
- "Number_psee_plur": Number_psee_plur, # U,
- "Number_psor_sing": Number_psor_sing, # cz, fi, sl, U,
- "Number_psor_plur": Number_psor_plur, # cz, fi, sl, U,
- "Number_pauc": Number_pauc, # U20
- "Number_grpa": Number_grpa, # U20
- "Number_grpl": Number_grpl, # U20
- "Number_inv": Number_inv, # U20
- "NumForm_digit": NumForm_digit, # cz, sl, U,
- "NumForm_roman": NumForm_roman, # cz, sl, U,
- "NumForm_word": NumForm_word, # cz, sl, U,
- "NumValue_one": NumValue_one, # cz, U,
- "NumValue_two": NumValue_two, # cz, U,
- "NumValue_three": NumValue_three, # cz, U,
- "PartForm_pres": PartForm_pres, # fi,
- "PartForm_past": PartForm_past, # fi,
- "PartForm_agt": PartForm_agt, # fi,
- "PartForm_neg": PartForm_neg, # fi,
- "PartType_mod": PartType_mod, # U,
- "PartType_emp": PartType_emp, # U,
- "PartType_res": PartType_res, # U,
- "PartType_inf": PartType_inf, # U,
- "PartType_vbp": PartType_vbp, # U,
- "Person_abs_one": Person_abs_one, # bq, U,
- "Person_abs_two": Person_abs_two, # bq, U,
- "Person_abs_three": Person_abs_three, # bq, U,
- "Person_dat_one": Person_dat_one, # bq, U,
- "Person_dat_two": Person_dat_two, # bq, U,
- "Person_dat_three": Person_dat_three, # bq, U,
- "Person_erg_one": Person_erg_one, # bq, U,
- "Person_erg_two": Person_erg_two, # bq, U,
- "Person_erg_three": Person_erg_three, # bq, U,
- "Person_psor_one": Person_psor_one, # fi, U,
- "Person_psor_two": Person_psor_two, # fi, U,
- "Person_psor_three": Person_psor_three, # fi, U,
- "Person_zero": Person_zero, # U20
- "Person_four": Person_four, # U20
- "Polite_inf": Polite_inf, # bq, U,
- "Polite_pol": Polite_pol, # bq, U,
- "Polite_abs_inf": Polite_abs_inf, # bq, U,
- "Polite_abs_pol": Polite_abs_pol, # bq, U,
- "Polite_erg_inf": Polite_erg_inf, # bq, U,
- "Polite_erg_pol": Polite_erg_pol, # bq, U,
- "Polite_dat_inf": Polite_dat_inf, # bq, U,
- "Polite_dat_pol": Polite_dat_pol, # bq, U,
- "Polite_infm": Polite_infm, # U20
- "Polite_form": Polite_form, # U20
- "Polite_form_elev": Polite_form_elev, # U20
- "Polite_form_humb": Polite_form_humb, # U20
- "Prefix_yes": Prefix_yes, # U,
- "PrepCase_npr": PrepCase_npr, # cz,
- "PrepCase_pre": PrepCase_pre, # U,
- "PunctSide_ini": PunctSide_ini, # U,
- "PunctSide_fin": PunctSide_fin, # U,
- "PunctType_peri": PunctType_peri, # U,
- "PunctType_qest": PunctType_qest, # U,
- "PunctType_excl": PunctType_excl, # U,
- "PunctType_quot": PunctType_quot, # U,
- "PunctType_brck": PunctType_brck, # U,
- "PunctType_comm": PunctType_comm, # U,
- "PunctType_colo": PunctType_colo, # U,
- "PunctType_semi": PunctType_semi, # U,
- "PunctType_dash": PunctType_dash, # U,
- "Style_arch": Style_arch, # cz, fi, U,
- "Style_rare": Style_rare, # cz, fi, U,
- "Style_poet": Style_poet, # cz, U,
- "Style_norm": Style_norm, # cz, U,
- "Style_coll": Style_coll, # cz, U,
- "Style_vrnc": Style_vrnc, # cz, U,
- "Style_sing": Style_sing, # cz, U,
- "Style_expr": Style_expr, # cz, U,
- "Style_derg": Style_derg, # cz, U,
- "Style_vulg": Style_vulg, # cz, U,
- "Style_yes": Style_yes, # fi, U,
- "StyleVariant_styleShort": StyleVariant_styleShort, # cz,
- "StyleVariant_styleBound": StyleVariant_styleBound, # cz, sl,
- "VerbType_aux": VerbType_aux, # U,
- "VerbType_cop": VerbType_cop, # U,
- "VerbType_mod": VerbType_mod, # U,
- "VerbType_light": VerbType_light, # U,
+ "DEPRECATED001": DEPRECATED001,
+ "DEPRECATED002": DEPRECATED002,
+ "DEPRECATED003": DEPRECATED003,
+ "DEPRECATED004": DEPRECATED004,
+ "DEPRECATED005": DEPRECATED005,
+ "DEPRECATED006": DEPRECATED006,
+ "DEPRECATED007": DEPRECATED007,
+ "DEPRECATED008": DEPRECATED008,
+ "DEPRECATED009": DEPRECATED009,
+ "DEPRECATED010": DEPRECATED010,
+ "DEPRECATED011": DEPRECATED011,
+ "DEPRECATED012": DEPRECATED012,
+ "DEPRECATED013": DEPRECATED013,
+ "DEPRECATED014": DEPRECATED014,
+ "DEPRECATED015": DEPRECATED015,
+ "DEPRECATED016": DEPRECATED016,
+ "DEPRECATED017": DEPRECATED017,
+ "DEPRECATED018": DEPRECATED018,
+ "DEPRECATED019": DEPRECATED019,
+ "DEPRECATED020": DEPRECATED020,
+ "DEPRECATED021": DEPRECATED021,
+ "DEPRECATED022": DEPRECATED022,
+ "DEPRECATED023": DEPRECATED023,
+ "DEPRECATED024": DEPRECATED024,
+ "DEPRECATED025": DEPRECATED025,
+ "DEPRECATED026": DEPRECATED026,
+ "DEPRECATED027": DEPRECATED027,
+ "DEPRECATED028": DEPRECATED028,
+ "DEPRECATED029": DEPRECATED029,
+ "DEPRECATED030": DEPRECATED030,
+ "DEPRECATED031": DEPRECATED031,
+ "DEPRECATED032": DEPRECATED032,
+ "DEPRECATED033": DEPRECATED033,
+ "DEPRECATED034": DEPRECATED034,
+ "DEPRECATED035": DEPRECATED035,
+ "DEPRECATED036": DEPRECATED036,
+ "DEPRECATED037": DEPRECATED037,
+ "DEPRECATED038": DEPRECATED038,
+ "DEPRECATED039": DEPRECATED039,
+ "DEPRECATED040": DEPRECATED040,
+ "DEPRECATED041": DEPRECATED041,
+ "DEPRECATED042": DEPRECATED042,
+ "DEPRECATED043": DEPRECATED043,
+ "DEPRECATED044": DEPRECATED044,
+ "DEPRECATED045": DEPRECATED045,
+ "DEPRECATED046": DEPRECATED046,
+ "DEPRECATED047": DEPRECATED047,
+ "DEPRECATED048": DEPRECATED048,
+ "DEPRECATED049": DEPRECATED049,
+ "DEPRECATED050": DEPRECATED050,
+ "DEPRECATED051": DEPRECATED051,
+ "DEPRECATED052": DEPRECATED052,
+ "DEPRECATED053": DEPRECATED053,
+ "DEPRECATED054": DEPRECATED054,
+ "DEPRECATED055": DEPRECATED055,
+ "DEPRECATED056": DEPRECATED056,
+ "DEPRECATED057": DEPRECATED057,
+ "DEPRECATED058": DEPRECATED058,
+ "DEPRECATED059": DEPRECATED059,
+ "DEPRECATED060": DEPRECATED060,
+ "DEPRECATED061": DEPRECATED061,
+ "DEPRECATED062": DEPRECATED062,
+ "DEPRECATED063": DEPRECATED063,
+ "DEPRECATED064": DEPRECATED064,
+ "DEPRECATED065": DEPRECATED065,
+ "DEPRECATED066": DEPRECATED066,
+ "DEPRECATED067": DEPRECATED067,
+ "DEPRECATED068": DEPRECATED068,
+ "DEPRECATED069": DEPRECATED069,
+ "DEPRECATED070": DEPRECATED070,
+ "DEPRECATED071": DEPRECATED071,
+ "DEPRECATED072": DEPRECATED072,
+ "DEPRECATED073": DEPRECATED073,
+ "DEPRECATED074": DEPRECATED074,
+ "DEPRECATED075": DEPRECATED075,
+ "DEPRECATED076": DEPRECATED076,
+ "DEPRECATED077": DEPRECATED077,
+ "DEPRECATED078": DEPRECATED078,
+ "DEPRECATED079": DEPRECATED079,
+ "DEPRECATED080": DEPRECATED080,
+ "DEPRECATED081": DEPRECATED081,
+ "DEPRECATED082": DEPRECATED082,
+ "DEPRECATED083": DEPRECATED083,
+ "DEPRECATED084": DEPRECATED084,
+ "DEPRECATED085": DEPRECATED085,
+ "DEPRECATED086": DEPRECATED086,
+ "DEPRECATED087": DEPRECATED087,
+ "DEPRECATED088": DEPRECATED088,
+ "DEPRECATED089": DEPRECATED089,
+ "DEPRECATED090": DEPRECATED090,
+ "DEPRECATED091": DEPRECATED091,
+ "DEPRECATED092": DEPRECATED092,
+ "DEPRECATED093": DEPRECATED093,
+ "DEPRECATED094": DEPRECATED094,
+ "DEPRECATED095": DEPRECATED095,
+ "DEPRECATED096": DEPRECATED096,
+ "DEPRECATED097": DEPRECATED097,
+ "DEPRECATED098": DEPRECATED098,
+ "DEPRECATED099": DEPRECATED099,
+ "DEPRECATED100": DEPRECATED100,
+ "DEPRECATED101": DEPRECATED101,
+ "DEPRECATED102": DEPRECATED102,
+ "DEPRECATED103": DEPRECATED103,
+ "DEPRECATED104": DEPRECATED104,
+ "DEPRECATED105": DEPRECATED105,
+ "DEPRECATED106": DEPRECATED106,
+ "DEPRECATED107": DEPRECATED107,
+ "DEPRECATED108": DEPRECATED108,
+ "DEPRECATED109": DEPRECATED109,
+ "DEPRECATED110": DEPRECATED110,
+ "DEPRECATED111": DEPRECATED111,
+ "DEPRECATED112": DEPRECATED112,
+ "DEPRECATED113": DEPRECATED113,
+ "DEPRECATED114": DEPRECATED114,
+ "DEPRECATED115": DEPRECATED115,
+ "DEPRECATED116": DEPRECATED116,
+ "DEPRECATED117": DEPRECATED117,
+ "DEPRECATED118": DEPRECATED118,
+ "DEPRECATED119": DEPRECATED119,
+ "DEPRECATED120": DEPRECATED120,
+ "DEPRECATED121": DEPRECATED121,
+ "DEPRECATED122": DEPRECATED122,
+ "DEPRECATED123": DEPRECATED123,
+ "DEPRECATED124": DEPRECATED124,
+ "DEPRECATED125": DEPRECATED125,
+ "DEPRECATED126": DEPRECATED126,
+ "DEPRECATED127": DEPRECATED127,
+ "DEPRECATED128": DEPRECATED128,
+ "DEPRECATED129": DEPRECATED129,
+ "DEPRECATED130": DEPRECATED130,
+ "DEPRECATED131": DEPRECATED131,
+ "DEPRECATED132": DEPRECATED132,
+ "DEPRECATED133": DEPRECATED133,
+ "DEPRECATED134": DEPRECATED134,
+ "DEPRECATED135": DEPRECATED135,
+ "DEPRECATED136": DEPRECATED136,
+ "DEPRECATED137": DEPRECATED137,
+ "DEPRECATED138": DEPRECATED138,
+ "DEPRECATED139": DEPRECATED139,
+ "DEPRECATED140": DEPRECATED140,
+ "DEPRECATED141": DEPRECATED141,
+ "DEPRECATED142": DEPRECATED142,
+ "DEPRECATED143": DEPRECATED143,
+ "DEPRECATED144": DEPRECATED144,
+ "DEPRECATED145": DEPRECATED145,
+ "DEPRECATED146": DEPRECATED146,
+ "DEPRECATED147": DEPRECATED147,
+ "DEPRECATED148": DEPRECATED148,
+ "DEPRECATED149": DEPRECATED149,
+ "DEPRECATED150": DEPRECATED150,
+ "DEPRECATED151": DEPRECATED151,
+ "DEPRECATED152": DEPRECATED152,
+ "DEPRECATED153": DEPRECATED153,
+ "DEPRECATED154": DEPRECATED154,
+ "DEPRECATED155": DEPRECATED155,
+ "DEPRECATED156": DEPRECATED156,
+ "DEPRECATED157": DEPRECATED157,
+ "DEPRECATED158": DEPRECATED158,
+ "DEPRECATED159": DEPRECATED159,
+ "DEPRECATED160": DEPRECATED160,
+ "DEPRECATED161": DEPRECATED161,
+ "DEPRECATED162": DEPRECATED162,
+ "DEPRECATED163": DEPRECATED163,
+ "DEPRECATED164": DEPRECATED164,
+ "DEPRECATED165": DEPRECATED165,
+ "DEPRECATED166": DEPRECATED166,
+ "DEPRECATED167": DEPRECATED167,
+ "DEPRECATED168": DEPRECATED168,
+ "DEPRECATED169": DEPRECATED169,
+ "DEPRECATED170": DEPRECATED170,
+ "DEPRECATED171": DEPRECATED171,
+ "DEPRECATED172": DEPRECATED172,
+ "DEPRECATED173": DEPRECATED173,
+ "DEPRECATED174": DEPRECATED174,
+ "DEPRECATED175": DEPRECATED175,
+ "DEPRECATED176": DEPRECATED176,
+ "DEPRECATED177": DEPRECATED177,
+ "DEPRECATED178": DEPRECATED178,
+ "DEPRECATED179": DEPRECATED179,
+ "DEPRECATED180": DEPRECATED180,
+ "DEPRECATED181": DEPRECATED181,
+ "DEPRECATED182": DEPRECATED182,
+ "DEPRECATED183": DEPRECATED183,
+ "DEPRECATED184": DEPRECATED184,
+ "DEPRECATED185": DEPRECATED185,
+ "DEPRECATED186": DEPRECATED186,
+ "DEPRECATED187": DEPRECATED187,
+ "DEPRECATED188": DEPRECATED188,
+ "DEPRECATED189": DEPRECATED189,
+ "DEPRECATED190": DEPRECATED190,
+ "DEPRECATED191": DEPRECATED191,
+ "DEPRECATED192": DEPRECATED192,
+ "DEPRECATED193": DEPRECATED193,
+ "DEPRECATED194": DEPRECATED194,
+ "DEPRECATED195": DEPRECATED195,
+ "DEPRECATED196": DEPRECATED196,
+ "DEPRECATED197": DEPRECATED197,
+ "DEPRECATED198": DEPRECATED198,
+ "DEPRECATED199": DEPRECATED199,
+ "DEPRECATED200": DEPRECATED200,
+ "DEPRECATED201": DEPRECATED201,
+ "DEPRECATED202": DEPRECATED202,
+ "DEPRECATED203": DEPRECATED203,
+ "DEPRECATED204": DEPRECATED204,
+ "DEPRECATED205": DEPRECATED205,
+ "DEPRECATED206": DEPRECATED206,
+ "DEPRECATED207": DEPRECATED207,
+ "DEPRECATED208": DEPRECATED208,
+ "DEPRECATED209": DEPRECATED209,
+ "DEPRECATED210": DEPRECATED210,
+ "DEPRECATED211": DEPRECATED211,
+ "DEPRECATED212": DEPRECATED212,
+ "DEPRECATED213": DEPRECATED213,
+ "DEPRECATED214": DEPRECATED214,
+ "DEPRECATED215": DEPRECATED215,
+ "DEPRECATED216": DEPRECATED216,
+ "DEPRECATED217": DEPRECATED217,
+ "DEPRECATED218": DEPRECATED218,
+ "DEPRECATED219": DEPRECATED219,
+ "DEPRECATED220": DEPRECATED220,
+ "DEPRECATED221": DEPRECATED221,
+ "DEPRECATED222": DEPRECATED222,
+ "DEPRECATED223": DEPRECATED223,
+ "DEPRECATED224": DEPRECATED224,
+ "DEPRECATED225": DEPRECATED225,
+ "DEPRECATED226": DEPRECATED226,
+ "DEPRECATED227": DEPRECATED227,
+ "DEPRECATED228": DEPRECATED228,
+ "DEPRECATED229": DEPRECATED229,
+ "DEPRECATED230": DEPRECATED230,
+ "DEPRECATED231": DEPRECATED231,
+ "DEPRECATED232": DEPRECATED232,
+ "DEPRECATED233": DEPRECATED233,
+ "DEPRECATED234": DEPRECATED234,
+ "DEPRECATED235": DEPRECATED235,
+ "DEPRECATED236": DEPRECATED236,
+ "DEPRECATED237": DEPRECATED237,
+ "DEPRECATED238": DEPRECATED238,
+ "DEPRECATED239": DEPRECATED239,
+ "DEPRECATED240": DEPRECATED240,
+ "DEPRECATED241": DEPRECATED241,
+ "DEPRECATED242": DEPRECATED242,
+ "DEPRECATED243": DEPRECATED243,
+ "DEPRECATED244": DEPRECATED244,
+ "DEPRECATED245": DEPRECATED245,
+ "DEPRECATED246": DEPRECATED246,
+ "DEPRECATED247": DEPRECATED247,
+ "DEPRECATED248": DEPRECATED248,
+ "DEPRECATED249": DEPRECATED249,
+ "DEPRECATED250": DEPRECATED250,
+ "DEPRECATED251": DEPRECATED251,
+ "DEPRECATED252": DEPRECATED252,
+ "DEPRECATED253": DEPRECATED253,
+ "DEPRECATED254": DEPRECATED254,
+ "DEPRECATED255": DEPRECATED255,
+ "DEPRECATED256": DEPRECATED256,
+ "DEPRECATED257": DEPRECATED257,
+ "DEPRECATED258": DEPRECATED258,
+ "DEPRECATED259": DEPRECATED259,
+ "DEPRECATED260": DEPRECATED260,
+ "DEPRECATED261": DEPRECATED261,
+ "DEPRECATED262": DEPRECATED262,
+ "DEPRECATED263": DEPRECATED263,
+ "DEPRECATED264": DEPRECATED264,
+ "DEPRECATED265": DEPRECATED265,
+ "DEPRECATED266": DEPRECATED266,
+ "DEPRECATED267": DEPRECATED267,
+ "DEPRECATED268": DEPRECATED268,
+ "DEPRECATED269": DEPRECATED269,
+ "DEPRECATED270": DEPRECATED270,
+ "DEPRECATED271": DEPRECATED271,
+ "DEPRECATED272": DEPRECATED272,
+ "DEPRECATED273": DEPRECATED273,
+ "DEPRECATED274": DEPRECATED274,
+ "DEPRECATED275": DEPRECATED275,
+ "DEPRECATED276": DEPRECATED276,
"PERSON": PERSON,
"NORP": NORP,
diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py
index 67ebc06d6..82fb549ba 100644
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@@ -9,22 +9,52 @@ def i_has(en_tokenizer):
return doc
-def test_token_morph_id(i_has):
- assert i_has[0].morph.id
- assert i_has[1].morph.id != 0
- assert i_has[0].morph.id != i_has[1].morph.id
+def test_token_morph_eq(i_has):
+ assert i_has[0].morph is not i_has[0].morph
+ assert i_has[0].morph == i_has[0].morph
+ assert i_has[0].morph != i_has[1].morph
+
+
+def test_token_morph_key(i_has):
+ assert i_has[0].morph.key != 0
+ assert i_has[1].morph.key != 0
+ assert i_has[0].morph.key == i_has[0].morph.key
+ assert i_has[0].morph.key != i_has[1].morph.key
def test_morph_props(i_has):
- assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"]
- assert i_has[0].morph.pron_type_ == "PronType_prs"
- assert i_has[1].morph.pron_type == 0
+ assert i_has[0].morph.get("PronType") == ["PronType=prs"]
+ assert i_has[1].morph.get("PronType") == []
def test_morph_iter(i_has):
- assert list(i_has[0].morph) == ["PronType_prs"]
- assert list(i_has[1].morph) == ["Number_sing", "Person_three", "VerbForm_fin"]
+ assert set(i_has[0].morph) == set(["PronType=prs"])
+ assert set(i_has[1].morph) == set(["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"])
def test_morph_get(i_has):
- assert i_has[0].morph.get("pron_type") == "PronType_prs"
+ assert i_has[0].morph.get("PronType") == ["PronType=prs"]
+
+
+def test_morph_set(i_has):
+ assert i_has[0].morph.get("PronType") == ["PronType=prs"]
+ # set by string
+ i_has[0].morph_ = "PronType=unk"
+ assert i_has[0].morph.get("PronType") == ["PronType=unk"]
+ # set by string, fields are alphabetized
+ i_has[0].morph_ = "PronType=123|NounType=unk"
+ assert i_has[0].morph_ == "NounType=unk|PronType=123"
+ # set by dict
+ i_has[0].morph_ = {"AType": "123", "BType": "unk", "POS": "ADJ"}
+ assert i_has[0].morph_ == "AType=123|BType=unk|POS=ADJ"
+ # set by string with multiple values, fields and values are alphabetized
+ i_has[0].morph_ = "BType=c|AType=b,a"
+ assert i_has[0].morph_ == "AType=a,b|BType=c"
+ # set by dict with multiple values, fields and values are alphabetized
+ i_has[0].morph_ = {"AType": "b,a", "BType": "c"}
+ assert i_has[0].morph_ == "AType=a,b|BType=c"
+
+
+def test_morph_str(i_has):
+ assert str(i_has[0].morph) == "PronType=prs"
+ assert str(i_has[1].morph) == "Number=sing|Person=three|Tense=pres|VerbForm=fin"
diff --git a/spacy/tests/morphology/test_morph_converters.py b/spacy/tests/morphology/test_morph_converters.py
new file mode 100644
index 000000000..3bff4f924
--- /dev/null
+++ b/spacy/tests/morphology/test_morph_converters.py
@@ -0,0 +1,26 @@
+import pytest
+from spacy.morphology import Morphology
+
+
+def test_feats_converters():
+ feats = "Case=dat,gen|Number=sing"
+ feats_dict = {"Case": "dat,gen", "Number": "sing"}
+ feats_list = feats.split(Morphology.FEATURE_SEP)
+
+ # simple conversions
+ assert Morphology.list_to_feats(feats_list) == feats
+ assert Morphology.dict_to_feats(feats_dict) == feats
+ assert Morphology.feats_to_dict(feats) == feats_dict
+
+ # roundtrips
+ assert Morphology.dict_to_feats(Morphology.feats_to_dict(feats)) == feats
+ assert Morphology.feats_to_dict(Morphology.dict_to_feats(feats_dict)) == feats_dict
+
+ # unsorted input is normalized
+ unsorted_feats = "Number=sing|Case=gen,dat"
+ unsorted_feats_dict = {"Case": "gen,dat", "Number": "sing"}
+ unsorted_feats_list = feats.split(Morphology.FEATURE_SEP)
+ assert Morphology.feats_to_dict(unsorted_feats) == feats_dict
+ assert Morphology.dict_to_feats(unsorted_feats_dict) == feats
+ assert Morphology.list_to_feats(unsorted_feats_list) == feats
+ assert Morphology.dict_to_feats(Morphology.feats_to_dict(unsorted_feats)) == feats
diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py
index 4cf6b1206..0d8d7dea9 100644
--- a/spacy/tests/morphology/test_morph_features.py
+++ b/spacy/tests/morphology/test_morph_features.py
@@ -16,32 +16,30 @@ def test_init(morphology):
def test_add_morphology_with_string_names(morphology):
- morphology.add({"Case_gen", "Number_sing"})
+ morphology.add({"Case": "gen", "Number": "sing"})
def test_add_morphology_with_int_ids(morphology):
- morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")})
+ morphology.strings.add("Case")
+ morphology.strings.add("gen")
+ morphology.strings.add("Number")
+ morphology.strings.add("sing")
+ morphology.add({get_string_id("Case"): get_string_id("gen"), get_string_id("Number"): get_string_id("sing")})
def test_add_morphology_with_mix_strings_and_ints(morphology):
- morphology.add({get_string_id("PunctSide_ini"), "VerbType_aux"})
+ morphology.strings.add("PunctSide")
+ morphology.strings.add("ini")
+ morphology.add({get_string_id("PunctSide"): get_string_id("ini"), "VerbType": "aux"})
def test_morphology_tags_hash_distinctly(morphology):
- tag1 = morphology.add({"PunctSide_ini", "VerbType_aux"})
- tag2 = morphology.add({"Case_gen", "Number_sing"})
+ tag1 = morphology.add({"PunctSide": "ini", "VerbType": "aux"})
+ tag2 = morphology.add({"Case": "gen", "Number": "sing"})
assert tag1 != tag2
def test_morphology_tags_hash_independent_of_order(morphology):
- tag1 = morphology.add({"Case_gen", "Number_sing"})
- tag2 = morphology.add({"Number_sing", "Case_gen"})
+ tag1 = morphology.add({"Case": "gen", "Number": "sing"})
+ tag2 = morphology.add({"Number": "sing", "Case": "gen"})
assert tag1 == tag2
-
-
-def test_update_morphology_tag(morphology):
- tag1 = morphology.add({"Case_gen"})
- tag2 = morphology.update(tag1, {"Number_sing"})
- assert tag1 != tag2
- tag3 = morphology.add({"Number_sing", "Case_gen"})
- assert tag2 == tag3
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index a3148aa90..bfca72853 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -2,7 +2,7 @@ import pytest
import random
from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, ORTH, LOWER
-from spacy.symbols import POS, VERB, VerbForm_inf
+from spacy.symbols import POS, VERB
from spacy.vocab import Vocab
from spacy.language import Language
from spacy.lemmatizer import Lemmatizer
@@ -164,7 +164,7 @@ def test_issue590(en_vocab):
def test_issue595():
"""Test lemmatization of base forms"""
words = ["Do", "n't", "feed", "the", "dog"]
- tag_map = {"VB": {POS: VERB, VerbForm_inf: True}}
+ tag_map = {"VB": {POS: VERB, "VerbForm": "inf"}}
lookups = Lookups()
lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
lookups.add_table("lemma_index", {"verb": {}})
diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py
index 7d81c3148..aaff951e5 100644
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ b/spacy/tests/regression/test_issue1001-1500.py
@@ -8,7 +8,7 @@ from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
-from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
+from spacy.symbols import ORTH, LEMMA, POS, VERB
def test_issue1061():
@@ -88,7 +88,7 @@ def test_issue1375():
def test_issue1387():
- tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
+ tag_map = {"VBG": {POS: VERB, "VerbForm": "part"}}
lookups = Lookups()
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index 88428709b..1aefa2b7c 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -2,5 +2,6 @@ from .doc import Doc
from .token import Token
from .span import Span
from ._serialize import DocBin
+from .morphanalysis import MorphAnalysis
-__all__ = ["Doc", "Token", "Span", "DocBin"]
+__all__ = ["Doc", "Token", "Span", "DocBin", "MorphAnalysis"]
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
index 22844454a..9510875c9 100644
--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@@ -5,5 +5,5 @@ from ..structs cimport MorphAnalysisC
cdef class MorphAnalysis:
cdef readonly Vocab vocab
- cdef hash_t key
+ cdef readonly hash_t key
cdef MorphAnalysisC c
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index e09870741..ed987f4e4 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -1,15 +1,14 @@
from libc.string cimport memset
+cimport numpy as np
from ..vocab cimport Vocab
from ..typedefs cimport hash_t, attr_t
-from ..morphology cimport list_features, check_feature, get_field, tag_to_json
-
-from ..strings import get_string_id
+from ..morphology cimport list_features, check_feature, get_by_field
cdef class MorphAnalysis:
"""Control access to morphological features for a token."""
- def __init__(self, Vocab vocab, features=tuple()):
+ def __init__(self, Vocab vocab, features=dict()):
self.vocab = vocab
self.key = self.vocab.morphology.add(features)
analysis = self.vocab.morphology.tags.get(self.key)
@@ -33,7 +32,7 @@ cdef class MorphAnalysis:
def __contains__(self, feature):
"""Test whether the morphological analysis contains some feature."""
- cdef attr_t feat_id = get_string_id(feature)
+ cdef attr_t feat_id = self.vocab.strings.as_int(feature)
return check_feature(&self.c, feat_id)
def __iter__(self):
@@ -55,369 +54,28 @@ cdef class MorphAnalysis:
def __hash__(self):
return self.key
- def get(self, unicode field):
+ def __eq__(self, other):
+ return self.key == other.key
+
+ def __ne__(self, other):
+ return self.key != other.key
+
+ def get(self, field):
"""Retrieve a feature by field."""
- cdef int field_id = self.vocab.morphology._feat_map.attr2field[field]
- return self.vocab.strings[get_field(&self.c, field_id)]
+ cdef attr_t field_id = self.vocab.strings.as_int(field)
+ cdef np.ndarray results = get_by_field(&self.c, field_id)
+ return [self.vocab.strings[result] for result in results]
def to_json(self):
- """Produce a json serializable representation, which will be a list of
- strings.
+ """Produce a json serializable representation as a UD FEATS-style
+ string.
"""
- return tag_to_json(&self.c)
-
- @property
- def is_base_form(self):
- raise NotImplementedError
-
- @property
- def pos(self):
- return self.c.pos
-
- @property
- def pos_(self):
- return self.vocab.strings[self.c.pos]
-
- property id:
- def __get__(self):
- return self.key
-
- property abbr:
- def __get__(self):
- return self.c.abbr
-
- property adp_type:
- def __get__(self):
- return self.c.adp_type
-
- property adv_type:
- def __get__(self):
- return self.c.adv_type
-
- property animacy:
- def __get__(self):
- return self.c.animacy
-
- property aspect:
- def __get__(self):
- return self.c.aspect
-
- property case:
- def __get__(self):
- return self.c.case
-
- property conj_type:
- def __get__(self):
- return self.c.conj_type
-
- property connegative:
- def __get__(self):
- return self.c.connegative
-
- property definite:
- def __get__(self):
- return self.c.definite
-
- property degree:
- def __get__(self):
- return self.c.degree
-
- property derivation:
- def __get__(self):
- return self.c.derivation
-
- property echo:
- def __get__(self):
- return self.c.echo
-
- property foreign:
- def __get__(self):
- return self.c.foreign
-
- property gender:
- def __get__(self):
- return self.c.gender
-
- property hyph:
- def __get__(self):
- return self.c.hyph
-
- property inf_form:
- def __get__(self):
- return self.c.inf_form
-
- property mood:
- def __get__(self):
- return self.c.mood
-
- property name_type:
- def __get__(self):
- return self.c.name_type
-
- property negative:
- def __get__(self):
- return self.c.negative
-
- property noun_type:
- def __get__(self):
- return self.c.noun_type
-
- property number:
- def __get__(self):
- return self.c.number
-
- property num_form:
- def __get__(self):
- return self.c.num_form
-
- property num_type:
- def __get__(self):
- return self.c.num_type
-
- property num_value:
- def __get__(self):
- return self.c.num_value
-
- property part_form:
- def __get__(self):
- return self.c.part_form
-
- property part_type:
- def __get__(self):
- return self.c.part_type
-
- property person:
- def __get__(self):
- return self.c.person
-
- property polite:
- def __get__(self):
- return self.c.polite
-
- property polarity:
- def __get__(self):
- return self.c.polarity
-
- property poss:
- def __get__(self):
- return self.c.poss
-
- property prefix:
- def __get__(self):
- return self.c.prefix
-
- property prep_case:
- def __get__(self):
- return self.c.prep_case
-
- property pron_type:
- def __get__(self):
- return self.c.pron_type
-
- property punct_side:
- def __get__(self):
- return self.c.punct_side
-
- property punct_type:
- def __get__(self):
- return self.c.punct_type
-
- property reflex:
- def __get__(self):
- return self.c.reflex
-
- property style:
- def __get__(self):
- return self.c.style
-
- property style_variant:
- def __get__(self):
- return self.c.style_variant
-
- property tense:
- def __get__(self):
- return self.c.tense
-
- property typo:
- def __get__(self):
- return self.c.typo
-
- property verb_form:
- def __get__(self):
- return self.c.verb_form
-
- property voice:
- def __get__(self):
- return self.c.voice
-
- property verb_type:
- def __get__(self):
- return self.c.verb_type
-
- property abbr_:
- def __get__(self):
- return self.vocab.strings[self.c.abbr]
-
- property adp_type_:
- def __get__(self):
- return self.vocab.strings[self.c.adp_type]
-
- property adv_type_:
- def __get__(self):
- return self.vocab.strings[self.c.adv_type]
-
- property animacy_:
- def __get__(self):
- return self.vocab.strings[self.c.animacy]
-
- property aspect_:
- def __get__(self):
- return self.vocab.strings[self.c.aspect]
-
- property case_:
- def __get__(self):
- return self.vocab.strings[self.c.case]
-
- property conj_type_:
- def __get__(self):
- return self.vocab.strings[self.c.conj_type]
-
- property connegative_:
- def __get__(self):
- return self.vocab.strings[self.c.connegative]
-
- property definite_:
- def __get__(self):
- return self.vocab.strings[self.c.definite]
-
- property degree_:
- def __get__(self):
- return self.vocab.strings[self.c.degree]
-
- property derivation_:
- def __get__(self):
- return self.vocab.strings[self.c.derivation]
-
- property echo_:
- def __get__(self):
- return self.vocab.strings[self.c.echo]
-
- property foreign_:
- def __get__(self):
- return self.vocab.strings[self.c.foreign]
-
- property gender_:
- def __get__(self):
- return self.vocab.strings[self.c.gender]
-
- property hyph_:
- def __get__(self):
- return self.vocab.strings[self.c.hyph]
-
- property inf_form_:
- def __get__(self):
- return self.vocab.strings[self.c.inf_form]
-
- property name_type_:
- def __get__(self):
- return self.vocab.strings[self.c.name_type]
-
- property negative_:
- def __get__(self):
- return self.vocab.strings[self.c.negative]
-
- property mood_:
- def __get__(self):
- return self.vocab.strings[self.c.mood]
-
- property number_:
- def __get__(self):
- return self.vocab.strings[self.c.number]
-
- property num_form_:
- def __get__(self):
- return self.vocab.strings[self.c.num_form]
-
- property num_type_:
- def __get__(self):
- return self.vocab.strings[self.c.num_type]
-
- property num_value_:
- def __get__(self):
- return self.vocab.strings[self.c.num_value]
-
- property part_form_:
- def __get__(self):
- return self.vocab.strings[self.c.part_form]
-
- property part_type_:
- def __get__(self):
- return self.vocab.strings[self.c.part_type]
-
- property person_:
- def __get__(self):
- return self.vocab.strings[self.c.person]
-
- property polite_:
- def __get__(self):
- return self.vocab.strings[self.c.polite]
-
- property polarity_:
- def __get__(self):
- return self.vocab.strings[self.c.polarity]
-
- property poss_:
- def __get__(self):
- return self.vocab.strings[self.c.poss]
-
- property prefix_:
- def __get__(self):
- return self.vocab.strings[self.c.prefix]
-
- property prep_case_:
- def __get__(self):
- return self.vocab.strings[self.c.prep_case]
-
- property pron_type_:
- def __get__(self):
- return self.vocab.strings[self.c.pron_type]
-
- property punct_side_:
- def __get__(self):
- return self.vocab.strings[self.c.punct_side]
-
- property punct_type_:
- def __get__(self):
- return self.vocab.strings[self.c.punct_type]
-
- property reflex_:
- def __get__(self):
- return self.vocab.strings[self.c.reflex]
-
- property style_:
- def __get__(self):
- return self.vocab.strings[self.c.style]
-
- property style_variant_:
- def __get__(self):
- return self.vocab.strings[self.c.style_variant]
-
- property tense_:
- def __get__(self):
- return self.vocab.strings[self.c.tense]
-
- property typo_:
- def __get__(self):
- return self.vocab.strings[self.c.typo]
-
- property verb_form_:
- def __get__(self):
- return self.vocab.strings[self.c.verb_form]
-
- property voice_:
- def __get__(self):
- return self.vocab.strings[self.c.voice]
-
- property verb_type_:
- def __get__(self):
- return self.vocab.strings[self.c.verb_type]
+ morph_string = self.vocab.strings[self.c.key]
+ if morph_string == self.vocab.morphology.EMPTY_MORPH:
+ return ""
+ return morph_string
+
+ def to_dict(self):
+ """Produce a dict representation.
+ """
+ return self.vocab.morphology.feats_to_dict(self.to_json())
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 8e6290187..b159fffc1 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -217,6 +217,14 @@ cdef class Token:
def morph(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph)
+ property morph_:
+ def __get__(self):
+ return str(MorphAnalysis.from_id(self.vocab, self.c.morph))
+
+ def __set__(self, features):
+ cdef hash_t key = self.vocab.morphology.add(features)
+ self.c.morph = key
+
@property
def lex_id(self):
"""RETURNS (int): Sequential ID of the token's lexical type."""
From 06b251dd1e5aa5fa7c6025d11448ccea3b875d91 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Tue, 28 Jan 2020 11:36:29 +0100
Subject: [PATCH 048/496] Add support for pos/morphs/lemmas in training data
(#4941)
Add support for pos/morphs/lemmas throughout `GoldParse`, `Example`, and
`docs_to_json()`.
---
spacy/gold.pxd | 6 +-
spacy/gold.pyx | 135 +++++++++++++++++++++++----------------
spacy/tests/test_gold.py | 51 +++++++++++----
3 files changed, 124 insertions(+), 68 deletions(-)
diff --git a/spacy/gold.pxd b/spacy/gold.pxd
index 5f0b49c9f..49dba16df 100644
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@@ -25,6 +25,7 @@ cdef class GoldParse:
cdef public int loss
cdef public list words
cdef public list tags
+ cdef public list pos
cdef public list morphs
cdef public list lemmas
cdef public list sent_starts
@@ -44,11 +45,12 @@ cdef class TokenAnnotation:
cdef public list ids
cdef public list words
cdef public list tags
+ cdef public list pos
+ cdef public list morphs
+ cdef public list lemmas
cdef public list heads
cdef public list deps
cdef public list entities
- cdef public list morphs
- cdef public list lemmas
cdef public list sent_starts
cdef public list brackets
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 0dfa32c84..eca801176 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -485,11 +485,12 @@ def json_to_examples(doc):
words = []
ids = []
tags = []
+ pos = []
+ morphs = []
+ lemmas = []
heads = []
labels = []
ner = []
- morphs = []
- lemmas = []
sent_starts = []
brackets = []
for sent in paragraph["sentences"]:
@@ -498,14 +499,15 @@ def json_to_examples(doc):
words.append(token["orth"])
ids.append(token.get('id', sent_start_i + i))
tags.append(token.get('tag', "-"))
+ pos.append(token.get("pos", ""))
+ morphs.append(token.get("morph", ""))
+ lemmas.append(token.get("lemma", ""))
heads.append(token.get("head", 0) + sent_start_i + i)
labels.append(token.get("dep", ""))
# Ensure ROOT label is case-insensitive
if labels[-1].lower() == "root":
labels[-1] = "ROOT"
ner.append(token.get("ner", "-"))
- morphs.append(token.get("morph", {}))
- lemmas.append(token.get("lemma", ""))
if i == 0:
sent_starts.append(1)
else:
@@ -518,8 +520,9 @@ def json_to_examples(doc):
for cat in paragraph.get("cats", {}):
cats[cat["label"]] = cat["value"]
example.set_token_annotation(ids=ids, words=words, tags=tags,
- heads=heads, deps=labels, entities=ner, morphs=morphs,
- lemmas=lemmas, sent_starts=sent_starts, brackets=brackets)
+ pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
+ deps=labels, entities=ner, sent_starts=sent_starts,
+ brackets=brackets)
example.set_doc_annotation(cats=cats)
yield example
@@ -632,17 +635,18 @@ def _consume_ent(tags):
cdef class TokenAnnotation:
- def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None,
- entities=None, morphs=None, lemmas=None, sent_starts=None,
+ def __init__(self, ids=None, words=None, tags=None, pos=None, morphs=None,
+ lemmas=None, heads=None, deps=None, entities=None, sent_starts=None,
brackets=None):
self.ids = ids if ids else []
self.words = words if words else []
self.tags = tags if tags else []
+ self.pos = pos if pos else []
+ self.morphs = morphs if morphs else []
+ self.lemmas = lemmas if lemmas else []
self.heads = heads if heads else []
self.deps = deps if deps else []
self.entities = entities if entities else []
- self.morphs = morphs if morphs else []
- self.lemmas = lemmas if lemmas else []
self.sent_starts = sent_starts if sent_starts else []
self.brackets = brackets if brackets else []
@@ -651,11 +655,12 @@ cdef class TokenAnnotation:
return cls(ids=token_dict.get("ids", None),
words=token_dict.get("words", None),
tags=token_dict.get("tags", None),
+ pos=token_dict.get("pos", None),
+ morphs=token_dict.get("morphs", None),
+ lemmas=token_dict.get("lemmas", None),
heads=token_dict.get("heads", None),
deps=token_dict.get("deps", None),
entities=token_dict.get("entities", None),
- morphs=token_dict.get("morphs", None),
- lemmas=token_dict.get("lemmas", None),
sent_starts=token_dict.get("sent_starts", None),
brackets=token_dict.get("brackets", None))
@@ -663,11 +668,12 @@ cdef class TokenAnnotation:
return {"ids": self.ids,
"words": self.words,
"tags": self.tags,
+ "pos": self.pos,
+ "morphs": self.morphs,
+ "lemmas": self.lemmas,
"heads": self.heads,
"deps": self.deps,
"entities": self.entities,
- "morphs": self.morphs,
- "lemmas": self.lemmas,
"sent_starts": self.sent_starts,
"brackets": self.brackets}
@@ -680,6 +686,15 @@ cdef class TokenAnnotation:
def get_tag(self, i):
return self.tags[i] if i < len(self.tags) else "-"
+ def get_pos(self, i):
+ return self.pos[i] if i < len(self.pos) else ""
+
+ def get_morph(self, i):
+ return self.morphs[i] if i < len(self.morphs) else ""
+
+ def get_lemma(self, i):
+ return self.lemmas[i] if i < len(self.lemmas) else ""
+
def get_head(self, i):
return self.heads[i] if i < len(self.heads) else i
@@ -689,12 +704,6 @@ cdef class TokenAnnotation:
def get_entity(self, i):
return self.entities[i] if i < len(self.entities) else "-"
- def get_morph(self, i):
- return self.morphs[i] if i < len(self.morphs) else set()
-
- def get_lemma(self, i):
- return self.lemmas[i] if i < len(self.lemmas) else ""
-
def get_sent_start(self, i):
return self.sent_starts[i] if i < len(self.sent_starts) else None
@@ -756,12 +765,12 @@ cdef class Example:
self.goldparse = gold
return self.goldparse
- def set_token_annotation(self, ids=None, words=None, tags=None, heads=None,
- deps=None, entities=None, morphs=None, lemmas=None,
- sent_starts=None, brackets=None):
+ def set_token_annotation(self, ids=None, words=None, tags=None, pos=None,
+ morphs=None, lemmas=None, heads=None, deps=None,
+ entities=None, sent_starts=None, brackets=None):
self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
- heads=heads, deps=deps, entities=entities,
- morphs=morphs, lemmas=lemmas,
+ pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
+ deps=deps, entities=entities,
sent_starts=sent_starts, brackets=brackets)
def set_doc_annotation(self, cats=None, links=None):
@@ -774,8 +783,8 @@ cdef class Example:
""" Split the token annotations into multiple Examples based on
sent_starts and return a list of the new Examples"""
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
- s_ids, s_words, s_tags, s_heads = [], [], [], []
- s_deps, s_ents, s_morphs, s_lemmas, s_sent_starts = [], [], [], [], []
+ s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
+ s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
s_brackets = []
sent_start_i = 0
t = self.token_annotation
@@ -783,31 +792,33 @@ cdef class Example:
for i in range(len(t.words)):
if i > 0 and t.sent_starts[i] == 1:
s_example.set_token_annotation(ids=s_ids,
- words=s_words, tags=s_tags, heads=s_heads, deps=s_deps,
- entities=s_ents, morphs=s_morphs, lemmas=s_lemmas,
- sent_starts=s_sent_starts, brackets=s_brackets)
+ words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs,
+ lemmas=s_lemmas, heads=s_heads, deps=s_deps,
+ entities=s_ents, sent_starts=s_sent_starts,
+ brackets=s_brackets)
split_examples.append(s_example)
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
- s_ids, s_words, s_tags, s_heads = [], [], [], []
+ s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
s_sent_starts, s_brackets = [], []
sent_start_i = i
s_ids.append(t.get_id(i))
s_words.append(t.get_word(i))
s_tags.append(t.get_tag(i))
+ s_pos.append(t.get_pos(i))
+ s_morphs.append(t.get_morph(i))
+ s_lemmas.append(t.get_lemma(i))
s_heads.append(t.get_head(i) - sent_start_i)
s_deps.append(t.get_dep(i))
s_ents.append(t.get_entity(i))
- s_morphs.append(t.get_morph(i))
- s_lemmas.append(t.get_lemma(i))
s_sent_starts.append(t.get_sent_start(i))
s_brackets.extend((b[0] - sent_start_i,
b[1] - sent_start_i, b[2])
for b in t.brackets if b[0] == i)
i += 1
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
- heads=s_heads, deps=s_deps, entities=s_ents,
- morphs=s_morphs, lemmas=s_lemmas, sent_starts=s_sent_starts,
+ pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads,
+ deps=s_deps, entities=s_ents, sent_starts=s_sent_starts,
brackets=s_brackets)
split_examples.append(s_example)
return split_examples
@@ -911,11 +922,12 @@ cdef class GoldParse:
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
return cls(doc, words=token_annotation.words,
tags=token_annotation.tags,
+ pos=token_annotation.pos,
+ morphs=token_annotation.morphs,
+ lemmas=token_annotation.lemmas,
heads=token_annotation.heads,
deps=token_annotation.deps,
entities=token_annotation.entities,
- morphs=token_annotation.morphs,
- lemmas=token_annotation.lemmas,
sent_starts=token_annotation.sent_starts,
cats=doc_annotation.cats,
links=doc_annotation.links,
@@ -927,18 +939,25 @@ cdef class GoldParse:
ids = list(range(len(self.words)))
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
- heads=self.heads, deps=self.labels,
- entities=self.ner, morphs=self.morphs,
- sent_starts=self.sent_starts, lemmas=self.lemmas)
+ pos=self.pos, morphs=self.morphs,
+ lemmas=self.lemmas, heads=self.heads,
+ deps=self.labels, entities=self.ner,
+ sent_starts=self.sent_starts)
- def __init__(self, doc, words=None, tags=None, morphs=None, lemmas=None,
- sent_starts=None, heads=None, deps=None, entities=None,
- make_projective=False, cats=None, links=None):
+ def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
+ lemmas=None, heads=None, deps=None, entities=None,
+ sent_starts=None, make_projective=False, cats=None,
+ links=None):
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings.
tags (iterable): A sequence of strings, representing tag annotations.
+ pos (iterable): A sequence of strings, representing UPOS annotations.
+ morphs (iterable): A sequence of strings, representing morph
+ annotations.
+ lemmas (iterable): A sequence of strings, representing lemma
+ annotations.
heads (iterable): A sequence of integers, representing syntactic
head offsets.
deps (iterable): A sequence of strings, representing the syntactic
@@ -978,14 +997,16 @@ cdef class GoldParse:
words = [token.text for token in doc]
if not tags:
tags = [None for _ in words]
- if not heads:
- heads = [None for _ in words]
- if not deps:
- deps = [None for _ in words]
+ if not pos:
+ pos = [None for _ in words]
if not morphs:
morphs = [None for _ in words]
if not lemmas:
lemmas = [None for _ in words]
+ if not heads:
+ heads = [None for _ in words]
+ if not deps:
+ deps = [None for _ in words]
if not sent_starts:
sent_starts = [None for _ in words]
if entities is None:
@@ -1010,11 +1031,12 @@ cdef class GoldParse:
self.words = [None] * len(doc)
self.tags = [None] * len(doc)
+ self.pos = [None] * len(doc)
+ self.morphs = [None] * len(doc)
+ self.lemmas = [None] * len(doc)
self.heads = [None] * len(doc)
self.labels = [None] * len(doc)
self.ner = [None] * len(doc)
- self.morphs = [None] * len(doc)
- self.lemmas = [None] * len(doc)
self.sent_starts = [None] * len(doc)
# This needs to be done before we align the words
@@ -1034,24 +1056,26 @@ cdef class GoldParse:
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
self.orig = TokenAnnotation(ids=list(range(len(words))),
- words=words, tags=tags, heads=heads, deps=deps,
- entities=entities, morphs=morphs, lemmas=lemmas,
+ words=words, tags=tags, pos=pos, morphs=morphs,
+ lemmas=lemmas, heads=heads, deps=deps, entities=entities,
sent_starts=sent_starts, brackets=[])
for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].text.isspace():
self.words[i] = doc[i].text
self.tags[i] = "_SP"
+ self.pos[i] = "SPACE"
+ self.morphs[i] = None
+ self.lemmas[i] = None
self.heads[i] = None
self.labels[i] = None
self.ner[i] = None
- self.morphs[i] = set()
- self.lemmas[i] = None
self.sent_starts[i] = 0
if gold_i is None:
if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
+ self.pos[i] = pos[i2j_multi[i]]
self.morphs[i] = morphs[i2j_multi[i]]
self.lemmas[i] = lemmas[i2j_multi[i]]
self.sent_starts[i] = sent_starts[i2j_multi[i]]
@@ -1093,6 +1117,7 @@ cdef class GoldParse:
else:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
+ self.pos[i] = pos[gold_i]
self.morphs[i] = morphs[gold_i]
self.lemmas[i] = lemmas[gold_i]
self.sent_starts[i] = sent_starts[gold_i]
@@ -1156,9 +1181,11 @@ def docs_to_json(docs, id=0, ner_missing_tag="O"):
json_sent = {"tokens": [], "brackets": []}
for token in sent:
json_token = {"id": token.i, "orth": token.text}
- json_token["lemma"] = token.lemma_
if doc.is_tagged:
json_token["tag"] = token.tag_
+ json_token["pos"] = token.pos_
+ json_token["morph"] = token.morph_
+ json_token["lemma"] = token.lemma_
if doc.is_parsed:
json_token["head"] = token.head.i-token.i
json_token["dep"] = token.dep_
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 46c54b879..7fe8aab73 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -1,12 +1,7 @@
from spacy.errors import AlignmentError
-from spacy.gold import (
- biluo_tags_from_offsets,
- offsets_from_biluo_tags,
- Example,
- DocAnnotation,
-)
-from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
-from spacy.gold import GoldCorpus, docs_to_json, align
+from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
+from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo, align
+from spacy.gold import GoldCorpus, docs_to_json, Example, DocAnnotation
from spacy.lang.en import English
from spacy.syntax.nonproj import is_nonproj_tree
from spacy.tokens import Doc
@@ -20,6 +15,30 @@ import srsly
def doc():
text = "Sarah's sister flew to Silicon Valley via London."
tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
+ pos = [
+ "PROPN",
+ "PART",
+ "NOUN",
+ "VERB",
+ "ADP",
+ "PROPN",
+ "PROPN",
+ "ADP",
+ "PROPN",
+ "PUNCT",
+ ]
+ morphs = [
+ "NounType=prop|Number=sing",
+ "Poss=yes",
+ "Number=sing",
+ "Tense=past|VerbForm=fin",
+ "",
+ "NounType=prop|Number=sing",
+ "NounType=prop|Number=sing",
+ "",
+ "NounType=prop|Number=sing",
+ "PunctType=peri",
+ ]
# head of '.' is intentionally nonprojective for testing
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
deps = [
@@ -52,9 +71,11 @@ def doc():
doc = nlp(text)
for i in range(len(tags)):
doc[i].tag_ = tags[i]
+ doc[i].pos_ = pos[i]
+ doc[i].morph_ = morphs[i]
+ doc[i].lemma_ = lemmas[i]
doc[i].dep_ = deps[i]
doc[i].head = doc[heads[i]]
- doc[i].lemma_ = lemmas[i]
doc.ents = spans_from_biluo_tags(doc, biluo_tags)
doc.cats = cats
doc.is_tagged = True
@@ -162,9 +183,11 @@ def test_roundtrip_docs_to_json(doc):
nlp = English()
text = doc.text
tags = [t.tag_ for t in doc]
+ pos = [t.pos_ for t in doc]
+ morphs = [t.morph_ for t in doc]
+ lemmas = [t.lemma_ for t in doc]
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
- lemmas = [t.lemma_ for t in doc]
biluo_tags = iob_to_biluo(
[t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc]
)
@@ -182,9 +205,11 @@ def test_roundtrip_docs_to_json(doc):
assert len(doc) == goldcorpus.count_train()
assert text == reloaded_example.text
assert tags == goldparse.tags
+ assert pos == goldparse.pos
+ assert morphs == goldparse.morphs
+ assert lemmas == goldparse.lemmas
assert deps == goldparse.labels
assert heads == goldparse.heads
- assert lemmas == goldparse.lemmas
assert biluo_tags == goldparse.ner
assert "TRAVEL" in goldparse.cats
assert "BAKING" in goldparse.cats
@@ -203,9 +228,11 @@ def test_roundtrip_docs_to_json(doc):
assert len(doc) == goldcorpus.count_train()
assert text == reloaded_example.text
assert tags == goldparse.tags
+ assert pos == goldparse.pos
+ assert morphs == goldparse.morphs
+ assert lemmas == goldparse.lemmas
assert deps == goldparse.labels
assert heads == goldparse.heads
- assert lemmas == goldparse.lemmas
assert biluo_tags == goldparse.ner
assert "TRAVEL" in goldparse.cats
assert "BAKING" in goldparse.cats
From 569cc9898200772cc894b1663eb03e9fd017c1c9 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Wed, 29 Jan 2020 17:06:46 +0100
Subject: [PATCH 049/496] Update spaCy for thinc 8.0.0 (#4920)
* Add load_from_config function
* Add train_from_config script
* Merge configs and expose via spacy.config
* Fix script
* Suggest create_evaluation_callback
* Hard-code for NER
* Fix errors
* Register command
* Add TODO
* Update train-from-config todos
* Fix imports
* Allow delayed setting of parser model nr_class
* Get train-from-config working
* Tidy up and fix scores and printing
* Hide traceback if cancelled
* Fix weighted score formatting
* Fix score formatting
* Make output_path optional
* Add Tok2Vec component
* Tidy up and add tok2vec_tensors
* Add option to copy docs in nlp.update
* Copy docs in nlp.update
* Adjust nlp.update() for set_annotations
* Don't shuffle pipes in nlp.update, decruft
* Support set_annotations arg in component update
* Support set_annotations in parser update
* Add get_gradients method
* Add get_gradients to parser
* Update errors.py
* Fix problems caused by merge
* Add _link_components method in nlp
* Add concept of 'listeners' and ControlledModel
* Support optional attributes arg in ControlledModel
* Try having tok2vec component in pipeline
* Fix tok2vec component
* Fix config
* Fix tok2vec
* Update for Example
* Update for Example
* Update config
* Add eg2doc util
* Update and add schemas/types
* Update schemas
* Fix nlp.update
* Fix tagger
* Remove hacks from train-from-config
* Remove hard-coded config str
* Calculate loss in tok2vec component
* Tidy up and use function signatures instead of models
* Support union types for registry models
* Minor cleaning in Language.update
* Make ControlledModel specifically Tok2VecListener
* Fix train_from_config
* Fix tok2vec
* Tidy up
* Add function for bilstm tok2vec
* Fix type
* Fix syntax
* Fix pytorch optimizer
* Add example configs
* Update for thinc describe changes
* Update for Thinc changes
* Update for dropout/sgd changes
* Update for dropout/sgd changes
* Unhack gradient update
* Work on refactoring _ml
* Remove _ml.py module
* WIP upgrade cli scripts for thinc
* Move some _ml stuff to util
* Import link_vectors from util
* Update train_from_config
* Import from util
* Import from util
* Temporarily add ml.component_models module
* Move ml methods
* Move typedefs
* Update load vectors
* Update gitignore
* Move imports
* Add PrecomputableAffine
* Fix imports
* Fix imports
* Fix imports
* Fix missing imports
* Update CLI scripts
* Update spacy.language
* Add stubs for building the models
* Update model definition
* Update create_default_optimizer
* Fix import
* Fix comment
* Update imports in tests
* Update imports in spacy.cli
* Fix import
* fix obsolete thinc imports
* update srsly pin
* from thinc to ml_datasets for example data such as imdb
* update ml_datasets pin
* using STATE.vectors
* small fix
* fix Sentencizer.pipe
* black formatting
* rename Affine to Linear as in thinc
* set validate explicitely to True
* rename with_square_sequences to with_list2padded
* rename with_flatten to with_list2array
* chaining layernorm
* small fixes
* revert Optimizer import
* build_nel_encoder with new thinc style
* fixes using model's get and set methods
* Tok2Vec in component models, various fixes
* fix up legacy tok2vec code
* add model initialize calls
* add in build_tagger_model
* small fixes
* setting model dims
* fixes for ParserModel
* various small fixes
* initialize thinc Models
* fixes
* consistent naming of window_size
* fixes, removing set_dropout
* work around Iterable issue
* remove legacy tok2vec
* util fix
* fix forward function of tok2vec listener
* more fixes
* trying to fix PrecomputableAffine (not succesful yet)
* alloc instead of allocate
* add morphologizer
* rename residual
* rename fixes
* Fix predict function
* Update parser and parser model
* fixing few more tests
* Fix precomputable affine
* Update component model
* Update parser model
* Move backprop padding to own function, for test
* Update test
* Fix p. affine
* Update NEL
* build_bow_text_classifier and extract_ngrams
* Fix parser init
* Fix test add label
* add build_simple_cnn_text_classifier
* Fix parser init
* Set gpu off by default in example
* Fix tok2vec listener
* Fix parser model
* Small fixes
* small fix for PyTorchLSTM parameters
* revert my_compounding hack (iterable fixed now)
* fix biLSTM
* Fix uniqued
* PyTorchRNNWrapper fix
* small fixes
* use helper function to calculate cosine loss
* small fixes for build_simple_cnn_text_classifier
* putting dropout default at 0.0 to ensure the layer gets built
* using thinc util's set_dropout_rate
* moving layer normalization inside of maxout definition to optimize dropout
* temp debugging in NEL
* fixed NEL model by using init defaults !
* fixing after set_dropout_rate refactor
* proper fix
* fix test_update_doc after refactoring optimizers in thinc
* Add CharacterEmbed layer
* Construct tagger Model
* Add missing import
* Remove unused stuff
* Work on textcat
* fix test (again :)) after optimizer refactor
* fixes to allow reading Tagger from_disk without overwriting dimensions
* don't build the tok2vec prematuraly
* fix CharachterEmbed init
* CharacterEmbed fixes
* Fix CharacterEmbed architecture
* fix imports
* renames from latest thinc update
* one more rename
* add initialize calls where appropriate
* fix parser initialization
* Update Thinc version
* Fix errors, auto-format and tidy up imports
* Fix validation
* fix if bias is cupy array
* revert for now
* ensure it's a numpy array before running bp in ParserStepModel
* no reason to call require_gpu twice
* use CupyOps.to_numpy instead of cupy directly
* fix initialize of ParserModel
* remove unnecessary import
* fixes for CosineDistance
* fix device renaming
* use refactored loss functions (Thinc PR 251)
* overfitting test for tagger
* experimental settings for the tagger: avoid zero-init and subword normalization
* clean up tagger overfitting test
* use previous default value for nP
* remove toy config
* bringing layernorm back (had a bug - fixed in thinc)
* revert setting nP explicitly
* remove setting default in constructor
* restore values as they used to be
* add overfitting test for NER
* add overfitting test for dep parser
* add overfitting test for textcat
* fixing init for linear (previously affine)
* larger eps window for textcat
* ensure doc is not None
* Require newer thinc
* Make float check vaguer
* Slop the textcat overfit test more
* Fix textcat test
* Fix exclusive classes for textcat
* fix after renaming of alloc methods
* fixing renames and mandatory arguments (staticvectors WIP)
* upgrade to thinc==8.0.0.dev3
* refer to vocab.vectors directly instead of its name
* rename alpha to learn_rate
* adding hashembed and staticvectors dropout
* upgrade to thinc 8.0.0.dev4
* add name back to avoid warning W020
* thinc dev4
* update srsly
* using thinc 8.0.0a0 !
Co-authored-by: Matthew Honnibal
Co-authored-by: Ines Montani
---
.gitignore | 4 +
bin/wiki_entity_linking/train_descriptions.py | 29 +-
.../wikidata_train_entity_linker.py | 2 +-
examples/deep_learning_keras.py | 5 +-
.../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 63 ++
.../ptb-joint-pos-dep/defaults.cfg | 65 ++
examples/pipeline/multi_processing.py | 5 +-
examples/training/pretrain_textcat.py | 24 +-
examples/training/rehearsal.py | 2 +-
examples/training/train_entity_linker.py | 7 +-
examples/training/train_textcat.py | 5 +-
requirements.txt | 7 +-
setup.cfg | 6 +-
spacy/__init__.py | 5 +-
spacy/__main__.py | 2 +
spacy/_ml.py | 982 ------------------
spacy/cli/__init__.py | 1 +
spacy/cli/pretrain.py | 33 +-
spacy/cli/profile.py | 4 +-
spacy/cli/train.py | 8 +-
spacy/cli/train_from_config.py | 445 ++++++++
spacy/compat.py | 7 +-
spacy/language.py | 75 +-
spacy/lexeme.pyx | 2 +-
spacy/ml/__init__.py | 2 -
spacy/ml/_character_embed.py | 52 +
spacy/ml/_layers.py | 165 +++
spacy/ml/_legacy_tok2vec.py | 129 ---
spacy/ml/_wire.py | 41 -
spacy/ml/common.py | 21 -
spacy/ml/component_models.py | 222 ++++
spacy/ml/extract_ngrams.py | 39 +
spacy/ml/tok2vec.py | 92 +-
spacy/pipeline/__init__.py | 2 +
spacy/pipeline/hooks.py | 14 +-
spacy/pipeline/morphologizer.pyx | 25 +-
spacy/pipeline/pipes.pyx | 324 ++++--
spacy/pipeline/tok2vec.py | 188 ++++
spacy/syntax/_beam_utils.pxd | 2 +-
spacy/syntax/_beam_utils.pyx | 2 +-
spacy/syntax/_parser_model.pxd | 2 +-
spacy/syntax/_parser_model.pyx | 228 ++--
spacy/syntax/arc_eager.pxd | 2 +-
spacy/syntax/ner.pyx | 2 +-
spacy/syntax/nn_parser.pxd | 2 -
spacy/syntax/nn_parser.pyx | 139 ++-
spacy/syntax/transition_system.pxd | 3 +-
spacy/syntax/transition_system.pyx | 2 +-
spacy/tests/parser/test_add_label.py | 10 +-
spacy/tests/parser/test_ner.py | 34 +-
spacy/tests/parser/test_neural_parser.py | 13 +-
spacy/tests/parser/test_parse.py | 42 +
spacy/tests/parser/test_preset_sbd.py | 6 +-
spacy/tests/pipeline/test_tagger.py | 33 +
spacy/tests/pipeline/test_textcat.py | 28 +
spacy/tests/regression/test_issue2501-3000.py | 2 +-
spacy/tests/regression/test_issue3611.py | 2 +-
spacy/tests/test_architectures.py | 4 +-
spacy/tests/test_misc.py | 31 +-
spacy/tests/test_tok2vec.py | 6 +-
spacy/tests/vocab_vectors/test_vectors.py | 5 +-
spacy/tokens/_retokenize.pyx | 2 +-
spacy/tokens/_serialize.py | 2 +-
spacy/tokens/doc.pyx | 2 +-
spacy/tokens/span.pyx | 2 +-
spacy/tokens/token.pyx | 2 +-
spacy/typedefs.pxd | 2 +
spacy/util.py | 88 +-
spacy/vectors.pyx | 8 +-
spacy/vocab.pyx | 4 +-
70 files changed, 2141 insertions(+), 1675 deletions(-)
create mode 100644 examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
create mode 100644 examples/experiments/ptb-joint-pos-dep/defaults.cfg
create mode 100644 spacy/cli/train_from_config.py
create mode 100644 spacy/ml/_character_embed.py
create mode 100644 spacy/ml/_layers.py
delete mode 100644 spacy/ml/_legacy_tok2vec.py
delete mode 100644 spacy/ml/_wire.py
delete mode 100644 spacy/ml/common.py
create mode 100644 spacy/ml/component_models.py
create mode 100644 spacy/ml/extract_ngrams.py
create mode 100644 spacy/pipeline/tok2vec.py
diff --git a/.gitignore b/.gitignore
index c4ad59fc7..a0af6d4d2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,6 +39,7 @@ __pycache__/
.env*
.~env/
.venv
+env3.6/
venv/
.dev
.denv
@@ -111,3 +112,6 @@ Desktop.ini
# Pycharm project files
*.idea
+
+# IPython
+.ipynb_checkpoints/
diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py
index af08d6b8f..d98bba565 100644
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ b/bin/wiki_entity_linking/train_descriptions.py
@@ -4,12 +4,12 @@ from random import shuffle
import logging
import numpy as np
-from spacy._ml import zero_init, create_default_optimizer
-from spacy.cli.pretrain import get_cossim_loss
-
-from thinc.v2v import Model
+from thinc.model import Model
from thinc.api import chain
-from thinc.neural._classes.affine import Affine
+from thinc.loss import CosineDistance
+from thinc.layers import Linear
+
+from spacy.util import create_default_optimizer
logger = logging.getLogger(__name__)
@@ -34,6 +34,7 @@ class EntityEncoder:
self.input_dim = input_dim
self.desc_width = desc_width
self.epochs = epochs
+ self.distance = CosineDistance(ignore_zeros=True, normalize=False)
def apply_encoder(self, description_list):
if self.encoder is None:
@@ -132,21 +133,17 @@ class EntityEncoder:
def _build_network(self, orig_width, hidden_with):
with Model.define_operators({">>": chain}):
# very simple encoder-decoder model
- self.encoder = Affine(hidden_with, orig_width)
- self.model = self.encoder >> zero_init(
- Affine(orig_width, hidden_with, drop_factor=0.0)
- )
- self.sgd = create_default_optimizer(self.model.ops)
+ self.encoder = Linear(hidden_with, orig_width)
+ # TODO: removed the zero_init here - is oK?
+ self.model = self.encoder >> Linear(orig_width, hidden_with)
+ self.sgd = create_default_optimizer()
def _update(self, vectors):
+ truths = self.model.ops.asarray(vectors)
predictions, bp_model = self.model.begin_update(
- np.asarray(vectors), drop=self.DROP
+ truths, drop=self.DROP
)
- loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors))
+ d_scores, loss = self.distance(predictions, truths)
bp_model(d_scores, sgd=self.sgd)
return loss / len(vectors)
- @staticmethod
- def _get_loss(golds, scores):
- loss, gradients = get_cossim_loss(scores, golds)
- return loss, gradients
diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
index 6b5f4c30d..f4a1b321d 100644
--- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py
+++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
@@ -103,7 +103,7 @@ def main(
logger.info("STEP 3: Creating and training an Entity Linking pipe")
el_pipe = nlp.create_pipe(
- name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name,
+ name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors,
"labels_discard": labels_discard}
)
el_pipe.set_kb(kb)
diff --git a/examples/deep_learning_keras.py b/examples/deep_learning_keras.py
index 049cc0be4..bf857b8b7 100644
--- a/examples/deep_learning_keras.py
+++ b/examples/deep_learning_keras.py
@@ -14,7 +14,7 @@ pip install keras==2.0.9
Compatible with: spaCy v2.0.0+
"""
-
+import ml_datasets
import plac
import random
import pathlib
@@ -24,7 +24,6 @@ from keras.models import Sequential, model_from_json
from keras.layers import LSTM, Dense, Embedding, Bidirectional
from keras.layers import TimeDistributed
from keras.optimizers import Adam
-import thinc.extra.datasets
from spacy.compat import pickle
import spacy
@@ -224,7 +223,7 @@ def main(
if model_dir is not None:
model_dir = pathlib.Path(model_dir)
if train_dir is None or dev_dir is None:
- imdb_data = thinc.extra.datasets.imdb()
+ imdb_data = ml_datasets.imdb()
if is_runtime:
if dev_dir is None:
dev_texts, dev_labels = zip(*imdb_data[1])
diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
new file mode 100644
index 000000000..8cd150868
--- /dev/null
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@@ -0,0 +1,63 @@
+[training]
+patience = 10000
+eval_frequency = 200
+dropout = 0.2
+init_tok2vec = null
+vectors = null
+max_epochs = 100
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+use_gpu = 0
+scores = ["tags_acc", "uas", "las"]
+score_weights = {"las": 0.8, "tags_acc": 0.2}
+limit = 0
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+
+[nlp.pipeline.tagger.model]
+@architectures = "tagger_model.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "tok2vec_tensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "transition_based_parser.v1"
+nr_feature_tokens = 8
+hidden_width = 64
+maxout_pieces = 3
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "tok2vec_tensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "hash_embed_bilstm.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 96
+depth = 4
+embed_size = 2000
diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
new file mode 100644
index 000000000..6735284a7
--- /dev/null
+++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
@@ -0,0 +1,65 @@
+[training]
+patience = 10000
+eval_frequency = 200
+dropout = 0.2
+init_tok2vec = null
+vectors = null
+max_epochs = 100
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+use_gpu = -1
+scores = ["tags_acc", "uas", "las"]
+score_weights = {"las": 0.8, "tags_acc": 0.2}
+limit = 0
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+
+[nlp.pipeline.tagger.model]
+@architectures = "tagger_model.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "tok2vec_tensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "transition_based_parser.v1"
+nr_feature_tokens = 8
+hidden_width = 64
+maxout_pieces = 3
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "tok2vec_tensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "hash_embed_cnn.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 96
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
diff --git a/examples/pipeline/multi_processing.py b/examples/pipeline/multi_processing.py
index f0e437acf..e4aca7912 100644
--- a/examples/pipeline/multi_processing.py
+++ b/examples/pipeline/multi_processing.py
@@ -13,9 +13,10 @@ Prerequisites: pip install joblib
from __future__ import print_function, unicode_literals
from pathlib import Path
+
+import ml_datasets
from joblib import Parallel, delayed
from functools import partial
-import thinc.extra.datasets
import plac
import spacy
from spacy.util import minibatch
@@ -35,7 +36,7 @@ def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10
output_dir.mkdir()
# load and pre-process the IMBD dataset
print("Loading IMDB data...")
- data, _ = thinc.extra.datasets.imdb()
+ data, _ = ml_datasets.imdb()
texts, _ = zip(*data[-limit:])
print("Processing texts...")
partitions = minibatch(texts, size=batch_size)
diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py
index 64f7002ef..f1cc2d3d2 100644
--- a/examples/training/pretrain_textcat.py
+++ b/examples/training/pretrain_textcat.py
@@ -16,16 +16,18 @@ the development labels, after all --- only the unlabelled text.
import plac
import tqdm
import random
+
+import ml_datasets
+
import spacy
-import thinc.extra.datasets
from spacy.util import minibatch, use_gpu, compounding
-from spacy._ml import Tok2Vec
from spacy.pipeline import TextCategorizer
+from spacy.ml.tok2vec import Tok2Vec
import numpy
def load_texts(limit=0):
- train, dev = thinc.extra.datasets.imdb()
+ train, dev = ml_datasets.imdb()
train_texts, train_labels = zip(*train)
dev_texts, dev_labels = zip(*train)
train_texts = list(train_texts)
@@ -41,7 +43,7 @@ def load_texts(limit=0):
def load_textcat_data(limit=0):
"""Load data from the IMDB dataset."""
# Partition off part of the train data for evaluation
- train_data, eval_data = thinc.extra.datasets.imdb()
+ train_data, eval_data = ml_datasets.imdb()
random.shuffle(train_data)
train_data = train_data[-limit:]
texts, labels = zip(*train_data)
@@ -63,17 +65,15 @@ def prefer_gpu():
def build_textcat_model(tok2vec, nr_class, width):
- from thinc.v2v import Model, Softmax, Maxout
- from thinc.api import flatten_add_lengths, chain
- from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
- from thinc.misc import Residual, LayerNorm
- from spacy._ml import logistic, zero_init
+ from thinc.model import Model
+ from thinc.layers import Softmax, chain, reduce_mean
+ from thinc.layers import list2ragged
with Model.define_operators({">>": chain}):
model = (
tok2vec
- >> flatten_add_lengths
- >> Pooling(mean_pool)
+ >> list2ragged()
+ >> reduce_mean()
>> Softmax(nr_class, width)
)
model.tok2vec = tok2vec
@@ -81,7 +81,7 @@ def build_textcat_model(tok2vec, nr_class, width):
def block_gradients(model):
- from thinc.api import wrap
+ from thinc.api import wrap # TODO FIX
def forward(X, drop=0.0):
Y, _ = model.begin_update(X, drop=drop)
diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py
index b08ba9f9a..98459cf03 100644
--- a/examples/training/rehearsal.py
+++ b/examples/training/rehearsal.py
@@ -58,7 +58,7 @@ def main(model_name, unlabelled_loc):
# yet, but I'm getting weird results from Adam. Try commenting out the
# nlp.update(), and using Adam -- you'll find the models drift apart.
# I guess Adam is losing precision, introducing gradient noise?
- optimizer.alpha = 0.1
+ optimizer.learn_rate = 0.1
optimizer.b1 = 0.0
optimizer.b2 = 0.0
diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py
index f44c3b9cc..6e19848d3 100644
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@@ -17,7 +17,7 @@ import plac
import random
from pathlib import Path
-from spacy.symbols import PERSON
+import srsly
from spacy.vocab import Vocab
import spacy
@@ -68,7 +68,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
vocab = Vocab().from_disk(vocab_path)
# create blank Language class with correct vocab
nlp = spacy.blank("en", vocab=vocab)
- nlp.vocab.vectors.name = "spacy_pretrained_vectors"
+ nlp.vocab.vectors.name = "nel_vectors"
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
# Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
@@ -93,7 +93,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
nlp.add_pipe(entity_linker, last=True)
# Convert the texts to docs to make sure we have doc.ents set for the training examples.
- # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
+ # Also ensure that the annotated examples correspond to known identifiers in the knowledge base.
kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
TRAIN_DOCS = []
for text, annotation in TRAIN_DATA:
@@ -117,6 +117,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
with nlp.disable_pipes(*other_pipes): # only train entity linker
# reset and initialize the weights randomly
optimizer = nlp.begin_training()
+
for itn in range(n_iter):
random.shuffle(TRAIN_DOCS)
losses = {}
diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py
index 128773c0a..683ab1fc6 100644
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@@ -10,10 +10,11 @@ see the documentation:
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function
+
+import ml_datasets
import plac
import random
from pathlib import Path
-import thinc.extra.datasets
import spacy
from spacy.util import minibatch, compounding
@@ -115,7 +116,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
def load_data(limit=0, split=0.8):
"""Load data from the IMDB dataset."""
# Partition off part of the train data for evaluation
- train_data, _ = thinc.extra.datasets.imdb()
+ train_data, _ = ml_datasets.imdb()
random.shuffle(train_data)
train_data = train_data[-limit:]
texts, labels = zip(*train_data)
diff --git a/requirements.txt b/requirements.txt
index 79a05b2bd..bb6bf9804 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,17 +1,20 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
-thinc==7.4.0.dev0
+thinc==8.0.0a0
blis>=0.4.0,<0.5.0
+ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0
wasabi>=0.4.0,<1.1.0
-srsly>=0.1.0,<1.1.0
+srsly>=2.0.0,<3.0.0
catalogue>=0.0.7,<1.1.0
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0
plac>=0.9.6,<1.2.0
tqdm>=4.38.0,<5.0.0
+# Optional dependencies
+jsonschema>=2.6.0,<3.1.0
pydantic>=1.0.0,<2.0.0
# Development dependencies
cython>=0.25
diff --git a/setup.cfg b/setup.cfg
index 9516a3dda..9ea85e896 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -35,16 +35,16 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
- thinc==7.4.0.dev0
+ thinc==8.0.0a0
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
- thinc==7.4.0.dev0
+ thinc==8.0.0a0
blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0
- srsly>=0.1.0,<1.1.0
+ srsly>=2.0.0,<3.0.0
catalogue>=0.0.7,<1.1.0
# Third-party dependencies
setuptools
diff --git a/spacy/__init__.py b/spacy/__init__.py
index 49db0e3b5..4a311ec86 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -5,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
# These are imported as part of the API
-from thinc.neural.util import prefer_gpu, require_gpu
+from thinc.util import prefer_gpu, require_gpu
from . import pipeline
from .cli.info import info as cli_info
@@ -21,6 +21,9 @@ if sys.maxunicode == 65535:
raise SystemError(Errors.E130)
+config = registry
+
+
def load(name, **overrides):
depr_path = overrides.get("path")
if depr_path not in (True, False, None):
diff --git a/spacy/__main__.py b/spacy/__main__.py
index 05e3d5e02..71ab1a91a 100644
--- a/spacy/__main__.py
+++ b/spacy/__main__.py
@@ -4,12 +4,14 @@ if __name__ == "__main__":
from wasabi import msg
from spacy.cli import download, link, info, package, train, pretrain, convert
from spacy.cli import init_model, profile, evaluate, validate, debug_data
+ from spacy.cli import train_from_config_cli
commands = {
"download": download,
"link": link,
"info": info,
"train": train,
+ "train-from-config": train_from_config_cli,
"pretrain": pretrain,
"debug-data": debug_data,
"evaluate": evaluate,
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 37cfff0b7..e69de29bb 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -1,982 +0,0 @@
-import numpy
-from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
-from thinc.t2t import ExtractWindow, ParametricAttention
-from thinc.t2v import Pooling, sum_pool, mean_pool
-from thinc.i2v import HashEmbed
-from thinc.misc import Residual, FeatureExtracter
-from thinc.misc import LayerNorm as LN
-from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
-from thinc.api import with_getitem, flatten_add_lengths
-from thinc.api import uniqued, wrap, noop
-from thinc.linear.linear import LinearModel
-from thinc.neural.ops import NumpyOps, CupyOps
-from thinc.neural.util import get_array_module, copy_array
-from thinc.neural.optimizers import Adam
-
-from thinc import describe
-from thinc.describe import Dimension, Synapses, Biases, Gradient
-from thinc.neural._classes.affine import _set_dimensions_if_needed
-import thinc.extra.load_nlp
-
-from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
-from .errors import Errors, user_warning, Warnings
-from . import util
-from . import ml as new_ml
-from .ml import _legacy_tok2vec
-
-
-VECTORS_KEY = "spacy_pretrained_vectors"
-# Backwards compatibility with <2.2.2
-USE_MODEL_REGISTRY_TOK2VEC = False
-
-
-def cosine(vec1, vec2):
- xp = get_array_module(vec1)
- norm1 = xp.linalg.norm(vec1)
- norm2 = xp.linalg.norm(vec2)
- if norm1 == 0.0 or norm2 == 0.0:
- return 0
- else:
- return vec1.dot(vec2) / (norm1 * norm2)
-
-
-def create_default_optimizer(ops, **cfg):
- learn_rate = util.env_opt("learn_rate", 0.001)
- beta1 = util.env_opt("optimizer_B1", 0.9)
- beta2 = util.env_opt("optimizer_B2", 0.999)
- eps = util.env_opt("optimizer_eps", 1e-8)
- L2 = util.env_opt("L2_penalty", 1e-6)
- max_grad_norm = util.env_opt("grad_norm_clip", 1.0)
- optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps)
- optimizer.max_grad_norm = max_grad_norm
- optimizer.device = ops.device
- return optimizer
-
-
-@layerize
-def _flatten_add_lengths(seqs, pad=0, drop=0.0):
- ops = Model.ops
- lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
-
- def finish_update(d_X, sgd=None):
- return ops.unflatten(d_X, lengths, pad=pad)
-
- X = ops.flatten(seqs, pad=pad)
- return (X, lengths), finish_update
-
-
-def _zero_init(model):
- def _zero_init_impl(self, *args, **kwargs):
- self.W.fill(0)
-
- model.on_init_hooks.append(_zero_init_impl)
- if model.W is not None:
- model.W.fill(0.0)
- return model
-
-
-def with_cpu(ops, model):
- """Wrap a model that should run on CPU, transferring inputs and outputs
- as necessary."""
- model.to_cpu()
-
- def with_cpu_forward(inputs, drop=0.0):
- cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop)
- gpu_outputs = _to_device(ops, cpu_outputs)
-
- def with_cpu_backprop(d_outputs, sgd=None):
- cpu_d_outputs = _to_cpu(d_outputs)
- return backprop(cpu_d_outputs, sgd=sgd)
-
- return gpu_outputs, with_cpu_backprop
-
- return wrap(with_cpu_forward, model)
-
-
-def _to_cpu(X):
- if isinstance(X, numpy.ndarray):
- return X
- elif isinstance(X, tuple):
- return tuple([_to_cpu(x) for x in X])
- elif isinstance(X, list):
- return [_to_cpu(x) for x in X]
- elif hasattr(X, "get"):
- return X.get()
- else:
- return X
-
-
-def _to_device(ops, X):
- if isinstance(X, tuple):
- return tuple([_to_device(ops, x) for x in X])
- elif isinstance(X, list):
- return [_to_device(ops, x) for x in X]
- else:
- return ops.asarray(X)
-
-
-class extract_ngrams(Model):
- def __init__(self, ngram_size, attr=LOWER):
- Model.__init__(self)
- self.ngram_size = ngram_size
- self.attr = attr
-
- def begin_update(self, docs, drop=0.0):
- batch_keys = []
- batch_vals = []
- for doc in docs:
- unigrams = doc.to_array([self.attr])
- ngrams = [unigrams]
- for n in range(2, self.ngram_size + 1):
- ngrams.append(self.ops.ngrams(n, unigrams))
- keys = self.ops.xp.concatenate(ngrams)
- keys, vals = self.ops.xp.unique(keys, return_counts=True)
- batch_keys.append(keys)
- batch_vals.append(vals)
- # The dtype here matches what thinc is expecting -- which differs per
- # platform (by int definition). This should be fixed once the problem
- # is fixed on Thinc's side.
- lengths = self.ops.asarray(
- [arr.shape[0] for arr in batch_keys], dtype=numpy.int_
- )
- batch_keys = self.ops.xp.concatenate(batch_keys)
- batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
- return (batch_keys, batch_vals, lengths), None
-
-
-@describe.on_data(
- _set_dimensions_if_needed, lambda model, X, y: model.init_weights(model)
-)
-@describe.attributes(
- nI=Dimension("Input size"),
- nF=Dimension("Number of features"),
- nO=Dimension("Output size"),
- nP=Dimension("Maxout pieces"),
- W=Synapses("Weights matrix", lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
- b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)),
- pad=Synapses(
- "Pad",
- lambda obj: (1, obj.nF, obj.nO, obj.nP),
- lambda M, ops: ops.normal_init(M, 1.0),
- ),
- d_W=Gradient("W"),
- d_pad=Gradient("pad"),
- d_b=Gradient("b"),
-)
-class PrecomputableAffine(Model):
- def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
- Model.__init__(self, **kwargs)
- self.nO = nO
- self.nP = nP
- self.nI = nI
- self.nF = nF
-
- def begin_update(self, X, drop=0.0):
- Yf = self.ops.gemm(
- X, self.W.reshape((self.nF * self.nO * self.nP, self.nI)), trans2=True
- )
- Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
- Yf = self._add_padding(Yf)
-
- def backward(dY_ids, sgd=None):
- dY, ids = dY_ids
- dY, ids = self._backprop_padding(dY, ids)
- Xf = X[ids]
- Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
-
- self.d_b += dY.sum(axis=0)
- dY = dY.reshape((dY.shape[0], self.nO * self.nP))
-
- Wopfi = self.W.transpose((1, 2, 0, 3))
- Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
- Wopfi = Wopfi.reshape((self.nO * self.nP, self.nF * self.nI))
- dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO * self.nP)), Wopfi)
-
- # Reuse the buffer
- dWopfi = Wopfi
- dWopfi.fill(0.0)
- self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
- dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
- # (o, p, f, i) --> (f, o, p, i)
- self.d_W += dWopfi.transpose((2, 0, 1, 3))
-
- if sgd is not None:
- sgd(self._mem.weights, self._mem.gradient, key=self.id)
- return dXf.reshape((dXf.shape[0], self.nF, self.nI))
-
- return Yf, backward
-
- def _add_padding(self, Yf):
- Yf_padded = self.ops.xp.vstack((self.pad, Yf))
- return Yf_padded
-
- def _backprop_padding(self, dY, ids):
- # (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
- mask = ids < 0.0
- mask = mask.sum(axis=1)
- d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
- self.d_pad += d_pad.sum(axis=0)
- return dY, ids
-
- @staticmethod
- def init_weights(model):
- """This is like the 'layer sequential unit variance', but instead
- of taking the actual inputs, we randomly generate whitened data.
-
- Why's this all so complicated? We have a huge number of inputs,
- and the maxout unit makes guessing the dynamics tricky. Instead
- we set the maxout weights to values that empirically result in
- whitened outputs given whitened inputs.
- """
- if (model.W ** 2).sum() != 0.0:
- return
- ops = model.ops
- xp = ops.xp
- ops.normal_init(model.W, model.nF * model.nI, inplace=True)
-
- ids = ops.allocate((5000, model.nF), dtype="f")
- ids += xp.random.uniform(0, 1000, ids.shape)
- ids = ops.asarray(ids, dtype="i")
- tokvecs = ops.allocate((5000, model.nI), dtype="f")
- tokvecs += xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
- tokvecs.shape
- )
-
- def predict(ids, tokvecs):
- # nS ids. nW tokvecs. Exclude the padding array.
- hiddens = model(tokvecs[:-1]) # (nW, f, o, p)
- vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype="f")
- # need nS vectors
- hiddens = hiddens.reshape(
- (hiddens.shape[0] * model.nF, model.nO * model.nP)
- )
- model.ops.scatter_add(vectors, ids.flatten(), hiddens)
- vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP))
- vectors += model.b
- vectors = model.ops.asarray(vectors)
- if model.nP >= 2:
- return model.ops.maxout(vectors)[0]
- else:
- return vectors * (vectors >= 0)
-
- tol_var = 0.01
- tol_mean = 0.01
- t_max = 10
- t_i = 0
- for t_i in range(t_max):
- acts1 = predict(ids, tokvecs)
- var = model.ops.xp.var(acts1)
- mean = model.ops.xp.mean(acts1)
- if abs(var - 1.0) >= tol_var:
- model.W /= model.ops.xp.sqrt(var)
- elif abs(mean) >= tol_mean:
- model.b -= mean
- else:
- break
-
-
-def link_vectors_to_models(vocab):
- vectors = vocab.vectors
- if vectors.name is None:
- vectors.name = VECTORS_KEY
- if vectors.data.size != 0:
- user_warning(Warnings.W020.format(shape=vectors.data.shape))
- ops = Model.ops
- for word in vocab:
- if word.orth in vectors.key2row:
- word.rank = vectors.key2row[word.orth]
- else:
- word.rank = 0
- data = ops.asarray(vectors.data)
- # Set an entry here, so that vectors are accessed by StaticVectors
- # (unideal, I know)
- key = (ops.device, vectors.name)
- if key in thinc.extra.load_nlp.VECTORS:
- if thinc.extra.load_nlp.VECTORS[key].shape != data.shape:
- # This is a hack to avoid the problem in #3853. Maybe we should
- # print a warning as well?
- old_name = vectors.name
- new_name = f"{vectors.name}_{data.shape[0]}"
- user_warning(Warnings.W019.format(old=old_name, new=new_name))
- vectors.name = new_name
- key = (ops.device, vectors.name)
- thinc.extra.load_nlp.VECTORS[key] = data
-
-
-def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
- import torch.nn
- from thinc.api import with_square_sequences
- from thinc.extra.wrappers import PyTorchWrapperRNN
-
- if depth == 0:
- return layerize(noop())
- model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
- return with_square_sequences(PyTorchWrapperRNN(model))
-
-
-def Tok2Vec(width, embed_size, **kwargs):
- if not USE_MODEL_REGISTRY_TOK2VEC:
- # Preserve prior tok2vec for backwards compat, in v2.2.2
- return _legacy_tok2vec.Tok2Vec(width, embed_size, **kwargs)
- pretrained_vectors = kwargs.get("pretrained_vectors", None)
- cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
- subword_features = kwargs.get("subword_features", True)
- char_embed = kwargs.get("char_embed", False)
- conv_depth = kwargs.get("conv_depth", 4)
- bilstm_depth = kwargs.get("bilstm_depth", 0)
- conv_window = kwargs.get("conv_window", 1)
-
- cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
-
- doc2feats_cfg = {"arch": "spacy.Doc2Feats.v1", "config": {"columns": cols}}
- if char_embed:
- embed_cfg = {
- "arch": "spacy.CharacterEmbed.v1",
- "config": {
- "width": 64,
- "chars": 6,
- "@mix": {
- "arch": "spacy.LayerNormalizedMaxout.v1",
- "config": {"width": width, "pieces": 3},
- },
- "@embed_features": None,
- },
- }
- else:
- embed_cfg = {
- "arch": "spacy.MultiHashEmbed.v1",
- "config": {
- "width": width,
- "rows": embed_size,
- "columns": cols,
- "use_subwords": subword_features,
- "@pretrained_vectors": None,
- "@mix": {
- "arch": "spacy.LayerNormalizedMaxout.v1",
- "config": {"width": width, "pieces": 3},
- },
- },
- }
- if pretrained_vectors:
- embed_cfg["config"]["@pretrained_vectors"] = {
- "arch": "spacy.PretrainedVectors.v1",
- "config": {
- "vectors_name": pretrained_vectors,
- "width": width,
- "column": cols.index("ID"),
- },
- }
- if cnn_maxout_pieces >= 2:
- cnn_cfg = {
- "arch": "spacy.MaxoutWindowEncoder.v1",
- "config": {
- "width": width,
- "window_size": conv_window,
- "pieces": cnn_maxout_pieces,
- "depth": conv_depth,
- },
- }
- else:
- cnn_cfg = {
- "arch": "spacy.MishWindowEncoder.v1",
- "config": {"width": width, "window_size": conv_window, "depth": conv_depth},
- }
- bilstm_cfg = {
- "arch": "spacy.TorchBiLSTMEncoder.v1",
- "config": {"width": width, "depth": bilstm_depth},
- }
- if conv_depth == 0 and bilstm_depth == 0:
- encode_cfg = {}
- elif conv_depth >= 1 and bilstm_depth >= 1:
- encode_cfg = {
- "arch": "thinc.FeedForward.v1",
- "config": {"children": [cnn_cfg, bilstm_cfg]},
- }
- elif conv_depth >= 1:
- encode_cfg = cnn_cfg
- else:
- encode_cfg = bilstm_cfg
- config = {"@doc2feats": doc2feats_cfg, "@embed": embed_cfg, "@encode": encode_cfg}
- return new_ml.Tok2Vec(config)
-
-
-def reapply(layer, n_times):
- def reapply_fwd(X, drop=0.0):
- backprops = []
- for i in range(n_times):
- Y, backprop = layer.begin_update(X, drop=drop)
- X = Y
- backprops.append(backprop)
-
- def reapply_bwd(dY, sgd=None):
- dX = None
- for backprop in reversed(backprops):
- dY = backprop(dY, sgd=sgd)
- if dX is None:
- dX = dY
- else:
- dX += dY
- return dX
-
- return Y, reapply_bwd
-
- return wrap(reapply_fwd, layer)
-
-
-def asarray(ops, dtype):
- def forward(X, drop=0.0):
- return ops.asarray(X, dtype=dtype), None
-
- return layerize(forward)
-
-
-def _divide_array(X, size):
- parts = []
- index = 0
- while index < len(X):
- parts.append(X[index : index + size])
- index += size
- return parts
-
-
-def get_col(idx):
- if idx < 0:
- raise IndexError(Errors.E066.format(value=idx))
-
- def forward(X, drop=0.0):
- if isinstance(X, numpy.ndarray):
- ops = NumpyOps()
- else:
- ops = CupyOps()
- output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
-
- def backward(y, sgd=None):
- dX = ops.allocate(X.shape)
- dX[:, idx] += y
- return dX
-
- return output, backward
-
- return layerize(forward)
-
-
-def doc2feats(cols=None):
- if cols is None:
- cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
-
- def forward(docs, drop=0.0):
- feats = []
- for doc in docs:
- feats.append(doc.to_array(cols))
- return feats, None
-
- model = layerize(forward)
- model.cols = cols
- return model
-
-
-def print_shape(prefix):
- def forward(X, drop=0.0):
- return X, lambda dX, **kwargs: dX
-
- return layerize(forward)
-
-
-@layerize
-def get_token_vectors(tokens_attrs_vectors, drop=0.0):
- tokens, attrs, vectors = tokens_attrs_vectors
-
- def backward(d_output, sgd=None):
- return (tokens, d_output)
-
- return vectors, backward
-
-
-@layerize
-def logistic(X, drop=0.0):
- xp = get_array_module(X)
- if not isinstance(X, xp.ndarray):
- X = xp.asarray(X)
- # Clip to range (-10, 10)
- X = xp.minimum(X, 10.0, X)
- X = xp.maximum(X, -10.0, X)
- Y = 1.0 / (1.0 + xp.exp(-X))
-
- def logistic_bwd(dY, sgd=None):
- dX = dY * (Y * (1 - Y))
- return dX
-
- return Y, logistic_bwd
-
-
-def zero_init(model):
- def _zero_init_impl(self, X, y):
- self.W.fill(0)
-
- model.on_data_hooks.append(_zero_init_impl)
- return model
-
-
-def getitem(i):
- def getitem_fwd(X, drop=0.0):
- return X[i], None
-
- return layerize(getitem_fwd)
-
-
-@describe.attributes(
- W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None)
-)
-class MultiSoftmax(Affine):
- """Neural network layer that predicts several multi-class attributes at once.
- For instance, we might predict one class with 6 variables, and another with 5.
- We predict the 11 neurons required for this, and then softmax them such
- that columns 0-6 make a probability distribution and coumns 6-11 make another.
- """
-
- name = "multisoftmax"
-
- def __init__(self, out_sizes, nI=None, **kwargs):
- Model.__init__(self, **kwargs)
- self.out_sizes = out_sizes
- self.nO = sum(out_sizes)
- self.nI = nI
-
- def predict(self, input__BI):
- output__BO = self.ops.affine(self.W, self.b, input__BI)
- i = 0
- for out_size in self.out_sizes:
- self.ops.softmax(output__BO[:, i : i + out_size], inplace=True)
- i += out_size
- return output__BO
-
- def begin_update(self, input__BI, drop=0.0):
- output__BO = self.predict(input__BI)
-
- def finish_update(grad__BO, sgd=None):
- self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True)
- self.d_b += grad__BO.sum(axis=0)
- grad__BI = self.ops.gemm(grad__BO, self.W)
- if sgd is not None:
- sgd(self._mem.weights, self._mem.gradient, key=self.id)
- return grad__BI
-
- return output__BO, finish_update
-
-
-def build_tagger_model(nr_class, **cfg):
- embed_size = util.env_opt("embed_size", 2000)
- if "token_vector_width" in cfg:
- token_vector_width = cfg["token_vector_width"]
- else:
- token_vector_width = util.env_opt("token_vector_width", 96)
- pretrained_vectors = cfg.get("pretrained_vectors")
- subword_features = cfg.get("subword_features", True)
- with Model.define_operators({">>": chain, "+": add}):
- if "tok2vec" in cfg:
- tok2vec = cfg["tok2vec"]
- else:
- tok2vec = Tok2Vec(
- token_vector_width,
- embed_size,
- subword_features=subword_features,
- pretrained_vectors=pretrained_vectors,
- )
- softmax = with_flatten(Softmax(nr_class, token_vector_width))
- model = tok2vec >> softmax
- model.nI = None
- model.tok2vec = tok2vec
- model.softmax = softmax
- return model
-
-
-def build_morphologizer_model(class_nums, **cfg):
- embed_size = util.env_opt("embed_size", 7000)
- if "token_vector_width" in cfg:
- token_vector_width = cfg["token_vector_width"]
- else:
- token_vector_width = util.env_opt("token_vector_width", 128)
- pretrained_vectors = cfg.get("pretrained_vectors")
- char_embed = cfg.get("char_embed", True)
- with Model.define_operators({">>": chain, "+": add, "**": clone}):
- if "tok2vec" in cfg:
- tok2vec = cfg["tok2vec"]
- else:
- tok2vec = Tok2Vec(
- token_vector_width,
- embed_size,
- char_embed=char_embed,
- pretrained_vectors=pretrained_vectors,
- )
- softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
- softmax.out_sizes = class_nums
- model = tok2vec >> softmax
- model.nI = None
- model.tok2vec = tok2vec
- model.softmax = softmax
- return model
-
-
-@layerize
-def SpacyVectors(docs, drop=0.0):
- batch = []
- for doc in docs:
- indices = numpy.zeros((len(doc),), dtype="i")
- for i, word in enumerate(doc):
- if word.orth in doc.vocab.vectors.key2row:
- indices[i] = doc.vocab.vectors.key2row[word.orth]
- else:
- indices[i] = 0
- vectors = doc.vocab.vectors.data[indices]
- batch.append(vectors)
- return batch, None
-
-
-def build_text_classifier(nr_class, width=64, **cfg):
- depth = cfg.get("depth", 2)
- nr_vector = cfg.get("nr_vector", 5000)
- pretrained_dims = cfg.get("pretrained_dims", 0)
- with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}):
- if cfg.get("low_data") and pretrained_dims:
- model = (
- SpacyVectors
- >> flatten_add_lengths
- >> with_getitem(0, Affine(width, pretrained_dims))
- >> ParametricAttention(width)
- >> Pooling(sum_pool)
- >> Residual(ReLu(width, width)) ** 2
- >> zero_init(Affine(nr_class, width, drop_factor=0.0))
- >> logistic
- )
- return model
-
- lower = HashEmbed(width, nr_vector, column=1)
- prefix = HashEmbed(width // 2, nr_vector, column=2)
- suffix = HashEmbed(width // 2, nr_vector, column=3)
- shape = HashEmbed(width // 2, nr_vector, column=4)
-
- trained_vectors = FeatureExtracter(
- [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
- ) >> with_flatten(
- uniqued(
- (lower | prefix | suffix | shape)
- >> LN(Maxout(width, width + (width // 2) * 3)),
- column=0,
- )
- )
-
- if pretrained_dims:
- static_vectors = SpacyVectors >> with_flatten(
- Affine(width, pretrained_dims)
- )
- # TODO Make concatenate support lists
- vectors = concatenate_lists(trained_vectors, static_vectors)
- vectors_width = width * 2
- else:
- vectors = trained_vectors
- vectors_width = width
- static_vectors = None
- tok2vec = vectors >> with_flatten(
- LN(Maxout(width, vectors_width))
- >> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth,
- pad=depth,
- )
- cnn_model = (
- tok2vec
- >> flatten_add_lengths
- >> ParametricAttention(width)
- >> Pooling(sum_pool)
- >> Residual(zero_init(Maxout(width, width)))
- >> zero_init(Affine(nr_class, width, drop_factor=0.0))
- )
-
- linear_model = build_bow_text_classifier(
- nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False
- )
- if cfg.get("exclusive_classes"):
- output_layer = Softmax(nr_class, nr_class * 2)
- else:
- output_layer = (
- zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic
- )
- model = (linear_model | cnn_model) >> output_layer
- model.tok2vec = chain(tok2vec, flatten)
- model.nO = nr_class
- model.lsuv = False
- return model
-
-
-def build_bow_text_classifier(
- nr_class, ngram_size=1, exclusive_classes=False, no_output_layer=False, **cfg
-):
- with Model.define_operators({">>": chain}):
- model = with_cpu(
- Model.ops, extract_ngrams(ngram_size, attr=ORTH) >> LinearModel(nr_class)
- )
- if not no_output_layer:
- model = model >> (cpu_softmax if exclusive_classes else logistic)
- model.nO = nr_class
- return model
-
-
-@layerize
-def cpu_softmax(X, drop=0.0):
- ops = NumpyOps()
-
- def cpu_softmax_backward(dY, sgd=None):
- return dY
-
- return ops.softmax(X), cpu_softmax_backward
-
-
-def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
- """
- Build a simple CNN text classifier, given a token-to-vector model as inputs.
- If exclusive_classes=True, a softmax non-linearity is applied, so that the
- outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
- is applied instead, so that outputs are in the range [0, 1].
- """
- with Model.define_operators({">>": chain}):
- if exclusive_classes:
- output_layer = Softmax(nr_class, tok2vec.nO)
- else:
- output_layer = (
- zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
- )
- model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
- model.tok2vec = chain(tok2vec, flatten)
- model.nO = nr_class
- return model
-
-
-def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
- if "entity_width" not in cfg:
- raise ValueError(Errors.E144.format(param="entity_width"))
-
- conv_depth = cfg.get("conv_depth", 2)
- cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
- pretrained_vectors = cfg.get("pretrained_vectors", None)
- context_width = cfg.get("entity_width")
-
- with Model.define_operators({">>": chain, "**": clone}):
- # context encoder
- tok2vec = Tok2Vec(
- width=hidden_width,
- embed_size=embed_width,
- pretrained_vectors=pretrained_vectors,
- cnn_maxout_pieces=cnn_maxout_pieces,
- subword_features=True,
- conv_depth=conv_depth,
- bilstm_depth=0,
- )
-
- model = (
- tok2vec
- >> flatten_add_lengths
- >> Pooling(mean_pool)
- >> Residual(zero_init(Maxout(hidden_width, hidden_width)))
- >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0))
- )
-
- model.tok2vec = tok2vec
- model.nO = context_width
- return model
-
-
-@layerize
-def flatten(seqs, drop=0.0):
- ops = Model.ops
- lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
-
- def finish_update(d_X, sgd=None):
- return ops.unflatten(d_X, lengths, pad=0)
-
- X = ops.flatten(seqs, pad=0)
- return X, finish_update
-
-
-def concatenate_lists(*layers, **kwargs): # pragma: no cover
- """Compose two or more models `f`, `g`, etc, such that their outputs are
- concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
- """
- if not layers:
- return noop()
- drop_factor = kwargs.get("drop_factor", 1.0)
- ops = layers[0].ops
- layers = [chain(layer, flatten) for layer in layers]
- concat = concatenate(*layers)
-
- def concatenate_lists_fwd(Xs, drop=0.0):
- if drop is not None:
- drop *= drop_factor
- lengths = ops.asarray([len(X) for X in Xs], dtype="i")
- flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
- ys = ops.unflatten(flat_y, lengths)
-
- def concatenate_lists_bwd(d_ys, sgd=None):
- return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
-
- return ys, concatenate_lists_bwd
-
- model = wrap(concatenate_lists_fwd, concat)
- return model
-
-
-def masked_language_model(vocab, model, mask_prob=0.15):
- """Convert a model into a BERT-style masked language model"""
-
- random_words = _RandomWords(vocab)
-
- def mlm_forward(docs, drop=0.0):
- mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
- mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
- output, backprop = model.begin_update(docs, drop=drop)
-
- def mlm_backward(d_output, sgd=None):
- d_output *= 1 - mask
- return backprop(d_output, sgd=sgd)
-
- return output, mlm_backward
-
- return wrap(mlm_forward, model)
-
-
-class _RandomWords(object):
- def __init__(self, vocab):
- self.words = [lex.text for lex in vocab if lex.prob != 0.0]
- self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
- self.words = self.words[:10000]
- self.probs = self.probs[:10000]
- self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
- self.probs /= self.probs.sum()
- self._cache = []
-
- def next(self):
- if not self._cache:
- self._cache.extend(
- numpy.random.choice(len(self.words), 10000, p=self.probs)
- )
- index = self._cache.pop()
- return self.words[index]
-
-
-def _apply_mask(docs, random_words, mask_prob=0.15):
- # This needs to be here to avoid circular imports
- from .tokens.doc import Doc
-
- N = sum(len(doc) for doc in docs)
- mask = numpy.random.uniform(0.0, 1.0, (N,))
- mask = mask >= mask_prob
- i = 0
- masked_docs = []
- for doc in docs:
- words = []
- for token in doc:
- if not mask[i]:
- word = _replace_word(token.text, random_words)
- else:
- word = token.text
- words.append(word)
- i += 1
- spaces = [bool(w.whitespace_) for w in doc]
- # NB: If you change this implementation to instead modify
- # the docs in place, take care that the IDs reflect the original
- # words. Currently we use the original docs to make the vectors
- # for the target, so we don't lose the original tokens. But if
- # you modified the docs in place here, you would.
- masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces))
- return mask, masked_docs
-
-
-def _replace_word(word, random_words, mask="[MASK]"):
- roll = numpy.random.random()
- if roll < 0.8:
- return mask
- elif roll < 0.9:
- return random_words.next()
- else:
- return word
-
-
-def _uniform_init(lo, hi):
- def wrapped(W, ops):
- copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
-
- return wrapped
-
-
-@describe.attributes(
- nM=Dimension("Vector dimensions"),
- nC=Dimension("Number of characters per word"),
- vectors=Synapses(
- "Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1)
- ),
- d_vectors=Gradient("vectors"),
-)
-class CharacterEmbed(Model):
- def __init__(self, nM=None, nC=None, **kwargs):
- Model.__init__(self, **kwargs)
- self.nM = nM
- self.nC = nC
-
- @property
- def nO(self):
- return self.nM * self.nC
-
- @property
- def nV(self):
- return 256
-
- def begin_update(self, docs, drop=0.0):
- if not docs:
- return []
- ids = []
- output = []
- weights = self.vectors
- # This assists in indexing; it's like looping over this dimension.
- # Still consider this weird witch craft...But thanks to Mark Neumann
- # for the tip.
- nCv = self.ops.xp.arange(self.nC)
- for doc in docs:
- doc_ids = doc.to_utf8_array(nr_char=self.nC)
- doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
- # Let's say I have a 2d array of indices, and a 3d table of data. What numpy
- # incantation do I chant to get
- # output[i, j, k] == data[j, ids[i, j], k]?
- doc_vectors[:, nCv] = weights[nCv, doc_ids[:, nCv]]
- output.append(doc_vectors.reshape((len(doc), self.nO)))
- ids.append(doc_ids)
-
- def backprop_character_embed(d_vectors, sgd=None):
- gradient = self.d_vectors
- for doc_ids, d_doc_vectors in zip(ids, d_vectors):
- d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), self.nC, self.nM))
- gradient[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
- if sgd is not None:
- sgd(self._mem.weights, self._mem.gradient, key=self.id)
- return None
-
- return output, backprop_character_embed
-
-
-def get_cossim_loss(yh, y, ignore_zeros=False):
- xp = get_array_module(yh)
- # Find the zero vectors
- if ignore_zeros:
- zero_indices = xp.abs(y).sum(axis=1) == 0
- # Add a small constant to avoid 0 vectors
- yh = yh + 1e-8
- y = y + 1e-8
- # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
- norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
- norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
- mul_norms = norm_yh * norm_y
- cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
- d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
- losses = xp.abs(cosine - 1)
- if ignore_zeros:
- # If the target was a zero vector, don't count it in the loss.
- d_yh[zero_indices] = 0
- losses[zero_indices] = 0
- loss = losses.sum()
- return loss, -d_yh
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 778453711..0f7677fd2 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -4,6 +4,7 @@ from .link import link # noqa: F401
from .package import package # noqa: F401
from .profile import profile # noqa: F401
from .train import train # noqa: F401
+from .train_from_config import train_from_config_cli # noqa: F401
from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .evaluate import evaluate # noqa: F401
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 9e2fc5b1c..109b135b5 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -4,19 +4,21 @@ import time
import re
from collections import Counter
from pathlib import Path
-from thinc.v2v import Affine, Maxout
-from thinc.misc import LayerNorm as LN
-from thinc.neural.util import prefer_gpu
+from thinc.layers import Linear, Maxout
+from thinc.util import prefer_gpu
from wasabi import msg
import srsly
+from thinc.layers import chain, list2array
+from thinc.loss import CosineDistance, L2Distance
from spacy.gold import Example
from ..errors import Errors
from ..tokens import Doc
from ..attrs import ID, HEAD
-from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
-from .._ml import masked_language_model, get_cossim_loss
+from ..ml.component_models import Tok2Vec
+from ..ml.component_models import masked_language_model
from .. import util
+from ..util import create_default_optimizer
from .train import _load_pretrained_tok2vec
@@ -99,7 +101,7 @@ def pretrain(
with msg.loading(f"Loading model '{vectors_model}'..."):
nlp = util.load_model(vectors_model)
msg.good(f"Loaded model '{vectors_model}'")
- pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
+ pretrained_vectors = None if not use_vectors else nlp.vocab.vectors
model = create_pretraining_model(
nlp,
Tok2Vec(
@@ -136,7 +138,7 @@ def pretrain(
# Without '--init-tok2vec' the '--epoch-start' argument is ignored
epoch_start = 0
- optimizer = create_default_optimizer(model.ops)
+ optimizer = create_default_optimizer()
tracker = ProgressTracker(frequency=10000)
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}")
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
@@ -251,13 +253,14 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
+ # TODO: this code originally didn't normalize, but shouldn't normalize=True ?
if objective == "L2":
- d_target = prediction - target
- loss = (d_target ** 2).sum()
+ distance = L2Distance(normalize=False)
elif objective == "cosine":
- loss, d_target = get_cossim_loss(prediction, target)
+ distance = CosineDistance(normalize=False)
else:
raise ValueError(Errors.E142.format(loss_func=objective))
+ d_target, loss = distance(prediction, target)
return loss, d_target
@@ -269,18 +272,18 @@ def create_pretraining_model(nlp, tok2vec):
"""
output_size = nlp.vocab.vectors.data.shape[1]
output_layer = chain(
- LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
+ Maxout(300, pieces=3, normalize=True, dropout=0.0), Linear(output_size)
)
# This is annoying, but the parser etc have the flatten step after
# the tok2vec. To load the weights in cleanly, we need to match
# the shape of the models' components exactly. So what we cann
# "tok2vec" has to be the same set of processes as what the components do.
- tok2vec = chain(tok2vec, flatten)
+ tok2vec = chain(tok2vec, list2array())
model = chain(tok2vec, output_layer)
model = masked_language_model(nlp.vocab, model)
- model.tok2vec = tok2vec
- model.output_layer = output_layer
- model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
+ model.set_ref("tok2vec", tok2vec)
+ model.set_ref("output_layer", output_layer)
+ model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
return model
diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
index 44e59971a..5b7a02212 100644
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@@ -5,7 +5,7 @@ import cProfile
import pstats
import sys
import itertools
-import thinc.extra.datasets
+import ml_datasets
from wasabi import msg
from ..util import load_model
@@ -29,7 +29,7 @@ def profile(
if inputs is None:
n_inputs = 25000
with msg.loading("Loading IMDB dataset via Thinc..."):
- imdb_train, _ = thinc.extra.datasets.imdb()
+ imdb_train, _ = ml_datasets.imdb()
inputs, _ = zip(*imdb_train)
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
inputs = inputs[:n_inputs]
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 6ebf5d37d..a83ca158d 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -1,7 +1,7 @@
import os
import tqdm
from pathlib import Path
-from thinc.neural._classes.model import Model
+from thinc.backends import use_ops
from timeit import default_timer as timer
import shutil
import srsly
@@ -9,7 +9,7 @@ from wasabi import msg
import contextlib
import random
-from .._ml import create_default_optimizer
+from ..util import create_default_optimizer
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus
from .. import util
@@ -200,7 +200,7 @@ def train(
if base_model:
# Start with an existing model, use default optimizer
- optimizer = create_default_optimizer(Model.ops)
+ optimizer = create_default_optimizer()
else:
# Start with a blank model, call begin_training
optimizer = nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
@@ -367,7 +367,7 @@ def train(
cpu_wps = nwords / (end_time - start_time)
else:
gpu_wps = nwords / (end_time - start_time)
- with Model.use_device("cpu"):
+ with use_ops("numpy"):
nlp_loaded = util.load_model_from_path(epoch_model_path)
for name, component in nlp_loaded.pipeline:
if hasattr(component, "cfg"):
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
new file mode 100644
index 000000000..0488dd04c
--- /dev/null
+++ b/spacy/cli/train_from_config.py
@@ -0,0 +1,445 @@
+import plac
+from thinc.util import require_gpu
+from wasabi import msg
+from pathlib import Path
+import thinc
+import thinc.schedules
+from thinc.model import Model
+from spacy.gold import GoldCorpus
+import spacy
+from spacy.pipeline.tok2vec import Tok2VecListener
+from typing import Optional, Dict, List, Union, Sequence
+from pydantic import BaseModel, FilePath, StrictInt
+import tqdm
+
+from ..ml import component_models
+from .. import util
+
+registry = util.registry
+
+CONFIG_STR = """
+[training]
+patience = 10
+eval_frequency = 10
+dropout = 0.2
+init_tok2vec = null
+vectors = null
+max_epochs = 100
+orth_variant_level = 0.0
+gold_preproc = false
+max_length = 0
+use_gpu = 0
+scores = ["ents_p", "ents_r", "ents_f"]
+score_weights = {"ents_f": 1.0}
+limit = 0
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.ner]
+factory = "ner"
+
+[nlp.pipeline.ner.model]
+@architectures = "transition_based_ner.v1"
+nr_feature_tokens = 3
+hidden_width = 64
+maxout_pieces = 3
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "tok2vec_tensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "hash_embed_cnn.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 128
+depth = 4
+window_size = 1
+embed_size = 10000
+maxout_pieces = 3
+"""
+
+
+class PipelineComponent(BaseModel):
+ factory: str
+ model: Model
+
+ class Config:
+ arbitrary_types_allowed = True
+
+
+class ConfigSchema(BaseModel):
+ optimizer: Optional["Optimizer"]
+
+ class training(BaseModel):
+ patience: int = 10
+ eval_frequency: int = 100
+ dropout: float = 0.2
+ init_tok2vec: Optional[FilePath] = None
+ vectors: Optional[str] = None
+ max_epochs: int = 100
+ orth_variant_level: float = 0.0
+ gold_preproc: bool = False
+ max_length: int = 0
+ use_gpu: int = 0
+ scores: List[str] = ["ents_p", "ents_r", "ents_f"]
+ score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0}
+ limit: int = 0
+ batch_size: Union[Sequence[int], int]
+
+ class nlp(BaseModel):
+ lang: str
+ vectors: Optional[str]
+ pipeline: Optional[Dict[str, PipelineComponent]]
+
+ class Config:
+ extra = "allow"
+
+
+# Of course, these would normally decorate the functions where they're defined.
+# But for now...
+@registry.architectures.register("hash_embed_cnn.v1")
+def hash_embed_cnn(
+ pretrained_vectors, width, depth, embed_size, maxout_pieces, window_size
+):
+ return component_models.Tok2Vec(
+ width=width,
+ embed_size=embed_size,
+ pretrained_vectors=pretrained_vectors,
+ conv_depth=depth,
+ cnn_maxout_pieces=maxout_pieces,
+ bilstm_depth=0,
+ window_size=window_size,
+ )
+
+
+@registry.architectures.register("hash_embed_bilstm.v1")
+def hash_embed_bilstm_v1(pretrained_vectors, width, depth, embed_size):
+ return component_models.Tok2Vec(
+ width=width,
+ embed_size=embed_size,
+ pretrained_vectors=pretrained_vectors,
+ bilstm_depth=depth,
+ conv_depth=0,
+ cnn_maxout_pieces=0,
+ )
+
+
+@registry.architectures.register("tagger_model.v1")
+def build_tagger_model_v1(tok2vec):
+ return component_models.build_tagger_model(nr_class=None, tok2vec=tok2vec)
+
+
+@registry.architectures.register("transition_based_parser.v1")
+def create_tb_parser_model(
+ tok2vec: Model,
+ nr_feature_tokens: StrictInt = 3,
+ hidden_width: StrictInt = 64,
+ maxout_pieces: StrictInt = 3,
+):
+ from thinc.layers import Linear, chain, list2array
+ from spacy.ml._layers import PrecomputableAffine
+ from spacy.syntax._parser_model import ParserModel
+ from thinc.api import use_ops, zero_init
+
+ token_vector_width = tok2vec.get_dim("nO")
+ tok2vec = chain(tok2vec, list2array())
+ tok2vec.set_dim("nO", token_vector_width)
+
+ lower = PrecomputableAffine(
+ hidden_width, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), nP=maxout_pieces
+ )
+ lower.set_dim("nP", maxout_pieces)
+ with use_ops("numpy"):
+ # Initialize weights at zero, as it's a classification layer.
+ upper = Linear(init_W=zero_init)
+ return ParserModel(tok2vec, lower, upper)
+
+
+@plac.annotations(
+ # fmt: off
+ train_path=("Location of JSON-formatted training data", "positional", None, Path),
+ dev_path=("Location of JSON-formatted development data", "positional", None, Path),
+ config_path=("Path to config file", "positional", None, Path),
+ output_path=("Output directory to store model in", "option", "o", Path),
+ meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
+ raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
+ # fmt: on
+)
+def train_from_config_cli(
+ train_path,
+ dev_path,
+ config_path,
+ output_path=None,
+ meta_path=None,
+ raw_text=None,
+ debug=False,
+ verbose=False,
+):
+ """
+ Train or update a spaCy model. Requires data to be formatted in spaCy's
+ JSON format. To convert data from other formats, use the `spacy convert`
+ command.
+ """
+ if not config_path or not config_path.exists():
+ msg.fail("Config file not found", config_path, exits=1)
+ if not train_path or not train_path.exists():
+ msg.fail("Training data not found", train_path, exits=1)
+ if not dev_path or not dev_path.exists():
+ msg.fail("Development data not found", dev_path, exits=1)
+ if meta_path is not None and not meta_path.exists():
+ msg.fail("Can't find model meta.json", meta_path, exits=1)
+ if output_path is not None and not output_path.exists():
+ output_path.mkdir()
+
+ try:
+ train_from_config(
+ config_path,
+ {"train": train_path, "dev": dev_path},
+ output_path=output_path,
+ meta_path=meta_path,
+ raw_text=raw_text,
+ )
+ except KeyboardInterrupt:
+ msg.warn("Cancelled.")
+
+
+def train_from_config(
+ config_path,
+ data_paths,
+ raw_text=None,
+ meta_path=None,
+ output_path=None,
+):
+ msg.info("Loading config from: {}".format(config_path))
+ config = util.load_from_config(config_path, create_objects=True)
+ use_gpu = config["training"]["use_gpu"]
+ if use_gpu >= 0:
+ msg.info("Using GPU")
+ else:
+ msg.info("Using CPU")
+ msg.info("Creating nlp from config")
+ nlp = create_nlp_from_config(**config["nlp"])
+ optimizer = config["optimizer"]
+ limit = config["training"]["limit"]
+ msg.info("Loading training corpus")
+ corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
+ msg.info("Initializing the nlp pipeline")
+ nlp.begin_training(
+ lambda: corpus.train_examples, device=use_gpu
+ )
+
+ train_batches = create_train_batches(nlp, corpus, config["training"])
+ evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"])
+
+ # Create iterator, which yields out info after each optimization step.
+ msg.info("Start training")
+ training_step_iterator = train_while_improving(
+ nlp,
+ optimizer,
+ train_batches,
+ evaluate,
+ config["training"]["dropout"],
+ config["training"]["patience"],
+ config["training"]["eval_frequency"],
+ )
+
+ msg.info("Training. Initial learn rate: {}".format(optimizer.learn_rate))
+ print_row = setup_printer(config)
+
+ try:
+ progress = tqdm.tqdm(total=config["training"]["eval_frequency"], leave=False)
+ for batch, info, is_best_checkpoint in training_step_iterator:
+ progress.update(1)
+ if is_best_checkpoint is not None:
+ progress.close()
+ print_row(info)
+ if is_best_checkpoint and output_path is not None:
+ nlp.to_disk(output_path)
+ progress = tqdm.tqdm(
+ total=config["training"]["eval_frequency"], leave=False
+ )
+ finally:
+ if output_path is not None:
+ with nlp.use_params(optimizer.averages):
+ final_model_path = output_path / "model-final"
+ nlp.to_disk(final_model_path)
+ msg.good("Saved model to output directory", final_model_path)
+ # with msg.loading("Creating best model..."):
+ # best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
+ # msg.good("Created best model", best_model_path)
+
+
+def create_nlp_from_config(lang, vectors, pipeline):
+ lang_class = spacy.util.get_lang_class(lang)
+ nlp = lang_class()
+ if vectors is not None:
+ spacy.cli.train._load_vectors(nlp, vectors)
+ for name, component_cfg in pipeline.items():
+ factory = component_cfg.pop("factory")
+ component = nlp.create_pipe(factory, config=component_cfg)
+ nlp.add_pipe(component, name=name)
+ return nlp
+
+
+def create_train_batches(nlp, corpus, cfg):
+ while True:
+ train_examples = corpus.train_dataset(
+ nlp,
+ noise_level=0.0,
+ orth_variant_level=cfg["orth_variant_level"],
+ gold_preproc=cfg["gold_preproc"],
+ max_length=cfg["max_length"],
+ ignore_misaligned=True,
+ )
+ for batch in util.minibatch_by_words(train_examples, size=cfg["batch_size"]):
+ yield batch
+
+
+def create_evaluation_callback(nlp, optimizer, corpus, cfg):
+ def evaluate():
+ with nlp.use_params(optimizer.averages):
+ dev_examples = list(
+ corpus.dev_dataset(
+ nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
+ )
+ )
+ scorer = nlp.evaluate(dev_examples)
+ scores = scorer.scores
+ # Calculate a weighted sum based on score_weights for the main score
+ weights = cfg["score_weights"]
+ weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
+ return weighted_score, scorer.scores
+
+ return evaluate
+
+
+def train_while_improving(
+ nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency
+):
+ """Train until an evaluation stops improving. Works as a generator,
+ with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
+ where info is a dict, and is_best_checkpoint is in [True, False, None] --
+ None indicating that the iteration was not evaluated as a checkpoint.
+ The evaluation is conducted by calling the evaluate callback, which should
+
+ Positional arguments:
+ nlp: The spaCy pipeline to evaluate.
+ train_data (Iterable[Batch]): A generator of batches, with the training
+ data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
+ data iterable needs to take care of iterating over the epochs and
+ shuffling.
+ evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
+ The callback should take no arguments and return a tuple
+ `(main_score, other_scores)`. The main_score should be a float where
+ higher is better. other_scores can be any object.
+
+ Every iteration, the function yields out a tuple with:
+
+ * batch: A zipped sequence of Tuple[Doc, GoldParse] pairs.
+ * info: A dict with various information about the last update (see below).
+ * is_best_checkpoint: A value in None, False, True, indicating whether this
+ was the best evaluation so far. You should use this to save the model
+ checkpoints during training. If None, evaluation was not conducted on
+ that iteration. False means evaluation was conducted, but a previous
+ evaluation was better.
+
+ The info dict provides the following information:
+
+ epoch (int): How many passes over the data have been completed.
+ step (int): How many steps have been completed.
+ score (float): The main score form the last evaluation.
+ other_scores: : The other scores from the last evaluation.
+ loss: The accumulated losses throughout training.
+ checkpoints: A list of previous results, where each result is a
+ (score, step, epoch) tuple.
+ """
+ if isinstance(dropout, float):
+ dropouts = thinc.schedules.constant(dropout)
+ else:
+ dropouts = dropout
+ results = []
+ losses = {}
+ for step, batch in enumerate(train_data):
+ dropout = next(dropouts)
+ for subbatch in subdivide_batch(batch):
+ nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
+ for name, proc in nlp.pipeline:
+ if hasattr(proc, "model"):
+ proc.model.finish_update(optimizer)
+ optimizer.step_schedules()
+ if not (step % eval_frequency):
+ score, other_scores = evaluate()
+ results.append((score, step))
+ is_best_checkpoint = score == max(results)[0]
+ else:
+ score, other_scores = (None, None)
+ is_best_checkpoint = None
+ info = {
+ "step": step,
+ "score": score,
+ "other_scores": other_scores,
+ "losses": losses,
+ "checkpoints": results,
+ }
+ yield batch, info, is_best_checkpoint
+ if is_best_checkpoint is not None:
+ losses = {}
+ # Stop if no improvement in `patience` updates
+ best_score, best_step = max(results)
+ if (step - best_step) >= patience:
+ break
+
+
+def subdivide_batch(batch):
+ return [batch]
+
+
+def setup_printer(config):
+ score_cols = config["training"]["scores"]
+ score_widths = [max(len(col), 6) for col in score_cols]
+ loss_cols = ["Loss {}".format(pipe) for pipe in config["nlp"]["pipeline"]]
+ loss_widths = [max(len(col), 8) for col in loss_cols]
+ table_header = ["#"] + loss_cols + score_cols + ["Score"]
+ table_header = [col.upper() for col in table_header]
+ table_widths = [6] + loss_widths + score_widths + [6]
+ table_aligns = ["r" for _ in table_widths]
+
+ msg.row(table_header, widths=table_widths)
+ msg.row(["-" * width for width in table_widths])
+
+ def print_row(info):
+ losses = [
+ "{0:.2f}".format(info["losses"].get(col, 0.0))
+ for col in config["nlp"]["pipeline"]
+ ]
+ scores = [
+ "{0:.2f}".format(info["other_scores"].get(col, 0.0))
+ for col in config["training"]["scores"]
+ ]
+ data = [info["step"]] + losses + scores + ["{0:.2f}".format(info["score"])]
+ msg.row(data, widths=table_widths, aligns=table_aligns)
+
+ return print_row
+
+
+@registry.architectures.register("tok2vec_tensors.v1")
+def tok2vec_tensors_v1(width):
+ tok2vec = Tok2VecListener("tok2vec", width=width)
+ return tok2vec
diff --git a/spacy/compat.py b/spacy/compat.py
index 8cb08ae09..6fa49353e 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -8,7 +8,7 @@ DOCS: https://spacy.io/api/top-level#compat
import os
import sys
-from thinc.neural.util import copy_array
+from thinc.util import copy_array
try:
import cPickle as pickle
@@ -30,10 +30,7 @@ try:
except ImportError:
cupy = None
-try:
- from thinc.neural.optimizers import Optimizer # noqa: F401
-except ImportError:
- from thinc.neural.optimizers import Adam as Optimizer # noqa: F401
+from thinc.optimizers import Optimizer # noqa: F401
pickle = pickle
copy_reg = copy_reg
diff --git a/spacy/language.py b/spacy/language.py
index b91903595..cde9c0164 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -4,7 +4,8 @@ import weakref
import functools
from contextlib import contextmanager
from copy import copy, deepcopy
-from thinc.neural import Model
+from thinc.model import Model
+from thinc.backends import get_current_ops
import srsly
import multiprocessing as mp
from itertools import chain, cycle
@@ -16,7 +17,7 @@ from .lookups import Lookups
from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
from .gold import Example
from .scorer import Scorer
-from ._ml import link_vectors_to_models, create_default_optimizer
+from .util import link_vectors_to_models, create_default_optimizer
from .attrs import IS_STOP, LANG
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
@@ -468,30 +469,27 @@ class Language(object):
if sgd is None:
if self._optimizer is None:
- self._optimizer = create_default_optimizer(Model.ops)
+ self._optimizer = create_default_optimizer()
sgd = self._optimizer
- grads = {}
-
- def get_grads(W, dW, key=None):
- grads[key] = (W, dW)
-
- get_grads.alpha = sgd.alpha
- get_grads.b1 = sgd.b1
- get_grads.b2 = sgd.b2
- pipes = list(self.pipeline)
- random.shuffle(pipes)
if component_cfg is None:
component_cfg = {}
- for name, proc in pipes:
+ # Determine whether component should set annotations. In theory I guess
+ # we should do this by inspecting the meta? Or we could just always
+ # say "yes"
+ for name, proc in self.pipeline:
+ component_cfg.setdefault(name, {})
+ component_cfg[name].setdefault("drop", drop)
+ component_cfg[name].setdefault("set_annotations", False)
+ grads = {}
+ for name, proc in self.pipeline:
if not hasattr(proc, "update"):
continue
- grads = {}
- kwargs = component_cfg.get(name, {})
- kwargs.setdefault("drop", drop)
- proc.update(examples, sgd=get_grads, losses=losses, **kwargs)
- for key, (W, dW) in grads.items():
- sgd(W, dW, key=key)
+ proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
+ if sgd is not False:
+ for name, proc in self.pipeline:
+ if hasattr(proc, "model"):
+ proc.model.finish_update(sgd)
def rehearse(self, examples, sgd=None, losses=None, config=None):
"""Make a "rehearsal" update to the models in the pipeline, to prevent
@@ -518,7 +516,7 @@ class Language(object):
examples = Example.to_example_objects(examples, make_doc=self.make_doc)
if sgd is None:
if self._optimizer is None:
- self._optimizer = create_default_optimizer(Model.ops)
+ self._optimizer = create_default_optimizer()
sgd = self._optimizer
pipes = list(self.pipeline)
random.shuffle(pipes)
@@ -529,7 +527,7 @@ class Language(object):
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
- get_grads.alpha = sgd.alpha
+ get_grads.learn_rate = sgd.learn_rate
get_grads.b1 = sgd.b1
get_grads.b2 = sgd.b2
for name, proc in pipes:
@@ -537,8 +535,8 @@ class Language(object):
continue
grads = {}
proc.rehearse(examples, sgd=get_grads, losses=losses, **config.get(name, {}))
- for key, (W, dW) in grads.items():
- sgd(W, dW, key=key)
+ for key, (W, dW) in grads.items():
+ sgd(W, dW, key=key)
return losses
def preprocess_gold(self, examples):
@@ -577,12 +575,13 @@ class Language(object):
if cfg.get("device", -1) >= 0:
util.use_gpu(cfg["device"])
if self.vocab.vectors.data.shape[1] >= 1:
- self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data)
+ ops = get_current_ops()
+ self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
link_vectors_to_models(self.vocab)
if self.vocab.vectors.data.shape[1]:
- cfg["pretrained_vectors"] = self.vocab.vectors.name
+ cfg["pretrained_vectors"] = self.vocab.vectors
if sgd is None:
- sgd = create_default_optimizer(Model.ops)
+ sgd = create_default_optimizer()
self._optimizer = sgd
if component_cfg is None:
component_cfg = {}
@@ -596,6 +595,7 @@ class Language(object):
sgd=self._optimizer,
**kwargs
)
+ self._link_components()
return self._optimizer
def resume_training(self, sgd=None, **cfg):
@@ -609,13 +609,14 @@ class Language(object):
"""
if cfg.get("device", -1) >= 0:
util.use_gpu(cfg["device"])
+ ops = get_current_ops()
if self.vocab.vectors.data.shape[1] >= 1:
- self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data)
+ self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
link_vectors_to_models(self.vocab)
if self.vocab.vectors.data.shape[1]:
- cfg["pretrained_vectors"] = self.vocab.vectors.name
+ cfg["pretrained_vectors"] = self.vocab.vectors
if sgd is None:
- sgd = create_default_optimizer(Model.ops)
+ sgd = create_default_optimizer()
self._optimizer = sgd
for name, proc in self.pipeline:
if hasattr(proc, "_rehearsal_model"):
@@ -736,7 +737,7 @@ class Language(object):
disable=disable,
n_process=n_process,
component_cfg=component_cfg,
- as_example=False
+ as_example=False # TODO: shouldn't this be as_example=as_example ?
)
for doc, context in zip(docs, contexts):
yield (doc, context)
@@ -838,6 +839,16 @@ class Language(object):
for proc in procs:
proc.terminate()
+ def _link_components(self):
+ """Register 'listeners' within pipeline components, to allow them to
+ effectively share weights.
+ """
+ for i, (name1, proc1) in enumerate(self.pipeline):
+ if hasattr(proc1, "find_listeners"):
+ for name2, proc2 in self.pipeline[i:]:
+ if hasattr(proc2, "model"):
+ proc1.find_listeners(proc2.model)
+
def to_disk(self, path, exclude=tuple(), disable=None):
"""Save the current state to a directory. If a model is loaded, this
will include the model.
@@ -906,6 +917,7 @@ class Language(object):
exclude = list(exclude) + ["vocab"]
util.from_disk(path, deserializers, exclude)
self._path = path
+ self._link_components()
return self
def to_bytes(self, exclude=tuple(), disable=None, **kwargs):
@@ -962,6 +974,7 @@ class Language(object):
)
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
util.from_bytes(bytes_data, deserializers, exclude)
+ self._link_components()
return self
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 497e20516..1292a46bd 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -6,7 +6,7 @@ cimport numpy as np
np.import_array()
import numpy
-from thinc.neural.util import get_array_module
+from thinc.util import get_array_module
from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py
index 8eebf0564..e69de29bb 100644
--- a/spacy/ml/__init__.py
+++ b/spacy/ml/__init__.py
@@ -1,2 +0,0 @@
-from .tok2vec import Tok2Vec # noqa: F401
-from .common import FeedForward, LayerNormalizedMaxout # noqa: F401
diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py
new file mode 100644
index 000000000..2ff67746f
--- /dev/null
+++ b/spacy/ml/_character_embed.py
@@ -0,0 +1,52 @@
+from thinc.api import Model
+
+
+def CharacterEmbed(nM, nC):
+ # nM: Number of dimensions per character. nC: Number of characters.
+ nO = nM*nC if (nM is not None and nC is not None) else None
+ return Model(
+ "charembed",
+ forward,
+ init=init,
+ dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256},
+ params={"E": None}
+ ).initialize()
+
+
+def init(model, X=None, Y=None):
+ vectors_table = model.ops.alloc3f(model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM"))
+ model.set_param("E", vectors_table)
+
+
+def forward(model, docs, is_train):
+ if not docs:
+ return []
+ ids = []
+ output = []
+ E = model.get_param("E")
+ nC = model.get_dim("nC")
+ nM = model.get_dim("nM")
+ nO = model.get_dim("nO")
+ # This assists in indexing; it's like looping over this dimension.
+ # Still consider this weird witch craft...But thanks to Mark Neumann
+ # for the tip.
+ nCv = model.ops.xp.arange(nC)
+ for doc in docs:
+ doc_ids = doc.to_utf8_array(nr_char=nC)
+ doc_vectors = model.ops.alloc3f(len(doc), nC, nM)
+ # Let's say I have a 2d array of indices, and a 3d table of data. What numpy
+ # incantation do I chant to get
+ # output[i, j, k] == data[j, ids[i, j], k]?
+ doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]]
+ output.append(doc_vectors.reshape((len(doc), nO)))
+ ids.append(doc_ids)
+
+ def backprop(d_output):
+ dE = model.ops.alloc(E.shape, dtype=E.dtype)
+ for doc_ids, d_doc_vectors in zip(ids, d_output):
+ d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), nC, nM))
+ dE[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
+ model.inc_grad("E", dE)
+ return []
+
+ return output, backprop
diff --git a/spacy/ml/_layers.py b/spacy/ml/_layers.py
new file mode 100644
index 000000000..e6aa798e7
--- /dev/null
+++ b/spacy/ml/_layers.py
@@ -0,0 +1,165 @@
+from thinc.model import Model
+from thinc.api import normal_init
+
+
+def PrecomputableAffine(nO, nI, nF, nP):
+ model = Model(
+ "precomputable_affine",
+ forward,
+ init=init,
+ dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
+ params={"W": None, "b": None, "pad": None},
+ )
+ model.initialize()
+ return model
+
+
+def forward(model, X, is_train):
+ nF = model.get_dim("nF")
+ nO = model.get_dim("nO")
+ nP = model.get_dim("nP")
+ nI = model.get_dim("nI")
+ W = model.get_param("W")
+ Yf = model.ops.gemm(
+ X, W.reshape((nF * nO * nP, nI)), trans2=True
+ )
+ Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
+ Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
+
+ def backward(dY_ids):
+ # This backprop is particularly tricky, because we get back a different
+ # thing from what we put out. We put out an array of shape:
+ # (nB, nF, nO, nP), and get back:
+ # (nB, nO, nP) and ids (nB, nF)
+ # The ids tell us the values of nF, so we would have:
+ #
+ # dYf = zeros((nB, nF, nO, nP))
+ # for b in range(nB):
+ # for f in range(nF):
+ # dYf[b, ids[b, f]] += dY[b]
+ #
+ # However, we avoid building that array for efficiency -- and just pass
+ # in the indices.
+ dY, ids = dY_ids
+ assert dY.ndim == 3
+ assert dY.shape[1] == nO, dY.shape
+ assert dY.shape[2] == nP, dY.shape
+ nB = dY.shape[0]
+ model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
+ Xf = X[ids]
+ Xf = Xf.reshape((Xf.shape[0], nF * nI))
+
+ model.inc_grad("b", dY.sum(axis=0))
+ dY = dY.reshape((dY.shape[0], nO * nP))
+
+ Wopfi = W.transpose((1, 2, 0, 3))
+ Wopfi = model.ops.xp.ascontiguousarray(Wopfi)
+ Wopfi = Wopfi.reshape((nO * nP, nF * nI))
+ dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
+
+ # Reuse the buffer
+ dWopfi = Wopfi
+ dWopfi.fill(0.0)
+ model.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
+ dWopfi = dWopfi.reshape((nO, nP, nF, nI))
+ # (o, p, f, i) --> (f, o, p, i)
+ model.inc_grad("W", dWopfi.transpose((2, 0, 1, 3)))
+ return dXf.reshape((dXf.shape[0], nF, nI))
+
+ return Yf, backward
+
+
+def _backprop_precomputable_affine_padding(model, dY, ids):
+ nB = dY.shape[0]
+ nF = model.get_dim("nF")
+ nP = model.get_dim("nP")
+ nO = model.get_dim("nO")
+ # Backprop the "padding", used as a filler for missing values.
+ # Values that are missing are set to -1, and each state vector could
+ # have multiple missing values. The padding has different values for
+ # different missing features. The gradient of the padding vector is:
+ #
+ # for b in range(nB):
+ # for f in range(nF):
+ # if ids[b, f] < 0:
+ # d_padding[0, f] += dY[b]
+ #
+ # Which can be rewritten as:
+ #
+ # for b in range(nB):
+ # d_pad[0, ids[b] < 0] += dY[b]
+ #
+ # I don't know how to avoid the loop without building a whole array :(.
+ # Cursed numpy.
+ d_pad = model.ops.alloc((1, nF, nO, nP))
+ for b in range(nB):
+ d_pad[0, ids[b] < 0] += dY[b]
+ return d_pad
+
+
+def init(model, X=None, Y=None):
+ """This is like the 'layer sequential unit variance', but instead
+ of taking the actual inputs, we randomly generate whitened data.
+
+ Why's this all so complicated? We have a huge number of inputs,
+ and the maxout unit makes guessing the dynamics tricky. Instead
+ we set the maxout weights to values that empirically result in
+ whitened outputs given whitened inputs.
+ """
+ if model.has_param("W") and model.get_param("W").any():
+ return
+
+ nF = model.get_dim("nF")
+ nO = model.get_dim("nO")
+ nP = model.get_dim("nP")
+ nI = model.get_dim("nI")
+ W = model.ops.alloc4f(nF, nO, nP, nI)
+ b = model.ops.alloc2f(nO, nP)
+ pad = model.ops.alloc4f(1, nF, nO, nP)
+
+ ops = model.ops
+ W = normal_init(ops, W.shape, fan_in=nF*nI)
+ model.set_param("W", W)
+ model.set_param("b", b)
+ model.set_param("pad", pad)
+
+ ids = ops.alloc((5000, nF), dtype="f")
+ ids += ops.xp.random.uniform(0, 1000, ids.shape)
+ ids = ops.asarray(ids, dtype="i")
+ tokvecs = ops.alloc((5000, nI), dtype="f")
+ tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+ tokvecs.shape
+ )
+
+ def predict(ids, tokvecs):
+ # nS ids. nW tokvecs. Exclude the padding array.
+ hiddens = model.predict(tokvecs[:-1]) # (nW, f, o, p)
+ vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
+ # need nS vectors
+ hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
+ model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+ vectors = vectors.reshape((vectors.shape[0], nO, nP))
+ vectors += b
+ vectors = model.ops.asarray(vectors)
+ if nP >= 2:
+ return model.ops.maxout(vectors)[0]
+ else:
+ return vectors * (vectors >= 0)
+
+ tol_var = 0.01
+ tol_mean = 0.01
+ t_max = 10
+ W = model.get_param("W").copy()
+ b = model.get_param("b").copy()
+ for t_i in range(t_max):
+ acts1 = predict(ids, tokvecs)
+ var = model.ops.xp.var(acts1)
+ mean = model.ops.xp.mean(acts1)
+ if abs(var - 1.0) >= tol_var:
+ W /= model.ops.xp.sqrt(var)
+ model.set_param("W", W)
+ elif abs(mean) >= tol_mean:
+ b -= mean
+ model.set_param("b", b)
+ else:
+ break
diff --git a/spacy/ml/_legacy_tok2vec.py b/spacy/ml/_legacy_tok2vec.py
deleted file mode 100644
index e7baae380..000000000
--- a/spacy/ml/_legacy_tok2vec.py
+++ /dev/null
@@ -1,129 +0,0 @@
-from thinc.v2v import Model, Maxout
-from thinc.i2v import HashEmbed, StaticVectors
-from thinc.t2t import ExtractWindow
-from thinc.misc import Residual
-from thinc.misc import LayerNorm as LN
-from thinc.misc import FeatureExtracter
-from thinc.api import layerize, chain, clone, concatenate, with_flatten
-from thinc.api import uniqued, wrap, noop
-
-from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
-
-
-def Tok2Vec(width, embed_size, **kwargs):
- # Circular imports :(
- from .._ml import CharacterEmbed
- from .._ml import PyTorchBiLSTM
-
- pretrained_vectors = kwargs.get("pretrained_vectors", None)
- cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
- subword_features = kwargs.get("subword_features", True)
- char_embed = kwargs.get("char_embed", False)
- if char_embed:
- subword_features = False
- conv_depth = kwargs.get("conv_depth", 4)
- bilstm_depth = kwargs.get("bilstm_depth", 0)
- cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
- with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
- norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
- if subword_features:
- prefix = HashEmbed(
- width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
- )
- suffix = HashEmbed(
- width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
- )
- shape = HashEmbed(
- width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
- )
- else:
- prefix, suffix, shape = (None, None, None)
- if pretrained_vectors is not None:
- glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
-
- if subword_features:
- embed = uniqued(
- (glove | norm | prefix | suffix | shape)
- >> LN(Maxout(width, width * 5, pieces=3)),
- column=cols.index(ORTH),
- )
- else:
- embed = uniqued(
- (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
- column=cols.index(ORTH),
- )
- elif subword_features:
- embed = uniqued(
- (norm | prefix | suffix | shape)
- >> LN(Maxout(width, width * 4, pieces=3)),
- column=cols.index(ORTH),
- )
- elif char_embed:
- embed = concatenate_lists(
- CharacterEmbed(nM=64, nC=8),
- FeatureExtracter(cols) >> with_flatten(norm),
- )
- reduce_dimensions = LN(
- Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
- )
- else:
- embed = norm
-
- convolution = Residual(
- ExtractWindow(nW=1)
- >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
- )
- if char_embed:
- tok2vec = embed >> with_flatten(
- reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
- )
- else:
- tok2vec = FeatureExtracter(cols) >> with_flatten(
- embed >> convolution ** conv_depth, pad=conv_depth
- )
-
- if bilstm_depth >= 1:
- tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
- # Work around thinc API limitations :(. TODO: Revise in Thinc 7
- tok2vec.nO = width
- tok2vec.embed = embed
- return tok2vec
-
-
-@layerize
-def flatten(seqs, drop=0.0):
- ops = Model.ops
- lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
-
- def finish_update(d_X, sgd=None):
- return ops.unflatten(d_X, lengths, pad=0)
-
- X = ops.flatten(seqs, pad=0)
- return X, finish_update
-
-
-def concatenate_lists(*layers, **kwargs): # pragma: no cover
- """Compose two or more models `f`, `g`, etc, such that their outputs are
- concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
- """
- if not layers:
- return noop()
- drop_factor = kwargs.get("drop_factor", 1.0)
- ops = layers[0].ops
- layers = [chain(layer, flatten) for layer in layers]
- concat = concatenate(*layers)
-
- def concatenate_lists_fwd(Xs, drop=0.0):
- if drop is not None:
- drop *= drop_factor
- lengths = ops.asarray([len(X) for X in Xs], dtype="i")
- flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
- ys = ops.unflatten(flat_y, lengths)
-
- def concatenate_lists_bwd(d_ys, sgd=None):
- return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
-
- return ys, concatenate_lists_bwd
-
- model = wrap(concatenate_lists_fwd, concat)
- return model
diff --git a/spacy/ml/_wire.py b/spacy/ml/_wire.py
deleted file mode 100644
index 2b1144fcb..000000000
--- a/spacy/ml/_wire.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from thinc.api import layerize, wrap, noop, chain, concatenate
-from thinc.v2v import Model
-
-
-def concatenate_lists(*layers, **kwargs): # pragma: no cover
- """Compose two or more models `f`, `g`, etc, such that their outputs are
- concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
- """
- if not layers:
- return layerize(noop())
- drop_factor = kwargs.get("drop_factor", 1.0)
- ops = layers[0].ops
- layers = [chain(layer, flatten) for layer in layers]
- concat = concatenate(*layers)
-
- def concatenate_lists_fwd(Xs, drop=0.0):
- if drop is not None:
- drop *= drop_factor
- lengths = ops.asarray([len(X) for X in Xs], dtype="i")
- flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
- ys = ops.unflatten(flat_y, lengths)
-
- def concatenate_lists_bwd(d_ys, sgd=None):
- return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
-
- return ys, concatenate_lists_bwd
-
- model = wrap(concatenate_lists_fwd, concat)
- return model
-
-
-@layerize
-def flatten(seqs, drop=0.0):
- ops = Model.ops
- lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
-
- def finish_update(d_X, sgd=None):
- return ops.unflatten(d_X, lengths, pad=0)
-
- X = ops.flatten(seqs, pad=0)
- return X, finish_update
diff --git a/spacy/ml/common.py b/spacy/ml/common.py
deleted file mode 100644
index 4ecb00e4e..000000000
--- a/spacy/ml/common.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from thinc.api import chain
-from thinc.v2v import Maxout
-from thinc.misc import LayerNorm
-from ..util import registry, make_layer
-
-
-@registry.architectures.register("thinc.FeedForward.v1")
-def FeedForward(config):
- layers = [make_layer(layer_cfg) for layer_cfg in config["layers"]]
- model = chain(*layers)
- model.cfg = config
- return model
-
-
-@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
-def LayerNormalizedMaxout(config):
- width = config["width"]
- pieces = config["pieces"]
- layer = LayerNorm(Maxout(width, pieces=pieces))
- layer.nO = width
- return layer
diff --git a/spacy/ml/component_models.py b/spacy/ml/component_models.py
new file mode 100644
index 000000000..a24c2bfce
--- /dev/null
+++ b/spacy/ml/component_models.py
@@ -0,0 +1,222 @@
+from spacy import util
+from spacy.ml.extract_ngrams import extract_ngrams
+
+from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
+from ..errors import Errors
+from ._character_embed import CharacterEmbed
+
+from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged
+from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors
+from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain
+from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued
+from thinc.api import zero_init, glorot_uniform_init
+
+
+def build_text_classifier(arch, config):
+ if arch == "cnn":
+ return build_simple_cnn_text_classifier(**config)
+ elif arch == "bow":
+ return build_bow_text_classifier(**config)
+ else:
+ raise ValueError("Unexpected textcat arch")
+
+
+def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg):
+ """
+ Build a simple CNN text classifier, given a token-to-vector model as inputs.
+ If exclusive_classes=True, a softmax non-linearity is applied, so that the
+ outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
+ is applied instead, so that outputs are in the range [0, 1].
+ """
+ with Model.define_operators({">>": chain}):
+ if exclusive_classes:
+ output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO"))
+ else:
+ # TODO: experiment with init_w=zero_init
+ output_layer = (
+ Linear(nO=nr_class, nI=tok2vec.get_dim("nO"))
+ >> Logistic()
+ )
+ model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
+ model.set_ref("tok2vec", tok2vec)
+ model.set_dim("nO", nr_class)
+ return model
+
+
+def build_bow_text_classifier(
+ nr_class, exclusive_classes, ngram_size=1, no_output_layer=False, **cfg
+):
+ with Model.define_operators({">>": chain}):
+ model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nr_class)
+ model.to_cpu()
+ if not no_output_layer:
+ output_layer = (
+ Softmax(nO=nr_class) if exclusive_classes else Logistic(nO=nr_class)
+ )
+ output_layer.to_cpu()
+ model = model >> output_layer
+ model.set_dim("nO", nr_class)
+ return model
+
+
+def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
+ if "entity_width" not in cfg:
+ raise ValueError(Errors.E144.format(param="entity_width"))
+
+ conv_depth = cfg.get("conv_depth", 2)
+ cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
+ pretrained_vectors = cfg.get("pretrained_vectors", None)
+ context_width = cfg.get("entity_width")
+
+ with Model.define_operators({">>": chain, "**": clone}):
+ nel_tok2vec = Tok2Vec(
+ width=hidden_width,
+ embed_size=embed_width,
+ pretrained_vectors=pretrained_vectors,
+ cnn_maxout_pieces=cnn_maxout_pieces,
+ subword_features=True,
+ conv_depth=conv_depth,
+ bilstm_depth=0,
+ )
+
+ model = (
+ nel_tok2vec
+ >> list2ragged()
+ >> reduce_mean()
+ >> residual(Maxout(nO=hidden_width, nI=hidden_width, nP=2, dropout=0.0))
+ >> Linear(nO=context_width, nI=hidden_width)
+ )
+ model.initialize()
+
+ model.set_ref("tok2vec", nel_tok2vec)
+ model.set_dim("nO", context_width)
+ return model
+
+
+def masked_language_model(*args, **kwargs):
+ raise NotImplementedError
+
+
+def build_tagger_model(nr_class, tok2vec):
+ token_vector_width = tok2vec.get_dim("nO")
+ # TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
+ softmax = with_array(Softmax(nO=nr_class, nI=token_vector_width, init_W=zero_init))
+ model = chain(tok2vec, softmax)
+ model.set_ref("tok2vec", tok2vec)
+ model.set_ref("softmax", softmax)
+ return model
+
+
+def build_morphologizer_model(class_nums, **cfg):
+ embed_size = util.env_opt("embed_size", 7000)
+ if "token_vector_width" in cfg:
+ token_vector_width = cfg["token_vector_width"]
+ else:
+ token_vector_width = util.env_opt("token_vector_width", 128)
+ pretrained_vectors = cfg.get("pretrained_vectors")
+ char_embed = cfg.get("char_embed", True)
+ with Model.define_operators({">>": chain, "+": add, "**": clone}):
+ if "tok2vec" in cfg:
+ tok2vec = cfg["tok2vec"]
+ else:
+ tok2vec = Tok2Vec(
+ token_vector_width,
+ embed_size,
+ char_embed=char_embed,
+ pretrained_vectors=pretrained_vectors,
+ )
+ softmax = with_array(MultiSoftmax(nOs=class_nums, nI=token_vector_width))
+ model = tok2vec >> softmax
+ model.set_ref("tok2vec", tok2vec)
+ model.set_ref("softmax", softmax)
+ return model
+
+
+def Tok2Vec(
+ width,
+ embed_size,
+ pretrained_vectors=None,
+ window_size=1,
+ cnn_maxout_pieces=3,
+ subword_features=True,
+ char_embed=False,
+ conv_depth=4,
+ bilstm_depth=0,
+):
+ if char_embed:
+ subword_features = False
+ cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+ with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
+ norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0)
+ if subword_features:
+ prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0)
+ suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0)
+ shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0)
+ else:
+ prefix, suffix, shape = (None, None, None)
+ if pretrained_vectors is not None:
+ glove = StaticVectors(vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0)
+
+ if subword_features:
+ embed = uniqued(
+ (glove | norm | prefix | suffix | shape)
+ >> Maxout(
+ nO=width, nI=width * 5, nP=3, dropout=0.0, normalize=True
+ ),
+ column=cols.index(ORTH),
+ )
+ else:
+ embed = uniqued(
+ (glove | norm)
+ >> Maxout(
+ nO=width, nI=width * 2, nP=3, dropout=0.0, normalize=True
+ ),
+ column=cols.index(ORTH),
+ )
+ elif subword_features:
+ embed = uniqued(
+ concatenate(norm, prefix, suffix, shape)
+ >> Maxout(nO=width, nI=width * 4, nP=3, dropout=0.0, normalize=True),
+ column=cols.index(ORTH),
+ )
+ elif char_embed:
+ embed = CharacterEmbed(nM=64, nC=8) | FeatureExtractor(cols) >> with_array(
+ norm
+ )
+ reduce_dimensions = Maxout(
+ nO=width,
+ nI=64 * 8 + width,
+ nP=cnn_maxout_pieces,
+ dropout=0.0,
+ normalize=True,
+ )
+ else:
+ embed = norm
+
+ convolution = residual(
+ expand_window(window_size=window_size)
+ >> Maxout(
+ nO=width,
+ nI=width * 3,
+ nP=cnn_maxout_pieces,
+ dropout=0.0,
+ normalize=True,
+ )
+ )
+ if char_embed:
+ tok2vec = embed >> with_array(
+ reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
+ )
+ else:
+ tok2vec = FeatureExtractor(cols) >> with_array(
+ embed >> convolution ** conv_depth, pad=conv_depth
+ )
+
+ if bilstm_depth >= 1:
+ tok2vec = tok2vec >> PyTorchLSTM(
+ nO=width, nI=width, depth=bilstm_depth, bi=True
+ )
+ # Work around thinc API limitations :(. TODO: Revise in Thinc 7
+ tok2vec.set_dim("nO", width)
+ tok2vec.set_ref("embed", embed)
+ return tok2vec
diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py
new file mode 100644
index 000000000..1ec5b5fc1
--- /dev/null
+++ b/spacy/ml/extract_ngrams.py
@@ -0,0 +1,39 @@
+import numpy
+from thinc.model import Model
+
+from ..attrs import LOWER
+
+
+def extract_ngrams(ngram_size, attr=LOWER) -> Model:
+ model = Model("extract_ngrams", forward)
+ model.attrs["ngram_size"] = ngram_size
+ model.attrs["attr"] = attr
+ return model
+
+
+def forward(self, docs, is_train: bool):
+ batch_keys = []
+ batch_vals = []
+ for doc in docs:
+ unigrams = doc.to_array([self.attrs["attr"]])
+ ngrams = [unigrams]
+ for n in range(2, self.attrs["ngram_size"] + 1):
+ ngrams.append(self.ops.ngrams(n, unigrams))
+ keys = self.ops.xp.concatenate(ngrams)
+ keys, vals = self.ops.xp.unique(keys, return_counts=True)
+ batch_keys.append(keys)
+ batch_vals.append(vals)
+ # The dtype here matches what thinc is expecting -- which differs per
+ # platform (by int definition). This should be fixed once the problem
+ # is fixed on Thinc's side.
+ lengths = self.ops.asarray(
+ [arr.shape[0] for arr in batch_keys], dtype=numpy.int_
+ )
+ batch_keys = self.ops.xp.concatenate(batch_keys)
+ batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
+
+ def backprop(dY):
+ return dY
+
+ return (batch_keys, batch_vals, lengths), backprop
+
diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py
index 9a0ed6bf5..102b88604 100644
--- a/spacy/ml/tok2vec.py
+++ b/spacy/ml/tok2vec.py
@@ -1,11 +1,12 @@
-from thinc.api import chain, layerize, clone, concatenate, with_flatten, uniqued
-from thinc.api import noop, with_square_sequences
-from thinc.v2v import Maxout, Model
-from thinc.i2v import HashEmbed, StaticVectors
-from thinc.t2t import ExtractWindow
-from thinc.misc import Residual, LayerNorm, FeatureExtracter
+from thinc.layers import chain, clone, concatenate, with_array, uniqued
+from thinc.model import Model
+from thinc.layers import noop, with_padded
+from thinc.layers import Maxout, expand_window
+from thinc.layers import HashEmbed, StaticVectors
+from thinc.layers import residual, LayerNorm, FeatureExtractor
+
+from spacy.ml import _character_embed
from ..util import make_layer, registry
-from ._wire import concatenate_lists
@registry.architectures.register("spacy.Tok2Vec.v1")
@@ -13,19 +14,21 @@ def Tok2Vec(config):
doc2feats = make_layer(config["@doc2feats"])
embed = make_layer(config["@embed"])
encode = make_layer(config["@encode"])
- field_size = getattr(encode, "receptive_field", 0)
- tok2vec = chain(doc2feats, with_flatten(chain(embed, encode), pad=field_size))
- tok2vec.cfg = config
- tok2vec.nO = encode.nO
- tok2vec.embed = embed
- tok2vec.encode = encode
+ field_size = 0
+ if encode.has_attr("receptive_field"):
+ field_size = encode.attrs["receptive_field"]
+ tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size))
+ tok2vec.attrs["cfg"] = config
+ tok2vec.set_dim("nO", encode.get_dim("nO"))
+ tok2vec.set_ref("embed", embed)
+ tok2vec.set_ref("encode", encode)
return tok2vec
@registry.architectures.register("spacy.Doc2Feats.v1")
def Doc2Feats(config):
columns = config["columns"]
- return FeatureExtracter(columns)
+ return FeatureExtractor(columns)
@registry.architectures.register("spacy.MultiHashEmbed.v1")
@@ -40,55 +43,47 @@ def MultiHashEmbed(config):
width = config["width"]
rows = config["rows"]
- norm = HashEmbed(width, rows, column=cols.index("NORM"), name="embed_norm")
+ norm = HashEmbed(width, rows, column=cols.index("NORM"), dropout=0.0)
if config["use_subwords"]:
- prefix = HashEmbed(
- width, rows // 2, column=cols.index("PREFIX"), name="embed_prefix"
- )
- suffix = HashEmbed(
- width, rows // 2, column=cols.index("SUFFIX"), name="embed_suffix"
- )
- shape = HashEmbed(
- width, rows // 2, column=cols.index("SHAPE"), name="embed_shape"
- )
+ prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX"), dropout=0.0)
+ suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX"), dropout=0.0)
+ shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE"), dropout=0.0)
if config.get("@pretrained_vectors"):
glove = make_layer(config["@pretrained_vectors"])
mix = make_layer(config["@mix"])
with Model.define_operators({">>": chain, "|": concatenate}):
if config["use_subwords"] and config["@pretrained_vectors"]:
- mix._layers[0].nI = width * 5
+ mix._layers[0].set_dim("nI", width * 5)
layer = uniqued(
(glove | norm | prefix | suffix | shape) >> mix,
column=cols.index("ORTH"),
)
elif config["use_subwords"]:
- mix._layers[0].nI = width * 4
+ mix._layers[0].set_dim("nI", width * 4)
layer = uniqued(
(norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH")
)
elif config["@pretrained_vectors"]:
- mix._layers[0].nI = width * 2
+ mix._layers[0].set_dim("nI", width * 2)
layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),)
else:
layer = norm
- layer.cfg = config
+ layer.attrs["cfg"] = config
return layer
@registry.architectures.register("spacy.CharacterEmbed.v1")
def CharacterEmbed(config):
- from .. import _ml
-
width = config["width"]
chars = config["chars"]
- chr_embed = _ml.CharacterEmbedModel(nM=width, nC=chars)
+ chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars)
other_tables = make_layer(config["@embed_features"])
mix = make_layer(config["@mix"])
- model = chain(concatenate_lists(chr_embed, other_tables), mix)
- model.cfg = config
+ model = chain(concatenate(chr_embed, other_tables), mix)
+ model.attrs["cfg"] = config
return model
@@ -99,48 +94,49 @@ def MaxoutWindowEncoder(config):
nP = config["pieces"]
depth = config["depth"]
- cnn = chain(
- ExtractWindow(nW=nW), LayerNorm(Maxout(nO, nO * ((nW * 2) + 1), pieces=nP))
- )
- model = clone(Residual(cnn), depth)
- model.nO = nO
- model.receptive_field = nW * depth
+ cnn = expand_window(window_size=nW), Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True)
+ model = clone(residual(cnn), depth)
+ model.set_dim("nO", nO)
+ model.attrs["receptive_field"] = nW * depth
return model
@registry.architectures.register("spacy.MishWindowEncoder.v1")
def MishWindowEncoder(config):
- from thinc.v2v import Mish
+ from thinc.layers import Mish
nO = config["width"]
nW = config["window_size"]
depth = config["depth"]
- cnn = chain(ExtractWindow(nW=nW), LayerNorm(Mish(nO, nO * ((nW * 2) + 1))))
- model = clone(Residual(cnn), depth)
- model.nO = nO
+ cnn = chain(expand_window(window_size=nW), Mish(nO=nO, nI=nO * ((nW * 2) + 1)), LayerNorm(nO))
+ model = clone(residual(cnn), depth)
+ model.set_dim("nO", nO)
return model
@registry.architectures.register("spacy.PretrainedVectors.v1")
def PretrainedVectors(config):
- return StaticVectors(config["vectors_name"], config["width"], config["column"])
+ # TODO: actual vectors instead of name
+ return StaticVectors(vectors=config["vectors_name"], nO=config["width"], column=config["column"], dropout=0.0)
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
def TorchBiLSTMEncoder(config):
import torch.nn
- from thinc.extra.wrappers import PyTorchWrapperRNN
+ # TODO FIX
+ from thinc.layers import PyTorchRNNWrapper
width = config["width"]
depth = config["depth"]
if depth == 0:
- return layerize(noop())
- return with_square_sequences(
- PyTorchWrapperRNN(torch.nn.LSTM(width, width // 2, depth, bidirectional=True))
+ return noop()
+ return with_padded(
+ PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True))
)
+# TODO: update
_EXAMPLE_CONFIG = {
"@doc2feats": {
"arch": "Doc2Feats",
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 2f9824eda..6a90de81c 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -3,6 +3,7 @@ from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
from .pipes import SentenceRecognizer
from .morphologizer import Morphologizer
from .entityruler import EntityRuler
+from .tok2vec import Tok2Vec
from .hooks import SentenceSegmenter, SimilarityHook
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
@@ -13,6 +14,7 @@ __all__ = [
"EntityLinker",
"TextCategorizer",
"Tensorizer",
+ "Tok2Vec",
"Pipe",
"Morphologizer",
"EntityRuler",
diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py
index 68385c5a9..00c328e81 100644
--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@@ -1,9 +1,8 @@
-from thinc.t2v import Pooling, max_pool, mean_pool
-from thinc.neural._classes.difference import Siamese, CauchySimilarity
+from thinc.layers import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity
from .pipes import Pipe
from ..language import component
-from .._ml import link_vectors_to_models
+from ..util import link_vectors_to_models
@component("sentencizer_hook", assigns=["doc.user_hooks"])
@@ -63,7 +62,10 @@ class SimilarityHook(Pipe):
@classmethod
def Model(cls, length):
- return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
+ return siamese(
+ concatenate(reduce_max(), reduce_mean()),
+ CauchySimilarity(length * 2)
+ )
def __call__(self, doc):
"""Install similarity hook"""
@@ -80,7 +82,7 @@ class SimilarityHook(Pipe):
def update(self, doc1_doc2, golds, sgd=None, drop=0.0):
self.require_model()
- sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
+ sims, bp_sims = self.model.begin_update(doc1_doc2)
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
"""Allocate model, using width from tensorizer in pipeline.
@@ -89,7 +91,7 @@ class SimilarityHook(Pipe):
pipeline (list): The pipeline the model is part of.
"""
if self.model is True:
- self.model = self.Model(pipeline[0].model.nO)
+ self.model = self.Model(pipeline[0].model.get_dim("nO"))
link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 10038d410..7b9e4b04e 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -3,19 +3,20 @@ from collections import defaultdict
import numpy
cimport numpy as np
-from thinc.api import chain
-from thinc.neural.util import to_categorical, copy_array, get_array_module
+from thinc.layers import chain, list2array
+from thinc.util import to_categorical, copy_array, get_array_module
+
from .. import util
from .pipes import Pipe
from ..language import component
-from .._ml import Tok2Vec, build_morphologizer_model
-from .._ml import link_vectors_to_models, zero_init, flatten
-from .._ml import create_default_optimizer
+from ..util import link_vectors_to_models, create_default_optimizer
from ..errors import Errors, TempErrors
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..morphology cimport Morphology
+from ..ml.component_models import build_morphologizer_model
+
@component("morphologizer", assigns=["token.morph", "token.pos"])
class Morphologizer(Pipe):
@@ -43,7 +44,7 @@ class Morphologizer(Pipe):
if self.model in (None, True, False):
return None
else:
- return chain(self.model.tok2vec, flatten)
+ return chain(self.model.get_ref("tok2vec"), list2array())
def __call__(self, doc):
features, tokvecs = self.predict([doc])
@@ -60,9 +61,9 @@ class Morphologizer(Pipe):
def predict(self, docs):
if not any(len(doc) for doc in docs):
# Handle case where there are no tokens in any docs.
- n_labels = self.model.nO
- guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
- tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
+ n_labels = self.model.get_dim("nO")
+ guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
+ tokvecs = self.model.ops.alloc((0, self.model.get_ref("tok2vec").get_dim("nO")))
return guesses, tokvecs
tokvecs = self.model.tok2vec(docs)
scores = self.model.softmax(tokvecs)
@@ -77,7 +78,7 @@ class Morphologizer(Pipe):
for field in self._class_map.fields]
for i, doc in enumerate(docs):
doc_scores = batch_scores[i]
- doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)
+ doc_guesses = scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"])
# Convert the neuron indices into feature IDs.
doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i')
for j in range(len(doc)):
@@ -110,7 +111,7 @@ class Morphologizer(Pipe):
def get_loss(self, examples, scores):
guesses = []
for doc_scores in scores:
- guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes))
+ guesses.append(scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"]))
guesses = self.model.ops.xp.vstack(guesses)
scores = self.model.ops.xp.vstack(scores)
if not isinstance(scores, numpy.ndarray):
@@ -120,7 +121,7 @@ class Morphologizer(Pipe):
cdef int idx = 0
# Do this on CPU, as we can't vectorize easily.
target = numpy.zeros(scores.shape, dtype='f')
- field_sizes = self.model.softmax.out_sizes
+ field_sizes = self.model.get_ref("softmax").attrs["nOs"]
for example in examples:
doc = example.doc
gold = example.gold
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 4f0f2469e..bca53bc03 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -3,11 +3,11 @@
import numpy
import srsly
import random
-from thinc.api import chain
-from thinc.v2v import Affine, Maxout, Softmax
-from thinc.misc import LayerNorm
-from thinc.neural.util import to_categorical
-from thinc.neural.util import get_array_module
+from thinc.layers import chain, Linear, Maxout, Softmax, LayerNorm, list2array
+from thinc.initializers import zero_init
+from thinc.loss import CosineDistance
+from thinc.util import to_categorical, get_array_module
+from thinc.model import set_dropout_rate
from ..tokens.doc cimport Doc
from ..syntax.nn_parser cimport Parser
@@ -21,13 +21,14 @@ from ..language import Language, component
from ..syntax import nonproj
from ..gold import Example
from ..attrs import POS, ID
+from ..util import link_vectors_to_models, create_default_optimizer
from ..parts_of_speech import X
from ..kb import KnowledgeBase
-from .._ml import Tok2Vec, build_tagger_model, cosine, get_cossim_loss
-from .._ml import build_text_classifier, build_simple_cnn_text_classifier
-from .._ml import build_bow_text_classifier, build_nel_encoder
-from .._ml import link_vectors_to_models, zero_init, flatten
-from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
+from ..ml.component_models import Tok2Vec, build_tagger_model
+from ..ml.component_models import build_text_classifier
+from ..ml.component_models import build_simple_cnn_text_classifier
+from ..ml.component_models import build_bow_text_classifier, build_nel_encoder
+from ..ml.component_models import masked_language_model
from ..errors import Errors, TempErrors, user_warning, Warnings
from .. import util
@@ -126,13 +127,15 @@ class Pipe(object):
"""Modify a batch of documents, using pre-computed scores."""
raise NotImplementedError
- def update(self, examples, drop=0.0, sgd=None, losses=None):
+ def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model.
Delegates to predict() and get_loss().
"""
- pass
+ if set_annotations:
+ docs = (self._get_doc(ex) for ex in examples)
+ docs = list(self.pipe(docs))
def rehearse(self, examples, sgd=None, losses=None, **config):
pass
@@ -152,7 +155,7 @@ class Pipe(object):
raise NotImplementedError
def create_optimizer(self):
- return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {}))
+ return create_default_optimizer()
def begin_training(
self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
@@ -163,10 +166,30 @@ class Pipe(object):
self.model = self.Model(**self.cfg)
if hasattr(self, "vocab"):
link_vectors_to_models(self.vocab)
+ self.model.initialize()
if sgd is None:
sgd = self.create_optimizer()
return sgd
+ def get_gradients(self):
+ """Get non-zero gradients of the model's parameters, as a dictionary
+ keyed by the parameter ID. The values are (weights, gradients) tuples.
+ """
+ gradients = {}
+ if self.model in (None, True, False):
+ return gradients
+ queue = [self.model]
+ seen = set()
+ for node in queue:
+ if node.id in seen:
+ continue
+ seen.add(node.id)
+ if hasattr(node, "_mem") and node._mem.gradient.any():
+ gradients[node.id] = [node._mem.weights, node._mem.gradient]
+ if hasattr(node, "_layers"):
+ queue.extend(node._layers)
+ return gradients
+
def use_params(self, params):
"""Modify the pipe's model, to use the given parameter values."""
with self.model.use_params(params):
@@ -193,7 +216,7 @@ class Pipe(object):
def load_model(b):
# TODO: Remove this once we don't have to handle previous models
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
- self.cfg["pretrained_vectors"] = self.vocab.vectors.name
+ self.cfg["pretrained_vectors"] = self.vocab.vectors
if self.model is True:
self.model = self.Model(**self.cfg)
try:
@@ -226,7 +249,7 @@ class Pipe(object):
def load_model(p):
# TODO: Remove this once we don't have to handle previous models
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
- self.cfg["pretrained_vectors"] = self.vocab.vectors.name
+ self.cfg["pretrained_vectors"] = self.vocab.vectors
if self.model is True:
self.model = self.Model(**self.cfg)
try:
@@ -254,10 +277,10 @@ class Tensorizer(Pipe):
width (int): Output size of the model.
embed_size (int): Number of vectors in the embedding table.
**cfg: Config parameters.
- RETURNS (Model): A `thinc.neural.Model` or similar instance.
+ RETURNS (Model): A `thinc.model.Model` or similar instance.
"""
input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96))
- return zero_init(Affine(output_size, input_size, drop_factor=0.0))
+ return Linear(output_size, input_size, init_W=zero_init)
def __init__(self, vocab, model=True, **cfg):
"""Construct a new statistical model. Weights are not allocated on
@@ -277,7 +300,6 @@ class Tensorizer(Pipe):
self.model = model
self.input_models = []
self.cfg = dict(cfg)
- self.cfg.setdefault("cnn_maxout_pieces", 3)
def __call__(self, example):
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
@@ -337,7 +359,7 @@ class Tensorizer(Pipe):
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
doc.tensor = tensor
- def update(self, examples, state=None, drop=0.0, sgd=None, losses=None):
+ def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None):
"""Update the model.
docs (iterable): A batch of `Doc` objects.
@@ -350,17 +372,23 @@ class Tensorizer(Pipe):
examples = Example.to_example_objects(examples)
inputs = []
bp_inputs = []
+ set_dropout_rate(self.model, drop)
for tok2vec in self.input_models:
- tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples], drop=drop)
+ set_dropout_rate(tok2vec, drop)
+ tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples])
inputs.append(tensor)
bp_inputs.append(bp_tensor)
inputs = self.model.ops.xp.hstack(inputs)
- scores, bp_scores = self.model.begin_update(inputs, drop=drop)
+ scores, bp_scores = self.model.begin_update(inputs)
loss, d_scores = self.get_loss(examples, scores)
d_inputs = bp_scores(d_scores, sgd=sgd)
d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
for d_input, bp_input in zip(d_inputs, bp_inputs):
- bp_input(d_input, sgd=sgd)
+ bp_input(d_input)
+ if sgd is not None:
+ for tok2vec in self.input_models:
+ tok2vec.finish_update(sgd)
+ self.model.finish_update(sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += loss
@@ -387,6 +415,7 @@ class Tensorizer(Pipe):
self.input_models.append(model.tok2vec)
if self.model is True:
self.model = self.Model(**self.cfg)
+ self.model.initialize()
link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
@@ -405,7 +434,6 @@ class Tagger(Pipe):
self.model = model
self._rehearsal_model = None
self.cfg = dict(sorted(cfg.items()))
- self.cfg.setdefault("cnn_maxout_pieces", 2)
@property
def labels(self):
@@ -416,12 +444,12 @@ class Tagger(Pipe):
if self.model in (None, True, False):
return None
else:
- return chain(self.model.tok2vec, flatten)
+ return chain(self.model.get_ref("tok2vec"), list2array())
def __call__(self, example):
doc = self._get_doc(example)
- tags, tokvecs = self.predict([doc])
- self.set_annotations([doc], tags, tensors=tokvecs)
+ tags = self.predict([doc])
+ self.set_annotations([doc], tags)
if isinstance(example, Example):
example.doc = doc
return example
@@ -430,8 +458,10 @@ class Tagger(Pipe):
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size):
docs = [self._get_doc(ex) for ex in examples]
- tag_ids, tokvecs = self.predict(docs)
- self.set_annotations(docs, tag_ids, tensors=tokvecs)
+ tag_ids = self.predict(docs)
+ assert len(docs) == len(examples)
+ assert len(tag_ids) == len(examples)
+ self.set_annotations(docs, tag_ids)
if as_example:
annotated_examples = []
@@ -447,20 +477,25 @@ class Tagger(Pipe):
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
n_labels = len(self.labels)
- guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
- tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
- return guesses, tokvecs
- tokvecs = self.model.tok2vec(docs)
- scores = self.model.softmax(tokvecs)
+ guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
+ assert len(guesses) == len(docs)
+ return guesses
+ scores = self.model.predict(docs)
+ assert len(scores) == len(docs), (len(scores), len(docs))
+ guesses = self._scores2guesses(scores)
+ assert len(guesses) == len(docs)
+ return guesses
+
+ def _scores2guesses(self, scores):
guesses = []
for doc_scores in scores:
doc_guesses = doc_scores.argmax(axis=1)
if not isinstance(doc_guesses, numpy.ndarray):
doc_guesses = doc_guesses.get()
guesses.append(doc_guesses)
- return guesses, tokvecs
+ return guesses
- def set_annotations(self, docs, batch_tag_ids, tensors=None):
+ def set_annotations(self, docs, batch_tag_ids):
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
@@ -483,15 +518,9 @@ class Tagger(Pipe):
else:
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
idx += 1
- if tensors is not None and len(tensors):
- if isinstance(doc.tensor, numpy.ndarray) \
- and not isinstance(tensors[i], numpy.ndarray):
- doc.extend_tensor(tensors[i].get())
- else:
- doc.extend_tensor(tensors[i])
doc.is_tagged = True
- def update(self, examples, drop=0., sgd=None, losses=None):
+ def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
self.require_model()
examples = Example.to_example_objects(examples)
if losses is not None and self.name not in losses:
@@ -500,13 +529,18 @@ class Tagger(Pipe):
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
# Handle cases where there are no tokens in any docs.
return
-
- tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+ set_dropout_rate(self.model, drop)
+ tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
loss, d_tag_scores = self.get_loss(examples, tag_scores)
- bp_tag_scores(d_tag_scores, sgd=sgd)
+ bp_tag_scores(d_tag_scores)
+ if sgd not in (None, False):
+ self.model.finish_update(sgd)
if losses is not None:
losses[self.name] += loss
+ if set_annotations:
+ docs = [ex.doc for ex in examples]
+ self.set_annotations(docs, self._scores2guesses(tag_scores))
def rehearse(self, examples, drop=0., sgd=None, losses=None):
"""Perform a 'rehearsal' update, where we try to match the output of
@@ -519,10 +553,12 @@ class Tagger(Pipe):
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
return
- guesses, backprop = self.model.begin_update(docs, drop=drop)
+ set_dropout_rate(self.model, drop)
+ guesses, backprop = self.model.begin_update(docs)
target = self._rehearsal_model(examples)
gradient = guesses - target
- backprop(gradient, sgd=sgd)
+ backprop(gradient)
+ self.model.finish_update(sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += (gradient**2).sum()
@@ -546,7 +582,7 @@ class Tagger(Pipe):
known_labels[idx] = 0.
idx += 1
correct = self.model.ops.xp.array(correct, dtype="i")
- d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
+ d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
d_scores *= self.model.ops.asarray(known_labels)
loss = (d_scores**2).sum()
docs = [ex.doc for ex in examples]
@@ -566,6 +602,7 @@ class Tagger(Pipe):
new_tag_map[tag] = orig_tag_map[tag]
else:
new_tag_map[tag] = {POS: X}
+
cdef Vocab vocab = self.vocab
if new_tag_map:
vocab.morphology = Morphology(vocab.strings, new_tag_map,
@@ -577,16 +614,39 @@ class Tagger(Pipe):
if hp in kwargs:
self.cfg[hp] = kwargs[hp]
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
+ # Get batch of example docs, example outputs to call begin_training().
+ # This lets the model infer shapes.
+ n_tags = self.vocab.morphology.n_tags
+ for node in self.model.walk():
+ # TODO: softmax hack ?
+ if node.name == "softmax" and node.has_dim("nO") is None:
+ node.set_dim("nO", n_tags)
link_vectors_to_models(self.vocab)
+ self.model.initialize()
if sgd is None:
sgd = self.create_optimizer()
return sgd
@classmethod
- def Model(cls, n_tags, **cfg):
+ def Model(cls, n_tags=None, **cfg):
if cfg.get("pretrained_dims") and not cfg.get("pretrained_vectors"):
raise ValueError(TempErrors.T008)
- return build_tagger_model(n_tags, **cfg)
+ if "tok2vec" in cfg:
+ tok2vec = cfg["tok2vec"]
+ else:
+ config = {
+ "width": cfg.get("token_vector_width", 96),
+ "embed_size": cfg.get("embed_size", 2000),
+ "pretrained_vectors": cfg.get("pretrained_vectors", None),
+ "window_size": cfg.get("window_size", 1),
+ "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3),
+ "subword_features": cfg.get("subword_features", True),
+ "char_embed": cfg.get("char_embed", False),
+ "conv_depth": cfg.get("conv_depth", 4),
+ "bilstm_depth": cfg.get("bilstm_depth", 0),
+ }
+ tok2vec = Tok2Vec(**config)
+ return build_tagger_model(n_tags, tok2vec)
def add_label(self, label, values=None):
if not isinstance(label, str):
@@ -633,12 +693,12 @@ class Tagger(Pipe):
def load_model(b):
# TODO: Remove this once we don't have to handle previous models
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
- self.cfg["pretrained_vectors"] = self.vocab.vectors.name
+ self.cfg["pretrained_vectors"] = self.vocab.vectors
if self.model is True:
token_vector_width = util.env_opt(
"token_vector_width",
self.cfg.get("token_vector_width", 96))
- self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
+ self.model = self.Model(**self.cfg)
try:
self.model.from_bytes(b)
except AttributeError:
@@ -676,9 +736,9 @@ class Tagger(Pipe):
def load_model(p):
# TODO: Remove this once we don't have to handle previous models
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
- self.cfg["pretrained_vectors"] = self.vocab.vectors.name
+ self.cfg["pretrained_vectors"] = self.vocab.vectors
if self.model is True:
- self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
+ self.model = self.Model(**self.cfg)
with p.open("rb") as file_:
try:
self.model.from_bytes(file_.read())
@@ -753,10 +813,12 @@ class SentenceRecognizer(Tagger):
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
# Handle cases where there are no tokens in any docs.
return
-
- tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+ set_dropout_rate(self.model, drop)
+ tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
loss, d_tag_scores = self.get_loss(examples, tag_scores)
- bp_tag_scores(d_tag_scores, sgd=sgd)
+ bp_tag_scores(d_tag_scores)
+ if sgd is not None:
+ self.model.finish_update(sgd)
if losses is not None:
losses[self.name] += loss
@@ -780,7 +842,7 @@ class SentenceRecognizer(Tagger):
known_labels[idx] = 0.
idx += 1
correct = self.model.ops.xp.array(correct, dtype="i")
- d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
+ d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
d_scores *= self.model.ops.asarray(known_labels)
loss = (d_scores**2).sum()
docs = [ex.doc for ex in examples]
@@ -797,6 +859,7 @@ class SentenceRecognizer(Tagger):
self.model = self.Model(len(self.labels), **self.cfg)
if sgd is None:
sgd = self.create_optimizer()
+ self.model.initialize()
return sgd
@classmethod
@@ -918,6 +981,7 @@ class MultitaskObjective(Tagger):
token_vector_width = util.env_opt("token_vector_width")
self.model = self.Model(len(self.labels), tok2vec=tok2vec)
link_vectors_to_models(self.vocab)
+ self.model.initialize()
if sgd is None:
sgd = self.create_optimizer()
return sgd
@@ -925,14 +989,12 @@ class MultitaskObjective(Tagger):
@classmethod
def Model(cls, n_tags, tok2vec=None, **cfg):
token_vector_width = util.env_opt("token_vector_width", 96)
- softmax = Softmax(n_tags, token_vector_width*2)
model = chain(
tok2vec,
- LayerNorm(Maxout(token_vector_width*2, token_vector_width, pieces=3)),
- softmax
+ Maxout(nO=token_vector_width*2, nI=token_vector_width, nP=3, dropout=0.0),
+ LayerNorm(token_vector_width*2),
+ Softmax(nO=n_tags, nI=token_vector_width*2)
)
- model.tok2vec = tok2vec
- model.softmax = softmax
return model
def predict(self, docs):
@@ -958,7 +1020,7 @@ class MultitaskObjective(Tagger):
correct[idx] = self.labels[label]
idx += 1
correct = self.model.ops.xp.array(correct, dtype="i")
- d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
+ d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
loss = (d_scores**2).sum()
return float(loss), d_scores
@@ -1047,19 +1109,18 @@ class ClozeMultitask(Pipe):
def Model(cls, vocab, tok2vec, **cfg):
output_size = vocab.vectors.data.shape[1]
output_layer = chain(
- LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
- zero_init(Affine(output_size, output_size, drop_factor=0.0))
+ Maxout(nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0),
+ Linear(nO=output_size, nI=output_size, init_W=zero_init)
)
model = chain(tok2vec, output_layer)
model = masked_language_model(vocab, model)
- model.tok2vec = tok2vec
- model.output_layer = output_layer
return model
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
self.cfg = cfg
+ self.distance = CosineDistance(ignore_zeros=True, normalize=False)
def set_annotations(self, docs, dep_ids, tensors=None):
pass
@@ -1069,7 +1130,8 @@ class ClozeMultitask(Pipe):
link_vectors_to_models(self.vocab)
if self.model is True:
self.model = self.Model(self.vocab, tok2vec)
- X = self.model.ops.allocate((5, self.model.tok2vec.nO))
+ X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
+ self.model.initialize()
self.model.output_layer.begin_training(X)
if sgd is None:
sgd = self.create_optimizer()
@@ -1088,10 +1150,11 @@ class ClozeMultitask(Pipe):
# and look them up all at once. This prevents data copying.
ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
target = vectors[ids]
- loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
- return float(loss), gradient
+ gradient = self.distance.get_grad(prediction, target)
+ loss = self.distance.get_loss(prediction, target)
+ return loss, gradient
- def update(self, examples, drop=0., sgd=None, losses=None):
+ def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
pass
def rehearse(self, examples, drop=0., sgd=None, losses=None):
@@ -1099,9 +1162,12 @@ class ClozeMultitask(Pipe):
examples = Example.to_example_objects(examples)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
- predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+ set_dropout_rate(self.model, drop)
+ predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples])
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
- bp_predictions(d_predictions, sgd=sgd)
+ bp_predictions(d_predictions)
+ if sgd is not None:
+ self.model.finish_update(sgd)
if losses is not None:
losses[self.name] += loss
@@ -1115,19 +1181,45 @@ class TextCategorizer(Pipe):
"""
@classmethod
- def Model(cls, nr_class=1, **cfg):
- embed_size = util.env_opt("embed_size", 2000)
- if "token_vector_width" in cfg:
- token_vector_width = cfg["token_vector_width"]
+ def Model(cls, nr_class=1, exclusive_classes=None, **cfg):
+ if nr_class == 1:
+ exclusive_classes = False
+ if exclusive_classes is None:
+ raise ValueError(
+ "TextCategorizer Model must specify 'exclusive_classes'. "
+ "This setting determines whether the model will output "
+ "scores that sum to 1 for each example. If only one class "
+ "is true for each example, you should set exclusive_classes=True. "
+ "For 'multi_label' classification, set exclusive_classes=False."
+ )
+ if "embed_size" not in cfg:
+ cfg["embed_size"] = util.env_opt("embed_size", 2000)
+ if "token_vector_width" not in cfg:
+ cfg["token_vector_width"] = util.env_opt("token_vector_width", 96)
+ if cfg.get("architecture") == "bow":
+ return build_bow_text_classifier(nr_class, exclusive_classes, **cfg)
else:
- token_vector_width = util.env_opt("token_vector_width", 96)
- if cfg.get("architecture") == "simple_cnn":
- tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
- return build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
- elif cfg.get("architecture") == "bow":
- return build_bow_text_classifier(nr_class, **cfg)
- else:
- return build_text_classifier(nr_class, **cfg)
+ if "tok2vec" in cfg:
+ tok2vec = cfg["tok2vec"]
+ else:
+ config = {
+ "width": cfg.get("token_vector_width", 96),
+ "embed_size": cfg.get("embed_size", 2000),
+ "pretrained_vectors": cfg.get("pretrained_vectors", None),
+ "window_size": cfg.get("window_size", 1),
+ "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3),
+ "subword_features": cfg.get("subword_features", True),
+ "char_embed": cfg.get("char_embed", False),
+ "conv_depth": cfg.get("conv_depth", 4),
+ "bilstm_depth": cfg.get("bilstm_depth", 0),
+ }
+ tok2vec = Tok2Vec(**config)
+ return build_simple_cnn_text_classifier(
+ tok2vec,
+ nr_class,
+ exclusive_classes,
+ **cfg
+ )
@property
def tok2vec(self):
@@ -1141,6 +1233,8 @@ class TextCategorizer(Pipe):
self.model = model
self._rehearsal_model = None
self.cfg = dict(cfg)
+ if "exclusive_classes" not in cfg:
+ self.cfg["exclusive_classes"] = True
@property
def labels(self):
@@ -1180,7 +1274,7 @@ class TextCategorizer(Pipe):
scores = xp.zeros((len(docs), len(self.labels)))
return scores, tensors
- scores = self.model(docs)
+ scores = self.model.predict(docs)
scores = self.model.ops.asarray(scores)
return scores, tensors
@@ -1189,18 +1283,24 @@ class TextCategorizer(Pipe):
for j, label in enumerate(self.labels):
doc.cats[label] = float(scores[i, j])
- def update(self, examples, state=None, drop=0., sgd=None, losses=None):
+ def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
self.require_model()
examples = Example.to_example_objects(examples)
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
# Handle cases where there are no tokens in any docs.
return
- scores, bp_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+ set_dropout_rate(self.model, drop)
+ scores, bp_scores = self.model.begin_update([ex.doc for ex in examples])
loss, d_scores = self.get_loss(examples, scores)
- bp_scores(d_scores, sgd=sgd)
+ bp_scores(d_scores)
+ if sgd is not None:
+ self.model.finish_update(sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += loss
+ if set_annotations:
+ docs = [ex.doc for ex in examples]
+ self.set_annotations(docs, scores=scores)
def rehearse(self, examples, drop=0., sgd=None, losses=None):
if self._rehearsal_model is None:
@@ -1210,10 +1310,13 @@ class TextCategorizer(Pipe):
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
return
- scores, bp_scores = self.model.begin_update(docs, drop=drop)
+ set_dropout_rate(self.model, drop)
+ scores, bp_scores = self.model.begin_update(docs)
target = self._rehearsal_model(examples)
gradient = scores - target
- bp_scores(gradient, sgd=sgd)
+ bp_scores(gradient)
+ if sgd is not None:
+ self.model.finish_update(sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += (gradient**2).sum()
@@ -1247,7 +1350,7 @@ class TextCategorizer(Pipe):
# - a huge problem.
raise ValueError(Errors.E116)
# smaller = self.model._layers[-1]
- # larger = Affine(len(self.labels)+1, smaller.nI)
+ # larger = Linear(len(self.labels)+1, smaller.nI)
# copy_array(larger.W[:smaller.nO], smaller.W)
# copy_array(larger.b[:smaller.nO], smaller.b)
# self.model._layers[-1] = larger
@@ -1259,12 +1362,15 @@ class TextCategorizer(Pipe):
for cat in example.doc_annotation.cats:
self.add_label(cat)
if self.model is True:
- self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
+ self.cfg.update(kwargs)
self.require_labels()
self.model = self.Model(len(self.labels), **self.cfg)
link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
+ # TODO: use get_examples instead
+ docs = [Doc(Vocab(), words=["hello"])]
+ self.model.initialize(X=docs)
return sgd
@@ -1382,6 +1488,7 @@ class EntityLinker(Pipe):
self.model = True
self.kb = None
self.cfg = dict(cfg)
+ self.distance = CosineDistance(normalize=False)
def set_kb(self, kb):
self.kb = kb
@@ -1399,16 +1506,14 @@ class EntityLinker(Pipe):
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
self.require_kb()
self.cfg["entity_width"] = self.kb.entity_vector_length
-
if self.model is True:
self.model = self.Model(**self.cfg)
-
+ self.model.initialize()
if sgd is None:
sgd = self.create_optimizer()
-
return sgd
- def update(self, examples, state=None, drop=0.0, sgd=None, losses=None):
+ def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None):
self.require_model()
self.require_kb()
if losses is not None:
@@ -1416,9 +1521,12 @@ class EntityLinker(Pipe):
if not examples:
return 0
examples = Example.to_example_objects(examples)
-
sentence_docs = []
docs = [ex.doc for ex in examples]
+ if set_annotations:
+ # This seems simpler than other ways to get that exact output -- but
+ # it does run the model twice :(
+ predictions = self.model.predict(docs)
golds = [ex.gold for ex in examples]
for doc, gold in zip(docs, golds):
@@ -1443,13 +1551,17 @@ class EntityLinker(Pipe):
except AttributeError:
# Catch the exception when ent.sent is None and provide a user-friendly warning
raise RuntimeError(Errors.E030)
-
- sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
+ set_dropout_rate(self.model, drop)
+ sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds)
- bp_context(d_scores, sgd=sgd)
+ bp_context(d_scores)
+ if sgd is not None:
+ self.model.finish_update(sgd)
if losses is not None:
losses[self.name] += loss
+ if set_annotations:
+ self.set_annotations(docs, predictions)
return loss
def get_similarity_loss(self, golds, scores):
@@ -1467,7 +1579,8 @@ class EntityLinker(Pipe):
if scores.shape != entity_encodings.shape:
raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up"))
- loss, gradients = get_cossim_loss(yh=scores, y=entity_encodings)
+ gradients = self.distance.get_grad(scores, entity_encodings)
+ loss = self.distance.get_loss(scores, entity_encodings)
loss = loss / len(entity_encodings)
return loss, gradients
@@ -1533,7 +1646,7 @@ class EntityLinker(Pipe):
for sent in doc.sents:
sent_doc = sent.as_doc()
# currently, the context is the same for each entity in a sentence (should be refined)
- sentence_encoding = self.model([sent_doc])[0]
+ sentence_encoding = self.model.predict([sent_doc])[0]
xp = get_array_module(sentence_encoding)
sentence_encoding_t = sentence_encoding.T
sentence_norm = xp.linalg.norm(sentence_encoding_t)
@@ -1720,7 +1833,6 @@ class Sentencizer(Pipe):
self.set_annotations(docs, scores, tensors=tensors)
else:
self.set_annotations(docs, predictions)
-
if as_example:
annotated_examples = []
for ex, doc in zip(examples, docs):
@@ -1729,7 +1841,7 @@ class Sentencizer(Pipe):
yield from annotated_examples
else:
yield from docs
-
+
def predict(self, docs):
"""Apply the pipeline's model to a batch of docs, without
modifying them.
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
new file mode 100644
index 000000000..9857c87eb
--- /dev/null
+++ b/spacy/pipeline/tok2vec.py
@@ -0,0 +1,188 @@
+from .pipes import Pipe
+from ..gold import Example
+from ..tokens import Doc
+from ..vocab import Vocab
+from ..language import component
+from ..util import link_vectors_to_models, minibatch, registry, eg2doc
+
+from thinc.model import Model, set_dropout_rate
+
+
+@component("tok2vec", assigns=["doc.tensor"])
+class Tok2Vec(Pipe):
+ @classmethod
+ def from_nlp(cls, nlp, **cfg):
+ return cls(nlp.vocab, **cfg)
+
+ @classmethod
+ def Model(cls, architecture, **cfg):
+ """Create a new statistical model for the class.
+
+ architecture (str): The registered model architecture to use.
+ **cfg: Config parameters.
+ RETURNS (Model): A `thinc.model.Model` or similar instance.
+ """
+ model = registry.architectures.get(architecture)
+ return model(**cfg)
+
+ def __init__(self, vocab, model=True, **cfg):
+ """Construct a new statistical model. Weights are not allocated on
+ initialisation.
+ vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
+ instance with the `Doc` objects it will process.
+ model (Model): A `Model` instance or `True` to allocate one later.
+ **cfg: Config parameters.
+ """
+ self.vocab = vocab
+ self.model = model
+ self.cfg = dict(cfg)
+ self.listeners = []
+
+ def create_listener(self):
+ listener = Tok2VecListener(upstream_name="tok2vec", width=self.model.get_dim("nO"))
+ self.listeners.append(listener)
+
+ def add_listener(self, listener):
+ self.listeners.append(listener)
+
+ def find_listeners(self, model):
+ for node in model.walk():
+ if isinstance(node, Tok2VecListener) and node.upstream_name == self.name:
+ self.add_listener(node)
+
+ def __call__(self, doc):
+ """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
+ model. Vectors are set to the `Doc.tensor` attribute.
+ docs (Doc or iterable): One or more documents to add vectors to.
+ RETURNS (dict or None): Intermediate computations.
+ """
+ tokvecses = self.predict([doc])
+ self.set_annotations([doc], tokvecses)
+ return doc
+
+ def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
+ """Process `Doc` objects as a stream.
+ stream (iterator): A sequence of `Doc` objects to process.
+ batch_size (int): Number of `Doc` objects to group.
+ n_threads (int): Number of threads.
+ YIELDS (iterator): A sequence of `Doc` objects, in order of input.
+ """
+ for batch in minibatch(stream, batch_size):
+ batch = list(batch)
+ if as_example:
+ docs = [eg2doc(doc) for doc in batch]
+ else:
+ docs = batch
+ tokvecses = self.predict(docs)
+ self.set_annotations(docs, tokvecses)
+ yield from batch
+
+ def predict(self, docs):
+ """Return a single tensor for a batch of documents.
+ docs (iterable): A sequence of `Doc` objects.
+ RETURNS (object): Vector representations for each token in the documents.
+ """
+ tokvecs = self.model.predict(docs)
+ batch_id = Tok2VecListener.get_batch_id(docs)
+ for listener in self.listeners:
+ listener.receive(batch_id, tokvecs, None)
+ return tokvecs
+
+ def set_annotations(self, docs, tokvecses):
+ """Set the tensor attribute for a batch of documents.
+ docs (iterable): A sequence of `Doc` objects.
+ tokvecs (object): Vector representation for each token in the documents.
+ """
+ for doc, tokvecs in zip(docs, tokvecses):
+ assert tokvecs.shape[0] == len(doc)
+ doc.tensor = tokvecs
+
+ def update(self, examples, drop=0.0, sgd=None, losses=None, set_annotations=False):
+ """Update the model.
+ examples (iterable): A batch of examples
+ drop (float): The droput rate.
+ sgd (callable): An optimizer.
+ RETURNS (dict): Results from the update.
+ """
+ if losses is None:
+ losses = {}
+ examples = Example.to_example_objects(examples)
+ docs = [eg.doc for eg in examples]
+ if isinstance(docs, Doc):
+ docs = [docs]
+ set_dropout_rate(self.model, drop)
+ tokvecs, bp_tokvecs = self.model.begin_update(docs)
+
+ def capture_losses(d_tokvecs):
+ """Accumulate tok2vec loss before doing backprop."""
+ l2_loss = sum((d_t2v**2).sum() for d_t2v in d_tokvecs)
+ if self.name in losses:
+ losses[self.name] += l2_loss / len(d_tokvecs)
+ else:
+ losses[self.name] = l2_loss / len(d_tokvecs)
+ return bp_tokvecs(d_tokvecs)
+
+ batch_id = Tok2VecListener.get_batch_id(docs)
+ for listener in self.listeners:
+ listener.receive(batch_id, tokvecs, capture_losses)
+ if sgd is not None:
+ self.model.finish_update(sgd)
+ if set_annotations:
+ self.set_annotations(docs, tokvecs)
+
+ def get_loss(self, docs, golds, scores):
+ pass
+
+ def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
+ """Allocate models and pre-process training data
+
+ get_examples (function): Function returning example training data.
+ pipeline (list): The pipeline the model is part of.
+ """
+ if self.model is True:
+ self.model = self.Model(**self.cfg)
+ # TODO: use examples instead ?
+ docs = [Doc(Vocab(), words=["hello"])]
+ self.model.initialize(X=docs)
+ link_vectors_to_models(self.vocab)
+
+
+class Tok2VecListener(Model):
+ """A layer that gets fed its answers from an upstream connection,
+ for instance from a component earlier in the pipeline.
+ """
+ name = "tok2vec-listener"
+
+ def __init__(self, upstream_name, width):
+ Model.__init__(self, name=self.name, forward=forward, dims={"nO": width})
+ self.upstream_name = upstream_name
+ self._batch_id = None
+ self._outputs = None
+ self._backprop = None
+
+ @classmethod
+ def get_batch_id(cls, inputs):
+ return sum(sum(token.orth for token in doc) for doc in inputs)
+
+ def receive(self, batch_id, outputs, backprop):
+ self._batch_id = batch_id
+ self._outputs = outputs
+ self._backprop = backprop
+
+ def verify_inputs(self, inputs):
+ if self._batch_id is None and self._outputs is None:
+ raise ValueError
+ else:
+ batch_id = self.get_batch_id(inputs)
+ if batch_id != self._batch_id:
+ raise ValueError(f"Mismatched IDs! {batch_id} vs {self._batch_id}")
+ else:
+ return True
+
+
+def forward(model: Tok2VecListener, inputs, is_train):
+ if is_train:
+ model.verify_inputs(inputs)
+ return model._outputs, model._backprop
+ else:
+ return [doc.tensor for doc in inputs], lambda dX: []
diff --git a/spacy/syntax/_beam_utils.pxd b/spacy/syntax/_beam_utils.pxd
index 36b0c05da..cf99ac3d1 100644
--- a/spacy/syntax/_beam_utils.pxd
+++ b/spacy/syntax/_beam_utils.pxd
@@ -1,4 +1,4 @@
-from thinc.typedefs cimport class_t, hash_t
+from ..typedefs cimport hash_t, class_t
# These are passed as callbacks to thinc.search.Beam
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
index b1085c762..32cf9193a 100644
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@@ -5,9 +5,9 @@ import numpy
from cpython.ref cimport PyObject, Py_XDECREF
from thinc.extra.search cimport Beam
from thinc.extra.search import MaxViolation
-from thinc.typedefs cimport hash_t, class_t
from thinc.extra.search cimport MaxViolation
+from ..typedefs cimport hash_t, class_t
from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParse
from ..errors import Errors
diff --git a/spacy/syntax/_parser_model.pxd b/spacy/syntax/_parser_model.pxd
index 9c72f3415..15befb372 100644
--- a/spacy/syntax/_parser_model.pxd
+++ b/spacy/syntax/_parser_model.pxd
@@ -1,6 +1,6 @@
from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free, realloc
-from thinc.typedefs cimport weight_t, class_t, hash_t
+from ..typedefs cimport weight_t, class_t, hash_t
from ._state cimport StateC
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 19d05e77f..cb8e1d127 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -10,18 +10,14 @@ from libcpp.vector cimport vector
from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free, realloc
from cymem.cymem cimport Pool
-from thinc.typedefs cimport weight_t, class_t, hash_t
from thinc.extra.search cimport Beam
-from thinc.api import chain, clone
-from thinc.v2v import Model, Maxout, Affine
-from thinc.misc import LayerNorm
-from thinc.neural.ops import CupyOps, NumpyOps
-from thinc.neural.util import get_array_module
-from thinc.linalg cimport Vec, VecVec
+from thinc.layers import Linear
+from thinc.model import Model
+from thinc.backends import CupyOps, NumpyOps, use_ops
+from thinc.backends.linalg cimport Vec, VecVec
cimport blis.cy
-from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
-from .._ml import link_vectors_to_models, create_default_optimizer
+from ..typedefs cimport weight_t, class_t, hash_t
from ..compat import copy_array
from ..tokens.doc cimport Doc
from ..gold cimport GoldParse
@@ -31,6 +27,7 @@ from .stateclass cimport StateClass
from .transition_system cimport Transition
from . import _beam_utils
from . import nonproj
+from ..util import link_vectors_to_models, create_default_optimizer
cdef WeightsC get_c_weights(model) except *:
@@ -44,8 +41,8 @@ cdef WeightsC get_c_weights(model) except *:
output.hidden_weights = NULL
output.hidden_bias = NULL
else:
- vec2scores_W = model.vec2scores.W
- vec2scores_b = model.vec2scores.b
+ vec2scores_W = model.vec2scores.get_param("W")
+ vec2scores_b = model.vec2scores.get_param("b")
output.hidden_weights = vec2scores_W.data
output.hidden_bias = vec2scores_b.data
cdef np.ndarray class_mask = model._class_mask
@@ -57,12 +54,12 @@ cdef SizesC get_c_sizes(model, int batch_size) except *:
cdef SizesC output
output.states = batch_size
if model.vec2scores is None:
- output.classes = model.state2vec.nO
+ output.classes = model.state2vec.get_dim("nO")
else:
- output.classes = model.vec2scores.nO
- output.hiddens = model.state2vec.nO
- output.pieces = model.state2vec.nP
- output.feats = model.state2vec.nF
+ output.classes = model.vec2scores.get_dim("nO")
+ output.hiddens = model.state2vec.get_dim("nO")
+ output.pieces = model.state2vec.get_dim("nP")
+ output.feats = model.state2vec.get_dim("nF")
output.embed_width = model.tokvecs.shape[1]
return output
@@ -226,7 +223,7 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
class ParserModel(Model):
def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None):
- Model.__init__(self)
+ Model.__init__(self, name="parser_model", forward=forward)
self._layers = [tok2vec, lower_model]
if upper_model is not None:
self._layers.append(upper_model)
@@ -235,41 +232,47 @@ class ParserModel(Model):
for class_ in unseen_classes:
self.unseen_classes.add(class_)
- def begin_update(self, docs, drop=0.):
- step_model = ParserStepModel(docs, self._layers, drop=drop,
- unseen_classes=self.unseen_classes)
- def finish_parser_update(golds, sgd=None):
- step_model.make_updates(sgd)
- return None
- return step_model, finish_parser_update
+ def predict(self, docs):
+ step_model = ParserStepModel(docs, self._layers,
+ unseen_classes=self.unseen_classes, train=False)
+ return step_model
- def resize_output(self, new_output):
+ def resize_output(self, new_nO):
if len(self._layers) == 2:
return
- if new_output == self.upper.nO:
+ if new_nO == self.upper.get_dim("nO"):
return
smaller = self.upper
-
- with Model.use_device('cpu'):
- larger = Affine(new_output, smaller.nI)
- larger.W.fill(0.0)
- larger.b.fill(0.0)
- # It seems very unhappy if I pass these as smaller.W?
- # Seems to segfault. Maybe it's a descriptor protocol thing?
- smaller_W = smaller.W
- larger_W = larger.W
- smaller_b = smaller.b
- larger_b = larger.b
+ nI = smaller.get_dim("nI")
+ with use_ops('numpy'):
+ larger = Linear(new_nO, nI)
+ larger_W = larger.ops.alloc2f(new_nO, nI)
+ larger_b = larger.ops.alloc1f(new_nO)
+ smaller_W = smaller.get_param("W")
+ smaller_b = smaller.get_param("b")
# Weights are stored in (nr_out, nr_in) format, so we're basically
# just adding rows here.
- larger_W[:smaller.nO] = smaller_W
- larger_b[:smaller.nO] = smaller_b
+ larger_W[:smaller.get_dim("nO")] = smaller_W
+ larger_b[:smaller.get_dim("nO")] = smaller_b
+ larger.set_param("W", larger_W)
+ larger.set_param("b", larger_b)
self._layers[-1] = larger
- for i in range(smaller.nO, new_output):
+ for i in range(smaller.get_dim("nO"), new_nO):
self.unseen_classes.add(i)
- def begin_training(self, X, y=None):
- self.lower.begin_training(X, y=y)
+ def initialize(self, X=None, Y=None):
+ self.tok2vec.initialize()
+ self.lower.initialize(X=X, Y=Y)
+ if self.upper is not None:
+ # In case we need to trigger the callbacks
+ statevecs = self.ops.alloc((2, self.lower.get_dim("nO")))
+ self.upper.initialize(X=statevecs)
+
+ def finish_update(self, optimizer):
+ self.tok2vec.finish_update(optimizer)
+ self.lower.finish_update(optimizer)
+ if self.upper is not None:
+ self.upper.finish_update(optimizer)
@property
def tok2vec(self):
@@ -284,17 +287,25 @@ class ParserModel(Model):
return self._layers[2]
+def forward(model:ParserModel, X, is_train):
+ step_model = ParserStepModel(X, model._layers, unseen_classes=model.unseen_classes,
+ train=is_train)
+
+ return step_model, step_model.finish_steps
+
+
class ParserStepModel(Model):
- def __init__(self, docs, layers, unseen_classes=None, drop=0.):
- self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop)
- if layers[1].nP >= 2:
+ def __init__(self, docs, layers, unseen_classes=None, train=True):
+ Model.__init__(self, name="parser_step_model", forward=step_forward)
+ self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
+ if layers[1].get_dim("nP") >= 2:
activation = "maxout"
elif len(layers) == 2:
activation = None
else:
activation = "relu"
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
- activation=activation, drop=drop)
+ activation=activation, train=train)
if len(layers) == 3:
self.vec2scores = layers[-1]
else:
@@ -304,7 +315,7 @@ class ParserStepModel(Model):
if self.vec2scores is None:
self._class_mask = numpy.zeros((self.state2vec.nO,), dtype='f')
else:
- self._class_mask = numpy.zeros((self.vec2scores.nO,), dtype='f')
+ self._class_mask = numpy.zeros((self.vec2scores.get_dim("nO"),), dtype='f')
self._class_mask.fill(1)
if unseen_classes is not None:
for class_ in unseen_classes:
@@ -323,40 +334,6 @@ class ParserStepModel(Model):
def mark_class_seen(self, class_):
self._class_mask[class_] = 1
- def begin_update(self, states, drop=0.):
- token_ids = self.get_token_ids(states)
- vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
- if self.vec2scores is not None:
- mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop)
- if mask is not None:
- vector *= mask
- scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
- else:
- scores = NumpyOps().asarray(vector)
- get_d_vector = lambda d_scores, sgd=None: d_scores
- mask = None
- # If the class is unseen, make sure its score is minimum
- scores[:, self._class_mask == 0] = numpy.nanmin(scores)
-
- def backprop_parser_step(d_scores, sgd=None):
- # Zero vectors for unseen classes
- d_scores *= self._class_mask
- d_vector = get_d_vector(d_scores, sgd=sgd)
- if mask is not None:
- d_vector *= mask
- if isinstance(self.state2vec.ops, CupyOps) \
- and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
- # Move token_ids and d_vector to GPU, asynchronously
- self.backprops.append((
- util.get_async(self.cuda_stream, token_ids),
- util.get_async(self.cuda_stream, d_vector),
- get_d_tokvecs
- ))
- else:
- self.backprops.append((token_ids, d_vector, get_d_tokvecs))
- return None
- return scores, backprop_parser_step
-
def get_token_ids(self, batch):
states = _beam_utils.collect_states(batch)
cdef StateClass state
@@ -370,25 +347,56 @@ class ParserStepModel(Model):
c_ids += ids.shape[1]
return ids
- def make_updates(self, sgd):
+ def finish_steps(self, golds):
# Add a padding vector to the d_tokvecs gradient, so that missing
# values don't affect the real gradient.
- d_tokvecs = self.ops.allocate((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
+ d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
# Tells CUDA to block, so our async copies complete.
if self.cuda_stream is not None:
self.cuda_stream.synchronize()
for ids, d_vector, bp_vector in self.backprops:
- d_state_features = bp_vector((d_vector, ids), sgd=sgd)
+ d_state_features = bp_vector((d_vector, ids))
ids = ids.flatten()
d_state_features = d_state_features.reshape(
(ids.size, d_state_features.shape[2]))
self.ops.scatter_add(d_tokvecs, ids,
d_state_features)
# Padded -- see update()
- self.bp_tokvecs(d_tokvecs[:-1], sgd=sgd)
+ if isinstance(self.ops, CupyOps):
+ d_tokvecs = self.ops.to_numpy(d_tokvecs)
+ self.bp_tokvecs(d_tokvecs[:-1])
return d_tokvecs
+def step_forward(model: ParserStepModel, states, is_train):
+ token_ids = model.get_token_ids(states)
+ vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
+ if model.vec2scores is not None:
+ scores, get_d_vector = model.vec2scores(vector, is_train)
+ else:
+ scores = NumpyOps().asarray(vector)
+ get_d_vector = lambda d_scores: d_scores
+ # If the class is unseen, make sure its score is minimum
+ scores[:, model._class_mask == 0] = numpy.nanmin(scores)
+
+ def backprop_parser_step(d_scores):
+ # Zero vectors for unseen classes
+ d_scores *= model._class_mask
+ d_vector = get_d_vector(d_scores)
+ if isinstance(model.state2vec.ops, CupyOps) \
+ and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
+ # Move token_ids and d_vector to GPU, asynchronously
+ model.backprops.append((
+ util.get_async(model.cuda_stream, token_ids),
+ util.get_async(model.cuda_stream, d_vector),
+ get_d_tokvecs
+ ))
+ else:
+ model.backprops.append((token_ids, d_vector, get_d_tokvecs))
+ return None
+ return scores, backprop_parser_step
+
+
cdef class precompute_hiddens:
"""Allow a model to be "primed" by pre-computing input features in bulk.
@@ -406,7 +414,7 @@ cdef class precompute_hiddens:
we can do all our hard maths up front, packed into large multiplications,
and do the hard-to-program parsing on the CPU.
"""
- cdef readonly int nF, nO, nP
+ cdef readonly int nF, nO, nP # TODO: make these more like the dimensions in thinc
cdef bint _is_synchronized
cdef public object ops
cdef np.ndarray _features
@@ -417,8 +425,8 @@ cdef class precompute_hiddens:
cdef object activation
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
- activation="maxout", drop=0.):
- gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
+ activation="maxout", train=False):
+ gpu_cached, bp_features = lower_model(tokvecs, train)
cdef np.ndarray cached
if not isinstance(gpu_cached, numpy.ndarray):
# Note the passing of cuda_stream here: it lets
@@ -427,12 +435,16 @@ cdef class precompute_hiddens:
cached = gpu_cached.get(stream=cuda_stream)
else:
cached = gpu_cached
- if not isinstance(lower_model.b, numpy.ndarray):
- self.bias = lower_model.b.get()
+ if not isinstance(lower_model.get_param("b"), numpy.ndarray):
+ # self.bias = lower_model.get_param("b").get(stream=cuda_stream) ???
+ self.bias = lower_model.get_param("b")
else:
- self.bias = lower_model.b
+ self.bias = lower_model.get_param("b")
self.nF = cached.shape[1]
- self.nP = getattr(lower_model, 'nP', 1)
+ if lower_model.has_dim("nP"):
+ self.nP = lower_model.get_dim("nP")
+ else:
+ self.nP = 1
self.nO = cached.shape[2]
self.ops = lower_model.ops
assert activation in (None, "relu", "maxout")
@@ -448,10 +460,26 @@ cdef class precompute_hiddens:
self._is_synchronized = True
return self._cached.data
- def __call__(self, X):
- return self.begin_update(X, drop=None)[0]
+ def get_dim(self, name):
+ if name == "nF":
+ return self.nF
+ elif name == "nP":
+ return self.nP
+ elif name == "nO":
+ return self.nO
+ else:
+ raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP")
- def begin_update(self, token_ids, drop=0.):
+ def __call__(self, X, bint is_train):
+ if is_train:
+ return self.begin_update(X)
+ else:
+ return self.predict(X), lambda X: X
+
+ def predict(self, X):
+ return self.begin_update(X)[0]
+
+ def begin_update(self, token_ids):
cdef np.ndarray state_vector = numpy.zeros(
(token_ids.shape[0], self.nO, self.nP), dtype='f')
# This is tricky, but (assuming GPU available);
@@ -466,13 +494,13 @@ cdef class precompute_hiddens:
sum_state_features(state_vector.data,
feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP)
- state_vector += self.bias
+ state_vector = state_vector + self.bias
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
- def backward(d_state_vector_ids, sgd=None):
+ def backward(d_state_vector_ids):
d_state_vector, token_ids = d_state_vector_ids
- d_state_vector = bp_nonlinearity(d_state_vector, sgd)
- d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
+ d_state_vector = bp_nonlinearity(d_state_vector)
+ d_tokens = bp_hiddens((d_state_vector, token_ids))
return d_tokens
return state_vector, backward
@@ -492,7 +520,7 @@ cdef class precompute_hiddens:
else:
mask = None
- def backprop_nonlinearity(d_best, sgd=None):
+ def backprop_nonlinearity(d_best):
if isinstance(d_best, numpy.ndarray):
ops = NumpyOps()
else:
diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd
index 972ad682a..9e9593eee 100644
--- a/spacy/syntax/arc_eager.pxd
+++ b/spacy/syntax/arc_eager.pxd
@@ -1,6 +1,6 @@
from cymem.cymem cimport Pool
-from thinc.typedefs cimport weight_t
+from ..typedefs cimport weight_t
from .stateclass cimport StateClass
from ..typedefs cimport attr_t
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index 5dfa20b7d..50b916fe2 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -1,7 +1,7 @@
-from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam
from collections import Counter
+from ..typedefs cimport weight_t
from .stateclass cimport StateClass
from ._state cimport StateC
from .transition_system cimport Transition
diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd
index 707c9654c..d77a04420 100644
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@@ -1,5 +1,3 @@
-from thinc.typedefs cimport atom_t
-
from .stateclass cimport StateClass
from .arc_eager cimport TransitionSystem
from ..vocab cimport Vocab
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 14d9e54d4..c73bc9a0a 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -13,24 +13,23 @@ from libcpp.vector cimport vector
from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free
from cymem.cymem cimport Pool
-from thinc.typedefs cimport weight_t, class_t, hash_t
from thinc.extra.search cimport Beam
-from thinc.api import chain, clone
-from thinc.v2v import Model, Maxout, Affine
-from thinc.misc import LayerNorm
-from thinc.neural.ops import NumpyOps, CupyOps
-from thinc.neural.util import get_array_module
-from thinc.linalg cimport Vec, VecVec
+from thinc.layers import chain, clone, Linear, list2array
+from thinc.backends import NumpyOps, CupyOps, use_ops
+from thinc.util import get_array_module
+from thinc.backends.linalg cimport Vec, VecVec
+from thinc.initializers import zero_init
+from thinc.model import set_dropout_rate
import srsly
from spacy.gold import Example
+from ..typedefs cimport weight_t, class_t, hash_t
from ._parser_model cimport alloc_activations, free_activations
from ._parser_model cimport predict_states, arg_max_if_valid
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
from ._parser_model cimport get_c_weights, get_c_sizes
from ._parser_model import ParserModel
-from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
-from .._ml import link_vectors_to_models, create_default_optimizer
+from ..util import link_vectors_to_models, create_default_optimizer
from ..compat import copy_array
from ..tokens.doc cimport Doc
from ..gold cimport GoldParse
@@ -44,6 +43,10 @@ from . import _beam_utils
from . import nonproj
+from ..ml._layers import PrecomputableAffine
+from ..ml.component_models import Tok2Vec
+
+
cdef class Parser:
"""
Base class of the DependencyParser and EntityRecognizer.
@@ -54,7 +57,7 @@ cdef class Parser:
subword_features = util.env_opt('subword_features',
cfg.get('subword_features', True))
conv_depth = util.env_opt('conv_depth', cfg.get('conv_depth', 4))
- conv_window = util.env_opt('conv_window', cfg.get('conv_depth', 1))
+ window_size = util.env_opt('window_size', cfg.get('window_size', 1))
t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3))
bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0))
self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0))
@@ -71,23 +74,23 @@ cdef class Parser:
parser_maxout_pieces = 1
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000))
pretrained_vectors = cfg.get('pretrained_vectors', None)
- tok2vec = Tok2Vec(token_vector_width, embed_size,
+ tok2vec = Tok2Vec(width=token_vector_width,
+ embed_size=embed_size,
conv_depth=conv_depth,
- conv_window=conv_window,
+ window_size=window_size,
cnn_maxout_pieces=t2v_pieces,
subword_features=subword_features,
pretrained_vectors=pretrained_vectors,
bilstm_depth=bilstm_depth)
- tok2vec = chain(tok2vec, flatten)
- tok2vec.nO = token_vector_width
+ tok2vec = chain(tok2vec, list2array())
+ tok2vec.set_dim("nO", token_vector_width)
lower = PrecomputableAffine(hidden_width,
nF=nr_feature_tokens, nI=token_vector_width,
nP=parser_maxout_pieces)
- lower.nP = parser_maxout_pieces
+ lower.set_dim("nP", parser_maxout_pieces)
if depth == 1:
- with Model.use_device('cpu'):
- upper = Affine(nr_class, hidden_width, drop_factor=0.0)
- upper.W *= 0
+ with use_ops('numpy'):
+ upper = Linear(nr_class, hidden_width, init_W=zero_init)
else:
upper = None
@@ -102,11 +105,13 @@ cdef class Parser:
'bilstm_depth': bilstm_depth,
'self_attn_depth': self_attn_depth,
'conv_depth': conv_depth,
- 'conv_window': conv_window,
+ 'window_size': window_size,
'embed_size': embed_size,
'cnn_maxout_pieces': t2v_pieces
}
- return ParserModel(tok2vec, lower, upper), cfg
+ model = ParserModel(tok2vec, lower, upper)
+ model.initialize()
+ return model, cfg
name = 'base_parser'
@@ -283,12 +288,13 @@ cdef class Parser:
def greedy_parse(self, docs, drop=0.):
cdef vector[StateC*] states
cdef StateClass state
+ set_dropout_rate(self.model, drop)
batch = self.moves.init_batch(docs)
# This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to
# expand our model output.
self._resize()
- model = self.model(docs)
+ model = self.model.predict(docs)
weights = get_c_weights(model)
for state in batch:
if not state.is_final():
@@ -303,18 +309,19 @@ cdef class Parser:
cdef Beam beam
cdef Doc doc
cdef np.ndarray token_ids
+ set_dropout_rate(self.model, drop)
beams = self.moves.init_beams(docs, beam_width, beam_density=beam_density)
# This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to
# expand our model output.
self._resize()
- model = self.model(docs)
+ model = self.model.predict(docs)
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
dtype='i', order='C')
cdef int* c_ids
cdef int nr_feature = self.cfg["nr_feature_tokens"]
cdef int n_states
- model = self.model(docs)
+ model = self.model.predict(docs)
todo = [beam for beam in beams if not beam.is_done]
while todo:
token_ids.fill(-1)
@@ -331,8 +338,8 @@ cdef class Parser:
n_states += 1
if n_states == 0:
break
- vectors = model.state2vec(token_ids[:n_states])
- scores = model.vec2scores(vectors)
+ vectors = model.state2vec.predict(token_ids[:n_states])
+ scores = model.vec2scores.predict(vectors)
todo = self.transition_beams(todo, scores)
return beams
@@ -424,7 +431,7 @@ cdef class Parser:
beam.check_done(_beam_utils.check_final_state, NULL)
return [b for b in beams if not b.is_done]
- def update(self, examples, drop=0., sgd=None, losses=None):
+ def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
self.require_model()
examples = Example.to_example_objects(examples)
@@ -438,8 +445,10 @@ cdef class Parser:
beam_update_prob = self.cfg.get('beam_update_prob', 0.5)
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() < beam_update_prob:
return self.update_beam(examples, self.cfg.get('beam_width', 1),
- drop=drop, sgd=sgd, losses=losses,
+ drop=drop, sgd=sgd, losses=losses, set_annotations=set_annotations,
beam_density=self.cfg.get('beam_density', 0.001))
+
+ set_dropout_rate(self.model, drop)
# Chop sequences into lengths of this many transitions, to make the
# batch uniform length.
cut_gold = numpy.random.choice(range(20, 100))
@@ -448,19 +457,24 @@ cdef class Parser:
if not s.is_final() and g is not None]
# Prepare the stepwise model, and get the callback for finishing the batch
- model, finish_update = self.model.begin_update([ex.doc for ex in examples], drop=drop)
+ model, backprop_tok2vec = self.model.begin_update([ex.doc for ex in examples])
+ all_states = list(states)
for _ in range(max_steps):
if not states_golds:
break
states, golds = zip(*states_golds)
- scores, backprop = model.begin_update(states, drop=drop)
+ scores, backprop = model.begin_update(states)
d_scores = self.get_batch_loss(states, golds, scores, losses)
- backprop(d_scores, sgd=sgd)
+ backprop(d_scores)
# Follow the predicted action
self.transition_states(states, scores)
states_golds = [eg for eg in states_golds if not eg[0].is_final()]
- # Do the backprop
- finish_update(golds, sgd=sgd)
+ backprop_tok2vec(golds)
+ if sgd is not None:
+ self.model.finish_update(sgd)
+ if set_annotations:
+ docs = [ex.doc for ex in examples]
+ self.set_annotations(docs, all_states)
return losses
def rehearse(self, examples, sgd=None, losses=None, **cfg):
@@ -482,13 +496,15 @@ cdef class Parser:
# expand our model output.
self._resize()
# Prepare the stepwise model, and get the callback for finishing the batch
- tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0)
- model, finish_update = self.model.begin_update(docs, drop=0.0)
+ set_dropout_rate(self._rehearsal_model, 0.0)
+ set_dropout_rate(self.model, 0.0)
+ tutor, _ = self._rehearsal_model.begin_update(docs)
+ model, finish_update = self.model.begin_update(docs)
n_scores = 0.
loss = 0.
while states:
- targets, _ = tutor.begin_update(states, drop=0.)
- guesses, backprop = model.begin_update(states, drop=0.)
+ targets, _ = tutor.begin_update(states)
+ guesses, backprop = model.begin_update(states)
d_scores = (guesses - targets) / targets.shape[0]
# If all weights for an output are 0 in the original model, don't
# supervise that output. This allows us to add classes.
@@ -499,12 +515,14 @@ cdef class Parser:
states = [state for state in states if not state.is_final()]
n_scores += d_scores.size
# Do the backprop
- finish_update(docs, sgd=sgd)
+ finish_update(docs)
+ if sgd is not None:
+ self.model.finish_update(sgd)
losses[self.name] += loss / n_scores
return losses
def update_beam(self, examples, width, drop=0., sgd=None, losses=None,
- beam_density=0.0):
+ set_annotations=False, beam_density=0.0):
examples = Example.to_example_objects(examples)
docs = [ex.doc for ex in examples]
golds = [ex.gold for ex in examples]
@@ -514,15 +532,16 @@ cdef class Parser:
for gold in golds:
self.moves.preprocess_gold(gold)
new_golds.append(gold)
- model, finish_update = self.model.begin_update(docs, drop=drop)
+ set_dropout_rate(self.model, drop)
+ model, backprop_tok2vec = self.model.begin_update(docs)
states_d_scores, backprops, beams = _beam_utils.update_beam(
- self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, model.state2vec,
- model.vec2scores, width, drop=drop, losses=losses,
+ self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds,
+ model.state2vec, model.vec2scores, width, losses=losses,
beam_density=beam_density)
for i, d_scores in enumerate(states_d_scores):
losses[self.name] += (d_scores**2).mean()
ids, bp_vectors, bp_scores = backprops[i]
- d_vector = bp_scores(d_scores, sgd=sgd)
+ d_vector = bp_scores(d_scores)
if isinstance(model.ops, CupyOps) \
and not isinstance(ids, model.state2vec.ops.xp.ndarray):
model.backprops.append((
@@ -531,11 +550,34 @@ cdef class Parser:
bp_vectors))
else:
model.backprops.append((ids, d_vector, bp_vectors))
- model.make_updates(sgd)
+ backprop_tok2vec(golds)
+ if sgd is not None:
+ self.model.finish_update(sgd)
+ if set_annotations:
+ self.set_annotations(docs, beams)
cdef Beam beam
for beam in beams:
_beam_utils.cleanup_beam(beam)
+ def get_gradients(self):
+ """Get non-zero gradients of the model's parameters, as a dictionary
+ keyed by the parameter ID. The values are (weights, gradients) tuples.
+ """
+ gradients = {}
+ if self.model in (None, True, False):
+ return gradients
+ queue = [self.model]
+ seen = set()
+ for node in queue:
+ if node.id in seen:
+ continue
+ seen.add(node.id)
+ if hasattr(node, "_mem") and node._mem.gradient.any():
+ gradients[node.id] = [node._mem.weights, node._mem.gradient]
+ if hasattr(node, "_layers"):
+ queue.extend(node._layers)
+ return gradients
+
def _init_gold_batch(self, whole_examples, min_length=5, max_length=500):
"""Make a square batch, of length equal to the shortest doc. A long
doc will get multiple states. Let's say we have a doc of length 2*N,
@@ -605,8 +647,7 @@ cdef class Parser:
return d_scores
def create_optimizer(self):
- return create_default_optimizer(self.model.ops,
- **self.cfg.get('optimizer', {}))
+ return create_default_optimizer()
def begin_training(self, get_examples, pipeline=None, sgd=None, **cfg):
if 'model' in cfg:
@@ -636,14 +677,16 @@ cdef class Parser:
for doc, gold in parses:
doc_sample.append(doc)
gold_sample.append(gold)
- self.model.begin_training(doc_sample, gold_sample)
+ self.model.initialize(doc_sample, gold_sample)
if pipeline is not None:
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **cfg)
link_vectors_to_models(self.vocab)
else:
if sgd is None:
sgd = self.create_optimizer()
- self.model.begin_training([])
+ if self.model.upper.has_dim("nO") is None:
+ self.model.upper.set_dim("nO", self.moves.n_moves)
+ self.model.initialize()
self.cfg.update(cfg)
return sgd
@@ -709,7 +752,7 @@ cdef class Parser:
if 'model' not in exclude:
# TODO: Remove this once we don't have to handle previous models
if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
- self.cfg['pretrained_vectors'] = self.vocab.vectors.name
+ self.cfg['pretrained_vectors'] = self.vocab.vectors
if self.model is True:
self.model, cfg = self.Model(**self.cfg)
else:
diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd
index a5fe55918..bd706a997 100644
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@@ -1,7 +1,6 @@
from cymem.cymem cimport Pool
-from thinc.typedefs cimport weight_t
-from ..typedefs cimport attr_t
+from ..typedefs cimport attr_t, weight_t
from ..structs cimport TokenC
from ..gold cimport GoldParse
from ..gold cimport GoldParseC
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 62e369091..6ab83436e 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -1,7 +1,7 @@
# cython: infer_types=True
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
-from thinc.typedefs cimport weight_t
+from ..typedefs cimport weight_t
from thinc.extra.search cimport Beam
from collections import Counter
import srsly
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index a24fd143d..25892ac71 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -1,6 +1,6 @@
import pytest
-from thinc.neural.optimizers import Adam
-from thinc.neural.ops import NumpyOps
+from thinc.optimizers import Adam
+from thinc.backends import NumpyOps
from spacy.attrs import NORM
from spacy.gold import GoldParse
from spacy.vocab import Vocab
@@ -28,7 +28,7 @@ def _train_parser(parser):
fix_random_seed(1)
parser.add_label("left")
parser.begin_training([], **parser.cfg)
- sgd = Adam(NumpyOps(), 0.001)
+ sgd = Adam(0.001, ops=NumpyOps())
for i in range(5):
losses = {}
@@ -41,8 +41,8 @@ def _train_parser(parser):
def test_add_label(parser):
parser = _train_parser(parser)
parser.add_label("right")
- sgd = Adam(NumpyOps(), 0.001)
- for i in range(10):
+ sgd = Adam(0.001, ops=NumpyOps())
+ for i in range(100):
losses = {}
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
gold = GoldParse(
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 8d5043487..8dda1f406 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -7,6 +7,11 @@ from spacy.syntax.ner import BiluoPushDown
from spacy.gold import GoldParse
from spacy.tokens import Doc
+TRAIN_DATA = [
+ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
+ ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
+ ]
+
@pytest.fixture
def vocab():
@@ -263,7 +268,7 @@ def test_change_number_features():
nlp.add_pipe(ner)
ner.add_label("PERSON")
nlp.begin_training()
- assert ner.model.lower.nF == ner.nr_feature
+ assert ner.model.lower.get_dim("nF") == ner.nr_feature
# Test we can change it
nlp = English()
ner = nlp.create_pipe("ner")
@@ -272,11 +277,36 @@ def test_change_number_features():
nlp.begin_training(
component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}}
)
- assert ner.model.lower.nF == 3
+ assert ner.model.lower.get_dim("nF") == 3
# Test the model runs
nlp("hello world")
+def test_overfitting():
+ # Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly
+ nlp = English()
+ ner = nlp.create_pipe("ner")
+ for _, annotations in TRAIN_DATA:
+ for ent in annotations.get("entities"):
+ ner.add_label(ent[2])
+ nlp.add_pipe(ner)
+ optimizer = nlp.begin_training()
+
+ for i in range(50):
+ losses = {}
+ nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
+ assert losses["ner"] < 0.00001
+
+ # test the trained model
+ test_text = "I like London."
+ doc = nlp(test_text)
+ ents = doc.ents
+
+ assert len(ents) == 1
+ assert ents[0].text == "London"
+ assert ents[0].label_ == "LOC"
+
+
class BlockerComponent1(object):
name = "my_blocker"
diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py
index 0906fbb94..2470982d3 100644
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@@ -1,5 +1,5 @@
import pytest
-from spacy._ml import Tok2Vec
+from spacy.ml.component_models import Tok2Vec
from spacy.vocab import Vocab
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.nn_parser import Parser
@@ -20,7 +20,9 @@ def arc_eager(vocab):
@pytest.fixture
def tok2vec():
- return Tok2Vec(8, 100)
+ tok2vec = Tok2Vec(8, 100)
+ tok2vec.initialize()
+ return tok2vec
@pytest.fixture
@@ -30,7 +32,7 @@ def parser(vocab, arc_eager):
@pytest.fixture
def model(arc_eager, tok2vec):
- return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
+ return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.get_dim("nO"))[0]
@pytest.fixture
@@ -53,7 +55,7 @@ def test_build_model(parser):
def test_predict_doc(parser, tok2vec, model, doc):
- doc.tensor = tok2vec([doc])[0]
+ doc.tensor = tok2vec.predict([doc])[0]
parser.model = model
parser(doc)
@@ -61,8 +63,9 @@ def test_predict_doc(parser, tok2vec, model, doc):
def test_update_doc(parser, model, doc, gold):
parser.model = model
- def optimize(weights, gradient, key=None):
+ def optimize(key, weights, gradient):
weights -= 0.001 * gradient
+ return weights, gradient
parser.update((doc, gold), sgd=optimize)
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 75091ec07..1d3f522c9 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,7 +1,25 @@
import pytest
+from spacy.lang.en import English
from ..util import get_doc, apply_transition_sequence
+TRAIN_DATA = [
+ (
+ "They trade mortgage-backed securities.",
+ {
+ "heads": [1, 1, 4, 4, 5, 1, 1],
+ "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
+ },
+ ),
+ (
+ "I like London and Berlin.",
+ {
+ "heads": [1, 1, 1, 2, 2, 1],
+ "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
+ },
+ ),
+]
+
def test_parser_root(en_tokenizer):
text = "i don't have other assistance"
@@ -162,3 +180,27 @@ def test_parser_set_sent_starts(en_vocab):
for sent in doc.sents:
for token in sent:
assert token.head in sent
+
+
+def test_overfitting():
+ # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly
+ nlp = English()
+ parser = nlp.create_pipe("parser")
+ for _, annotations in TRAIN_DATA:
+ for dep in annotations.get("deps", []):
+ parser.add_label(dep)
+ nlp.add_pipe(parser)
+ optimizer = nlp.begin_training()
+
+ for i in range(50):
+ losses = {}
+ nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
+ assert losses["parser"] < 0.00001
+
+ # test the trained model
+ test_text = "I like securities."
+ doc = nlp(test_text)
+
+ assert doc[0].dep_ is "nsubj"
+ assert doc[2].dep_ is "dobj"
+ assert doc[3].dep_ is "punct"
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index ed6aef096..5e56442b5 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -1,6 +1,6 @@
import pytest
-from thinc.neural.optimizers import Adam
-from thinc.neural.ops import NumpyOps
+from thinc.optimizers import Adam
+from thinc.backends import NumpyOps
from spacy.attrs import NORM
from spacy.gold import GoldParse
from spacy.vocab import Vocab
@@ -21,7 +21,7 @@ def parser(vocab):
# parser.add_label('right')
parser.add_label("left")
parser.begin_training([], **parser.cfg)
- sgd = Adam(NumpyOps(), 0.001)
+ sgd = Adam(0.001)
for i in range(10):
losses = {}
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index ca9dab009..6a6ec8665 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,4 +1,5 @@
import pytest
+import srsly
from spacy.language import Language
@@ -8,3 +9,35 @@ def test_label_types():
nlp.get_pipe("tagger").add_label("A")
with pytest.raises(ValueError):
nlp.get_pipe("tagger").add_label(9)
+
+
+TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
+
+TRAIN_DATA = [
+ ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+ ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
+
+def test_overfitting():
+ # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
+ nlp = Language()
+ tagger = nlp.create_pipe("tagger")
+ for tag, values in TAG_MAP.items():
+ tagger.add_label(tag, values)
+ nlp.add_pipe(tagger)
+ optimizer = nlp.begin_training()
+
+ for i in range(50):
+ losses = {}
+ nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
+ assert losses["tagger"] < 0.00001
+
+ # test the trained model
+ test_text = "I like blue eggs"
+ doc = nlp(test_text)
+
+ assert doc[0].tag_ is "N"
+ assert doc[1].tag_ is "V"
+ assert doc[2].tag_ is "J"
+ assert doc[3].tag_ is "N"
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 9e37e92e1..558d09e40 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -6,6 +6,11 @@ from spacy.pipeline import TextCategorizer
from spacy.tokens import Doc
from spacy.gold import GoldParse
+TRAIN_DATA = [
+ ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
+ ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
+]
+
@pytest.mark.skip(reason="Test is flakey when run with others")
def test_simple_train():
@@ -67,3 +72,26 @@ def test_label_types():
nlp.get_pipe("textcat").add_label("answer")
with pytest.raises(ValueError):
nlp.get_pipe("textcat").add_label(9)
+
+
+def test_overfitting():
+ # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
+ nlp = Language()
+ textcat = nlp.create_pipe("textcat")
+ for _, annotations in TRAIN_DATA:
+ for label, value in annotations.get("cats").items():
+ textcat.add_label(label)
+ nlp.add_pipe(textcat)
+ optimizer = nlp.begin_training()
+
+ for i in range(50):
+ losses = {}
+ nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
+ assert losses["textcat"] < 0.00001
+
+ # test the trained model
+ test_text = "I am happy."
+ doc = nlp(test_text)
+ cats = doc.cats
+ assert cats["POSITIVE"] > 0.9
+ assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001)
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 49e7de179..c4f5e8599 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -8,7 +8,7 @@ from spacy.matcher import Matcher
from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
from spacy.compat import pickle
-from spacy._ml import link_vectors_to_models
+from spacy.util import link_vectors_to_models
import numpy
import random
diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py
index 367961ab1..fca884356 100644
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@@ -32,7 +32,7 @@ def test_issue3611():
# training the network
with nlp.disable_pipes([p for p in nlp.pipe_names if p != "textcat"]):
- optimizer = nlp.begin_training()
+ optimizer = nlp.begin_training(X=x_train, Y=y_train)
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py
index ad56e4c54..786e2cedf 100644
--- a/spacy/tests/test_architectures.py
+++ b/spacy/tests/test_architectures.py
@@ -1,12 +1,12 @@
import pytest
from spacy import registry
-from thinc.v2v import Affine
+from thinc.layers import Linear
from catalogue import RegistryError
@registry.architectures.register("my_test_function")
def create_model(nr_in, nr_out):
- return Affine(nr_in, nr_out)
+ return Linear(nr_in, nr_out)
def test_get_architecture():
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 09e0fb561..a6bcdb50c 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -5,7 +5,8 @@ from pathlib import Path
from spacy import util
from spacy import prefer_gpu, require_gpu
from spacy.compat import symlink_to, symlink_remove, is_windows
-from spacy._ml import PrecomputableAffine
+from spacy.ml._layers import PrecomputableAffine
+from spacy.ml._layers import _backprop_precomputable_affine_padding
from subprocess import CalledProcessError
@@ -67,28 +68,30 @@ def test_util_get_package_path(package):
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP)
- assert model.W.shape == (nF, nO, nP, nI)
- tensor = model.ops.allocate((10, nI))
+ assert model.get_param("W").shape == (nF, nO, nP, nI)
+ tensor = model.ops.alloc((10, nI))
Y, get_dX = model.begin_update(tensor)
assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
- assert model.d_pad.shape == (1, nF, nO, nP)
- dY = model.ops.allocate((15, nO, nP))
- ids = model.ops.allocate((15, nF))
+ dY = model.ops.alloc((15, nO, nP))
+ ids = model.ops.alloc((15, nF))
ids[1, 2] = -1
dY[1] = 1
- assert model.d_pad[0, 2, 0, 0] == 0.0
- model._backprop_padding(dY, ids)
- assert model.d_pad[0, 2, 0, 0] == 1.0
- model.d_pad.fill(0.0)
+ assert not model.has_grad("pad")
+ d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+ assert d_pad[0, 2, 0, 0] == 1.0
ids.fill(0.0)
dY.fill(0.0)
- ids[1, 2] = -1
+ dY[0] = 0
+ ids[1, 2] = 0
ids[1, 1] = -1
ids[1, 0] = -1
dY[1] = 1
- assert model.d_pad[0, 2, 0, 0] == 0.0
- model._backprop_padding(dY, ids)
- assert model.d_pad[0, 2, 0, 0] == 3.0
+ ids[2, 0] = -1
+ dY[2] = 5
+ d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+ assert d_pad[0, 0, 0, 0] == 6
+ assert d_pad[0, 1, 0, 0] == 1
+ assert d_pad[0, 2, 0, 0] == 0
def test_prefer_gpu():
diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py
index 473d5017d..2d10d79d4 100644
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@@ -1,6 +1,6 @@
import pytest
-from spacy._ml import Tok2Vec
+from spacy.ml.component_models import Tok2Vec
from spacy.vocab import Vocab
from spacy.tokens import Doc
@@ -10,7 +10,7 @@ def get_batch(batch_size):
docs = []
start = 0
for size in range(1, batch_size + 1):
- # Make the words numbers, so that they're distnct
+ # Make the words numbers, so that they're distinct
# across the batch, and easy to track.
numbers = [str(i) for i in range(start, start + size)]
docs.append(Doc(vocab, words=numbers))
@@ -37,6 +37,7 @@ def test_empty_doc():
def test_tok2vec_batch_sizes(batch_size, width, embed_size):
batch = get_batch(batch_size)
tok2vec = Tok2Vec(width, embed_size)
+ tok2vec.initialize()
vectors, backprop = tok2vec.begin_update(batch)
assert len(vectors) == len(batch)
for doc_vec, doc in zip(vectors, batch):
@@ -56,6 +57,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
def test_tok2vec_configs(tok2vec_config):
docs = get_batch(3)
tok2vec = Tok2Vec(**tok2vec_config)
+ tok2vec.initialize()
vectors, backprop = tok2vec.begin_update(docs)
assert len(vectors) == len(docs)
assert vectors[0].shape == (len(docs[0]), tok2vec_config["width"])
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 8684ad018..011cd16b1 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -1,14 +1,13 @@
import pytest
import numpy
from numpy.testing import assert_allclose
-from spacy._ml import cosine
from spacy.vocab import Vocab
from spacy.vectors import Vectors
from spacy.tokenizer import Tokenizer
from spacy.strings import hash_string
from spacy.tokens import Doc
-from ..util import add_vecs_to_vocab
+from ..util import add_vecs_to_vocab, get_cosine
@pytest.fixture
@@ -311,4 +310,4 @@ def test_vocab_prune_vectors():
assert list(remap.keys()) == ["kitten"]
neighbour, similarity = list(remap.values())[0]
assert neighbour == "cat", remap
- assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
+ assert_allclose(similarity, get_cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 12690ba50..cd7e5a426 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -4,7 +4,7 @@
from libc.string cimport memcpy, memset
from libc.stdlib cimport malloc, free
from cymem.cymem cimport Pool
-from thinc.neural.util import get_array_module
+from thinc.util import get_array_module
import numpy
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index d7348659d..4a18acd77 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -1,7 +1,7 @@
import numpy
import zlib
import srsly
-from thinc.neural.ops import NumpyOps
+from thinc.backends import NumpyOps
from ..compat import copy_reg
from ..tokens import Doc
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 7e6473d56..15f77d621 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -11,7 +11,7 @@ import numpy
import numpy.linalg
import struct
import srsly
-from thinc.neural.util import get_array_module, copy_array
+from thinc.util import get_array_module, copy_array
from .span cimport Span
from .token cimport Token
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 9e9322d65..7ab1c1d18 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -3,7 +3,7 @@ from libc.math cimport sqrt
import numpy
import numpy.linalg
-from thinc.neural.util import get_array_module
+from thinc.util import get_array_module
from collections import defaultdict
from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index b159fffc1..c241cd5ad 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -7,7 +7,7 @@ cimport numpy as np
np.import_array()
import numpy
-from thinc.neural.util import get_array_module
+from thinc.util import get_array_module
from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme
diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd
index bd5b38958..b43814268 100644
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@@ -2,7 +2,9 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t
from libc.stdint cimport uint8_t
+ctypedef float weight_t
ctypedef uint64_t hash_t
+ctypedef uint64_t class_t
ctypedef char* utf8_t
ctypedef uint64_t attr_t
ctypedef uint64_t flags_t
diff --git a/spacy/util.py b/spacy/util.py
index 55e197eb2..53fa81402 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -4,8 +4,14 @@ import importlib.util
import re
from pathlib import Path
import random
-from thinc.neural._classes.model import Model
-from thinc.neural.ops import NumpyOps
+from typing import List
+
+import thinc
+import thinc.config
+from thinc.backends import NumpyOps, get_current_ops
+from thinc.optimizers import Adam
+from thinc.util import require_gpu
+
import functools
import itertools
import numpy.random
@@ -13,6 +19,7 @@ import srsly
import catalogue
import sys
+
try:
import cupy.random
except ImportError:
@@ -20,14 +27,13 @@ except ImportError:
from .symbols import ORTH
from .compat import cupy, CudaStream
-from .errors import Errors, Warnings, deprecation_warning
-
+from .errors import Errors, Warnings, deprecation_warning, user_warning
_data_path = Path(__file__).parent / "data"
_PRINT_ENV = False
-class registry(object):
+class registry(thinc.registry):
languages = catalogue.create("spacy", "languages", entry_points=True)
architectures = catalogue.create("spacy", "architectures", entry_points=True)
lookups = catalogue.create("spacy", "lookups", entry_points=True)
@@ -219,6 +225,23 @@ def load_model_from_init_py(init_file, **overrides):
return load_model_from_path(data_path, meta, **overrides)
+def load_from_config(path, create_objects=False):
+ """Load a Thinc-formatted config file, optionally filling in objects where
+ the config references registry entries. See "Thinc config files" for details.
+
+ path (unicode or Path): Path to the config file
+ create_objects (bool): Whether to automatically create objects when the config
+ references registry entries. Defaults to False.
+
+ RETURNS (dict): The objects from the config file.
+ """
+ config = thinc.config.Config().from_disk(path)
+ if create_objects:
+ return registry.make_from_config(config, validate=True)
+ else:
+ return config
+
+
def get_model_meta(path):
"""Get model meta.json from a directory path and validate its contents.
@@ -293,9 +316,10 @@ def get_component_name(component):
def get_cuda_stream(require=False, non_blocking=True):
+ ops = get_current_ops()
if CudaStream is None:
return None
- elif isinstance(Model.ops, NumpyOps):
+ elif isinstance(ops, NumpyOps):
return None
else:
return CudaStream(non_blocking=non_blocking)
@@ -310,6 +334,14 @@ def get_async(stream, numpy_array):
return array
+def eg2doc(example):
+ """Get a Doc object from an Example (or if it's a Doc, use it directly)"""
+ # Put the import here to avoid circular import problems
+ from .tokens.doc import Doc
+
+ return example if isinstance(example, Doc) else example.doc
+
+
def env_opt(name, default=None):
if type(default) is float:
type_convert = float
@@ -532,6 +564,8 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len):
"""Create minibatches of a given number of words."""
if isinstance(size, int):
size_ = itertools.repeat(size)
+ if isinstance(size, List):
+ size_ = iter(size)
else:
size_ = size
examples = iter(examples)
@@ -680,17 +714,7 @@ def escape_html(text):
def use_gpu(gpu_id):
- try:
- import cupy.cuda.device
- except ImportError:
- return None
- from thinc.neural.ops import CupyOps
-
- device = cupy.cuda.device.Device(gpu_id)
- device.use()
- Model.ops = CupyOps()
- Model.Ops = CupyOps
- return device
+ return require_gpu(gpu_id)
def fix_random_seed(seed=0):
@@ -747,3 +771,33 @@ class DummyTokenizer(object):
def from_disk(self, _path, **kwargs):
return self
+
+
+def link_vectors_to_models(vocab):
+ vectors = vocab.vectors
+ if vectors.name is None:
+ vectors.name = VECTORS_KEY
+ if vectors.data.size != 0:
+ user_warning(Warnings.W020.format(shape=vectors.data.shape))
+ for word in vocab:
+ if word.orth in vectors.key2row:
+ word.rank = vectors.key2row[word.orth]
+ else:
+ word.rank = 0
+
+
+VECTORS_KEY = "spacy_pretrained_vectors"
+
+
+def create_default_optimizer():
+ ops = get_current_ops()
+ learn_rate = env_opt("learn_rate", 0.001)
+ beta1 = env_opt("optimizer_B1", 0.9)
+ beta2 = env_opt("optimizer_B2", 0.999)
+ eps = env_opt("optimizer_eps", 1e-8)
+ L2 = env_opt("L2_penalty", 1e-6)
+ max_grad_norm = env_opt("grad_norm_clip", 1.0)
+ optimizer = Adam(learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, ops=ops)
+ optimizer.max_grad_norm = max_grad_norm
+ optimizer.device = ops.device_type
+ return optimizer
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index b12c8d833..2b1067247 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -5,8 +5,8 @@ from libcpp.set cimport set as cppset
import functools
import numpy
import srsly
-from thinc.neural.util import get_array_module
-from thinc.neural._classes.model import Model
+from thinc.util import get_array_module
+from thinc.backends import get_current_ops
from .strings cimport StringStore
@@ -426,9 +426,9 @@ cdef class Vectors:
self.add(key, row=i)
def load_vectors(path):
- xp = Model.ops.xp
+ ops = get_current_ops()
if path.exists():
- self.data = xp.load(str(path))
+ self.data = ops.xp.load(str(path))
serializers = {
"key2row": load_key2row,
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index c7e74f36c..3da9978c4 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -2,7 +2,7 @@
from libc.string cimport memcpy
import srsly
-from thinc.neural.util import get_array_module
+from thinc.util import get_array_module
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme
@@ -16,7 +16,7 @@ from .errors import Errors
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs, NORM
from .vectors import Vectors
-from ._ml import link_vectors_to_models
+from .util import link_vectors_to_models
from .lookups import Lookups
from . import util
From a365359b36e77b6c02cd58c8bf62d91a25ea8052 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 29 Jan 2020 17:44:25 +0100
Subject: [PATCH 050/496] Add convert CLI option to merge CoNLL-U subtokens
(#4722)
* Add convert CLI option to merge CoNLL-U subtokens
Add `-T` option to convert CLI that merges CoNLL-U subtokens into one
token in the converted data. Each CoNLL-U sentence is read into a `Doc`
and the `Retokenizer` is used to merge subtokens with features as
follows:
* `orth` is the merged token orth (should correspond to raw text and `#
text`)
* `tag` is all subtoken tags concatenated with `_`, e.g. `ADP_DET`
* `pos` is the POS of the syntactic root of the span (as determined by
the Retokenizer)
* `morph` is all morphological features merged
* `lemma` is all subtoken lemmas concatenated with ` `, e.g. `de o`
* with `-m` all morphological features are combined with the tag using
the separator `__`, e.g.
`ADP_DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art`
* `dep` is the dependency relation for the syntactic root of the span
(as determined by the Retokenizer)
Concatenated tags will be mapped to the UD POS of the syntactic root
(e.g., `ADP`) and the morphological features will be the combined
features.
In many cases, the original UD subtokens can be reconstructed from the
available features given a language-specific lookup table, e.g.,
Portuguese `do / ADP_DET /
Definite=Def|Gender=Masc|Number=Sing|PronType=Art` is `de / ADP`, `o /
DET / Definite=Def|Gender=Masc|Number=Sing|PronType=Art` or lookup rules
for forms containing open class words like Spanish `hablarlo / VERB_PRON
/
Case=Acc|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs|VerbForm=Inf`.
* Clean up imports
---
spacy/cli/convert.py | 4 +-
spacy/cli/converters/conllu2json.py | 330 ++++++++++++++++++++--------
spacy/tests/test_cli.py | 37 ++++
3 files changed, 273 insertions(+), 98 deletions(-)
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 31931db68..2ffbeb458 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -34,6 +34,7 @@ def convert(
seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
+ merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
lang: ("Language (if tokenizer required)", "option", "l", str) = None,
@@ -85,7 +86,8 @@ def convert(
input_data,
n_sents=n_sents,
seg_sents=seg_sents,
- use_morphology=morphology,
+ append_morphology=morphology,
+ merge_subtokens=merge_subtokens,
lang=lang,
model=model,
no_print=no_print,
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index 12b1103d4..13f2042f9 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -1,36 +1,36 @@
import re
-from spacy.gold import Example
-from ...gold import iob_to_biluo
+from ...gold import Example
+from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
+from ...language import Language
+from ...tokens import Doc, Token
+from .conll_ner2json import n_sents_info
+from wasabi import Printer
def conllu2json(
- input_data, n_sents=10, use_morphology=False, lang=None, ner_map=None, **_
+ input_data, n_sents=10, append_morphology=False, lang=None, ner_map=None,
+ merge_subtokens=False, no_print=False, **_
):
"""
Convert conllu files into JSON format for use with train cli.
- use_morphology parameter enables appending morphology to tags, which is
+ append_morphology parameter enables appending morphology to tags, which is
useful for languages such as Spanish, where UD tags are not so rich.
Extract NER tags if available and convert them so that they follow
BILUO and the Wikipedia scheme
"""
- # by @dvsrepo, via #11 explosion/spacy-dev-resources
- # by @katarkor
- # name=NER is to handle NorNE
MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?"
+ msg = Printer(no_print=no_print)
+ n_sents_info(msg, n_sents)
docs = []
raw = ""
sentences = []
- conll_data = read_conllx(input_data, use_morphology=use_morphology)
- checked_for_ner = False
- has_ner_tags = False
+ conll_data = read_conllx(input_data, append_morphology=append_morphology,
+ ner_tag_pattern=MISC_NER_PATTERN, ner_map=ner_map,
+ merge_subtokens=merge_subtokens)
+ has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN)
for i, example in enumerate(conll_data):
- if not checked_for_ner:
- has_ner_tags = is_ner(
- example.token_annotation.entities[0], MISC_NER_PATTERN
- )
- checked_for_ner = True
raw += example.text
sentences.append(
generate_sentence(
@@ -43,137 +43,273 @@ def conllu2json(
# Real-sized documents could be extracted using the comments on the
# conllu document
if len(sentences) % n_sents == 0:
- doc = create_doc(raw, sentences, i)
+ doc = create_json_doc(raw, sentences, i)
docs.append(doc)
raw = ""
sentences = []
if sentences:
- doc = create_doc(raw, sentences, i)
+ doc = create_json_doc(raw, sentences, i)
docs.append(doc)
return docs
-def is_ner(tag, tag_pattern):
+def has_ner(input_data, ner_tag_pattern):
"""
Check the 10th column of the first token to determine if the file contains
NER tags
"""
- tag_match = re.search(tag_pattern, tag)
- if tag_match:
- return True
- elif tag == "O":
- return True
- else:
- return False
+ for sent in input_data.strip().split("\n\n"):
+ lines = sent.strip().split("\n")
+ if lines:
+ while lines[0].startswith("#"):
+ lines.pop(0)
+ if lines:
+ parts = lines[0].split("\t")
+ id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+ if re.search(ner_tag_pattern, misc):
+ return True
+ else:
+ return False
-def read_conllx(input_data, use_morphology=False, n=0):
- """ Yield example data points, one for each sentence """
+def read_conllx(input_data, append_morphology=False, merge_subtokens=False,
+ ner_tag_pattern="", ner_map=None):
+ """ Yield examples, one for each sentence """
+ vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
i = 0
for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n")
if lines:
while lines[0].startswith("#"):
lines.pop(0)
- ids, words, tags, heads, deps, ents = [], [], [], [], [], []
- spaces = []
- for line in lines:
- parts = line.split("\t")
- id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
- if "-" in id_ or "." in id_:
- continue
- try:
- id_ = int(id_) - 1
- head = (int(head) - 1) if head != "0" else id_
- dep = "ROOT" if dep == "root" else dep
- tag = pos if tag == "_" else tag
- tag = tag + "__" + morph if use_morphology else tag
- ent = misc if misc else "O"
-
- ids.append(id_)
- words.append(word)
- tags.append(tag)
- heads.append(head)
- deps.append(dep)
- ents.append(ent)
- if "SpaceAfter=No" in misc:
- spaces.append(False)
- else:
- spaces.append(True)
- except: # noqa: E722
- print(line)
- raise
- raw = ""
- for word, space in zip(words, spaces):
- raw += word
- if space:
- raw += " "
- example = Example(doc=raw)
- example.set_token_annotation(
- ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=ents
- )
+ example = example_from_conllu_sentence(vocab, lines,
+ ner_tag_pattern, merge_subtokens=merge_subtokens,
+ append_morphology=append_morphology,
+ ner_map=ner_map)
yield example
- i += 1
- if 1 <= n <= i:
- break
-def extract_tags(iob, tag_pattern, ner_map=None):
+def get_entities(lines, tag_pattern, ner_map=None):
+ """Find entities in the MISC column according to the pattern and map to
+ final entity type with `ner_map` if mapping present. Entity tag is 'O' if
+ the pattern is not matched.
+
+ lines (unicode): CONLL-U lines for one sentences
+ tag_pattern (unicode): Regex pattern for entity tag
+ ner_map (dict): Map old NER tag names to new ones, '' maps to O.
+ RETURNS (list): List of BILUO entity tags
"""
- Extract tag from MISC column according to `tag_pattern` and map to final
- entity type with `ner_map` if mapping present.
+ miscs = []
+ for line in lines:
+ parts = line.split("\t")
+ id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+ if "-" in id_ or "." in id_:
+ continue
+ miscs.append(misc)
- For NorNE:
- Simplify tags obtained from the dataset in order to follow Wikipedia
- scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
- 'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
- 'MISC'.
- """
- new_iob = []
- for tag in iob:
- tag_match = re.search(tag_pattern, tag)
- new_tag = "O"
+ iob = []
+ for misc in miscs:
+ tag_match = re.search(tag_pattern, misc)
+ iob_tag = "O"
if tag_match:
prefix = tag_match.group(2)
suffix = tag_match.group(3)
if prefix and suffix:
- new_tag = prefix + "-" + suffix
+ iob_tag = prefix + "-" + suffix
if ner_map:
suffix = ner_map.get(suffix, suffix)
if suffix == "":
- new_tag = "O"
+ iob_tag = "O"
else:
- new_tag = prefix + "-" + suffix
- new_iob.append(new_tag)
- return new_iob
+ iob_tag = prefix + "-" + suffix
+ iob.append(iob_tag)
+ return iob_to_biluo(iob)
def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None):
sentence = {}
tokens = []
- if has_ner_tags:
- iob = extract_tags(token_annotation.entities, tag_pattern, ner_map=ner_map)
- biluo = iob_to_biluo(iob)
- for i, id in enumerate(token_annotation.ids):
+ for i, id_ in enumerate(token_annotation.ids):
token = {}
- token["id"] = id
- token["orth"] = token_annotation.words[i]
- token["tag"] = token_annotation.tags[i]
- token["head"] = token_annotation.heads[i] - id
- token["dep"] = token_annotation.deps[i]
+ token["id"] = id_
+ token["orth"] = token_annotation.get_word(i)
+ token["tag"] = token_annotation.get_tag(i)
+ token["pos"] = token_annotation.get_pos(i)
+ token["lemma"] = token_annotation.get_lemma(i)
+ token["morph"] = token_annotation.get_morph(i)
+ token["head"] = token_annotation.get_head(i) - id_
+ token["dep"] = token_annotation.get_dep(i)
if has_ner_tags:
- token["ner"] = biluo[i]
+ token["ner"] = token_annotation.get_entity(i)
tokens.append(token)
sentence["tokens"] = tokens
return sentence
-def create_doc(raw, sentences, id):
+def create_json_doc(raw, sentences, id_):
doc = {}
paragraph = {}
- doc["id"] = id
+ doc["id"] = id_
doc["paragraphs"] = []
paragraph["raw"] = raw.strip()
paragraph["sentences"] = sentences
doc["paragraphs"].append(paragraph)
return doc
+
+
+def example_from_conllu_sentence(vocab, lines, ner_tag_pattern,
+ merge_subtokens=False, append_morphology=False, ner_map=None):
+ """Create an Example from the lines for one CoNLL-U sentence, merging
+ subtokens and appending morphology to tags if required.
+
+ lines (unicode): The non-comment lines for a CoNLL-U sentence
+ ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col
+ RETURNS (Example): An example containing the annotation
+ """
+ # create a Doc with each subtoken as its own token
+ # if merging subtokens, each subtoken orth is the merged subtoken form
+ if not Token.has_extension("merged_orth"):
+ Token.set_extension("merged_orth", default="")
+ if not Token.has_extension("merged_lemma"):
+ Token.set_extension("merged_lemma", default="")
+ if not Token.has_extension("merged_morph"):
+ Token.set_extension("merged_morph", default="")
+ if not Token.has_extension("merged_spaceafter"):
+ Token.set_extension("merged_spaceafter", default="")
+ words, spaces, tags, poses, morphs, lemmas = [], [], [], [], [], []
+ heads, deps = [], []
+ subtok_word = ""
+ in_subtok = False
+ for i in range(len(lines)):
+ line = lines[i]
+ subtok_lines = []
+ parts = line.split("\t")
+ id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+ if "." in id_:
+ continue
+ if "-" in id_:
+ in_subtok = True
+ if "-" in id_:
+ in_subtok = True
+ subtok_word = word
+ subtok_start, subtok_end = id_.split("-")
+ subtok_spaceafter = "SpaceAfter=No" not in misc
+ continue
+ if merge_subtokens and in_subtok:
+ words.append(subtok_word)
+ else:
+ words.append(word)
+ if in_subtok:
+ if id_ == subtok_end:
+ spaces.append(subtok_spaceafter)
+ else:
+ spaces.append(False)
+ elif "SpaceAfter=No" in misc:
+ spaces.append(False)
+ else:
+ spaces.append(True)
+ if in_subtok and id_ == subtok_end:
+ subtok_word = ""
+ in_subtok = False
+ id_ = int(id_) - 1
+ head = (int(head) - 1) if head != "0" else id_
+ tag = pos if tag == "_" else tag
+ morph = morph if morph != "_" else ""
+ dep = "ROOT" if dep == "root" else dep
+ lemmas.append(lemma)
+ poses.append(pos)
+ tags.append(tag)
+ morphs.append(morph)
+ heads.append(head)
+ deps.append(dep)
+
+ doc = Doc(vocab, words=words, spaces=spaces)
+ for i in range(len(doc)):
+ doc[i].tag_ = tags[i]
+ doc[i].pos_ = poses[i]
+ doc[i].dep_ = deps[i]
+ doc[i].lemma_ = lemmas[i]
+ doc[i].head = doc[heads[i]]
+ doc[i]._.merged_orth = words[i]
+ doc[i]._.merged_morph = morphs[i]
+ doc[i]._.merged_lemma = lemmas[i]
+ doc[i]._.merged_spaceafter = spaces[i]
+ ents = get_entities(lines, ner_tag_pattern, ner_map)
+ doc.ents = spans_from_biluo_tags(doc, ents)
+ doc.is_parsed = True
+ doc.is_tagged = True
+
+ if merge_subtokens:
+ doc = merge_conllu_subtokens(lines, doc)
+
+ # create Example from custom Doc annotation
+ ids, words, tags, heads, deps = [], [], [], [], []
+ pos, lemmas, morphs, spaces = [], [], [], []
+ for i, t in enumerate(doc):
+ ids.append(i)
+ words.append(t._.merged_orth)
+ if append_morphology and t._.merged_morph:
+ tags.append(t.tag_ + "__" + t._.merged_morph)
+ else:
+ tags.append(t.tag_)
+ pos.append(t.pos_)
+ morphs.append(t._.merged_morph)
+ lemmas.append(t._.merged_lemma)
+ heads.append(t.head.i)
+ deps.append(t.dep_)
+ spaces.append(t._.merged_spaceafter)
+ ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
+ ents = biluo_tags_from_offsets(doc, ent_offsets)
+ raw = ""
+ for word, space in zip(words, spaces):
+ raw += word
+ if space:
+ raw += " "
+ example = Example(doc=raw)
+ example.set_token_annotation(ids=ids, words=words, tags=tags, pos=pos,
+ morphs=morphs, lemmas=lemmas, heads=heads,
+ deps=deps, entities=ents)
+ return example
+
+
+def merge_conllu_subtokens(lines, doc):
+ # identify and process all subtoken spans to prepare attrs for merging
+ subtok_spans = []
+ for line in lines:
+ parts = line.split("\t")
+ id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+ if "-" in id_:
+ subtok_start, subtok_end = id_.split("-")
+ subtok_span = doc[int(subtok_start) - 1:int(subtok_end)]
+ subtok_spans.append(subtok_span)
+ # create merged tag, morph, and lemma values
+ tags = []
+ morphs = {}
+ lemmas = []
+ for token in subtok_span:
+ tags.append(token.tag_)
+ lemmas.append(token.lemma_)
+ if token._.merged_morph:
+ for feature in token._.merged_morph.split("|"):
+ field, values = feature.split("=", 1)
+ if not field in morphs:
+ morphs[field] = set()
+ for value in values.split(","):
+ morphs[field].add(value)
+ # create merged features for each morph field
+ for field, values in morphs.items():
+ morphs[field] = field + "=" + ",".join(sorted(values))
+ # set the same attrs on all subtok tokens so that whatever head the
+ # retokenizer chooses, the final attrs are available on that token
+ for token in subtok_span:
+ token._.merged_orth = token.orth_
+ token._.merged_lemma = " ".join(lemmas)
+ token.tag_ = "_".join(tags)
+ token._.merged_morph = "|".join(sorted(morphs.values()))
+ token._.merged_spaceafter = True if subtok_span[-1].whitespace_ else False
+
+ with doc.retokenize() as retokenizer:
+ for span in subtok_spans:
+ retokenizer.merge(span)
+
+ return doc
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index b4aebe521..049858960 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -54,6 +54,43 @@ def test_cli_converters_conllu2json_name_ner_map():
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
+def test_cli_converters_conllu2json_subtokens():
+ # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
+ lines = [
+ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
+ "2-3\tFE\t_\t_\t_\t_\t_\t_\t_\t_",
+ "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tname=B-PER",
+ "3\tEilertsen\tEilertsen\tX\t_\tGender=Fem|Tense=past\t2\tname\t_\tname=I-PER",
+ "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
+ "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
+ ]
+ input_data = "\n".join(lines)
+ converted = conllu2json(input_data, n_sents=1, merge_subtokens=True,
+ append_morphology=True)
+ assert len(converted) == 1
+ assert converted[0]["id"] == 0
+ assert len(converted[0]["paragraphs"]) == 1
+ assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår."
+ assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
+ sent = converted[0]["paragraphs"][0]["sentences"][0]
+ assert len(sent["tokens"]) == 4
+ tokens = sent["tokens"]
+ print(tokens)
+ assert [t["orth"] for t in tokens] == ["Dommer", "FE", "avstår", "."]
+ assert [t["tag"] for t in tokens] == [
+ "NOUN__Definite=Ind|Gender=Masc|Number=Sing",
+ "PROPN_X__Gender=Fem,Masc|Tense=past",
+ "VERB__Mood=Ind|Tense=Pres|VerbForm=Fin",
+ "PUNCT"
+ ]
+ assert [t["pos"] for t in tokens] == ['NOUN', 'PROPN', 'VERB', 'PUNCT']
+ assert [t["morph"] for t in tokens] == ['Definite=Ind|Gender=Masc|Number=Sing', 'Gender=Fem,Masc|Tense=past', 'Mood=Ind|Tense=Pres|VerbForm=Fin', '']
+ assert [t["lemma"] for t in tokens] == ['dommer', 'Finn Eilertsen', 'avstå', '$.']
+ assert [t["head"] for t in tokens] == [1, 1, 0, -1]
+ assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
+ assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
+
+
def test_cli_converters_iob2json():
lines = [
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
From 5ee9d8c9b80c9e80491f320e58c0d86d2ec917b7 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Wed, 29 Jan 2020 17:45:46 +0100
Subject: [PATCH 051/496] Add MORPH attr, add support in retokenizer (#4947)
* Add MORPH attr / symbol for token attrs
* Update retokenizer for MORPH
---
spacy/attrs.pxd | 1 +
spacy/attrs.pyx | 1 +
spacy/symbols.pxd | 1 +
spacy/symbols.pyx | 1 +
spacy/tests/doc/test_retokenize_merge.py | 4 +++-
spacy/tests/doc/test_retokenize_split.py | 3 +++
spacy/tokens/_retokenize.pyx | 7 ++++++-
spacy/tokens/token.pxd | 4 ++++
8 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index d9aca078c..7fc0b9111 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -91,3 +91,4 @@ cdef enum attr_id_t:
LANG
ENT_KB_ID = symbols.ENT_KB_ID
+ MORPH
diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index a601a7a66..97ca627fb 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -87,6 +87,7 @@ IDS = {
"SPACY": SPACY,
"PROB": PROB,
"LANG": LANG,
+ "MORPH": MORPH,
}
diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index b95b4b805..5c1970628 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -462,3 +462,4 @@ cdef enum symbol_t:
acl
ENT_KB_ID
+ MORPH
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index 36b9ffa67..128946ec7 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -462,6 +462,7 @@ IDS = {
"acl": acl,
"LAW": LAW,
+ "MORPH": MORPH,
}
diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py
index c82c04eeb..17bcd2c64 100644
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@@ -8,7 +8,7 @@ from ..util import get_doc
def test_doc_retokenize_merge(en_tokenizer):
text = "WKRO played songs by the beach boys all night"
- attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
+ attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"}
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
@@ -18,9 +18,11 @@ def test_doc_retokenize_merge(en_tokenizer):
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED"
+ assert doc[4].morph_ == "Number=Plur"
assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED"
+ assert doc[5].morph_ == "Number=Plur"
def test_doc_retokenize_merge_children(en_tokenizer):
diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py
index 33b6fbe81..5f40da425 100644
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@@ -22,15 +22,18 @@ def test_doc_retokenize_split(en_vocab):
"tag": ["NNP"] * 2,
"lemma": ["Los", "Angeles"],
"ent_type": ["GPE"] * 2,
+ "morph": ["Number=Sing"] * 2,
},
)
assert len(doc) == 4
assert doc[0].text == "Los"
assert doc[0].head.text == "Angeles"
assert doc[0].idx == 0
+ assert doc[0].morph_ == "Number=Sing"
assert doc[1].idx == 3
assert doc[1].text == "Angeles"
assert doc[1].head.text == "start"
+ assert doc[1].morph_ == "Number=Sing"
assert doc[2].text == "start"
assert doc[2].head.text == "."
assert doc[3].text == "."
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index cd7e5a426..ec7e8a9e8 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -13,7 +13,7 @@ from .span cimport Span
from .token cimport Token
from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..structs cimport LexemeC, TokenC
-from ..attrs cimport TAG
+from ..attrs cimport TAG, MORPH
from .underscore import is_writable_attr
from ..attrs import intify_attrs
@@ -65,6 +65,8 @@ cdef class Retokenizer:
attrs["_"] = extensions
else:
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
+ if MORPH in attrs:
+ self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(attrs[MORPH]))
self.merges.append((span, attrs))
def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()):
@@ -96,6 +98,9 @@ cdef class Retokenizer:
# NB: Since we support {"KEY": [value, value]} syntax here, this
# will only "intify" the keys, not the values
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
+ if MORPH in attrs:
+ for morph in attrs[MORPH]:
+ self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(morph))
head_offsets = []
for head in heads:
if isinstance(head, Token):
diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd
index ec5df3fac..82d9c7c2a 100644
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@@ -43,6 +43,8 @@ cdef class Token:
return token.pos
elif feat_name == TAG:
return token.tag
+ elif feat_name == MORPH:
+ return token.morph
elif feat_name == DEP:
return token.dep
elif feat_name == HEAD:
@@ -71,6 +73,8 @@ cdef class Token:
token.pos = value
elif feat_name == TAG:
token.tag = value
+ elif feat_name == MORPH:
+ token.morph = value
elif feat_name == DEP:
token.dep = value
elif feat_name == HEAD:
From ccef9f2f446a9c6b0a212db5a7e9f7bfb93b16b4 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Wed, 29 Jan 2020 17:52:22 +0100
Subject: [PATCH 052/496] Update version
---
spacy/about.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/about.py b/spacy/about.py
index a1880fb54..356e12269 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "2.2.3"
+__version__ = "3.0.0.dev0"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
From 0c5c8c37eef77c2576a0243d36986f7b61069f4e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal
Date: Thu, 30 Jan 2020 10:26:03 +0100
Subject: [PATCH 053/496] Depend on tqdm
---
setup.cfg | 1 +
1 file changed, 1 insertion(+)
diff --git a/setup.cfg b/setup.cfg
index 9ea85e896..a0f88de47 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -52,6 +52,7 @@ install_requires =
plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0
pydantic>=1.3.0,<2.0.0
+ tqdm>=4.38.0,<5.0.0
[options.extras_require]
lookups =
From ba6d78132d9538346620c6b6cb384daa28fc0388 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal
Date: Thu, 30 Jan 2020 10:35:09 +0100
Subject: [PATCH 054/496] Fix dev version
---
spacy/about.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/about.py b/spacy/about.py
index 356e12269..6e01a855a 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.0.0.dev0"
+__version__ = "3.0.0.dev2"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
From 9df0b1360df3d58d26d98ddd54ef910911faaec4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal
Date: Thu, 30 Jan 2020 10:35:18 +0100
Subject: [PATCH 055/496] Fix ml_datasets
---
setup.cfg | 1 +
1 file changed, 1 insertion(+)
diff --git a/setup.cfg b/setup.cfg
index a0f88de47..a3aede089 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -46,6 +46,7 @@ install_requires =
wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0
catalogue>=0.0.7,<1.1.0
+ ml_datasets
# Third-party dependencies
setuptools
numpy>=1.15.0
From 71b93f33bb450198457844fabdc4445a76e5ecfe Mon Sep 17 00:00:00 2001
From: Matthew Honnibal
Date: Thu, 30 Jan 2020 15:41:45 +0100
Subject: [PATCH 056/496] Set dev version
---
spacy/about.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/about.py b/spacy/about.py
index 6e01a855a..6a3c680ab 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.0.0.dev2"
+__version__ = "3.0.0.dev3"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
From cabd60fa1e87e95398b60a2a2246a45711a7ffee Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Mon, 3 Feb 2020 13:02:12 +0100
Subject: [PATCH 057/496] Small fixes to as_example (#4957)
* label in span not writable anymore
* Revert "label in span not writable anymore"
This reverts commit ab442338c8c4ddd7dfbc15348f999b74f4928090.
* fixing yield - remove redundant list
---
spacy/language.py | 13 ++++++-------
spacy/pipeline/pipes.pyx | 24 ++++++------------------
2 files changed, 12 insertions(+), 25 deletions(-)
diff --git a/spacy/language.py b/spacy/language.py
index cde9c0164..a2baa5922 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -534,7 +534,9 @@ class Language(object):
if not hasattr(proc, "rehearse"):
continue
grads = {}
- proc.rehearse(examples, sgd=get_grads, losses=losses, **config.get(name, {}))
+ proc.rehearse(
+ examples, sgd=get_grads, losses=losses, **config.get(name, {})
+ )
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
return losses
@@ -590,10 +592,7 @@ class Language(object):
kwargs = component_cfg.get(name, {})
kwargs.update(cfg)
proc.begin_training(
- get_examples,
- pipeline=self.pipeline,
- sgd=self._optimizer,
- **kwargs
+ get_examples, pipeline=self.pipeline, sgd=self._optimizer, **kwargs
)
self._link_components()
return self._optimizer
@@ -701,7 +700,7 @@ class Language(object):
cleanup=False,
component_cfg=None,
n_process=1,
- as_example=False
+ as_example=False,
):
"""Process texts as a stream, and yield `Doc` objects in order.
@@ -737,7 +736,7 @@ class Language(object):
disable=disable,
n_process=n_process,
component_cfg=component_cfg,
- as_example=False # TODO: shouldn't this be as_example=as_example ?
+ as_example=as_example,
)
for doc, context in zip(docs, contexts):
yield (doc, context)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index bca53bc03..ca39de959 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -108,11 +108,9 @@ class Pipe(object):
self.set_annotations(docs, predictions)
if as_example:
- annotated_examples = []
for ex, doc in zip(examples, docs):
ex.doc = doc
- annotated_examples.append(ex)
- yield from annotated_examples
+ yield ex
else:
yield from docs
@@ -329,11 +327,9 @@ class Tensorizer(Pipe):
self.set_annotations(docs, tensors)
if as_example:
- annotated_examples = []
for ex, doc in zip(examples, docs):
ex.doc = doc
- annotated_examples.append(ex)
- yield from annotated_examples
+ yield ex
else:
yield from docs
@@ -464,11 +460,9 @@ class Tagger(Pipe):
self.set_annotations(docs, tag_ids)
if as_example:
- annotated_examples = []
for ex, doc in zip(examples, docs):
ex.doc = doc
- annotated_examples.append(ex)
- yield from annotated_examples
+ yield ex
else:
yield from docs
@@ -1256,11 +1250,9 @@ class TextCategorizer(Pipe):
self.set_annotations(docs, scores, tensors=tensors)
if as_example:
- annotated_examples = []
for ex, doc in zip(examples, docs):
ex.doc = doc
- annotated_examples.append(ex)
- yield from annotated_examples
+ yield ex
else:
yield from docs
@@ -1616,11 +1608,9 @@ class EntityLinker(Pipe):
self.set_annotations(docs, kb_ids, tensors=tensors)
if as_example:
- annotated_examples = []
for ex, doc in zip(examples, docs):
ex.doc = doc
- annotated_examples.append(ex)
- yield from annotated_examples
+ yield ex
else:
yield from docs
@@ -1834,11 +1824,9 @@ class Sentencizer(Pipe):
else:
self.set_annotations(docs, predictions)
if as_example:
- annotated_examples = []
for ex, doc in zip(examples, docs):
ex.doc = doc
- annotated_examples.append(ex)
- yield from annotated_examples
+ yield ex
else:
yield from docs
From 781e95cf536cd5720d07ec80b2cb89eaa4b41290 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Tue, 11 Feb 2020 02:31:49 +0100
Subject: [PATCH 058/496] Ensure doc.similarity returns a float (on develop)
(#4969)
---
spacy/tokens/doc.pyx | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 15f77d621..aec06d620 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -391,7 +391,9 @@ cdef class Doc:
return 0.0
vector = self.vector
xp = get_array_module(vector)
- return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+ result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+ # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+ return result.item()
@property
def has_vector(self):
From 9b84f987bdca50891b293a65762a00145307a3af Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Tue, 11 Feb 2020 02:33:16 +0100
Subject: [PATCH 059/496] fix grad_clip naming (#4967)
---
spacy/util.py | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/spacy/util.py b/spacy/util.py
index 53fa81402..0cc11cef7 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -796,8 +796,6 @@ def create_default_optimizer():
beta2 = env_opt("optimizer_B2", 0.999)
eps = env_opt("optimizer_eps", 1e-8)
L2 = env_opt("L2_penalty", 1e-6)
- max_grad_norm = env_opt("grad_norm_clip", 1.0)
- optimizer = Adam(learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, ops=ops)
- optimizer.max_grad_norm = max_grad_norm
- optimizer.device = ops.device_type
+ grad_clip = env_opt("grad_norm_clip", 1.0)
+ optimizer = Adam(learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, ops=ops, grad_clip=grad_clip)
return optimizer
From 2ed49404e30f206894e8c25fb28f8135d0a69077 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Tue, 11 Feb 2020 17:46:18 -0500
Subject: [PATCH 060/496] Improve setup.py and call into Cython directly
(#4952)
* Improve setup.py and call into Cython directly
* Add numpy to setup_requires
* Improve clean helper
* Update setup.cfg
* Try if it builds without pyproject.toml
* Update MANIFEST.in
---
MANIFEST.in | 2 +-
bin/cythonize.py | 169 ------------------------------------------
pyproject.toml | 3 -
setup.cfg | 1 +
setup.py | 165 ++++++++++++++++-------------------------
spacy/tokenizer.pyx | 2 +
spacy/tokens/span.pyx | 1 +
7 files changed, 67 insertions(+), 276 deletions(-)
delete mode 100755 bin/cythonize.py
delete mode 100644 pyproject.toml
diff --git a/MANIFEST.in b/MANIFEST.in
index 78655a5f4..266af1b0a 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,5 @@
recursive-include include *.h
-recursive-include spacy *.txt
+recursive-include spacy *.pyx *.pxd *.txt
include LICENSE
include README.md
include bin/spacy
diff --git a/bin/cythonize.py b/bin/cythonize.py
deleted file mode 100755
index 554252294..000000000
--- a/bin/cythonize.py
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/usr/bin/env python
-""" cythonize.py
-
-Cythonize pyx files into C++ files as needed.
-
-Usage: cythonize.py [root]
-
-Checks pyx files to see if they have been changed relative to their
-corresponding C++ files. If they have, then runs cython on these files to
-recreate the C++ files.
-
-Additionally, checks pxd files and setup.py if they have been changed. If
-they have, rebuilds everything.
-
-Change detection based on file hashes stored in JSON format.
-
-For now, this script should be run by developers when changing Cython files
-and the resulting C++ files checked in, so that end-users (and Python-only
-developers) do not get the Cython dependencies.
-
-Based upon:
-
-https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py
-https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py
-
-Note: this script does not check any of the dependent C++ libraries.
-"""
-from __future__ import print_function
-
-import os
-import sys
-import json
-import hashlib
-import subprocess
-import argparse
-
-
-HASH_FILE = "cythonize.json"
-
-
-def process_pyx(fromfile, tofile, language_level="-3"):
- print("Processing %s" % fromfile)
- try:
- from Cython.Compiler.Version import version as cython_version
- from distutils.version import LooseVersion
-
- if LooseVersion(cython_version) < LooseVersion("0.25"):
- raise Exception("Require Cython >= 0.25")
-
- except ImportError:
- pass
-
- flags = ["--fast-fail", language_level]
- if tofile.endswith(".cpp"):
- flags += ["--cplus"]
-
- try:
- try:
- r = subprocess.call(
- ["cython"] + flags + ["-o", tofile, fromfile], env=os.environ
- ) # See Issue #791
- if r != 0:
- raise Exception("Cython failed")
- except OSError:
- # There are ways of installing Cython that don't result in a cython
- # executable on the path, see gh-2397.
- r = subprocess.call(
- [
- sys.executable,
- "-c",
- "import sys; from Cython.Compiler.Main import "
- "setuptools_main as main; sys.exit(main())",
- ]
- + flags
- + ["-o", tofile, fromfile]
- )
- if r != 0:
- raise Exception("Cython failed")
- except OSError:
- raise OSError("Cython needs to be installed")
-
-
-def preserve_cwd(path, func, *args):
- orig_cwd = os.getcwd()
- try:
- os.chdir(path)
- func(*args)
- finally:
- os.chdir(orig_cwd)
-
-
-def load_hashes(filename):
- try:
- return json.load(open(filename))
- except (ValueError, IOError):
- return {}
-
-
-def save_hashes(hash_db, filename):
- with open(filename, "w") as f:
- f.write(json.dumps(hash_db))
-
-
-def get_hash(path):
- return hashlib.md5(open(path, "rb").read()).hexdigest()
-
-
-def hash_changed(base, path, db):
- full_path = os.path.normpath(os.path.join(base, path))
- return not get_hash(full_path) == db.get(full_path)
-
-
-def hash_add(base, path, db):
- full_path = os.path.normpath(os.path.join(base, path))
- db[full_path] = get_hash(full_path)
-
-
-def process(base, filename, db):
- root, ext = os.path.splitext(filename)
- if ext in [".pyx", ".cpp"]:
- if hash_changed(base, filename, db) or not os.path.isfile(
- os.path.join(base, root + ".cpp")
- ):
- preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp")
- hash_add(base, root + ".cpp", db)
- hash_add(base, root + ".pyx", db)
-
-
-def check_changes(root, db):
- res = False
- new_db = {}
-
- setup_filename = "setup.py"
- hash_add(".", setup_filename, new_db)
- if hash_changed(".", setup_filename, db):
- res = True
-
- for base, _, files in os.walk(root):
- for filename in files:
- if filename.endswith(".pxd"):
- hash_add(base, filename, new_db)
- if hash_changed(base, filename, db):
- res = True
-
- if res:
- db.clear()
- db.update(new_db)
- return res
-
-
-def run(root):
- db = load_hashes(HASH_FILE)
-
- try:
- check_changes(root, db)
- for base, _, files in os.walk(root):
- for filename in files:
- process(base, filename, db)
- finally:
- save_hashes(db, HASH_FILE)
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Cythonize pyx files into C++ files as needed"
- )
- parser.add_argument("root", help="root directory")
- args = parser.parse_args()
- run(args.root)
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index fed528d4a..000000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,3 +0,0 @@
-[build-system]
-requires = ["setuptools"]
-build-backend = "setuptools.build_meta"
diff --git a/setup.cfg b/setup.cfg
index a3aede089..f360cac37 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -31,6 +31,7 @@ python_requires = >=3.6
setup_requires =
wheel
cython>=0.25
+ numpy>=1.15.0
# We also need our Cython packages here to compile against
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
diff --git a/setup.py b/setup.py
index 1afdc7ae4..31f22ba3f 100755
--- a/setup.py
+++ b/setup.py
@@ -1,34 +1,22 @@
#!/usr/bin/env python
-import io
-import os
-import subprocess
import sys
-import contextlib
from distutils.command.build_ext import build_ext
from distutils.sysconfig import get_python_inc
import distutils.util
from distutils import ccompiler, msvccompiler
from setuptools import Extension, setup, find_packages
+import numpy
+from pathlib import Path
+from Cython.Build import cythonize
+from Cython.Compiler import Options
-def is_new_osx():
- """Check whether we're on OSX >= 10.10"""
- name = distutils.util.get_platform()
- if sys.platform != "darwin":
- return False
- elif name.startswith("macosx-10"):
- minor_version = int(name.split("-")[1].split(".")[1])
- if minor_version >= 7:
- return True
- else:
- return False
- else:
- return False
+# Preserve `__doc__` on functions and classes
+# http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options
+Options.docstrings = True
PACKAGES = find_packages()
-
-
MOD_NAMES = [
"spacy.parts_of_speech",
"spacy.strings",
@@ -61,16 +49,32 @@ MOD_NAMES = [
"spacy.symbols",
"spacy.vectors",
]
-
-
COMPILE_OPTIONS = {
"msvc": ["/Ox", "/EHsc"],
"mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
"other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
}
-
-
LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
+COMPILER_DIRECTIVES = {
+ "language_level": -3,
+ "embedsignature": True,
+ "annotation_typing": False,
+}
+
+
+def is_new_osx():
+ """Check whether we're on OSX >= 10.10"""
+ name = distutils.util.get_platform()
+ if sys.platform != "darwin":
+ return False
+ elif name.startswith("macosx-10"):
+ minor_version = int(name.split("-")[1].split(".")[1])
+ if minor_version >= 7:
+ return True
+ else:
+ return False
+ else:
+ return False
if is_new_osx():
@@ -103,95 +107,50 @@ class build_ext_subclass(build_ext, build_ext_options):
build_ext.build_extensions(self)
-def generate_cython(root, source):
- print("Cythonizing sources")
- p = subprocess.call(
- [sys.executable, os.path.join(root, "bin", "cythonize.py"), source],
- env=os.environ,
- )
- if p != 0:
- raise RuntimeError("Running cythonize failed")
-
-
-def is_source_release(path):
- return os.path.exists(os.path.join(path, "PKG-INFO"))
-
-
def clean(path):
- for name in MOD_NAMES:
- name = name.replace(".", "/")
- for ext in [".so", ".html", ".cpp", ".c"]:
- file_path = os.path.join(path, name + ext)
- if os.path.exists(file_path):
- os.unlink(file_path)
-
-
-@contextlib.contextmanager
-def chdir(new_dir):
- old_dir = os.getcwd()
- try:
- os.chdir(new_dir)
- sys.path.insert(0, new_dir)
- yield
- finally:
- del sys.path[0]
- os.chdir(old_dir)
+ for path in path.glob("**/*"):
+ if path.is_file() and path.suffix in (".so", ".cpp"):
+ print(f"Deleting {path.name}")
+ path.unlink()
def setup_package():
- root = os.path.abspath(os.path.dirname(__file__))
+ root = Path(__file__).parent
if len(sys.argv) > 1 and sys.argv[1] == "clean":
- return clean(root)
+ return clean(root / "spacy")
- with chdir(root):
- with io.open(os.path.join(root, "spacy", "about.py"), encoding="utf8") as f:
- about = {}
- exec(f.read(), about)
+ with (root / "spacy" / "about.py").open("r") as f:
+ about = {}
+ exec(f.read(), about)
- include_dirs = [
- get_python_inc(plat_specific=True),
- os.path.join(root, "include"),
- ]
+ include_dirs = [
+ get_python_inc(plat_specific=True),
+ numpy.get_include(),
+ str(root / "include"),
+ ]
+ if (
+ ccompiler.new_compiler().compiler_type == "msvc"
+ and msvccompiler.get_build_version() == 9
+ ):
+ include_dirs.append(str(root / "include" / "msvc9"))
+ ext_modules = []
+ for name in MOD_NAMES:
+ mod_path = name.replace(".", "/") + ".pyx"
+ ext = Extension(name, [mod_path], language="c++")
+ ext_modules.append(ext)
+ print("Cythonizing sources")
+ ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)
- if (
- ccompiler.new_compiler().compiler_type == "msvc"
- and msvccompiler.get_build_version() == 9
- ):
- include_dirs.append(os.path.join(root, "include", "msvc9"))
-
- ext_modules = []
- for mod_name in MOD_NAMES:
- mod_path = mod_name.replace(".", "/") + ".cpp"
- extra_link_args = []
- # ???
- # Imported from patch from @mikepb
- # See Issue #267. Running blind here...
- if sys.platform == "darwin":
- dylib_path = [".." for _ in range(mod_name.count("."))]
- dylib_path = "/".join(dylib_path)
- dylib_path = "@loader_path/%s/spacy/platform/darwin/lib" % dylib_path
- extra_link_args.append("-Wl,-rpath,%s" % dylib_path)
- ext_modules.append(
- Extension(
- mod_name,
- [mod_path],
- language="c++",
- include_dirs=include_dirs,
- extra_link_args=extra_link_args,
- )
- )
-
- if not is_source_release(root):
- generate_cython(root, "spacy")
-
- setup(
- name="spacy",
- packages=PACKAGES,
- version=about["__version__"],
- ext_modules=ext_modules,
- cmdclass={"build_ext": build_ext_subclass},
- )
+ setup(
+ name="spacy",
+ packages=PACKAGES,
+ version=about["__version__"],
+ ext_modules=ext_modules,
+ cmdclass={"build_ext": build_ext_subclass},
+ include_dirs=include_dirs,
+ package_data={"": ["*.pyx", "*.pxd", "*.pxi", "*.cpp"]},
+ )
if __name__ == "__main__":
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 7491a11fc..25d9f239d 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -1,5 +1,7 @@
# cython: embedsignature=True
# cython: profile=True
+from __future__ import unicode_literals
+
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
from libc.string cimport memcpy, memset
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 7ab1c1d18..d24a38029 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -1,3 +1,4 @@
+from __future__ import unicode_literals
cimport numpy as np
from libc.math cimport sqrt
From 207994871106bde872bd76224fbb4cf195f01e66 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 12 Feb 2020 22:49:21 +0100
Subject: [PATCH 061/496] add build dependencies back to pyproject.toml
---
pyproject.toml | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/pyproject.toml b/pyproject.toml
index fed528d4a..8a6ababf3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,11 @@
[build-system]
-requires = ["setuptools"]
+requires = [
+ "setuptools",
+ "wheel",
+ "cython>=0.25",
+ "cymem>=2.0.2,<2.1.0",
+ "preshed>=3.0.2,<3.1.0",
+ "murmurhash>=0.28.0,<1.1.0",
+ "thinc==7.4.0.dev0",
+]
build-backend = "setuptools.build_meta"
From 34986c7bfd1d4634861a5c4b54cf90ef18090ff4 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 12 Feb 2020 22:49:50 +0100
Subject: [PATCH 062/496] test versions of required libs across different
places
---
spacy/tests/test_requirements.py | 61 ++++++++++++++++++++++++++++++++
1 file changed, 61 insertions(+)
create mode 100644 spacy/tests/test_requirements.py
diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py
new file mode 100644
index 000000000..8c2b49b85
--- /dev/null
+++ b/spacy/tests/test_requirements.py
@@ -0,0 +1,61 @@
+import re
+from pathlib import Path
+
+
+def test_build_dependencies(en_vocab):
+ libs_ignore_requirements = ["pytest", "pytest-timeout", "mock", "flake8", "jsonschema"]
+ libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]
+
+ # check requirements.txt
+ root_dir = Path(__file__).parent.parent.parent
+ req_file = root_dir / "requirements.txt"
+ req_dict = {}
+ with req_file.open() as f:
+ lines = f.readlines()
+ for line in lines:
+ line = line.strip()
+ if not line.startswith("#"):
+ lib, v = _parse_req(line)
+ if lib and lib not in libs_ignore_requirements:
+ req_dict[lib] = v
+
+ # check setup.cfg and compare to requirements.txt
+ # also fails when there are missing or additional libs
+ setup_file = root_dir / "setup.cfg"
+ with setup_file.open() as f:
+ lines = f.readlines()
+ setup_keys = set()
+ for line in lines:
+ line = line.strip()
+ if not line.startswith("#"):
+ lib, v = _parse_req(line)
+ if lib and not lib.startswith("cupy") and lib not in libs_ignore_setup:
+ req_v = req_dict.get(lib, None)
+ assert req_v is not None # if fail: setup.cfg contains a lib not in requirements.txt
+ assert (lib+v) == (lib+req_v) # if fail: setup.cfg & requirements.txt have conflicting versions
+ setup_keys.add(lib)
+ assert sorted(setup_keys) == sorted(req_dict.keys()) # if fail: requirements.txt contains a lib not in setup.cfg
+
+ # check pyproject.toml and compare the versions of the libs to requirements.txt
+ # does not fail when there are missing or additional libs
+ toml_file = root_dir / "pyproject.toml"
+ with toml_file.open() as f:
+ lines = f.readlines()
+ toml_keys = set()
+ for line in lines:
+ line = line.strip()
+ line = line.strip(",")
+ line = line.strip("\"")
+ if not line.startswith("#"):
+ lib, v = _parse_req(line)
+ if lib:
+ req_v = req_dict.get(lib, None)
+ assert (lib+v) == (lib+req_v) # if fail: pyproject.toml & requirements.txt have conflicting versions
+ toml_keys.add(lib)
+
+def _parse_req(line):
+ lib = re.match(r"^[a-z0-9\-]*", line).group(0)
+ v = line.replace(lib, "").strip()
+ if not re.match(r"^[<>=][<>=].*", v):
+ return None, None
+ return lib, v
\ No newline at end of file
From 6bbd81656967fd93dfeb9af40c9194536b31a135 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 12 Feb 2020 22:50:27 +0100
Subject: [PATCH 063/496] formatting
---
spacy/tests/test_requirements.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py
index 8c2b49b85..cb2f51725 100644
--- a/spacy/tests/test_requirements.py
+++ b/spacy/tests/test_requirements.py
@@ -34,7 +34,7 @@ def test_build_dependencies(en_vocab):
assert req_v is not None # if fail: setup.cfg contains a lib not in requirements.txt
assert (lib+v) == (lib+req_v) # if fail: setup.cfg & requirements.txt have conflicting versions
setup_keys.add(lib)
- assert sorted(setup_keys) == sorted(req_dict.keys()) # if fail: requirements.txt contains a lib not in setup.cfg
+ assert sorted(setup_keys) == sorted(req_dict.keys()) # if fail: requirements.txt contains a lib not in setup.cfg
# check pyproject.toml and compare the versions of the libs to requirements.txt
# does not fail when there are missing or additional libs
@@ -53,9 +53,10 @@ def test_build_dependencies(en_vocab):
assert (lib+v) == (lib+req_v) # if fail: pyproject.toml & requirements.txt have conflicting versions
toml_keys.add(lib)
+
def _parse_req(line):
lib = re.match(r"^[a-z0-9\-]*", line).group(0)
v = line.replace(lib, "").strip()
if not re.match(r"^[<>=][<>=].*", v):
return None, None
- return lib, v
\ No newline at end of file
+ return lib, v
From 2729d9164d02a6795ccf93f0b9414856644e6dbc Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 12 Feb 2020 22:59:37 +0100
Subject: [PATCH 064/496] cleanup
---
spacy/tests/test_requirements.py | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py
index cb2f51725..320fc5763 100644
--- a/spacy/tests/test_requirements.py
+++ b/spacy/tests/test_requirements.py
@@ -3,6 +3,7 @@ from pathlib import Path
def test_build_dependencies(en_vocab):
+ # Check that library requirements are pinned exactly the same across different setup files.
libs_ignore_requirements = ["pytest", "pytest-timeout", "mock", "flake8", "jsonschema"]
libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]
@@ -41,17 +42,13 @@ def test_build_dependencies(en_vocab):
toml_file = root_dir / "pyproject.toml"
with toml_file.open() as f:
lines = f.readlines()
- toml_keys = set()
for line in lines:
- line = line.strip()
- line = line.strip(",")
- line = line.strip("\"")
+ line = line.strip().strip(",").strip("\"")
if not line.startswith("#"):
lib, v = _parse_req(line)
if lib:
req_v = req_dict.get(lib, None)
assert (lib+v) == (lib+req_v) # if fail: pyproject.toml & requirements.txt have conflicting versions
- toml_keys.add(lib)
def _parse_req(line):
From 80e95d02b148fd49e008058413012b757e6c7abb Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Tue, 18 Feb 2020 14:32:53 +0100
Subject: [PATCH 065/496] Allow spacy attr in token pattern
---
spacy/schemas.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 4a5054125..2268bf100 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -103,6 +103,7 @@ class TokenPattern(BaseModel):
ent_type: Optional[StringValue] = None
norm: Optional[StringValue] = None
length: Optional[NumberValue] = None
+ spacy: Optional[StrictBool] = None
is_alpha: Optional[StrictBool] = None
is_ascii: Optional[StrictBool] = None
is_digit: Optional[StrictBool] = None
From 1278161f4715fa3076e4d844d2ef1b6377a855b1 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Tue, 18 Feb 2020 15:17:03 +0100
Subject: [PATCH 066/496] Tidy up and fix issues
---
spacy/cli/converters/conllu2json.py | 2 +-
spacy/errors.py | 4 +-
spacy/gold.pyx | 5 -
spacy/lang/sk/examples.py | 4 -
spacy/lang/sk/lex_attrs.py | 3 -
spacy/lang/sk/tag_map.py | 2921 +++++++++++-----------
spacy/syntax/nn_parser.pyx | 4 +-
spacy/tests/regression/test_issue4849.py | 16 +-
spacy/tests/regression/test_issue4924.py | 17 +-
9 files changed, 1476 insertions(+), 1500 deletions(-)
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index f65e6a187..ecdc2ae66 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -235,7 +235,7 @@ def example_from_conllu_sentence(
subtok_word = ""
in_subtok = False
id_ = int(id_) - 1
- head = (int(head) - 1) if head != "0" else id_
+ head = (int(head) - 1) if head not in ("0", "_") else id_
tag = pos if tag == "_" else tag
morph = morph if morph != "_" else ""
dep = "ROOT" if dep == "root" else dep
diff --git a/spacy/errors.py b/spacy/errors.py
index 7ef3abc00..e6c0b069e 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -541,8 +541,8 @@ class Errors(object):
E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes "
"'{token_attrs}'.")
- E998 = ("Can only create GoldParse's from Example's without a Doc, "
- "if get_gold_parses() is called with a Vocab object.")
+ E998 = ("Can only create GoldParse objects from Example objects without a "
+ "Doc if get_gold_parses() is called with a Vocab object.")
E999 = ("Encountered an unexpected format for the dictionary holding "
"gold annotations: {gold_dict}")
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 5e46f274e..eca801176 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -991,11 +991,6 @@ cdef class GoldParse:
self.cats = {} if cats is None else dict(cats)
self.links = {} if links is None else dict(links)
- # orig_annot is used as an iterator in `nlp.evalate` even if self.length == 0,
- # so set a empty list to avoid error.
- # if self.lenght > 0, this is modified latter.
- self.orig_annot = []
-
# avoid allocating memory if the doc does not contain any tokens
if self.length > 0:
if not words:
diff --git a/spacy/lang/sk/examples.py b/spacy/lang/sk/examples.py
index 486ea375e..736109a7c 100644
--- a/spacy/lang/sk/examples.py
+++ b/spacy/lang/sk/examples.py
@@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/sk/lex_attrs.py b/spacy/lang/sk/lex_attrs.py
index 3dea4d8f0..0caf62e8e 100644
--- a/spacy/lang/sk/lex_attrs.py
+++ b/spacy/lang/sk/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
_num_words = [
diff --git a/spacy/lang/sk/tag_map.py b/spacy/lang/sk/tag_map.py
index 015c8cba3..d159a6a51 100644
--- a/spacy/lang/sk/tag_map.py
+++ b/spacy/lang/sk/tag_map.py
@@ -1,1467 +1,1464 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from ...symbols import POS, AUX, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
-from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
+from ...symbols import POS, AUX, ADJ, CCONJ, NUM, ADV, ADP, X, VERB
+from ...symbols import NOUN, PART, INTJ, PRON
# Source https://universaldependencies.org/tagset-conversion/sk-snk-uposf.html
# fmt: off
TAG_MAP = {
- "AAfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "AAfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAfs7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "AAip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAip7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAis7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAmp7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "AAms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAms7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "AAnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAnp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "AAns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AAns7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "AFfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "AFfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "AFfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "AFfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "AFfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "AFfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "AFfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "AFfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "AFfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "AFfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "AFfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "AFfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "AFfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "AFfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "AFip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "AFms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "AFnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "AFnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "AFnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "AFnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "AFnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "AFnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "AFnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "AFns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "AFns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "AFns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "AFns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "AFns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "AFns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "AFns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "AUfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
- "AUfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Plur"},
- "AUfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Plur"},
- "AUfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
- "AUfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
- "AUfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
- "AUfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
- "AUfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
- "AUfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
- "AUfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
- "AUfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Sing"},
- "AUfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Sing"},
- "AUfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
- "AUfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
- "AUfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
- "AUfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
- "AUfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
- "AUfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
- "AUip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
- "AUms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
- "AUnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
- "AUnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Plur"},
- "AUnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Plur"},
- "AUnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
- "AUnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
- "AUnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
- "AUnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
- "AUnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
- "AUnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
- "AUns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
- "AUns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Sing"},
- "AUns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Sing"},
- "AUns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
- "AUns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
- "AUns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
- "AUns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
- "AUns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
- "AUns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
- "Dx": {POS: ADV, "morph": "Degree=Pos"},
- "Dy": {POS: ADV, "morph": "Degree=Cmp"},
- "Dz": {POS: ADV, "morph": "Degree=Sup"},
- "Eu1": {POS: ADP, "morph": "AdpType=Prep|Case=Nom"},
- "Eu2": {POS: ADP, "morph": "AdpType=Prep|Case=Gen"},
- "Eu3": {POS: ADP, "morph": "AdpType=Prep|Case=Dat"},
- "Eu4": {POS: ADP, "morph": "AdpType=Prep|Case=Acc"},
- "Eu6": {POS: ADP, "morph": "AdpType=Prep|Case=Loc"},
- "Eu7": {POS: ADP, "morph": "AdpType=Prep|Case=Ins"},
- "Ev2": {POS: ADP, "morph": "AdpType=Voc|Case=Gen"},
- "Ev3": {POS: ADP, "morph": "AdpType=Voc|Case=Dat"},
- "Ev4": {POS: ADP, "morph": "AdpType=Voc|Case=Acc"},
- "Ev6": {POS: ADP, "morph": "AdpType=Voc|Case=Loc"},
- "Ev7": {POS: ADP, "morph": "AdpType=Voc|Case=Ins"},
- "Gkfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gknp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gknp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
- "Gkns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gkns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
- "Gtfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtnp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtnp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
- "Gtns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "Gtns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
- "J": {POS: INTJ, "morph": "_"},
- "NAfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "NAfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "NAfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "NAfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "NAfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "NAfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "NAfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "NAfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "NAfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "NAfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "NAfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "NAfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "NAfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "NAfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "NAip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "NAms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "NAnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "NAnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "NAnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "NAnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "NAnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "NAnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "NAnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "NAns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "NAns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "NAns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "NAns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "NAns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "NAns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "NAns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "ND": {POS: NUM, "morph": "MorphPos=Adv"},
- "NFfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "NFfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "NFfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "NFfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "NFfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "NFfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "NFfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "NFfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "NFfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "NFfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "NFfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "NFfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "NFfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "NFfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "NFip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"},
- "NFms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"},
- "NFnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "NFnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "NFnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "NFnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "NFnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "NFnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "NFnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur"},
- "NFns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "NFns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "NFns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "NFns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "NFns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "NFns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "NFns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing"},
- "NNfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Num|Number=Plur"},
- "NNfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Num|Number=Plur"},
- "NNfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Num|Number=Plur"},
- "NNfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Num|Number=Plur"},
- "NNfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Num|Number=Plur"},
- "NNfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Num|Number=Plur"},
- "NNfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Num|Number=Plur"},
- "NNip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"},
- "NNnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Num|Number=Plur"},
- "NNnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Num|Number=Plur"},
- "NNnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Num|Number=Plur"},
- "NNnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Num|Number=Plur"},
- "NNnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Num|Number=Plur"},
- "NNnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Num|Number=Plur"},
- "NNnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Num|Number=Plur"},
- "NSfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "NSfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "NSfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "NSfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "NSfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "NSfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "NSfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "NSfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "NSfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "NSfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "NSfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "NSfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "NSfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "NSfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "NSip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "NSip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "NSip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "NSip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "NSip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "NSip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "NSip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "NSis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "NSis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "NSis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "NSis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "NSis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "NSis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "NSis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "NUfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"},
- "NUfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"},
- "NUfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"},
- "NUfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"},
- "NUfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"},
- "NUfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"},
- "NUfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"},
- "NUip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"},
- "NUis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"},
- "NUis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"},
- "NUis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"},
- "NUis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"},
- "NUis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"},
- "NUis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"},
- "NUmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"},
- "NUnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"},
- "NUnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"},
- "NUnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"},
- "NUnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"},
- "NUnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"},
- "NUnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"},
- "NUnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"},
- "NUns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"},
- "NUns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"},
- "NUns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"},
- "NUns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"},
- "NUns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"},
- "NUns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"},
- "NUns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"},
- "O": {POS: CCONJ, "morph": "_"},
- "OY": {POS: CCONJ, "morph": "Mood=Cnd"},
- "PAfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
- "PAns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PAns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
- "PD": {POS: PRON, "morph": "MorphPos=Adv|PronType=Prs"},
- "PFfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFis2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFis4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFms2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFms4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
- "PFns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFns2g": {POS: PRON, "morph": "AdpType=Preppron|Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFns4g": {POS: PRON, "morph": "AdpType=Preppron|Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PFns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
- "PPhp1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Plur|PronType=Prs"},
- "PPhp2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Plur|PronType=Prs"},
- "PPhp3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Plur|PronType=Prs"},
- "PPhp4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Plur|PronType=Prs"},
- "PPhp5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Plur|PronType=Prs"},
- "PPhp6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Plur|PronType=Prs"},
- "PPhp7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Plur|PronType=Prs"},
- "PPhs1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Sing|PronType=Prs"},
- "PPhs2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Sing|PronType=Prs"},
- "PPhs3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Sing|PronType=Prs"},
- "PPhs4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Sing|PronType=Prs"},
- "PPhs5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Sing|PronType=Prs"},
- "PPhs6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Sing|PronType=Prs"},
- "PPhs7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Sing|PronType=Prs"},
- "PSfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
- "PSfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
- "PSfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
- "PSfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
- "PSfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
- "PSfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
- "PSfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
- "PSfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PSfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PSfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PSfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PSfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PSfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PSfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PSns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PSns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PSns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PSns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PSns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PSns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PSns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
- "PUfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
- "PUns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
- "PUns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
- "Q": {POS: X, "morph": "Hyph=Yes"},
- "R": {POS: PRON, "morph": "PronType=Prs|Reflex=Yes"},
- "SAfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "SAfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "SAfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "SAfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "SAfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "SAfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "SAfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"},
- "SAfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "SAfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "SAfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "SAfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "SAfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "SAfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "SAfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"},
- "SAip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"},
- "SAms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"},
- "SAnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "SAnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "SAnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "SAnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "SAnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "SAnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "SAnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"},
- "SAns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "SAns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "SAns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "SAns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "SAns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "SAns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "SAns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"},
- "SFfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "SFfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "SFfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "SFfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "SFfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "SFfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "SFfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"},
- "SFfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "SFfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "SFfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "SFfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "SFfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "SFfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "SFfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"},
- "SSfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "SSfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "SSfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "SSfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "SSfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "SSfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "SSfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"},
- "SSfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "SSfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "SSfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "SSfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "SSfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "SSfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "SSfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"},
- "SSip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"},
- "SSms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"},
- "SSnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Plur"},
- "SSnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Plur"},
- "SSnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Plur"},
- "SSnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Plur"},
- "SSnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Plur"},
- "SSnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Plur"},
- "SSnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Plur"},
- "SSns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing"},
- "SSns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing"},
- "SSns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing"},
- "SSns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing"},
- "SSns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing"},
- "SSns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing"},
- "SSns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing"},
- "SUfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"},
- "SUfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"},
- "SUfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"},
- "SUfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"},
- "SUfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"},
- "SUfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"},
- "SUfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"},
- "SUfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing"},
- "SUfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing"},
- "SUfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing"},
- "SUfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing"},
- "SUfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing"},
- "SUfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing"},
- "SUfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing"},
- "SUip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"},
- "SUms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"},
- "SUnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"},
- "SUnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"},
- "SUnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"},
- "SUnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"},
- "SUnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"},
- "SUnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"},
- "SUnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"},
- "SUns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"},
- "SUns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"},
- "SUns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"},
- "SUns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"},
- "SUns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"},
- "SUns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"},
- "SUns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"},
- "T": {POS: PART, "morph": "_"},
- "TY": {POS: PART, "morph": "Mood=Cnd"},
- "VBepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
- "VBepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
- "VBepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
- "VBepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
- "VBepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
- "VBepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
- "VBesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
- "VBesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
- "VBesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
- "VBesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
- "VBesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
- "VBesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
- "VBjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
- "VBjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
- "VBjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
- "VBjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
- "VBjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
- "VBjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
- "VBjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
- "VBjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
- "VBjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
- "VBjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
- "VBjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
- "VBjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
- "VHd-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Conv"},
- "VHd+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Conv"},
- "VHe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Conv"},
- "VHe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Conv"},
- "VHj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Conv"},
- "VHj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Conv"},
- "VId-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Inf"},
- "VId+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Inf"},
- "VIe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Inf"},
- "VIe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Inf"},
- "VIj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Inf"},
- "VIj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Inf"},
- "VKdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKdpc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKdpc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKdsa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKdsa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKdsc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKdsc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKe-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VKjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
- "VKjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
- "VLdpah-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdpah+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdpbh-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdpbh+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdpcf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdpcf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdpcn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdpcn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdsaf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdsaf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdsan-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdsan+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdsbf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdsbf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdsbn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdsbn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdscf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdscf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLdscn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLdscn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLepah-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLepah+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLepbh-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLepbh+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLepcf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLepcf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLepci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLepci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLepcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLepcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLepcn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLepcn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLesaf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLesaf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLesai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLesai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLesam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLesam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLesan-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLesan+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLesbf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLesbf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLesbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLesbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLesbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLesbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLesbn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLesbn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLescf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLescf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLesci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLesci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLescm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLescm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLescn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLescn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjpah-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjpah+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjpbh-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjpbh+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjpcf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjpcf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjpcn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjpcn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjsaf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjsaf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjsan-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjsan+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjsbf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjsbf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjsbn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjsbn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjscf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjscf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VLjscn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
- "VLjscn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
- "VMdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"},
- "VMdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"},
- "VMdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"},
- "VMdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"},
- "VMdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"},
- "VMdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"},
- "VMepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"},
- "VMepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"},
- "VMepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"},
- "VMepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"},
- "VMesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"},
- "VMesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"},
- "VMjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"},
- "VMjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"},
- "VMjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"},
- "VMjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"},
- "VMjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"},
- "VMjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"},
- "W": {POS: X, "morph": "Abbr=Yes"},
- "Y": {POS: AUX, "morph": "Mood=Cnd"},
+ "AAfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "AAfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAfs7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "AAip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAip7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis2y": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis2z": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis3y": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis3z": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis4y": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis4z": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis5y": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis5z": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis6y": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis6z": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis7y": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAis7z": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAmp7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "AAms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms2y": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms2z": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms3y": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms3z": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms4y": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms4z": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms5y": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms5z": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms6y": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms6z": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms7y": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAms7z": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "AAnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAnp7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "AAns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns2y": {POS: ADJ, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns2z": {POS: ADJ, "morph": "Case=Gen|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns3y": {POS: ADJ, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns3z": {POS: ADJ, "morph": "Case=Dat|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns4y": {POS: ADJ, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns4z": {POS: ADJ, "morph": "Case=Acc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns5y": {POS: ADJ, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns5z": {POS: ADJ, "morph": "Case=Voc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns6y": {POS: ADJ, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns6z": {POS: ADJ, "morph": "Case=Loc|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns7y": {POS: ADJ, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AAns7z": {POS: ADJ, "morph": "Case=Ins|Degree=Sup|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "AFfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "AFfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "AFfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "AFfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "AFfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "AFfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "AFfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "AFfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "AFfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "AFfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "AFfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "AFfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "AFfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "AFfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "AFip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "AFms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "AFnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "AFnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "AFnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "AFnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "AFnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "AFnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "AFnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "AFns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "AFns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "AFns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "AFns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "AFns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "AFns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "AFns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "AUfp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "AUfp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "AUfp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "AUfp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "AUfp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "AUfp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "AUfp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "AUfp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "AUfp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "AUfs1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "AUfs1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "AUfs1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "AUfs2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "AUfs3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "AUfs4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "AUfs5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "AUfs6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "AUfs7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "AUip1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUip1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUip1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUip2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUip3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUip4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUip5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUip6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUip7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUis1x": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUis1y": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUis1z": {POS: ADJ, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUis2x": {POS: ADJ, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUis3x": {POS: ADJ, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUis4x": {POS: ADJ, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUis5x": {POS: ADJ, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUis6x": {POS: ADJ, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUis7x": {POS: ADJ, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUmp1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUmp1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUmp1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUmp2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUmp3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUmp4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUmp5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUmp6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUmp7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "AUms1x": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUms1y": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUms1z": {POS: ADJ, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUms2x": {POS: ADJ, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUms3x": {POS: ADJ, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUms4x": {POS: ADJ, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUms5x": {POS: ADJ, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUms6x": {POS: ADJ, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUms7x": {POS: ADJ, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "AUnp1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "AUnp1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "AUnp1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "AUnp2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "AUnp3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "AUnp4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "AUnp5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "AUnp6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "AUnp7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "AUns1x": {POS: ADJ, "morph": "Case=Nom|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "AUns1y": {POS: ADJ, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "AUns1z": {POS: ADJ, "morph": "Case=Nom|Degree=Sup|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "AUns2x": {POS: ADJ, "morph": "Case=Gen|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "AUns3x": {POS: ADJ, "morph": "Case=Dat|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "AUns4x": {POS: ADJ, "morph": "Case=Acc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "AUns5x": {POS: ADJ, "morph": "Case=Voc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "AUns6x": {POS: ADJ, "morph": "Case=Loc|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "AUns7x": {POS: ADJ, "morph": "Case=Ins|Degree=Pos|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "Dx": {POS: ADV, "morph": "Degree=Pos"},
+ "Dy": {POS: ADV, "morph": "Degree=Cmp"},
+ "Dz": {POS: ADV, "morph": "Degree=Sup"},
+ "Eu1": {POS: ADP, "morph": "AdpType=Prep|Case=Nom"},
+ "Eu2": {POS: ADP, "morph": "AdpType=Prep|Case=Gen"},
+ "Eu3": {POS: ADP, "morph": "AdpType=Prep|Case=Dat"},
+ "Eu4": {POS: ADP, "morph": "AdpType=Prep|Case=Acc"},
+ "Eu6": {POS: ADP, "morph": "AdpType=Prep|Case=Loc"},
+ "Eu7": {POS: ADP, "morph": "AdpType=Prep|Case=Ins"},
+ "Ev2": {POS: ADP, "morph": "AdpType=Voc|Case=Gen"},
+ "Ev3": {POS: ADP, "morph": "AdpType=Voc|Case=Dat"},
+ "Ev4": {POS: ADP, "morph": "AdpType=Voc|Case=Acc"},
+ "Ev6": {POS: ADP, "morph": "AdpType=Voc|Case=Loc"},
+ "Ev7": {POS: ADP, "morph": "AdpType=Voc|Case=Ins"},
+ "Gkfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gknp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gknp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Act"},
+ "Gkns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gkns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Act"},
+ "Gtfp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtfs1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtfs7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Fem|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtip1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtip7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtis1x": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis1y": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis1z": {POS: VERB, "morph": "Animacy=Inan|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis2x": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis2y": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis2z": {POS: VERB, "morph": "Animacy=Inan|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis3x": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis3y": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis3z": {POS: VERB, "morph": "Animacy=Inan|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis4x": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis4y": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis4z": {POS: VERB, "morph": "Animacy=Inan|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis5x": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis5y": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis5z": {POS: VERB, "morph": "Animacy=Inan|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis6x": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis6y": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis6z": {POS: VERB, "morph": "Animacy=Inan|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis7x": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis7y": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtis7z": {POS: VERB, "morph": "Animacy=Inan|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtmp1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtmp7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtms1x": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms1y": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms1z": {POS: VERB, "morph": "Animacy=Anim|Case=Nom|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms2x": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms2y": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms2z": {POS: VERB, "morph": "Animacy=Anim|Case=Gen|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms3x": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms3y": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms3z": {POS: VERB, "morph": "Animacy=Anim|Case=Dat|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms4x": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms4y": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms4z": {POS: VERB, "morph": "Animacy=Anim|Case=Acc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms5x": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms5y": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms5z": {POS: VERB, "morph": "Animacy=Anim|Case=Voc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms6x": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms6y": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms6z": {POS: VERB, "morph": "Animacy=Anim|Case=Loc|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms7x": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms7y": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Cmp|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtms7z": {POS: VERB, "morph": "Animacy=Anim|Case=Ins|Degree=Sup|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtnp1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtnp7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Plur|VerbForm=Part|Voice=Pass"},
+ "Gtns1x": {POS: VERB, "morph": "Case=Nom|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns1y": {POS: VERB, "morph": "Case=Nom|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns1z": {POS: VERB, "morph": "Case=Nom|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns2x": {POS: VERB, "morph": "Case=Gen|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns2y": {POS: VERB, "morph": "Case=Gen|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns2z": {POS: VERB, "morph": "Case=Gen|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns3x": {POS: VERB, "morph": "Case=Dat|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns3y": {POS: VERB, "morph": "Case=Dat|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns3z": {POS: VERB, "morph": "Case=Dat|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns4x": {POS: VERB, "morph": "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns4y": {POS: VERB, "morph": "Case=Acc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns4z": {POS: VERB, "morph": "Case=Acc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns5x": {POS: VERB, "morph": "Case=Voc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns5y": {POS: VERB, "morph": "Case=Voc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns5z": {POS: VERB, "morph": "Case=Voc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns6x": {POS: VERB, "morph": "Case=Loc|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns6y": {POS: VERB, "morph": "Case=Loc|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns6z": {POS: VERB, "morph": "Case=Loc|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns7x": {POS: VERB, "morph": "Case=Ins|Degree=Pos|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns7y": {POS: VERB, "morph": "Case=Ins|Degree=Cmp|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "Gtns7z": {POS: VERB, "morph": "Case=Ins|Degree=Sup|Gender=Neut|Number=Sing|VerbForm=Part|Voice=Pass"},
+ "J": {POS: INTJ, "morph": "_"},
+ "NAfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "NAfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "NAfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "NAfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "NAfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "NAfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "NAfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "NAfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "NAfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "NAfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "NAfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "NAfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "NAfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "NAfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "NAip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "NAms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "NAnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "NAnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "NAnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "NAnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "NAnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "NAnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "NAnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "NAns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "NAns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "NAns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "NAns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "NAns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "NAns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "NAns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "ND": {POS: NUM, "morph": "MorphPos=Adv"},
+ "NFfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "NFfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "NFfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "NFfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "NFfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "NFfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "NFfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "NFfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "NFfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "NFfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "NFfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "NFfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "NFfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "NFfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "NFip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur"},
+ "NFms1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFms2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFms3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFms4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFms5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFms6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFms7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing"},
+ "NFnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "NFnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "NFnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "NFnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "NFnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "NFnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "NFnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur"},
+ "NFns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "NFns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "NFns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "NFns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "NFns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "NFns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "NFns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing"},
+ "NNfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Num|Number=Plur"},
+ "NNfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Num|Number=Plur"},
+ "NNfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Num|Number=Plur"},
+ "NNfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Num|Number=Plur"},
+ "NNfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Num|Number=Plur"},
+ "NNfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Num|Number=Plur"},
+ "NNfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Num|Number=Plur"},
+ "NNip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Num|Number=Plur"},
+ "NNnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Num|Number=Plur"},
+ "NNnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Num|Number=Plur"},
+ "NNnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Num|Number=Plur"},
+ "NNnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Num|Number=Plur"},
+ "NNnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Num|Number=Plur"},
+ "NNnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Num|Number=Plur"},
+ "NNnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Num|Number=Plur"},
+ "NSfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "NSfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "NSfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "NSfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "NSfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "NSfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "NSfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "NSfs1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "NSfs2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "NSfs3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "NSfs4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "NSfs5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "NSfs6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "NSfs7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "NSip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "NSip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "NSip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "NSip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "NSip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "NSip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "NSip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "NSis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "NSis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "NSis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "NSis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "NSis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "NSis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "NSis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "NUfp1": {POS: NUM, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "NUfp2": {POS: NUM, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "NUfp3": {POS: NUM, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "NUfp4": {POS: NUM, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "NUfp5": {POS: NUM, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "NUfp6": {POS: NUM, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "NUfp7": {POS: NUM, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "NUip1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUip2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUip3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUip4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUip5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUip6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUip7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUis1": {POS: NUM, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "NUis2": {POS: NUM, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "NUis3": {POS: NUM, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "NUis4": {POS: NUM, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "NUis5": {POS: NUM, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "NUis6": {POS: NUM, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "NUis7": {POS: NUM, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "NUmp1": {POS: NUM, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUmp2": {POS: NUM, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUmp3": {POS: NUM, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUmp4": {POS: NUM, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUmp5": {POS: NUM, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUmp6": {POS: NUM, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUmp7": {POS: NUM, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "NUnp1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "NUnp2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "NUnp3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "NUnp4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "NUnp5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "NUnp6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "NUnp7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "NUns1": {POS: NUM, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "NUns2": {POS: NUM, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "NUns3": {POS: NUM, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "NUns4": {POS: NUM, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "NUns5": {POS: NUM, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "NUns6": {POS: NUM, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "NUns7": {POS: NUM, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "O": {POS: CCONJ, "morph": "_"},
+ "OY": {POS: CCONJ, "morph": "Mood=Cnd"},
+ "PAfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur|PronType=Prs"},
+ "PAns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PAns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing|PronType=Prs"},
+ "PD": {POS: PRON, "morph": "MorphPos=Adv|PronType=Prs"},
+ "PFfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFis2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFis4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFms2g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFms4g": {POS: PRON, "morph": "AdpType=Preppron|Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Plur|PronType=Prs"},
+ "PFns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFns2g": {POS: PRON, "morph": "AdpType=Preppron|Case=Gen|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFns4g": {POS: PRON, "morph": "AdpType=Preppron|Case=Acc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PFns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Mix|Number=Sing|PronType=Prs"},
+ "PPhp1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Plur|PronType=Prs"},
+ "PPhp2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Plur|PronType=Prs"},
+ "PPhp3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Plur|PronType=Prs"},
+ "PPhp4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Plur|PronType=Prs"},
+ "PPhp5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Plur|PronType=Prs"},
+ "PPhp6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Plur|PronType=Prs"},
+ "PPhp7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Plur|PronType=Prs"},
+ "PPhs1": {POS: PRON, "morph": "Case=Nom|MorphPos=Pron|Number=Sing|PronType=Prs"},
+ "PPhs2": {POS: PRON, "morph": "Case=Gen|MorphPos=Pron|Number=Sing|PronType=Prs"},
+ "PPhs3": {POS: PRON, "morph": "Case=Dat|MorphPos=Pron|Number=Sing|PronType=Prs"},
+ "PPhs4": {POS: PRON, "morph": "Case=Acc|MorphPos=Pron|Number=Sing|PronType=Prs"},
+ "PPhs5": {POS: PRON, "morph": "Case=Voc|MorphPos=Pron|Number=Sing|PronType=Prs"},
+ "PPhs6": {POS: PRON, "morph": "Case=Loc|MorphPos=Pron|Number=Sing|PronType=Prs"},
+ "PPhs7": {POS: PRON, "morph": "Case=Ins|MorphPos=Pron|Number=Sing|PronType=Prs"},
+ "PSfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
+ "PSfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
+ "PSfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
+ "PSfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
+ "PSfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
+ "PSfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
+ "PSfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur|PronType=Prs"},
+ "PSfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PSfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PSfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PSfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PSfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PSfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PSfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PSns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PSns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PSns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PSns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PSns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PSns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PSns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing|PronType=Prs"},
+ "PUfp1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUfp2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUfp3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUfp4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUfp5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUfp6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUfp7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUfs1": {POS: PRON, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUfs2": {POS: PRON, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUfs3": {POS: PRON, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUfs4": {POS: PRON, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUfs5": {POS: PRON, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUfs6": {POS: PRON, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUfs7": {POS: PRON, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUip1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUip2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUip3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUip4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUip5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUip6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUip7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUis1": {POS: PRON, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUis2": {POS: PRON, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUis3": {POS: PRON, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUis4": {POS: PRON, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUis5": {POS: PRON, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUis6": {POS: PRON, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUis7": {POS: PRON, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUmp1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUmp2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUmp3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUmp4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUmp5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUmp6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUmp7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUms1": {POS: PRON, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUms2": {POS: PRON, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUms3": {POS: PRON, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUms4": {POS: PRON, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUms5": {POS: PRON, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUms6": {POS: PRON, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUms7": {POS: PRON, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUnp1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUnp2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUnp3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUnp4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUnp5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUnp6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUnp7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur|PronType=Prs"},
+ "PUns1": {POS: PRON, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUns2": {POS: PRON, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUns3": {POS: PRON, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUns4": {POS: PRON, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUns5": {POS: PRON, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUns6": {POS: PRON, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "PUns7": {POS: PRON, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing|PronType=Prs"},
+ "Q": {POS: X, "morph": "Hyph=Yes"},
+ "R": {POS: PRON, "morph": "PronType=Prs|Reflex=Yes"},
+ "SAfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "SAfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "SAfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "SAfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "SAfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "SAfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "SAfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Plur"},
+ "SAfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "SAfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "SAfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "SAfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "SAfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "SAfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "SAfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Adj|Number=Sing"},
+ "SAip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Plur"},
+ "SAms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Adj|Number=Sing"},
+ "SAnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "SAnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "SAnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "SAnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "SAnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "SAnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "SAnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Plur"},
+ "SAns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "SAns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "SAns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "SAns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "SAns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "SAns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "SAns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Adj|Number=Sing"},
+ "SFfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "SFfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "SFfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "SFfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "SFfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "SFfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "SFfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Plur"},
+ "SFfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "SFfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "SFfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "SFfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "SFfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "SFfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "SFfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Mix|Number=Sing"},
+ "SSfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "SSfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "SSfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "SSfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "SSfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "SSfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "SSfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Plur"},
+ "SSfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "SSfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "SSfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "SSfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "SSfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "SSfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "SSfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Noun|Number=Sing"},
+ "SSip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Plur"},
+ "SSms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Noun|Number=Sing"},
+ "SSnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Plur"},
+ "SSnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Plur"},
+ "SSnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Plur"},
+ "SSnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Plur"},
+ "SSnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Plur"},
+ "SSnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Plur"},
+ "SSnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Plur"},
+ "SSns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Noun|Number=Sing"},
+ "SSns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Noun|Number=Sing"},
+ "SSns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Noun|Number=Sing"},
+ "SSns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Noun|Number=Sing"},
+ "SSns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Noun|Number=Sing"},
+ "SSns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Noun|Number=Sing"},
+ "SSns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Noun|Number=Sing"},
+ "SUfp1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "SUfp2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "SUfp3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "SUfp4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "SUfp5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "SUfp6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "SUfp7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Plur"},
+ "SUfs1": {POS: NOUN, "morph": "Case=Nom|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "SUfs2": {POS: NOUN, "morph": "Case=Gen|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "SUfs3": {POS: NOUN, "morph": "Case=Dat|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "SUfs4": {POS: NOUN, "morph": "Case=Acc|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "SUfs5": {POS: NOUN, "morph": "Case=Voc|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "SUfs6": {POS: NOUN, "morph": "Case=Loc|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "SUfs7": {POS: NOUN, "morph": "Case=Ins|Gender=Fem|MorphPos=Def|Number=Sing"},
+ "SUip1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUip2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUip3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUip4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUip5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUip6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUip7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUis1": {POS: NOUN, "morph": "Animacy=Inan|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUis2": {POS: NOUN, "morph": "Animacy=Inan|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUis3": {POS: NOUN, "morph": "Animacy=Inan|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUis4": {POS: NOUN, "morph": "Animacy=Inan|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUis5": {POS: NOUN, "morph": "Animacy=Inan|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUis6": {POS: NOUN, "morph": "Animacy=Inan|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUis7": {POS: NOUN, "morph": "Animacy=Inan|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUmp1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUmp2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUmp3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUmp4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUmp5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUmp6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUmp7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Plur"},
+ "SUms1": {POS: NOUN, "morph": "Animacy=Anim|Case=Nom|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUms2": {POS: NOUN, "morph": "Animacy=Anim|Case=Gen|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUms3": {POS: NOUN, "morph": "Animacy=Anim|Case=Dat|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUms4": {POS: NOUN, "morph": "Animacy=Anim|Case=Acc|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUms5": {POS: NOUN, "morph": "Animacy=Anim|Case=Voc|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUms6": {POS: NOUN, "morph": "Animacy=Anim|Case=Loc|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUms7": {POS: NOUN, "morph": "Animacy=Anim|Case=Ins|Gender=Masc|MorphPos=Def|Number=Sing"},
+ "SUnp1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "SUnp2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "SUnp3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "SUnp4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "SUnp5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "SUnp6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "SUnp7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Plur"},
+ "SUns1": {POS: NOUN, "morph": "Case=Nom|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "SUns2": {POS: NOUN, "morph": "Case=Gen|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "SUns3": {POS: NOUN, "morph": "Case=Dat|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "SUns4": {POS: NOUN, "morph": "Case=Acc|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "SUns5": {POS: NOUN, "morph": "Case=Voc|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "SUns6": {POS: NOUN, "morph": "Case=Loc|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "SUns7": {POS: NOUN, "morph": "Case=Ins|Gender=Neut|MorphPos=Def|Number=Sing"},
+ "T": {POS: PART, "morph": "_"},
+ "TY": {POS: PART, "morph": "Mood=Cnd"},
+ "VBepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
+ "VBepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
+ "VBepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
+ "VBepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
+ "VBepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
+ "VBepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
+ "VBesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
+ "VBesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
+ "VBesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
+ "VBesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
+ "VBesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
+ "VBesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
+ "VBjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
+ "VBjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
+ "VBjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
+ "VBjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
+ "VBjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
+ "VBjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
+ "VBjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
+ "VBjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
+ "VBjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
+ "VBjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
+ "VBjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Fut|VerbForm=Fin"},
+ "VBjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Fut|VerbForm=Fin"},
+ "VHd-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Conv"},
+ "VHd+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Conv"},
+ "VHe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Conv"},
+ "VHe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Conv"},
+ "VHj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Conv"},
+ "VHj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Conv"},
+ "VId-": {POS: VERB, "morph": "Aspect=Perf|Polarity=Neg|VerbForm=Inf"},
+ "VId+": {POS: VERB, "morph": "Aspect=Perf|Polarity=Pos|VerbForm=Inf"},
+ "VIe-": {POS: VERB, "morph": "Aspect=Imp|Polarity=Neg|VerbForm=Inf"},
+ "VIe+": {POS: VERB, "morph": "Aspect=Imp|Polarity=Pos|VerbForm=Inf"},
+ "VIj-": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Neg|VerbForm=Inf"},
+ "VIj+": {POS: VERB, "morph": "Aspect=Imp,Perf|Polarity=Pos|VerbForm=Inf"},
+ "VKdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKdpc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKdpc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKdsa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKdsa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKdsc-": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKdsc+": {POS: VERB, "morph": "Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKe-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKepc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKepc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKesa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKesa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKesc-": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKesc+": {POS: VERB, "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKjpc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKjpc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Plur|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKjsa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKjsa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VKjsc-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Neg|Tense=Pres|VerbForm=Fin"},
+ "VKjsc+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin"},
+ "VLdpah-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdpah+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdpbh-": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdpbh+": {POS: VERB, "morph": "Aspect=Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdpcf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdpcf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdpcn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdpcn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdsaf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdsaf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdsan-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdsan+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdsbf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdsbf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdsbn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdsbn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdscf-": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdscf+": {POS: VERB, "morph": "Aspect=Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLdscn-": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLdscn+": {POS: VERB, "morph": "Aspect=Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLepah-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLepah+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLepbh-": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLepbh+": {POS: VERB, "morph": "Aspect=Imp|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLepcf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLepcf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLepci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLepci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLepcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLepcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLepcn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLepcn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLesaf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLesaf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLesai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLesai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLesam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLesam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLesan-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLesan+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLesbf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLesbf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLesbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLesbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLesbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLesbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLesbn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLesbn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLescf-": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLescf+": {POS: VERB, "morph": "Aspect=Imp|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLesci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLesci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLescm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLescm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLescn-": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLescn+": {POS: VERB, "morph": "Aspect=Imp|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjpah-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjpah+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjpbh-": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjpbh+": {POS: VERB, "morph": "Aspect=Imp,Perf|Number=Plur|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjpcf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjpcf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjpci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjpci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjpcm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjpcm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjpcn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjpcn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Plur|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjsaf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjsaf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjsai-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjsai+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjsam-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjsam+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjsan-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjsan+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=1|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjsbf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjsbf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjsbi-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjsbi+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjsbm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjsbm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjsbn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjsbn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=2|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjscf-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjscf+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Fem|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjsci-": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjsci+": {POS: VERB, "morph": "Animacy=Inan|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjscm-": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjscm+": {POS: VERB, "morph": "Animacy=Anim|Aspect=Imp,Perf|Gender=Masc|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VLjscn-": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Neg|Tense=Past|VerbForm=Part"},
+ "VLjscn+": {POS: VERB, "morph": "Aspect=Imp,Perf|Gender=Neut|Number=Sing|Person=3|Polarity=Pos|Tense=Past|VerbForm=Part"},
+ "VMdpa-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"},
+ "VMdpa+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"},
+ "VMdpb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"},
+ "VMdpb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"},
+ "VMdsb-": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"},
+ "VMdsb+": {POS: VERB, "morph": "Aspect=Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"},
+ "VMepa-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"},
+ "VMepa+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"},
+ "VMepb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"},
+ "VMepb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"},
+ "VMesb-": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"},
+ "VMesb+": {POS: VERB, "morph": "Aspect=Imp|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"},
+ "VMjpa-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Neg|VerbForm=Fin"},
+ "VMjpa+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=1|Polarity=Pos|VerbForm=Fin"},
+ "VMjpb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Neg|VerbForm=Fin"},
+ "VMjpb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Plur|Person=2|Polarity=Pos|VerbForm=Fin"},
+ "VMjsb-": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Neg|VerbForm=Fin"},
+ "VMjsb+": {POS: VERB, "morph": "Aspect=Imp,Perf|Mood=Imp|Number=Sing|Person=2|Polarity=Pos|VerbForm=Fin"},
+ "W": {POS: X, "morph": "Abbr=Yes"},
+ "Y": {POS: AUX, "morph": "Mood=Cnd"},
}
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 26504a3c0..8e55d3873 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -77,7 +77,7 @@ cdef class Parser:
tok2vec = Tok2Vec(width=token_vector_width,
embed_size=embed_size,
conv_depth=conv_depth,
- window_size=window_size,
+ window_size=conv_window,
cnn_maxout_pieces=t2v_pieces,
subword_features=subword_features,
pretrained_vectors=pretrained_vectors,
@@ -105,7 +105,7 @@ cdef class Parser:
'bilstm_depth': bilstm_depth,
'self_attn_depth': self_attn_depth,
'conv_depth': conv_depth,
- 'window_size': window_size,
+ 'window_size': conv_window,
'embed_size': embed_size,
'cnn_maxout_pieces': t2v_pieces
}
diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py
index 834219773..ddbf6f7a0 100644
--- a/spacy/tests/regression/test_issue4849.py
+++ b/spacy/tests/regression/test_issue4849.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
@@ -9,11 +6,12 @@ def test_issue4849():
nlp = English()
ruler = EntityRuler(
- nlp, patterns=[
- {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'},
- {"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'},
+ nlp,
+ patterns=[
+ {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
+ {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
],
- phrase_matcher_attr="LOWER"
+ phrase_matcher_attr="LOWER",
)
nlp.add_pipe(ruler)
@@ -27,10 +25,10 @@ def test_issue4849():
count_ents = 0
for doc in nlp.pipe([text], n_process=1):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
- assert(count_ents == 2)
+ assert count_ents == 2
# USING 2 PROCESSES
count_ents = 0
for doc in nlp.pipe([text], n_process=2):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
- assert (count_ents == 2)
+ assert count_ents == 2
diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py
index 8aea2c3d5..5665d6d0f 100644
--- a/spacy/tests/regression/test_issue4924.py
+++ b/spacy/tests/regression/test_issue4924.py
@@ -1,16 +1,9 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import pytest
-
-import spacy
+from spacy.language import Language
-@pytest.fixture
-def nlp():
- return spacy.blank("en")
-
-
-def test_evaluate(nlp):
+def test_evaluate():
+ nlp = Language()
docs_golds = [("", {})]
- nlp.evaluate(docs_golds)
+ with pytest.raises(ValueError):
+ nlp.evaluate(docs_golds)
From e3f40a6a0f590088d16dbdbc252d9304cf482cfc Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Tue, 18 Feb 2020 15:38:18 +0100
Subject: [PATCH 067/496] Tidy up and auto-format
---
spacy/__init__.py | 2 +-
spacy/cli/__init__.py | 2 +-
spacy/cli/debug_data.py | 6 +--
spacy/cli/pretrain.py | 10 ++---
spacy/cli/train.py | 26 ++++++-------
spacy/cli/train_from_config.py | 36 ++++++++----------
spacy/compat.py | 2 +-
spacy/displacy/templates.py | 1 -
spacy/glossary.py | 1 -
spacy/gold.pxd | 4 +-
spacy/kb.pxd | 5 +--
spacy/lang/af/stop_words.py | 1 -
spacy/lang/bg/stop_words.py | 1 -
spacy/lang/bn/examples.py | 1 -
spacy/lang/bn/stop_words.py | 1 -
spacy/lang/ca/examples.py | 1 -
spacy/lang/cs/stop_words.py | 1 -
spacy/lang/da/examples.py | 1 -
spacy/lang/de/examples.py | 1 -
spacy/lang/de/stop_words.py | 3 +-
spacy/lang/el/get_pos_from_wiktionary.py | 1 -
spacy/lang/el/norm_exceptions.py | 1 -
spacy/lang/el/stop_words.py | 1 -
spacy/lang/en/examples.py | 1 -
spacy/lang/en/norm_exceptions.py | 1 -
spacy/lang/en/stop_words.py | 1 -
spacy/lang/es/examples.py | 1 -
spacy/lang/es/stop_words.py | 1 -
spacy/lang/et/stop_words.py | 1 -
spacy/lang/fa/examples.py | 1 -
spacy/lang/fa/generate_verbs_exc.py | 1 -
spacy/lang/fa/stop_words.py | 1 -
spacy/lang/fi/stop_words.py | 1 -
spacy/lang/fr/examples.py | 1 -
spacy/lang/fr/stop_words.py | 1 -
spacy/lang/ga/irish_morphology_helpers.py | 1 -
spacy/lang/he/examples.py | 1 -
spacy/lang/hi/examples.py | 1 -
spacy/lang/hi/stop_words.py | 1 -
spacy/lang/hu/examples.py | 1 -
spacy/lang/hu/stop_words.py | 1 -
spacy/lang/id/examples.py | 1 -
spacy/lang/is/stop_words.py | 1 -
spacy/lang/it/examples.py | 1 -
spacy/lang/it/stop_words.py | 1 -
spacy/lang/ja/examples.py | 1 -
spacy/lang/kn/stop_words.py | 1 -
spacy/lang/lt/examples.py | 1 -
spacy/lang/lv/stop_words.py | 1 -
spacy/lang/mr/stop_words.py | 1 -
spacy/lang/nb/examples.py | 1 -
spacy/lang/nl/examples.py | 1 -
spacy/lang/norm_exceptions.py | 1 -
spacy/lang/pl/examples.py | 1 -
spacy/lang/pt/examples.py | 1 -
spacy/lang/pt/stop_words.py | 1 -
spacy/lang/ro/examples.py | 1 -
spacy/lang/ru/examples.py | 1 -
spacy/lang/ru/norm_exceptions.py | 1 -
spacy/lang/si/examples.py | 1 -
spacy/lang/si/stop_words.py | 1 -
spacy/lang/sk/stop_words.py | 1 -
spacy/lang/sl/stop_words.py | 1 -
spacy/lang/sq/examples.py | 1 -
spacy/lang/sq/stop_words.py | 1 -
spacy/lang/sr/examples.py | 1 -
spacy/lang/sr/norm_exceptions.py | 1 -
spacy/lang/sr/stop_words.py | 1 -
spacy/lang/sv/examples.py | 1 -
spacy/lang/sv/stop_words.py | 1 -
spacy/lang/ta/examples.py | 1 -
spacy/lang/ta/stop_words.py | 1 -
spacy/lang/te/examples.py | 1 -
spacy/lang/th/norm_exceptions.py | 1 -
spacy/lang/tokenizer_exceptions.py | 2 +-
spacy/lang/tr/examples.py | 1 -
spacy/lang/uk/examples.py | 1 -
spacy/lang/ur/examples.py | 1 -
spacy/lang/xx/__init__.py | 1 -
spacy/lang/xx/examples.py | 1 -
spacy/lang/yo/examples.py | 1 -
spacy/lang/zh/examples.py | 1 -
spacy/language.py | 2 +-
spacy/lexeme.pyx | 2 +-
spacy/ml/_character_embed.py | 8 ++--
spacy/ml/_layers.py | 17 ++++-----
spacy/ml/component_models.py | 23 ++++++-----
spacy/ml/extract_ngrams.py | 7 +---
spacy/ml/tok2vec.py | 38 +++++++++++--------
spacy/pipeline/hooks.py | 5 +--
spacy/pipeline/morphologizer.pyx | 4 +-
spacy/pipeline/pipes.pyx | 8 ++--
spacy/pipeline/tok2vec.py | 17 ++++++---
spacy/syntax/_parser_model.pyx | 4 +-
spacy/syntax/nn_parser.pyx | 18 ++++-----
spacy/syntax/nonproj.pyx | 2 +-
spacy/tests/doc/test_doc_api.py | 15 +++++++-
spacy/tests/doc/test_morphanalysis.py | 4 +-
spacy/tests/doc/test_retokenize_merge.py | 7 +++-
spacy/tests/lang/ar/test_text.py | 1 -
spacy/tests/lang/en/test_indices.py | 1 -
spacy/tests/lang/fi/test_tokenizer.py | 12 ++----
spacy/tests/lang/hu/test_tokenizer.py | 16 ++++----
spacy/tests/lang/sv/test_text.py | 1 -
spacy/tests/lang/zh/test_text.py | 1 -
.../tests/morphology/test_morph_converters.py | 1 -
spacy/tests/morphology/test_morph_features.py | 11 +++++-
spacy/tests/parser/test_add_label.py | 3 +-
spacy/tests/parser/test_ner.py | 2 +-
spacy/tests/parser/test_preset_sbd.py | 3 +-
spacy/tests/pipeline/test_entity_ruler.py | 7 +---
spacy/tests/pipeline/test_tagger.py | 1 -
spacy/tests/regression/test_issue1501-2000.py | 4 +-
spacy/tests/regression/test_issue3611.py | 5 +--
spacy/tests/regression/test_issue4030.py | 5 +--
spacy/tests/test_architectures.py | 2 +-
spacy/tests/test_cli.py | 18 ++++++---
spacy/tests/tokenizer/test_exceptions.py | 4 +-
spacy/tests/tokenizer/test_tokenizer.py | 14 ++++++-
spacy/tokens/_retokenize.pyx | 2 +-
spacy/tokens/_serialize.py | 2 +-
spacy/tokens/doc.pyx | 3 +-
spacy/tokens/span.pyx | 2 +-
spacy/tokens/token.pyx | 2 +-
spacy/util.py | 16 +++++---
spacy/vectors.pyx | 3 +-
spacy/vocab.pyx | 2 +-
127 files changed, 219 insertions(+), 275 deletions(-)
diff --git a/spacy/__init__.py b/spacy/__init__.py
index 4a311ec86..2c063ce24 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -5,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
# These are imported as part of the API
-from thinc.util import prefer_gpu, require_gpu
+from thinc.api import prefer_gpu, require_gpu
from . import pipeline
from .cli.info import info as cli_info
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 0f7677fd2..585eaea51 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -4,7 +4,7 @@ from .link import link # noqa: F401
from .package import package # noqa: F401
from .profile import profile # noqa: F401
from .train import train # noqa: F401
-from .train_from_config import train_from_config_cli # noqa: F401
+from .train_from_config import train_from_config_cli # noqa: F401
from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .evaluate import evaluate # noqa: F401
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 4bcafce24..1705bf446 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -192,11 +192,7 @@ def debug_data(
has_ws_ents_error = True
if gold_train_data["punct_ents"]:
- msg.warn(
- "{} entity span(s) with punctuation".format(
- gold_train_data["punct_ents"]
- )
- )
+ msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
has_punct_ents_warning = True
for label in new_labels:
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 2cef378c0..690e3107d 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -4,14 +4,12 @@ import time
import re
from collections import Counter
from pathlib import Path
-from thinc.layers import Linear, Maxout
-from thinc.util import prefer_gpu
+from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu
+from thinc.api import CosineDistance, L2Distance
from wasabi import msg
import srsly
-from thinc.layers import chain, list2array
-from thinc.loss import CosineDistance, L2Distance
-from spacy.gold import Example
+from ..gold import Example
from ..errors import Errors
from ..tokens import Doc
from ..attrs import ID, HEAD
@@ -85,7 +83,7 @@ def pretrain(
)
if not output_dir.exists():
output_dir.mkdir()
- msg.good("Created output directory: {}".format(output_dir))
+ msg.good(f"Created output directory: {output_dir}")
srsly.write_json(output_dir / "config.json", config)
msg.good("Saved settings to config.json")
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index adae91ff9..d8514095b 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -1,7 +1,7 @@
import os
import tqdm
from pathlib import Path
-from thinc.backends import use_ops
+from thinc.api import use_ops
from timeit import default_timer as timer
import shutil
import srsly
@@ -89,7 +89,7 @@ def train(
)
if not output_path.exists():
output_path.mkdir()
- msg.good("Created output directory: {}".format(output_path))
+ msg.good(f"Created output directory: {output_path}")
tag_map = {}
if tag_map_path is not None:
@@ -125,17 +125,17 @@ def train(
msg.text(f"Training pipeline: {pipeline}")
disabled_pipes = None
pipes_added = False
- msg.text("Training pipeline: {}".format(pipeline))
+ msg.text(f"Training pipeline: {pipeline}")
if use_gpu >= 0:
activated_gpu = None
try:
activated_gpu = set_gpu(use_gpu)
except Exception as e:
- msg.warn("Exception: {}".format(e))
+ msg.warn(f"Exception: {e}")
if activated_gpu is not None:
- msg.text("Using GPU: {}".format(use_gpu))
+ msg.text(f"Using GPU: {use_gpu}")
else:
- msg.warn("Unable to activate GPU: {}".format(use_gpu))
+ msg.warn(f"Unable to activate GPU: {use_gpu}")
msg.text("Using CPU only")
use_gpu = -1
if base_model:
@@ -158,11 +158,11 @@ def train(
"positive_label": textcat_positive_label,
}
if pipe not in nlp.pipe_names:
- msg.text("Adding component to base model '{}'".format(pipe))
+ msg.text(f"Adding component to base model '{pipe}'")
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
pipes_added = True
elif replace_components:
- msg.text("Replacing component from base model '{}'".format(pipe))
+ msg.text(f"Replacing component from base model '{pipe}'")
nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
pipes_added = True
else:
@@ -180,7 +180,7 @@ def train(
f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
exits=1,
)
- msg.text("Extending component from base model '{}'".format(pipe))
+ msg.text(f"Extending component from base model '{pipe}'")
disabled_pipes = nlp.disable_pipes(
[p for p in nlp.pipe_names if p not in pipeline]
)
@@ -377,7 +377,7 @@ def train(
msg.warn(
"Did you provide the same parameters during 'train' as during 'pretrain'?"
)
- msg.fail("Original error message: {}".format(e), exits=1)
+ msg.fail(f"Original error message: {e}", exits=1)
if raw_text:
# If raw text is available, perform 'rehearsal' updates,
# which use unlabelled data to reduce overfitting.
@@ -504,11 +504,7 @@ def train(
)
break
except Exception as e:
- msg.warn(
- "Aborting and saving the final best model. Encountered exception: {}".format(
- e
- )
- )
+ msg.warn(f"Aborting and saving final best model. Encountered exception: {e}")
finally:
best_pipes = nlp.pipe_names
if disabled_pipes:
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index 0488dd04c..9150da356 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -1,19 +1,20 @@
+from typing import Optional, Dict, List, Union, Sequence
import plac
-from thinc.util import require_gpu
from wasabi import msg
from pathlib import Path
import thinc
import thinc.schedules
-from thinc.model import Model
-from spacy.gold import GoldCorpus
-import spacy
-from spacy.pipeline.tok2vec import Tok2VecListener
-from typing import Optional, Dict, List, Union, Sequence
+from thinc.api import Model
from pydantic import BaseModel, FilePath, StrictInt
import tqdm
-from ..ml import component_models
-from .. import util
+# TODO: relative imports?
+import spacy
+from spacy.gold import GoldCorpus
+from spacy.pipeline.tok2vec import Tok2VecListener
+from spacy.ml import component_models
+from spacy import util
+
registry = util.registry
@@ -153,10 +154,9 @@ def create_tb_parser_model(
hidden_width: StrictInt = 64,
maxout_pieces: StrictInt = 3,
):
- from thinc.layers import Linear, chain, list2array
+ from thinc.api import Linear, chain, list2array, use_ops, zero_init
from spacy.ml._layers import PrecomputableAffine
from spacy.syntax._parser_model import ParserModel
- from thinc.api import use_ops, zero_init
token_vector_width = tok2vec.get_dim("nO")
tok2vec = chain(tok2vec, list2array())
@@ -221,13 +221,9 @@ def train_from_config_cli(
def train_from_config(
- config_path,
- data_paths,
- raw_text=None,
- meta_path=None,
- output_path=None,
+ config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
):
- msg.info("Loading config from: {}".format(config_path))
+ msg.info(f"Loading config from: {config_path}")
config = util.load_from_config(config_path, create_objects=True)
use_gpu = config["training"]["use_gpu"]
if use_gpu >= 0:
@@ -241,9 +237,7 @@ def train_from_config(
msg.info("Loading training corpus")
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
msg.info("Initializing the nlp pipeline")
- nlp.begin_training(
- lambda: corpus.train_examples, device=use_gpu
- )
+ nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
train_batches = create_train_batches(nlp, corpus, config["training"])
evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"])
@@ -260,7 +254,7 @@ def train_from_config(
config["training"]["eval_frequency"],
)
- msg.info("Training. Initial learn rate: {}".format(optimizer.learn_rate))
+ msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
print_row = setup_printer(config)
try:
@@ -414,7 +408,7 @@ def subdivide_batch(batch):
def setup_printer(config):
score_cols = config["training"]["scores"]
score_widths = [max(len(col), 6) for col in score_cols]
- loss_cols = ["Loss {}".format(pipe) for pipe in config["nlp"]["pipeline"]]
+ loss_cols = [f"Loss {pipe}" for pipe in config["nlp"]["pipeline"]]
loss_widths = [max(len(col), 8) for col in loss_cols]
table_header = ["#"] + loss_cols + score_cols + ["Score"]
table_header = [col.upper() for col in table_header]
diff --git a/spacy/compat.py b/spacy/compat.py
index 6fa49353e..8c5c2930b 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -30,7 +30,7 @@ try:
except ImportError:
cupy = None
-from thinc.optimizers import Optimizer # noqa: F401
+from thinc.api import Optimizer # noqa: F401
pickle = pickle
copy_reg = copy_reg
diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py
index d6970aa2f..a721ce480 100644
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@@ -1,4 +1,3 @@
-
# Setting explicit height and max-width: none on the SVG is required for
# Jupyter to render it properly in a cell
diff --git a/spacy/glossary.py b/spacy/glossary.py
index 5e7e531a9..938a575cd 100644
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@@ -1,4 +1,3 @@
-
def explain(term):
"""Get a description for a given POS tag, dependency label or entity type.
diff --git a/spacy/gold.pxd b/spacy/gold.pxd
index 49dba16df..aea691130 100644
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@@ -1,6 +1,6 @@
from cymem.cymem cimport Pool
-from spacy.tokens import Doc
+from .tokens import Doc
from .typedefs cimport attr_t
from .syntax.transition_system cimport Transition
@@ -65,5 +65,3 @@ cdef class Example:
cdef public TokenAnnotation token_annotation
cdef public DocAnnotation doc_annotation
cdef public object goldparse
-
-
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index d5aa382b1..518ce0f4e 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -6,7 +6,7 @@ from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE
-from spacy.vocab cimport Vocab
+from .vocab cimport Vocab
from .typedefs cimport hash_t
from .structs cimport KBEntryC, AliasC
@@ -113,7 +113,7 @@ cdef class KnowledgeBase:
return new_index
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
- """
+ """
Initializing the vectors and making sure the first element of each vector is a dummy,
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
cf. https://github.com/explosion/preshed/issues/17
@@ -169,4 +169,3 @@ cdef class Reader:
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
cdef int _read(self, void* value, size_t size) except -1
-
diff --git a/spacy/lang/af/stop_words.py b/spacy/lang/af/stop_words.py
index dfd144de9..4b5a04a5e 100644
--- a/spacy/lang/af/stop_words.py
+++ b/spacy/lang/af/stop_words.py
@@ -1,4 +1,3 @@
-
# Source: https://github.com/stopwords-iso/stopwords-af
STOP_WORDS = set(
diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py
index 45a252bc9..aae7692a2 100644
--- a/spacy/lang/bg/stop_words.py
+++ b/spacy/lang/bg/stop_words.py
@@ -1,4 +1,3 @@
-
# Source: https://github.com/Alir3z4/stop-words
STOP_WORDS = set(
diff --git a/spacy/lang/bn/examples.py b/spacy/lang/bn/examples.py
index 051e59d84..c3be4c556 100644
--- a/spacy/lang/bn/examples.py
+++ b/spacy/lang/bn/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/bn/stop_words.py b/spacy/lang/bn/stop_words.py
index 6bcd06b37..bf38e3254 100644
--- a/spacy/lang/bn/stop_words.py
+++ b/spacy/lang/bn/stop_words.py
@@ -1,4 +1,3 @@
-
STOP_WORDS = set(
"""
অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
diff --git a/spacy/lang/ca/examples.py b/spacy/lang/ca/examples.py
index 3fbf1fb0a..ae6aa3e24 100644
--- a/spacy/lang/ca/examples.py
+++ b/spacy/lang/ca/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/cs/stop_words.py b/spacy/lang/cs/stop_words.py
index e8171a7e5..70aab030b 100644
--- a/spacy/lang/cs/stop_words.py
+++ b/spacy/lang/cs/stop_words.py
@@ -1,4 +1,3 @@
-
# Source: https://github.com/Alir3z4/stop-words
STOP_WORDS = set(
diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py
index e5c6448f0..80b2b925b 100644
--- a/spacy/lang/da/examples.py
+++ b/spacy/lang/da/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/de/examples.py b/spacy/lang/de/examples.py
index 530ece629..735d1c316 100644
--- a/spacy/lang/de/examples.py
+++ b/spacy/lang/de/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/de/stop_words.py b/spacy/lang/de/stop_words.py
index cc5aa0f3c..f52687eb9 100644
--- a/spacy/lang/de/stop_words.py
+++ b/spacy/lang/de/stop_words.py
@@ -1,4 +1,3 @@
-
STOP_WORDS = set(
"""
á a ab aber ach acht achte achten achter achtes ag alle allein allem allen
@@ -44,7 +43,7 @@ kleines kommen kommt können könnt konnte könnte konnten kurz
lang lange leicht leider lieber los
machen macht machte mag magst man manche manchem manchen mancher manches mehr
-mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten
+mein meine meinem meinen meiner meines mich mir mit mittel mochte möchte mochten
mögen möglich mögt morgen muss muß müssen musst müsst musste mussten
na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter
diff --git a/spacy/lang/el/get_pos_from_wiktionary.py b/spacy/lang/el/get_pos_from_wiktionary.py
index 01deb23a2..369973cc0 100644
--- a/spacy/lang/el/get_pos_from_wiktionary.py
+++ b/spacy/lang/el/get_pos_from_wiktionary.py
@@ -1,4 +1,3 @@
-
def get_pos_from_wiktionary():
import re
from gensim.corpora.wikicorpus import extract_pages
diff --git a/spacy/lang/el/norm_exceptions.py b/spacy/lang/el/norm_exceptions.py
index d540aae2c..aa774c19b 100644
--- a/spacy/lang/el/norm_exceptions.py
+++ b/spacy/lang/el/norm_exceptions.py
@@ -1,4 +1,3 @@
-
# These exceptions are used to add NORM values based on a token's ORTH value.
# Norms are only set if no alternative is provided in the tokenizer exceptions.
diff --git a/spacy/lang/el/stop_words.py b/spacy/lang/el/stop_words.py
index 8484826d1..7c436219f 100644
--- a/spacy/lang/el/stop_words.py
+++ b/spacy/lang/el/stop_words.py
@@ -1,4 +1,3 @@
-
# Stop words
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
STOP_WORDS = set(
diff --git a/spacy/lang/en/examples.py b/spacy/lang/en/examples.py
index 0363a45e7..2cca9e05f 100644
--- a/spacy/lang/en/examples.py
+++ b/spacy/lang/en/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/en/norm_exceptions.py b/spacy/lang/en/norm_exceptions.py
index 431d9c049..4125cd37b 100644
--- a/spacy/lang/en/norm_exceptions.py
+++ b/spacy/lang/en/norm_exceptions.py
@@ -1,4 +1,3 @@
-
_exc = {
# Slang and abbreviations
"cos": "because",
diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py
index 4573c9411..1ca5cbc16 100644
--- a/spacy/lang/en/stop_words.py
+++ b/spacy/lang/en/stop_words.py
@@ -1,4 +1,3 @@
-
# Stop words
STOP_WORDS = set(
"""
diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py
index 1c1ad631b..a1db41a16 100644
--- a/spacy/lang/es/examples.py
+++ b/spacy/lang/es/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/es/stop_words.py b/spacy/lang/es/stop_words.py
index 3d46a88cb..004df4fca 100644
--- a/spacy/lang/es/stop_words.py
+++ b/spacy/lang/es/stop_words.py
@@ -1,4 +1,3 @@
-
STOP_WORDS = set(
"""
actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
diff --git a/spacy/lang/et/stop_words.py b/spacy/lang/et/stop_words.py
index 3b600a158..e1da1f14d 100644
--- a/spacy/lang/et/stop_words.py
+++ b/spacy/lang/et/stop_words.py
@@ -1,4 +1,3 @@
-
# Source: https://github.com/stopwords-iso/stopwords-et
STOP_WORDS = set(
diff --git a/spacy/lang/fa/examples.py b/spacy/lang/fa/examples.py
index d89feb6c8..9c6fb0345 100644
--- a/spacy/lang/fa/examples.py
+++ b/spacy/lang/fa/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/fa/generate_verbs_exc.py b/spacy/lang/fa/generate_verbs_exc.py
index 61586dc3f..62094c6de 100644
--- a/spacy/lang/fa/generate_verbs_exc.py
+++ b/spacy/lang/fa/generate_verbs_exc.py
@@ -1,4 +1,3 @@
-
verb_roots = """
#هست
آخت#آهنج
diff --git a/spacy/lang/fa/stop_words.py b/spacy/lang/fa/stop_words.py
index 372422b67..f462f2e7a 100644
--- a/spacy/lang/fa/stop_words.py
+++ b/spacy/lang/fa/stop_words.py
@@ -1,4 +1,3 @@
-
# Stop words from HAZM package
STOP_WORDS = set(
"""
diff --git a/spacy/lang/fi/stop_words.py b/spacy/lang/fi/stop_words.py
index 642cfc369..8e8dcfa56 100644
--- a/spacy/lang/fi/stop_words.py
+++ b/spacy/lang/fi/stop_words.py
@@ -1,4 +1,3 @@
-
# Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt
# Reformatted with some minor corrections
STOP_WORDS = set(
diff --git a/spacy/lang/fr/examples.py b/spacy/lang/fr/examples.py
index 57d57f4a6..a74a62204 100644
--- a/spacy/lang/fr/examples.py
+++ b/spacy/lang/fr/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/fr/stop_words.py b/spacy/lang/fr/stop_words.py
index 9c12e49a3..a331f3c0f 100644
--- a/spacy/lang/fr/stop_words.py
+++ b/spacy/lang/fr/stop_words.py
@@ -1,4 +1,3 @@
-
STOP_WORDS = set(
"""
a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons
diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py
index c8cd36835..d606da975 100644
--- a/spacy/lang/ga/irish_morphology_helpers.py
+++ b/spacy/lang/ga/irish_morphology_helpers.py
@@ -1,4 +1,3 @@
-
# fmt: off
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
diff --git a/spacy/lang/he/examples.py b/spacy/lang/he/examples.py
index 29075c7d4..d54d2a145 100644
--- a/spacy/lang/he/examples.py
+++ b/spacy/lang/he/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/hi/examples.py b/spacy/lang/hi/examples.py
index 7639ff940..ecb0b328c 100644
--- a/spacy/lang/hi/examples.py
+++ b/spacy/lang/hi/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py
index 142fc6f47..475b07da1 100644
--- a/spacy/lang/hi/stop_words.py
+++ b/spacy/lang/hi/stop_words.py
@@ -1,4 +1,3 @@
-
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6
STOP_WORDS = set(
diff --git a/spacy/lang/hu/examples.py b/spacy/lang/hu/examples.py
index b60f752ec..711a438bd 100644
--- a/spacy/lang/hu/examples.py
+++ b/spacy/lang/hu/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/hu/stop_words.py b/spacy/lang/hu/stop_words.py
index 024af68f4..e39a26d35 100644
--- a/spacy/lang/hu/stop_words.py
+++ b/spacy/lang/hu/stop_words.py
@@ -1,4 +1,3 @@
-
STOP_WORDS = set(
"""
a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben
diff --git a/spacy/lang/id/examples.py b/spacy/lang/id/examples.py
index 2ce46ce5a..1069232ff 100644
--- a/spacy/lang/id/examples.py
+++ b/spacy/lang/id/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/is/stop_words.py
index 5b3ff2f5a..917fb6df4 100644
--- a/spacy/lang/is/stop_words.py
+++ b/spacy/lang/is/stop_words.py
@@ -1,4 +1,3 @@
-
# Source: https://github.com/Xangis/extra-stopwords
STOP_WORDS = set(
diff --git a/spacy/lang/it/examples.py b/spacy/lang/it/examples.py
index 30327bd14..506721276 100644
--- a/spacy/lang/it/examples.py
+++ b/spacy/lang/it/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py
index 5cd1af137..e97613912 100644
--- a/spacy/lang/it/stop_words.py
+++ b/spacy/lang/it/stop_words.py
@@ -1,4 +1,3 @@
-
STOP_WORDS = set(
"""
a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl
diff --git a/spacy/lang/ja/examples.py b/spacy/lang/ja/examples.py
index 1d532ad77..c3a011862 100644
--- a/spacy/lang/ja/examples.py
+++ b/spacy/lang/ja/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/kn/stop_words.py b/spacy/lang/kn/stop_words.py
index cfeb0e69d..dba9740af 100644
--- a/spacy/lang/kn/stop_words.py
+++ b/spacy/lang/kn/stop_words.py
@@ -1,4 +1,3 @@
-
STOP_WORDS = set(
"""
ಹಲವು
diff --git a/spacy/lang/lt/examples.py b/spacy/lang/lt/examples.py
index b2889114c..eaf941f1a 100644
--- a/spacy/lang/lt/examples.py
+++ b/spacy/lang/lt/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/lv/stop_words.py b/spacy/lang/lv/stop_words.py
index a9612f949..2685c2430 100644
--- a/spacy/lang/lv/stop_words.py
+++ b/spacy/lang/lv/stop_words.py
@@ -1,4 +1,3 @@
-
# Source: https://github.com/stopwords-iso/stopwords-lv
STOP_WORDS = set(
diff --git a/spacy/lang/mr/stop_words.py b/spacy/lang/mr/stop_words.py
index 0d7501461..9b0cee951 100644
--- a/spacy/lang/mr/stop_words.py
+++ b/spacy/lang/mr/stop_words.py
@@ -1,4 +1,3 @@
-
# Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json
STOP_WORDS = set(
"""
diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py
index 89e265951..b1a63ad74 100644
--- a/spacy/lang/nb/examples.py
+++ b/spacy/lang/nb/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/nl/examples.py b/spacy/lang/nl/examples.py
index fcefa9d62..8c8c50c60 100644
--- a/spacy/lang/nl/examples.py
+++ b/spacy/lang/nl/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/norm_exceptions.py b/spacy/lang/norm_exceptions.py
index c194f05c7..f35f613b1 100644
--- a/spacy/lang/norm_exceptions.py
+++ b/spacy/lang/norm_exceptions.py
@@ -1,4 +1,3 @@
-
# These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them -
# for example, British vs. American spelling in English.
diff --git a/spacy/lang/pl/examples.py b/spacy/lang/pl/examples.py
index 6eabe1843..b1ea5880f 100644
--- a/spacy/lang/pl/examples.py
+++ b/spacy/lang/pl/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/pt/examples.py b/spacy/lang/pt/examples.py
index 7427f8b25..13f3512cf 100644
--- a/spacy/lang/pt/examples.py
+++ b/spacy/lang/pt/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/pt/stop_words.py b/spacy/lang/pt/stop_words.py
index 8065fcda7..ff45ad3a7 100644
--- a/spacy/lang/pt/stop_words.py
+++ b/spacy/lang/pt/stop_words.py
@@ -1,4 +1,3 @@
-
STOP_WORDS = set(
"""
à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes
diff --git a/spacy/lang/ro/examples.py b/spacy/lang/ro/examples.py
index d472f0d6d..bfa258ffc 100644
--- a/spacy/lang/ro/examples.py
+++ b/spacy/lang/ro/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/ru/examples.py b/spacy/lang/ru/examples.py
index 34cf5a1eb..adb007625 100644
--- a/spacy/lang/ru/examples.py
+++ b/spacy/lang/ru/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/ru/norm_exceptions.py b/spacy/lang/ru/norm_exceptions.py
index c5d725031..0975bf5b8 100644
--- a/spacy/lang/ru/norm_exceptions.py
+++ b/spacy/lang/ru/norm_exceptions.py
@@ -1,4 +1,3 @@
-
_exc = {
# Slang
"прив": "привет",
diff --git a/spacy/lang/si/examples.py b/spacy/lang/si/examples.py
index 0ff00e76e..b34051d00 100644
--- a/spacy/lang/si/examples.py
+++ b/spacy/lang/si/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/si/stop_words.py b/spacy/lang/si/stop_words.py
index 49723c860..bde662bf7 100644
--- a/spacy/lang/si/stop_words.py
+++ b/spacy/lang/si/stop_words.py
@@ -1,4 +1,3 @@
-
STOP_WORDS = set(
"""
අතර
diff --git a/spacy/lang/sk/stop_words.py b/spacy/lang/sk/stop_words.py
index 269bdc58b..017e7beef 100644
--- a/spacy/lang/sk/stop_words.py
+++ b/spacy/lang/sk/stop_words.py
@@ -1,4 +1,3 @@
-
# Source: https://github.com/Ardevop-sk/stopwords-sk
STOP_WORDS = set(
diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py
index c8596ad0b..6fb01a183 100644
--- a/spacy/lang/sl/stop_words.py
+++ b/spacy/lang/sl/stop_words.py
@@ -1,4 +1,3 @@
-
# Source: https://github.com/stopwords-iso/stopwords-sl
# TODO: probably needs to be tidied up – the list seems to have month names in
# it, which shouldn't be considered stop words.
diff --git a/spacy/lang/sq/examples.py b/spacy/lang/sq/examples.py
index e1075f70a..06ed20fa1 100644
--- a/spacy/lang/sq/examples.py
+++ b/spacy/lang/sq/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/sq/stop_words.py b/spacy/lang/sq/stop_words.py
index 58ee87d05..f2b1a4f4a 100644
--- a/spacy/lang/sq/stop_words.py
+++ b/spacy/lang/sq/stop_words.py
@@ -1,4 +1,3 @@
-
# Source: https://github.com/andrixh/index-albanian
STOP_WORDS = set(
diff --git a/spacy/lang/sr/examples.py b/spacy/lang/sr/examples.py
index 1ac867f4c..ec7f57ced 100644
--- a/spacy/lang/sr/examples.py
+++ b/spacy/lang/sr/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/sr/norm_exceptions.py b/spacy/lang/sr/norm_exceptions.py
index add8350a0..723ab84c0 100644
--- a/spacy/lang/sr/norm_exceptions.py
+++ b/spacy/lang/sr/norm_exceptions.py
@@ -1,4 +1,3 @@
-
_exc = {
# Slang
"ћале": "отац",
diff --git a/spacy/lang/sr/stop_words.py b/spacy/lang/sr/stop_words.py
index 488c82a75..5df5509d2 100644
--- a/spacy/lang/sr/stop_words.py
+++ b/spacy/lang/sr/stop_words.py
@@ -1,4 +1,3 @@
-
STOP_WORDS = set(
"""
а
diff --git a/spacy/lang/sv/examples.py b/spacy/lang/sv/examples.py
index 98eee700b..bc6cd7a54 100644
--- a/spacy/lang/sv/examples.py
+++ b/spacy/lang/sv/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/sv/stop_words.py b/spacy/lang/sv/stop_words.py
index 4d933a76d..2422b2a9e 100644
--- a/spacy/lang/sv/stop_words.py
+++ b/spacy/lang/sv/stop_words.py
@@ -1,4 +1,3 @@
-
STOP_WORDS = set(
"""
aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras
diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py
index 2590163cb..a53227220 100644
--- a/spacy/lang/ta/examples.py
+++ b/spacy/lang/ta/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/ta/stop_words.py b/spacy/lang/ta/stop_words.py
index 83410d65e..abbff949d 100644
--- a/spacy/lang/ta/stop_words.py
+++ b/spacy/lang/ta/stop_words.py
@@ -1,4 +1,3 @@
-
# Stop words
STOP_WORDS = set(
diff --git a/spacy/lang/te/examples.py b/spacy/lang/te/examples.py
index 6162b231e..cff7d3cb0 100644
--- a/spacy/lang/te/examples.py
+++ b/spacy/lang/te/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/th/norm_exceptions.py b/spacy/lang/th/norm_exceptions.py
index 98b878308..b8ddbab16 100644
--- a/spacy/lang/th/norm_exceptions.py
+++ b/spacy/lang/th/norm_exceptions.py
@@ -1,4 +1,3 @@
-
_exc = {
# Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
"สนุ๊กเกอร์": "สนุกเกอร์",
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index fa4e347fd..ee58a7b09 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -34,7 +34,7 @@ URL_PATTERN = (
r"|"
# host & domain names
# mods: match is case-sensitive, so include [A-Z]
- "(?:"
+ "(?:" # noqa: E131
"(?:"
"[A-Za-z0-9\u00a1-\uffff]"
"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
diff --git a/spacy/lang/tr/examples.py b/spacy/lang/tr/examples.py
index a14d87a46..dfb324a4e 100644
--- a/spacy/lang/tr/examples.py
+++ b/spacy/lang/tr/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.tr.examples import sentences
diff --git a/spacy/lang/uk/examples.py b/spacy/lang/uk/examples.py
index d17768ea6..f75d44488 100644
--- a/spacy/lang/uk/examples.py
+++ b/spacy/lang/uk/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/ur/examples.py b/spacy/lang/ur/examples.py
index 7024483b5..e55b337be 100644
--- a/spacy/lang/ur/examples.py
+++ b/spacy/lang/ur/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py
index 2af650703..347c624fd 100644
--- a/spacy/lang/xx/__init__.py
+++ b/spacy/lang/xx/__init__.py
@@ -1,4 +1,3 @@
-
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py
index 15f5c4ff8..8d63c3c20 100644
--- a/spacy/lang/xx/examples.py
+++ b/spacy/lang/xx/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/yo/examples.py b/spacy/lang/yo/examples.py
index 9b875d09e..0a610f125 100644
--- a/spacy/lang/yo/examples.py
+++ b/spacy/lang/yo/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/zh/examples.py b/spacy/lang/zh/examples.py
index d0715eb0d..8be1336d2 100644
--- a/spacy/lang/zh/examples.py
+++ b/spacy/lang/zh/examples.py
@@ -1,4 +1,3 @@
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/language.py b/spacy/language.py
index 3aaf0b327..1c6014cec 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -4,7 +4,7 @@ import weakref
import functools
from contextlib import contextmanager
from copy import copy, deepcopy
-from thinc.backends import get_current_ops
+from thinc.api import get_current_ops
import srsly
import multiprocessing as mp
from itertools import chain, cycle
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 1292a46bd..5910ebfe1 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -6,7 +6,7 @@ cimport numpy as np
np.import_array()
import numpy
-from thinc.util import get_array_module
+from thinc.api import get_array_module
from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py
index 2ff67746f..b366f67c6 100644
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
@@ -3,18 +3,20 @@ from thinc.api import Model
def CharacterEmbed(nM, nC):
# nM: Number of dimensions per character. nC: Number of characters.
- nO = nM*nC if (nM is not None and nC is not None) else None
+ nO = nM * nC if (nM is not None and nC is not None) else None
return Model(
"charembed",
forward,
init=init,
dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256},
- params={"E": None}
+ params={"E": None},
).initialize()
def init(model, X=None, Y=None):
- vectors_table = model.ops.alloc3f(model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM"))
+ vectors_table = model.ops.alloc3f(
+ model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
+ )
model.set_param("E", vectors_table)
diff --git a/spacy/ml/_layers.py b/spacy/ml/_layers.py
index e6aa798e7..7e9150d8b 100644
--- a/spacy/ml/_layers.py
+++ b/spacy/ml/_layers.py
@@ -1,5 +1,4 @@
-from thinc.model import Model
-from thinc.api import normal_init
+from thinc.api import Model, normal_init
def PrecomputableAffine(nO, nI, nF, nP):
@@ -20,9 +19,7 @@ def forward(model, X, is_train):
nP = model.get_dim("nP")
nI = model.get_dim("nI")
W = model.get_param("W")
- Yf = model.ops.gemm(
- X, W.reshape((nF * nO * nP, nI)), trans2=True
- )
+ Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True)
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
@@ -37,14 +34,14 @@ def forward(model, X, is_train):
# for b in range(nB):
# for f in range(nF):
# dYf[b, ids[b, f]] += dY[b]
- #
+ #
# However, we avoid building that array for efficiency -- and just pass
# in the indices.
dY, ids = dY_ids
assert dY.ndim == 3
assert dY.shape[1] == nO, dY.shape
assert dY.shape[2] == nP, dY.shape
- nB = dY.shape[0]
+ # nB = dY.shape[0]
model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
Xf = X[ids]
Xf = Xf.reshape((Xf.shape[0], nF * nI))
@@ -83,12 +80,12 @@ def _backprop_precomputable_affine_padding(model, dY, ids):
# for f in range(nF):
# if ids[b, f] < 0:
# d_padding[0, f] += dY[b]
- #
+ #
# Which can be rewritten as:
#
# for b in range(nB):
# d_pad[0, ids[b] < 0] += dY[b]
- #
+ #
# I don't know how to avoid the loop without building a whole array :(.
# Cursed numpy.
d_pad = model.ops.alloc((1, nF, nO, nP))
@@ -118,7 +115,7 @@ def init(model, X=None, Y=None):
pad = model.ops.alloc4f(1, nF, nO, nP)
ops = model.ops
- W = normal_init(ops, W.shape, fan_in=nF*nI)
+ W = normal_init(ops, W.shape, fan_in=nF * nI)
model.set_param("W", W)
model.set_param("b", b)
model.set_param("pad", pad)
diff --git a/spacy/ml/component_models.py b/spacy/ml/component_models.py
index a24c2bfce..8c694f950 100644
--- a/spacy/ml/component_models.py
+++ b/spacy/ml/component_models.py
@@ -9,7 +9,7 @@ from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged
from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors
from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain
from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued
-from thinc.api import zero_init, glorot_uniform_init
+from thinc.api import zero_init
def build_text_classifier(arch, config):
@@ -33,10 +33,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg
output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO"))
else:
# TODO: experiment with init_w=zero_init
- output_layer = (
- Linear(nO=nr_class, nI=tok2vec.get_dim("nO"))
- >> Logistic()
- )
+ output_layer = Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) >> Logistic()
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
model.set_ref("tok2vec", tok2vec)
model.set_dim("nO", nr_class)
@@ -149,13 +146,21 @@ def Tok2Vec(
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0)
if subword_features:
- prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0)
- suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0)
- shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0)
+ prefix = HashEmbed(
+ nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0
+ )
+ suffix = HashEmbed(
+ nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0
+ )
+ shape = HashEmbed(
+ nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0
+ )
else:
prefix, suffix, shape = (None, None, None)
if pretrained_vectors is not None:
- glove = StaticVectors(vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0)
+ glove = StaticVectors(
+ vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0
+ )
if subword_features:
embed = uniqued(
diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py
index 1ec5b5fc1..d4195b9a4 100644
--- a/spacy/ml/extract_ngrams.py
+++ b/spacy/ml/extract_ngrams.py
@@ -1,5 +1,5 @@
import numpy
-from thinc.model import Model
+from thinc.api import Model
from ..attrs import LOWER
@@ -26,9 +26,7 @@ def forward(self, docs, is_train: bool):
# The dtype here matches what thinc is expecting -- which differs per
# platform (by int definition). This should be fixed once the problem
# is fixed on Thinc's side.
- lengths = self.ops.asarray(
- [arr.shape[0] for arr in batch_keys], dtype=numpy.int_
- )
+ lengths = self.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
batch_keys = self.ops.xp.concatenate(batch_keys)
batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
@@ -36,4 +34,3 @@ def forward(self, docs, is_train: bool):
return dY
return (batch_keys, batch_vals, lengths), backprop
-
diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py
index 102b88604..5e51bc47a 100644
--- a/spacy/ml/tok2vec.py
+++ b/spacy/ml/tok2vec.py
@@ -1,11 +1,8 @@
-from thinc.layers import chain, clone, concatenate, with_array, uniqued
-from thinc.model import Model
-from thinc.layers import noop, with_padded
-from thinc.layers import Maxout, expand_window
-from thinc.layers import HashEmbed, StaticVectors
-from thinc.layers import residual, LayerNorm, FeatureExtractor
+from thinc.api import Model, chain, clone, concatenate, with_array, uniqued, noop
+from thinc.api import with_padded, Maxout, expand_window, HashEmbed, StaticVectors
+from thinc.api import residual, LayerNorm, FeatureExtractor
-from spacy.ml import _character_embed
+from ..ml import _character_embed
from ..util import make_layer, registry
@@ -93,8 +90,10 @@ def MaxoutWindowEncoder(config):
nW = config["window_size"]
nP = config["pieces"]
depth = config["depth"]
-
- cnn = expand_window(window_size=nW), Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True)
+ cnn = (
+ expand_window(window_size=nW),
+ Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True),
+ )
model = clone(residual(cnn), depth)
model.set_dim("nO", nO)
model.attrs["receptive_field"] = nW * depth
@@ -103,13 +102,16 @@ def MaxoutWindowEncoder(config):
@registry.architectures.register("spacy.MishWindowEncoder.v1")
def MishWindowEncoder(config):
- from thinc.layers import Mish
+ from thinc.api import Mish
nO = config["width"]
nW = config["window_size"]
depth = config["depth"]
-
- cnn = chain(expand_window(window_size=nW), Mish(nO=nO, nI=nO * ((nW * 2) + 1)), LayerNorm(nO))
+ cnn = chain(
+ expand_window(window_size=nW),
+ Mish(nO=nO, nI=nO * ((nW * 2) + 1)),
+ LayerNorm(nO),
+ )
model = clone(residual(cnn), depth)
model.set_dim("nO", nO)
return model
@@ -118,14 +120,20 @@ def MishWindowEncoder(config):
@registry.architectures.register("spacy.PretrainedVectors.v1")
def PretrainedVectors(config):
# TODO: actual vectors instead of name
- return StaticVectors(vectors=config["vectors_name"], nO=config["width"], column=config["column"], dropout=0.0)
+ return StaticVectors(
+ vectors=config["vectors_name"],
+ nO=config["width"],
+ column=config["column"],
+ dropout=0.0,
+ )
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
def TorchBiLSTMEncoder(config):
import torch.nn
- # TODO FIX
- from thinc.layers import PyTorchRNNWrapper
+
+ # TODO: FIX
+ from thinc.api import PyTorchRNNWrapper
width = config["width"]
depth = config["depth"]
diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py
index 00c328e81..d48b04bd1 100644
--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@@ -1,4 +1,4 @@
-from thinc.layers import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity
+from thinc.api import concatenate, reduce_max, reduce_mean, siamese, CauchySimilarity
from .pipes import Pipe
from ..language import component
@@ -63,8 +63,7 @@ class SimilarityHook(Pipe):
@classmethod
def Model(cls, length):
return siamese(
- concatenate(reduce_max(), reduce_mean()),
- CauchySimilarity(length * 2)
+ concatenate(reduce_max(), reduce_mean()), CauchySimilarity(length * 2)
)
def __call__(self, doc):
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 7b9e4b04e..999132b35 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -3,8 +3,8 @@ from collections import defaultdict
import numpy
cimport numpy as np
-from thinc.layers import chain, list2array
-from thinc.util import to_categorical, copy_array, get_array_module
+from thinc.api import chain, list2array, to_categorical, get_array_module
+from thinc.util import copy_array
from .. import util
from .pipes import Pipe
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index c77281b2c..ad75d2e78 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -3,11 +3,9 @@
import numpy
import srsly
import random
-from thinc.layers import chain, Linear, Maxout, Softmax, LayerNorm, list2array
-from thinc.initializers import zero_init
-from thinc.loss import CosineDistance
-from thinc.util import to_categorical, get_array_module
-from thinc.model import set_dropout_rate
+from thinc.api import chain, Linear, Maxout, Softmax, LayerNorm, list2array
+from thinc.api import zero_init, CosineDistance, to_categorical, get_array_module
+from thinc.api import set_dropout_rate
from ..tokens.doc cimport Doc
from ..syntax.nn_parser cimport Parser
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 9857c87eb..8290468cf 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -1,3 +1,5 @@
+from thinc.api import Model, set_dropout_rate
+
from .pipes import Pipe
from ..gold import Example
from ..tokens import Doc
@@ -5,8 +7,6 @@ from ..vocab import Vocab
from ..language import component
from ..util import link_vectors_to_models, minibatch, registry, eg2doc
-from thinc.model import Model, set_dropout_rate
-
@component("tok2vec", assigns=["doc.tensor"])
class Tok2Vec(Pipe):
@@ -39,7 +39,9 @@ class Tok2Vec(Pipe):
self.listeners = []
def create_listener(self):
- listener = Tok2VecListener(upstream_name="tok2vec", width=self.model.get_dim("nO"))
+ listener = Tok2VecListener(
+ upstream_name="tok2vec", width=self.model.get_dim("nO")
+ )
self.listeners.append(listener)
def add_listener(self, listener):
@@ -112,10 +114,10 @@ class Tok2Vec(Pipe):
docs = [docs]
set_dropout_rate(self.model, drop)
tokvecs, bp_tokvecs = self.model.begin_update(docs)
-
+
def capture_losses(d_tokvecs):
"""Accumulate tok2vec loss before doing backprop."""
- l2_loss = sum((d_t2v**2).sum() for d_t2v in d_tokvecs)
+ l2_loss = sum((d_t2v ** 2).sum() for d_t2v in d_tokvecs)
if self.name in losses:
losses[self.name] += l2_loss / len(d_tokvecs)
else:
@@ -133,7 +135,9 @@ class Tok2Vec(Pipe):
def get_loss(self, docs, golds, scores):
pass
- def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
+ def begin_training(
+ self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
+ ):
"""Allocate models and pre-process training data
get_examples (function): Function returning example training data.
@@ -151,6 +155,7 @@ class Tok2VecListener(Model):
"""A layer that gets fed its answers from an upstream connection,
for instance from a component earlier in the pipeline.
"""
+
name = "tok2vec-listener"
def __init__(self, upstream_name, width):
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index cb8e1d127..442233f19 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -11,9 +11,7 @@ from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free, realloc
from cymem.cymem cimport Pool
from thinc.extra.search cimport Beam
-from thinc.layers import Linear
-from thinc.model import Model
-from thinc.backends import CupyOps, NumpyOps, use_ops
+from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops
from thinc.backends.linalg cimport Vec, VecVec
cimport blis.cy
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 8e55d3873..cf57e1cf6 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -1,11 +1,8 @@
# cython: infer_types=True
# cython: cdivision=True
# cython: boundscheck=False
-import numpy
cimport cython.parallel
-import numpy.random
cimport numpy as np
-from itertools import islice
from cpython.ref cimport PyObject, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from libc.math cimport exp
@@ -14,15 +11,16 @@ from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free
from cymem.cymem cimport Pool
from thinc.extra.search cimport Beam
-from thinc.layers import chain, clone, Linear, list2array
-from thinc.backends import NumpyOps, CupyOps, use_ops
-from thinc.util import get_array_module
from thinc.backends.linalg cimport Vec, VecVec
-from thinc.initializers import zero_init
-from thinc.model import set_dropout_rate
-import srsly
-from spacy.gold import Example
+from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops
+from thinc.api import get_array_module, zero_init, set_dropout_rate
+from itertools import islice
+import srsly
+import numpy.random
+import numpy
+
+from ..gold import Example
from ..typedefs cimport weight_t, class_t, hash_t
from ._parser_model cimport alloc_activations, free_activations
from ._parser_model cimport predict_states, arg_max_if_valid
diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx
index f024c1f05..27516ffd9 100644
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@@ -6,7 +6,7 @@ scheme.
"""
from copy import copy
-from spacy.gold import Example
+from ..gold import Example
from ..tokens.doc cimport Doc, set_children_from_heads
from ..errors import Errors
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index b7627b175..4323bb736 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -1,4 +1,3 @@
-
import pytest
import numpy
from spacy.tokens import Doc, Span
@@ -274,7 +273,19 @@ def test_doc_is_nered(en_vocab):
def test_doc_from_array_sent_starts(en_vocab):
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
- deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
+ deps = [
+ "ROOT",
+ "dep",
+ "dep",
+ "dep",
+ "dep",
+ "dep",
+ "ROOT",
+ "dep",
+ "dep",
+ "dep",
+ "dep",
+ ]
doc = Doc(en_vocab, words=words)
for i, (dep, head) in enumerate(zip(deps, heads)):
doc[i].dep_ = dep
diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py
index 82fb549ba..221b6f683 100644
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@@ -29,7 +29,9 @@ def test_morph_props(i_has):
def test_morph_iter(i_has):
assert set(i_has[0].morph) == set(["PronType=prs"])
- assert set(i_has[1].morph) == set(["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"])
+ assert set(i_has[1].morph) == set(
+ ["Number=sing", "Person=three", "Tense=pres", "VerbForm=fin"]
+ )
def test_morph_get(i_has):
diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py
index 17bcd2c64..5e564d1f2 100644
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@@ -8,7 +8,12 @@ from ..util import get_doc
def test_doc_retokenize_merge(en_tokenizer):
text = "WKRO played songs by the beach boys all night"
- attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"}
+ attrs = {
+ "tag": "NAMED",
+ "lemma": "LEMMA",
+ "ent_type": "TYPE",
+ "morph": "Number=Plur",
+ }
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
diff --git a/spacy/tests/lang/ar/test_text.py b/spacy/tests/lang/ar/test_text.py
index f4a8cc1e3..c5ab376f1 100644
--- a/spacy/tests/lang/ar/test_text.py
+++ b/spacy/tests/lang/ar/test_text.py
@@ -1,4 +1,3 @@
-
def test_ar_tokenizer_handles_long_text(ar_tokenizer):
text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين.
ولد نجيب محفوظ في مدينة القاهرة، حيث ترعرع و تلقى تعليمه الجامعي في جامعتها،
diff --git a/spacy/tests/lang/en/test_indices.py b/spacy/tests/lang/en/test_indices.py
index d50c75fc5..93daeec30 100644
--- a/spacy/tests/lang/en/test_indices.py
+++ b/spacy/tests/lang/en/test_indices.py
@@ -1,4 +1,3 @@
-
def test_en_simple_punct(en_tokenizer):
text = "to walk, do foo"
tokens = en_tokenizer(text)
diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py
index 02aa63207..bcd62f239 100644
--- a/spacy/tests/lang/fi/test_tokenizer.py
+++ b/spacy/tests/lang/fi/test_tokenizer.py
@@ -19,16 +19,10 @@ HYPHENATED_TESTS = [
ABBREVIATION_INFLECTION_TESTS = [
(
"VTT:ssa ennen v:ta 2010 suoritetut mittaukset",
- ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"]
+ ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"],
),
- (
- "ALV:n osuus on 24 %.",
- ["ALV:n", "osuus", "on", "24", "%", "."]
- ),
- (
- "Hiihtäjä oli kilpailun 14:s.",
- ["Hiihtäjä", "oli", "kilpailun", "14:s", "."]
- )
+ ("ALV:n osuus on 24 %.", ["ALV:n", "osuus", "on", "24", "%", "."]),
+ ("Hiihtäjä oli kilpailun 14:s.", ["Hiihtäjä", "oli", "kilpailun", "14:s", "."]),
]
diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py
index d0d8c2268..fd3acd0a0 100644
--- a/spacy/tests/lang/hu/test_tokenizer.py
+++ b/spacy/tests/lang/hu/test_tokenizer.py
@@ -294,12 +294,7 @@ WIKI_TESTS = [
]
EXTRA_TESTS = (
- DOT_TESTS
- + QUOTE_TESTS
- + NUMBER_TESTS
- + HYPHEN_TESTS
- + WIKI_TESTS
- + TYPO_TESTS
+ DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS
)
# normal: default tests + 10% of extra tests
@@ -308,7 +303,14 @@ TESTS.extend([x for i, x in enumerate(EXTRA_TESTS) if i % 10 == 0])
# slow: remaining 90% of extra tests
SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0]
-TESTS.extend([pytest.param(x[0], x[1], marks=pytest.mark.slow()) if not isinstance(x[0], tuple) else x for x in SLOW_TESTS])
+TESTS.extend(
+ [
+ pytest.param(x[0], x[1], marks=pytest.mark.slow())
+ if not isinstance(x[0], tuple)
+ else x
+ for x in SLOW_TESTS
+ ]
+)
@pytest.mark.parametrize("text,expected_tokens", TESTS)
diff --git a/spacy/tests/lang/sv/test_text.py b/spacy/tests/lang/sv/test_text.py
index dc4911ab6..1e26c45bc 100644
--- a/spacy/tests/lang/sv/test_text.py
+++ b/spacy/tests/lang/sv/test_text.py
@@ -1,4 +1,3 @@
-
def test_sv_tokenizer_handles_long_text(sv_tokenizer):
text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön,
höet var uppställt i stackar nere vid den gröna ängen, och där gick storken på sina långa,
diff --git a/spacy/tests/lang/zh/test_text.py b/spacy/tests/lang/zh/test_text.py
index d48feaee5..d9a65732e 100644
--- a/spacy/tests/lang/zh/test_text.py
+++ b/spacy/tests/lang/zh/test_text.py
@@ -1,4 +1,3 @@
-
import pytest
diff --git a/spacy/tests/morphology/test_morph_converters.py b/spacy/tests/morphology/test_morph_converters.py
index 3bff4f924..9486cad45 100644
--- a/spacy/tests/morphology/test_morph_converters.py
+++ b/spacy/tests/morphology/test_morph_converters.py
@@ -1,4 +1,3 @@
-import pytest
from spacy.morphology import Morphology
diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py
index 0d8d7dea9..f644a5867 100644
--- a/spacy/tests/morphology/test_morph_features.py
+++ b/spacy/tests/morphology/test_morph_features.py
@@ -24,13 +24,20 @@ def test_add_morphology_with_int_ids(morphology):
morphology.strings.add("gen")
morphology.strings.add("Number")
morphology.strings.add("sing")
- morphology.add({get_string_id("Case"): get_string_id("gen"), get_string_id("Number"): get_string_id("sing")})
+ morphology.add(
+ {
+ get_string_id("Case"): get_string_id("gen"),
+ get_string_id("Number"): get_string_id("sing"),
+ }
+ )
def test_add_morphology_with_mix_strings_and_ints(morphology):
morphology.strings.add("PunctSide")
morphology.strings.add("ini")
- morphology.add({get_string_id("PunctSide"): get_string_id("ini"), "VerbType": "aux"})
+ morphology.add(
+ {get_string_id("PunctSide"): get_string_id("ini"), "VerbType": "aux"}
+ )
def test_morphology_tags_hash_distinctly(morphology):
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index 25892ac71..fe847a6ae 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -1,6 +1,5 @@
import pytest
-from thinc.optimizers import Adam
-from thinc.backends import NumpyOps
+from thinc.api import Adam, NumpyOps
from spacy.attrs import NORM
from spacy.gold import GoldParse
from spacy.vocab import Vocab
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 8dda1f406..9a4d21a8d 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -10,7 +10,7 @@ from spacy.tokens import Doc
TRAIN_DATA = [
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
- ]
+]
@pytest.fixture
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index 5e56442b5..c6c1240a8 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -1,6 +1,5 @@
import pytest
-from thinc.optimizers import Adam
-from thinc.backends import NumpyOps
+from thinc.api import Adam
from spacy.attrs import NORM
from spacy.gold import GoldParse
from spacy.vocab import Vocab
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 234603e94..b04569e22 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -149,10 +149,5 @@ def test_entity_ruler_validate(nlp):
def test_entity_ruler_properties(nlp, patterns):
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
- assert sorted(ruler.labels) == sorted([
- "HELLO",
- "BYE",
- "COMPLEX",
- "TECH_ORG"
- ])
+ assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"])
assert sorted(ruler.ent_ids) == ["a1", "a2"]
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 6a6ec8665..366cd4f1a 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,5 +1,4 @@
import pytest
-import srsly
from spacy.language import Language
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index d9e1d663a..2bfdbd7c3 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -270,7 +270,9 @@ def test_issue1963(en_tokenizer):
def test_issue1967(label):
ner = EntityRecognizer(Vocab())
example = Example(doc=None)
- example.set_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label])
+ example.set_token_annotation(
+ ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]
+ )
ner.moves.get_actions(gold_parses=[example])
diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py
index fca884356..120cea1d2 100644
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@@ -39,8 +39,5 @@ def test_issue3611():
for batch in batches:
nlp.update(
- examples=batch,
- sgd=optimizer,
- drop=0.1,
- losses=losses,
+ examples=batch, sgd=optimizer, drop=0.1, losses=losses,
)
diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py
index 7153594db..7158d9b21 100644
--- a/spacy/tests/regression/test_issue4030.py
+++ b/spacy/tests/regression/test_issue4030.py
@@ -39,10 +39,7 @@ def test_issue4030():
for batch in batches:
nlp.update(
- examples=batch,
- sgd=optimizer,
- drop=0.1,
- losses=losses,
+ examples=batch, sgd=optimizer, drop=0.1, losses=losses,
)
# processing of an empty doc should result in 0.0 for all categories
diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py
index 786e2cedf..31b2a2d2f 100644
--- a/spacy/tests/test_architectures.py
+++ b/spacy/tests/test_architectures.py
@@ -1,6 +1,6 @@
import pytest
from spacy import registry
-from thinc.layers import Linear
+from thinc.api import Linear
from catalogue import RegistryError
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 049858960..306adc881 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -65,8 +65,9 @@ def test_cli_converters_conllu2json_subtokens():
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
]
input_data = "\n".join(lines)
- converted = conllu2json(input_data, n_sents=1, merge_subtokens=True,
- append_morphology=True)
+ converted = conllu2json(
+ input_data, n_sents=1, merge_subtokens=True, append_morphology=True
+ )
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
@@ -81,11 +82,16 @@ def test_cli_converters_conllu2json_subtokens():
"NOUN__Definite=Ind|Gender=Masc|Number=Sing",
"PROPN_X__Gender=Fem,Masc|Tense=past",
"VERB__Mood=Ind|Tense=Pres|VerbForm=Fin",
- "PUNCT"
+ "PUNCT",
]
- assert [t["pos"] for t in tokens] == ['NOUN', 'PROPN', 'VERB', 'PUNCT']
- assert [t["morph"] for t in tokens] == ['Definite=Ind|Gender=Masc|Number=Sing', 'Gender=Fem,Masc|Tense=past', 'Mood=Ind|Tense=Pres|VerbForm=Fin', '']
- assert [t["lemma"] for t in tokens] == ['dommer', 'Finn Eilertsen', 'avstå', '$.']
+ assert [t["pos"] for t in tokens] == ["NOUN", "PROPN", "VERB", "PUNCT"]
+ assert [t["morph"] for t in tokens] == [
+ "Definite=Ind|Gender=Masc|Number=Sing",
+ "Gender=Fem,Masc|Tense=past",
+ "Mood=Ind|Tense=Pres|VerbForm=Fin",
+ "",
+ ]
+ assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."]
assert [t["head"] for t in tokens] == [1, 1, 0, -1]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
index 8276d7aea..9a98e049e 100644
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@@ -4,7 +4,9 @@ import pytest
def test_tokenizer_handles_emoticons(tokenizer):
# Tweebo challenge (CMU)
- text = """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| :> ...."""
+ text = (
+ """:o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| :> ...."""
+ )
tokens = tokenizer(text)
assert tokens[0].text == ":o"
assert tokens[1].text == ":/"
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index 3dce1ae31..c035559b4 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -130,7 +130,19 @@ def test_tokenizer_special_cases_with_affixes(tokenizer):
tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
tokenizer.add_special_case("A/B", [{"orth": "A/B"}])
doc = tokenizer(text)
- assert [token.text for token in doc] == ["(", "(", "(", "_SPECIAL_", "A/B", ",", "A/B", "-", "A/B", '"', ")"]
+ assert [token.text for token in doc] == [
+ "(",
+ "(",
+ "(",
+ "_SPECIAL_",
+ "A/B",
+ ",",
+ "A/B",
+ "-",
+ "A/B",
+ '"',
+ ")",
+ ]
def test_tokenizer_special_cases_with_period(tokenizer):
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index ec7e8a9e8..337c154a2 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -4,8 +4,8 @@
from libc.string cimport memcpy, memset
from libc.stdlib cimport malloc, free
from cymem.cymem cimport Pool
-from thinc.util import get_array_module
+from thinc.api import get_array_module
import numpy
from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 4a18acd77..65b70d1b3 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -1,7 +1,7 @@
import numpy
import zlib
import srsly
-from thinc.backends import NumpyOps
+from thinc.api import NumpyOps
from ..compat import copy_reg
from ..tokens import Doc
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 16ef5f966..54d92f8b1 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -11,7 +11,8 @@ import numpy
import numpy.linalg
import struct
import srsly
-from thinc.util import get_array_module, copy_array
+from thinc.api import get_array_module
+from thinc.util import copy_array
from .span cimport Span
from .token cimport Token
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 796a5e674..d6b50b5f4 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -4,7 +4,7 @@ from libc.math cimport sqrt
import numpy
import numpy.linalg
-from thinc.util import get_array_module
+from thinc.api import get_array_module
from collections import defaultdict
from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index c241cd5ad..379da6c77 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -7,7 +7,7 @@ cimport numpy as np
np.import_array()
import numpy
-from thinc.util import get_array_module
+from thinc.api import get_array_module
from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme
diff --git a/spacy/util.py b/spacy/util.py
index 0cc11cef7..995ff722f 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -5,13 +5,9 @@ import re
from pathlib import Path
import random
from typing import List
-
import thinc
import thinc.config
-from thinc.backends import NumpyOps, get_current_ops
-from thinc.optimizers import Adam
-from thinc.util import require_gpu
-
+from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu
import functools
import itertools
import numpy.random
@@ -797,5 +793,13 @@ def create_default_optimizer():
eps = env_opt("optimizer_eps", 1e-8)
L2 = env_opt("L2_penalty", 1e-6)
grad_clip = env_opt("grad_norm_clip", 1.0)
- optimizer = Adam(learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, ops=ops, grad_clip=grad_clip)
+ optimizer = Adam(
+ learn_rate,
+ L2=L2,
+ beta1=beta1,
+ beta2=beta2,
+ eps=eps,
+ ops=ops,
+ grad_clip=grad_clip,
+ )
return optimizer
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index f812acac4..0ade8b280 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -5,8 +5,7 @@ from libcpp.set cimport set as cppset
import functools
import numpy
import srsly
-from thinc.util import get_array_module
-from thinc.backends import get_current_ops
+from thinc.api import get_array_module, get_current_ops
from .strings cimport StringStore
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 3da9978c4..a1929559f 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -2,7 +2,7 @@
from libc.string cimport memcpy
import srsly
-from thinc.util import get_array_module
+from thinc.api import get_array_module
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme
From 09cbeaef27c910a6f235c94641efee25c904b4e0 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Tue, 18 Feb 2020 17:20:17 +0100
Subject: [PATCH 068/496] Remove symlinks, data dir and related stuff
---
spacy/cli/__init__.py | 12 +++-
spacy/cli/download.py | 33 ++-------
spacy/cli/info.py | 27 +++----
spacy/cli/link.py | 73 -------------------
spacy/cli/validate.py | 147 ++++++++++++---------------------------
spacy/compat.py | 28 --------
spacy/data/__init__.py | 0
spacy/errors.py | 9 +--
spacy/tests/test_misc.py | 47 -------------
spacy/util.py | 53 ++++----------
10 files changed, 82 insertions(+), 347 deletions(-)
delete mode 100644 spacy/cli/link.py
delete mode 100644 spacy/data/__init__.py
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 0f7677fd2..5f83b26c1 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -1,13 +1,21 @@
+from wasabi import msg
+
from .download import download # noqa: F401
from .info import info # noqa: F401
-from .link import link # noqa: F401
from .package import package # noqa: F401
from .profile import profile # noqa: F401
from .train import train # noqa: F401
-from .train_from_config import train_from_config_cli # noqa: F401
+from .train_from_config import train_from_config_cli # noqa: F401
from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .evaluate import evaluate # noqa: F401
from .convert import convert # noqa: F401
from .init_model import init_model # noqa: F401
from .validate import validate # noqa: F401
+
+
+def link(*args, **kwargs):
+ msg.warn(
+ "As of spaCy v3.0, model symlinks are deprecated. You can load models "
+ "using their full names or from a directory path."
+ )
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 7388bf615..0230e272d 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -4,8 +4,6 @@ import subprocess
import sys
from wasabi import msg
-from .link import link
-from ..util import get_package_path
from .. import about
@@ -15,9 +13,9 @@ def download(
*pip_args: ("Additional arguments to be passed to `pip install` on model install"),
):
"""
- Download compatible model from default download path using pip. Model
- can be shortcut, model name or, if --direct flag is set, full model name
- with version. For direct downloads, the compatibility check will be skipped.
+ Download compatible model from default download path using pip. If --direct
+ flag is set, the command expects the full model name with version.
+ For direct downloads, the compatibility check will be skipped.
"""
if not require_package("spacy") and "--no-deps" not in pip_args:
msg.warn(
@@ -47,28 +45,6 @@ def download(
"Download and installation successful",
f"You can now load the model via spacy.load('{model_name}')",
)
- # Only create symlink if the model is installed via a shortcut like 'en'.
- # There's no real advantage over an additional symlink for en_core_web_sm
- # and if anything, it's more error prone and causes more confusion.
- if model in shortcuts:
- try:
- # Get package path here because link uses
- # pip.get_installed_distributions() to check if model is a
- # package, which fails if model was just installed via
- # subprocess
- package_path = get_package_path(model_name)
- link(model_name, model, force=True, model_path=package_path)
- except: # noqa: E722
- # Dirty, but since spacy.download and the auto-linking is
- # mostly a convenience wrapper, it's best to show a success
- # message and loading instructions, even if linking fails.
- msg.warn(
- "Download successful but linking failed",
- f"Creating a shortcut link for '{model}' didn't work (maybe you "
- f"don't have admin permissions?), but you can still load "
- f"the model via its full package name: "
- f"nlp = spacy.load('{model_name}')",
- )
# If a model is downloaded and then loaded within the same process, our
# is_package check currently fails, because pkg_resources.working_set
# is not refreshed automatically (see #3923). We're trying to work
@@ -114,8 +90,7 @@ def get_version(model, comp):
model = model.rsplit(".dev", 1)[0]
if model not in comp:
msg.fail(
- f"No compatible model found for '{model}' "
- f"(spaCy v{about.__version__}).",
+ f"No compatible model found for '{model}' (spaCy v{about.__version__})",
exits=1,
)
return comp[model][0]
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index fc8764ca8..23f766368 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -3,25 +3,26 @@ from pathlib import Path
from wasabi import msg
import srsly
+from .validate import get_model_pkgs
from .. import util
from .. import about
def info(
- model: ("Optional shortcut link of model", "positional", None, str) = None,
+ model: ("Optional model name", "positional", None, str) = None,
markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False,
silent: ("Don't print anything (just return)", "flag", "s") = False,
):
"""
- Print info about spaCy installation. If a model shortcut link is
- speficied as an argument, print model information. Flag --markdown
- prints details in Markdown for easy copy-pasting to GitHub issues.
+ Print info about spaCy installation. If a model is speficied as an argument,
+ print model information. Flag --markdown prints details in Markdown for easy
+ copy-pasting to GitHub issues.
"""
if model:
if util.is_package(model):
model_path = util.get_package_path(model)
else:
- model_path = util.get_data_path() / model
+ model_path = model
meta_path = model_path / "meta.json"
if not meta_path.is_file():
msg.fail("Can't find model meta.json", meta_path, exits=1)
@@ -41,12 +42,13 @@ def info(
else:
msg.table(model_meta, title=title)
return meta
+ all_models, _ = get_model_pkgs()
data = {
"spaCy version": about.__version__,
"Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(),
"Python version": platform.python_version(),
- "Models": list_models(),
+ "Models": ", ".join(model["name"] for model in all_models.values()),
}
if not silent:
title = "Info about spaCy"
@@ -57,19 +59,6 @@ def info(
return data
-def list_models():
- def exclude_dir(dir_name):
- # exclude common cache directories and hidden directories
- exclude = ("cache", "pycache", "__pycache__")
- return dir_name in exclude or dir_name.startswith(".")
-
- data_path = util.get_data_path()
- if data_path:
- models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
- return ", ".join([m for m in models if not exclude_dir(m)])
- return "-"
-
-
def print_markdown(data, title=None):
"""Print data in GitHub-flavoured Markdown format for issues etc.
diff --git a/spacy/cli/link.py b/spacy/cli/link.py
deleted file mode 100644
index d8af469dc..000000000
--- a/spacy/cli/link.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from pathlib import Path
-from wasabi import msg
-
-from ..compat import symlink_to
-from .. import util
-
-
-def link(
- origin: ("package name or local path to model", "positional", None, str),
- link_name: ("name of shortuct link to create", "positional", None, str),
- force: ("force overwriting of existing link", "flag", "f", bool) = False,
- model_path=None,
-):
- """
- Create a symlink for models within the spacy/data directory. Accepts
- either the name of a pip package, or the local path to the model data
- directory. Linking models allows loading them via spacy.load(link_name).
- """
- if util.is_package(origin):
- model_path = util.get_package_path(origin)
- else:
- model_path = Path(origin) if model_path is None else Path(model_path)
- if not model_path.exists():
- msg.fail(
- "Can't locate model data",
- f"The data should be located in {model_path}",
- exits=1,
- )
- data_path = util.get_data_path()
- if not data_path or not data_path.exists():
- spacy_loc = Path(__file__).parent.parent
- msg.fail(
- f"Can't find the spaCy data path to create model symlink",
- f"Make sure a directory `/data` exists within your spaCy "
- f"installation and try again. The data directory should be located "
- f"here: {spacy_loc}",
- exits=1,
- )
- link_path = util.get_data_path() / link_name
- if link_path.is_symlink() and not force:
- msg.fail(
- f"Link '{link_name}' already exists",
- "To overwrite an existing link, use the --force flag",
- exits=1,
- )
- elif link_path.is_symlink(): # does a symlink exist?
- # NB: It's important to check for is_symlink here and not for exists,
- # because invalid/outdated symlinks would return False otherwise.
- link_path.unlink()
- elif link_path.exists(): # does it exist otherwise?
- # NB: Check this last because valid symlinks also "exist".
- msg.fail(
- f"Can't overwrite symlink '{link_name}'",
- "This can happen if your data directory contains a directory or "
- "file of the same name.",
- exits=1,
- )
- details = f"{model_path} --> {link_path}"
- try:
- symlink_to(link_path, model_path)
- except: # noqa: E722
- # This is quite dirty, but just making sure other errors are caught.
- msg.fail(
- f"Couldn't link model to '{link_name}'",
- "Creating a symlink in spacy/data failed. Make sure you have the "
- "required permissions and try re-running the command as admin, or "
- "use a virtualenv. You can still import the model as a module and "
- "call its load() method, or create the symlink manually.",
- )
- msg.text(details)
- raise
- msg.good("Linking successful", details)
- msg.text(f"You can now load the model via spacy.load('{link_name}')")
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index b4d217f2f..a23ce3453 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -1,10 +1,8 @@
from pathlib import Path
import sys
import requests
-import srsly
from wasabi import msg
-from ..util import get_data_path
from .. import about
@@ -13,6 +11,50 @@ def validate():
Validate that the currently installed version of spaCy is compatible
with the installed models. Should be run after `pip install -U spacy`.
"""
+ model_pkgs, compat = get_model_pkgs()
+ spacy_version = about.__version__.rsplit(".dev", 1)[0]
+ current_compat = compat.get(spacy_version, {})
+ if not current_compat:
+ msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
+ incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
+ na_models = [m for m in incompat_models if m not in current_compat]
+ update_models = [m for m in incompat_models if m in current_compat]
+ spacy_dir = Path(__file__).parent.parent
+
+ msg.divider(f"Installed models (spaCy v{about.__version__})")
+ msg.info(f"spaCy installation: {spacy_dir}")
+
+ if model_pkgs:
+ header = ("NAME", "VERSION", "")
+ rows = []
+ for name, data in model_pkgs.items():
+ if data["compat"]:
+ comp = msg.text("", color="green", icon="good", no_print=True)
+ version = msg.text(data["version"], color="green", no_print=True)
+ else:
+ version = msg.text(data["version"], color="red", no_print=True)
+ comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
+ rows.append((data["name"], version, comp))
+ msg.table(rows, header=header)
+ else:
+ msg.text("No models found in your current environment.", exits=0)
+ if update_models:
+ msg.divider("Install updates")
+ msg.text("Use the following commands to update the model packages:")
+ cmd = "python -m spacy download {}"
+ print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
+ if na_models:
+ msg.warn(
+ f"The following models are not available for spaCy v{about.__version__}:",
+ ", ".join(na_models),
+ )
+ if incompat_models:
+ sys.exit(1)
+
+
+def get_model_pkgs():
+ import pkg_resources
+
with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__)
if r.status_code != 200:
@@ -23,88 +65,11 @@ def validate():
)
msg.good("Loaded compatibility table")
compat = r.json()["spacy"]
- version = about.__version__
- version = version.rsplit(".dev", 1)[0]
- current_compat = compat.get(version)
- if not current_compat:
- msg.fail(
- f"Can't find spaCy v{version} in compatibility table",
- about.__compatibility__,
- exits=1,
- )
all_models = set()
for spacy_v, models in dict(compat).items():
all_models.update(models.keys())
for model, model_vs in models.items():
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
- model_links = get_model_links(current_compat)
- model_pkgs = get_model_pkgs(current_compat, all_models)
- incompat_links = {l for l, d in model_links.items() if not d["compat"]}
- incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
- incompat_models.update(
- [d["name"] for _, d in model_links.items() if not d["compat"]]
- )
- na_models = [m for m in incompat_models if m not in current_compat]
- update_models = [m for m in incompat_models if m in current_compat]
- spacy_dir = Path(__file__).parent.parent
-
- msg.divider(f"Installed models (spaCy v{about.__version__})")
- msg.info(f"spaCy installation: {spacy_dir}")
-
- if model_links or model_pkgs:
- header = ("TYPE", "NAME", "MODEL", "VERSION", "")
- rows = []
- for name, data in model_pkgs.items():
- rows.append(get_model_row(current_compat, name, data, msg))
- for name, data in model_links.items():
- rows.append(get_model_row(current_compat, name, data, msg, "link"))
- msg.table(rows, header=header)
- else:
- msg.text("No models found in your current environment.", exits=0)
- if update_models:
- msg.divider("Install updates")
- msg.text("Use the following commands to update the model packages:")
- cmd = "python -m spacy download {}"
- print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
- if na_models:
- msg.text(
- f"The following models are not available for spaCy "
- f"v{about.__version__}: {', '.join(na_models)}"
- )
- if incompat_links:
- msg.text(
- f"You may also want to overwrite the incompatible links using the "
- f"`python -m spacy link` command with `--force`, or remove them "
- f"from the data directory. "
- f"Data path: {get_data_path()}"
- )
- if incompat_models or incompat_links:
- sys.exit(1)
-
-
-def get_model_links(compat):
- links = {}
- data_path = get_data_path()
- if data_path:
- models = [p for p in data_path.iterdir() if is_model_path(p)]
- for model in models:
- meta_path = Path(model) / "meta.json"
- if not meta_path.exists():
- continue
- meta = srsly.read_json(meta_path)
- link = model.parts[-1]
- name = meta["lang"] + "_" + meta["name"]
- links[link] = {
- "name": name,
- "version": meta["version"],
- "compat": is_compat(compat, name, meta["version"]),
- }
- return links
-
-
-def get_model_pkgs(compat, all_models):
- import pkg_resources
-
pkgs = {}
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
package = pkg_name.replace("-", "_")
@@ -113,29 +78,9 @@ def get_model_pkgs(compat, all_models):
pkgs[pkg_name] = {
"name": package,
"version": version,
- "compat": is_compat(compat, package, version),
+ "compat": package in compat and version in compat[package],
}
- return pkgs
-
-
-def get_model_row(compat, name, data, msg, model_type="package"):
- if data["compat"]:
- comp = msg.text("", color="green", icon="good", no_print=True)
- version = msg.text(data["version"], color="green", no_print=True)
- else:
- version = msg.text(data["version"], color="red", no_print=True)
- comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
- return (model_type, name, data["name"], version, comp)
-
-
-def is_model_path(model_path):
- exclude = ["cache", "pycache", "__pycache__"]
- name = model_path.parts[-1]
- return model_path.is_dir() and name not in exclude and not name.startswith(".")
-
-
-def is_compat(compat, name, version):
- return name in compat and version in compat[name]
+ return pkgs, compat
def reformat_version(version):
diff --git a/spacy/compat.py b/spacy/compat.py
index 6fa49353e..be6cdb8a1 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -5,7 +5,6 @@ e.g. `unicode_`.
DOCS: https://spacy.io/api/top-level#compat
"""
-import os
import sys
from thinc.util import copy_array
@@ -43,33 +42,6 @@ is_linux = sys.platform.startswith("linux")
is_osx = sys.platform == "darwin"
-def symlink_to(orig, dest):
- """Create a symlink. Used for model shortcut links.
-
- orig (unicode / Path): The origin path.
- dest (unicode / Path): The destination path of the symlink.
- """
- if is_windows:
- import subprocess
-
- subprocess.check_call(["mklink", "/d", str(orig), str(dest)], shell=True)
- else:
- orig.symlink_to(dest)
-
-
-def symlink_remove(link):
- """Remove a symlink. Used for model shortcut links.
-
- link (unicode / Path): The path to the symlink.
- """
- # https://stackoverflow.com/q/26554135/6400719
- if os.path.isdir(str(link)) and is_windows:
- # this should only be on Py2.7 and windows
- os.rmdir(str(link))
- else:
- os.unlink(str(link))
-
-
def is_config(windows=None, linux=None, osx=None, **kwargs):
"""Check if a specific configuration of Python version and operating system
matches the user's setup. Mostly used to display targeted error messages.
diff --git a/spacy/data/__init__.py b/spacy/data/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/spacy/errors.py b/spacy/errors.py
index e00df2c51..6947dbbd5 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -224,13 +224,8 @@ class Errors(object):
E047 = ("Can't assign a value to unregistered extension attribute "
"'{name}'. Did you forget to call the `set_extension` method?")
E048 = ("Can't import language {lang} from spacy.lang: {err}")
- E049 = ("Can't find spaCy data directory: '{path}'. Check your "
- "installation and permissions, or use spacy.util.set_data_path "
- "to customise the location if necessary.")
- E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut "
- "link, a Python package or a valid path to a data directory.")
- E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure "
- "it points to a valid package (not just a data directory).")
+ E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
+ "package or a valid path to a data directory.")
E052 = ("Can't find model directory: {path}")
E053 = ("Could not read meta.json from {path}")
E054 = ("No valid '{setting}' setting found in model meta.json.")
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index a6bcdb50c..6d4e75a31 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -4,36 +4,8 @@ import ctypes
from pathlib import Path
from spacy import util
from spacy import prefer_gpu, require_gpu
-from spacy.compat import symlink_to, symlink_remove, is_windows
from spacy.ml._layers import PrecomputableAffine
from spacy.ml._layers import _backprop_precomputable_affine_padding
-from subprocess import CalledProcessError
-
-
-@pytest.fixture
-def symlink_target():
- return Path("./foo-target")
-
-
-@pytest.fixture
-def symlink():
- return Path("./foo-symlink")
-
-
-@pytest.fixture(scope="function")
-def symlink_setup_target(request, symlink_target, symlink):
- if not symlink_target.exists():
- os.mkdir(str(symlink_target))
- # yield -- need to cleanup even if assertion fails
- # https://github.com/pytest-dev/pytest/issues/2508#issuecomment-309934240
-
- def cleanup():
- # Remove symlink only if it was created
- if symlink.exists():
- symlink_remove(symlink)
- os.rmdir(str(symlink_target))
-
- request.addfinalizer(cleanup)
@pytest.fixture
@@ -109,25 +81,6 @@ def test_require_gpu():
require_gpu()
-def test_create_symlink_windows(
- symlink_setup_target, symlink_target, symlink, is_admin
-):
- """Test the creation of symlinks on windows. If run as admin or not on windows it should succeed, otherwise a CalledProcessError should be raised."""
- assert symlink_target.exists()
-
- if is_admin or not is_windows:
- try:
- symlink_to(symlink, symlink_target)
- assert symlink.exists()
- except CalledProcessError as e:
- pytest.fail(e)
- else:
- with pytest.raises(CalledProcessError):
- symlink_to(symlink, symlink_target)
-
- assert not symlink.exists()
-
-
def test_ascii_filenames():
"""Test that all filenames in the project are ASCII.
See: https://twitter.com/_inesmontani/status/1177941471632211968
diff --git a/spacy/util.py b/spacy/util.py
index 0cc11cef7..6067333f7 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -29,7 +29,6 @@ from .symbols import ORTH
from .compat import cupy, CudaStream
from .errors import Errors, Warnings, deprecation_warning, user_warning
-_data_path = Path(__file__).parent / "data"
_PRINT_ENV = False
@@ -84,27 +83,6 @@ def set_lang_class(name, cls):
registry.languages.register(name, func=cls)
-def get_data_path(require_exists=True):
- """Get path to spaCy data directory.
-
- require_exists (bool): Only return path if it exists, otherwise None.
- RETURNS (Path or None): Data path or None.
- """
- if not require_exists:
- return _data_path
- else:
- return _data_path if _data_path.exists() else None
-
-
-def set_data_path(path):
- """Set path to spaCy data directory.
-
- path (unicode or Path): Path to new data directory.
- """
- global _data_path
- _data_path = ensure_path(path)
-
-
def make_layer(arch_config):
arch_func = registry.architectures.get(arch_config["arch"])
return arch_func(arch_config["config"])
@@ -145,18 +123,13 @@ def get_module_path(module):
def load_model(name, **overrides):
- """Load a model from a shortcut link, package or data path.
+ """Load a model from a package or data path.
- name (unicode): Package name, shortcut link or model path.
+ name (unicode): Package name or model path.
**overrides: Specific overrides, like pipeline components to disable.
RETURNS (Language): `Language` class with the loaded model.
"""
- data_path = get_data_path()
- if not data_path or not data_path.exists():
- raise IOError(Errors.E049.format(path=data_path))
- if isinstance(name, str): # in data dir / shortcut
- if name in set([d.name for d in data_path.iterdir()]):
- return load_model_from_link(name, **overrides)
+ if isinstance(name, str): # name or string path
if is_package(name): # installed as package
return load_model_from_package(name, **overrides)
if Path(name).exists(): # path to model data directory
@@ -166,16 +139,6 @@ def load_model(name, **overrides):
raise IOError(Errors.E050.format(name=name))
-def load_model_from_link(name, **overrides):
- """Load a model from a shortcut link, or directory in spaCy data path."""
- path = get_data_path() / name / "__init__.py"
- try:
- cls = import_file(name, path)
- except AttributeError:
- raise IOError(Errors.E051.format(name=name))
- return cls.load(**overrides)
-
-
def load_model_from_package(name, **overrides):
"""Load a model from an installed package."""
cls = importlib.import_module(name)
@@ -797,5 +760,13 @@ def create_default_optimizer():
eps = env_opt("optimizer_eps", 1e-8)
L2 = env_opt("L2_penalty", 1e-6)
grad_clip = env_opt("grad_norm_clip", 1.0)
- optimizer = Adam(learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, ops=ops, grad_clip=grad_clip)
+ optimizer = Adam(
+ learn_rate,
+ L2=L2,
+ beta1=beta1,
+ beta2=beta2,
+ eps=eps,
+ ops=ops,
+ grad_clip=grad_clip,
+ )
return optimizer
From b20351792acba1bcd28998bed80171f5b6caa59f Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 19 Feb 2020 15:51:53 +0200
Subject: [PATCH 069/496] assert prints for more clarity
---
spacy/tests/test_requirements.py | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py
index 320fc5763..aaa562722 100644
--- a/spacy/tests/test_requirements.py
+++ b/spacy/tests/test_requirements.py
@@ -32,8 +32,9 @@ def test_build_dependencies(en_vocab):
lib, v = _parse_req(line)
if lib and not lib.startswith("cupy") and lib not in libs_ignore_setup:
req_v = req_dict.get(lib, None)
- assert req_v is not None # if fail: setup.cfg contains a lib not in requirements.txt
- assert (lib+v) == (lib+req_v) # if fail: setup.cfg & requirements.txt have conflicting versions
+ assert req_v is not None, "{} in setup.cfg but not in requirements.txt".format(lib)
+ assert (lib+v) == (lib+req_v), "{} has different version in setup.cfg and in requirements.txt: " \
+ "{} and {} respectively".format(lib, v, req_v)
setup_keys.add(lib)
assert sorted(setup_keys) == sorted(req_dict.keys()) # if fail: requirements.txt contains a lib not in setup.cfg
@@ -48,7 +49,8 @@ def test_build_dependencies(en_vocab):
lib, v = _parse_req(line)
if lib:
req_v = req_dict.get(lib, None)
- assert (lib+v) == (lib+req_v) # if fail: pyproject.toml & requirements.txt have conflicting versions
+ assert (lib+v) == (lib+req_v), "{} has different version in pyproject.toml and in requirements.txt: " \
+ "{} and {} respectively".format(lib, v, req_v)
def _parse_req(line):
From 303c4bcd4ca50569f7987c980ff2e4eb7e9c8a63 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 19 Feb 2020 15:52:55 +0200
Subject: [PATCH 070/496] include requirements in manifest
---
MANIFEST.in | 1 +
1 file changed, 1 insertion(+)
diff --git a/MANIFEST.in b/MANIFEST.in
index 1947b9140..64886cd19 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -4,5 +4,6 @@ include LICENSE
include README.md
include bin/spacy
include pyproject.toml
+include requirements.txt
recursive-exclude spacy/lang *.json
recursive-include spacy/lang *.json.gz
From 5c2f6454706b4522cc58efc8ffca132caeba27f9 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 19 Feb 2020 16:15:56 +0200
Subject: [PATCH 071/496] root dir one level up
---
spacy/tests/test_requirements.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py
index aaa562722..5bbccf362 100644
--- a/spacy/tests/test_requirements.py
+++ b/spacy/tests/test_requirements.py
@@ -8,7 +8,7 @@ def test_build_dependencies(en_vocab):
libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]
# check requirements.txt
- root_dir = Path(__file__).parent.parent.parent
+ root_dir = Path(__file__).parent.parent
req_file = root_dir / "requirements.txt"
req_dict = {}
with req_file.open() as f:
From 9834527f2c373708252c37998b7573291fc9da63 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 19 Feb 2020 16:22:48 +0200
Subject: [PATCH 072/496] hack to switch between CLI folder setup and local
setup
---
spacy/tests/test_requirements.py | 20 +++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py
index 5bbccf362..a6fa20d6b 100644
--- a/spacy/tests/test_requirements.py
+++ b/spacy/tests/test_requirements.py
@@ -8,11 +8,21 @@ def test_build_dependencies(en_vocab):
libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]
# check requirements.txt
- root_dir = Path(__file__).parent.parent
- req_file = root_dir / "requirements.txt"
- req_dict = {}
- with req_file.open() as f:
- lines = f.readlines()
+ try:
+ # for CLI usage
+ root_dir = Path(__file__).parent.parent
+ req_file = root_dir / "requirements.txt"
+ req_dict = {}
+ with req_file.open() as f:
+ lines = f.readlines()
+ except FileNotFoundError as e:
+ # for local usage
+ root_dir = Path(__file__).parent.parent.parent
+ req_file = root_dir / "requirements.txt"
+ req_dict = {}
+ with req_file.open() as f:
+ lines = f.readlines()
+
for line in lines:
line = line.strip()
if not line.startswith("#"):
From 9f1447bf7160dfdc354d8eb386ee169a330dbbca Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 19 Feb 2020 17:09:29 +0200
Subject: [PATCH 073/496] where areth thou, file ?
---
spacy/tests/test_requirements.py | 20 +++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py
index a6fa20d6b..23ba792df 100644
--- a/spacy/tests/test_requirements.py
+++ b/spacy/tests/test_requirements.py
@@ -8,20 +8,26 @@ def test_build_dependencies(en_vocab):
libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]
# check requirements.txt
+ req_dict = {}
try:
# for CLI usage
root_dir = Path(__file__).parent.parent
req_file = root_dir / "requirements.txt"
- req_dict = {}
with req_file.open() as f:
lines = f.readlines()
except FileNotFoundError as e:
- # for local usage
- root_dir = Path(__file__).parent.parent.parent
- req_file = root_dir / "requirements.txt"
- req_dict = {}
- with req_file.open() as f:
- lines = f.readlines()
+ try:
+ # for local usage
+ root_dir = Path(__file__).parent.parent.parent
+ req_file = root_dir / "requirements.txt"
+ with req_file.open() as f:
+ lines = f.readlines()
+ except FileNotFoundError as e:
+ # where areth thou ?
+ root_dir = Path(__file__).parent.parent.parent.parent
+ req_file = root_dir / "requirements.txt"
+ with req_file.open() as f:
+ lines = f.readlines()
for line in lines:
line = line.strip()
From 783da088eac9429852b48af38e32c4e219a95d57 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Sun, 23 Feb 2020 16:21:21 +0100
Subject: [PATCH 074/496] avoid try except
---
spacy/tests/test_requirements.py | 40 ++++++++++++--------------------
1 file changed, 15 insertions(+), 25 deletions(-)
diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py
index 23ba792df..644e6f8f9 100644
--- a/spacy/tests/test_requirements.py
+++ b/spacy/tests/test_requirements.py
@@ -9,32 +9,22 @@ def test_build_dependencies(en_vocab):
# check requirements.txt
req_dict = {}
- try:
- # for CLI usage
- root_dir = Path(__file__).parent.parent
- req_file = root_dir / "requirements.txt"
- with req_file.open() as f:
- lines = f.readlines()
- except FileNotFoundError as e:
- try:
- # for local usage
- root_dir = Path(__file__).parent.parent.parent
- req_file = root_dir / "requirements.txt"
- with req_file.open() as f:
- lines = f.readlines()
- except FileNotFoundError as e:
- # where areth thou ?
- root_dir = Path(__file__).parent.parent.parent.parent
- req_file = root_dir / "requirements.txt"
- with req_file.open() as f:
- lines = f.readlines()
- for line in lines:
- line = line.strip()
- if not line.startswith("#"):
- lib, v = _parse_req(line)
- if lib and lib not in libs_ignore_requirements:
- req_dict[lib] = v
+ root_dir = None
+ # when running tests locally, the file is 3 levels up. On the CI, it's 2 levels up.
+ roots = [Path(__file__).parent.parent, Path(__file__).parent.parent.parent] # or whatever
+ for r in roots:
+ req_file = root_dir / "requirements.txt"
+ if req_file.exists():
+ root_dir = r
+ with req_file.open() as f:
+ lines = f.readlines()
+ for line in lines:
+ line = line.strip()
+ if not line.startswith("#"):
+ lib, v = _parse_req(line)
+ if lib and lib not in libs_ignore_requirements:
+ req_dict[lib] = v
# check setup.cfg and compare to requirements.txt
# also fails when there are missing or additional libs
From 0f55e5170414d90b048609eb44fbf8f27d085074 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Sun, 23 Feb 2020 16:33:58 +0100
Subject: [PATCH 075/496] assert we found the root_dir
---
spacy/tests/test_requirements.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py
index 644e6f8f9..21636766d 100644
--- a/spacy/tests/test_requirements.py
+++ b/spacy/tests/test_requirements.py
@@ -26,6 +26,8 @@ def test_build_dependencies(en_vocab):
if lib and lib not in libs_ignore_requirements:
req_dict[lib] = v
+ assert root_dir is not None, "Could not find the root directory of requirements.txt"
+
# check setup.cfg and compare to requirements.txt
# also fails when there are missing or additional libs
setup_file = root_dir / "setup.cfg"
From 58568bd0cd96b2f72f8b4ea81cfcc269ad93d1f5 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Sun, 23 Feb 2020 16:45:37 +0100
Subject: [PATCH 076/496] fix
---
spacy/tests/test_requirements.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py
index 21636766d..7922f1f18 100644
--- a/spacy/tests/test_requirements.py
+++ b/spacy/tests/test_requirements.py
@@ -14,7 +14,7 @@ def test_build_dependencies(en_vocab):
# when running tests locally, the file is 3 levels up. On the CI, it's 2 levels up.
roots = [Path(__file__).parent.parent, Path(__file__).parent.parent.parent] # or whatever
for r in roots:
- req_file = root_dir / "requirements.txt"
+ req_file = r / "requirements.txt"
if req_file.exists():
root_dir = r
with req_file.open() as f:
From d821c95eb05f3ad0b82601487093559d1d686a2c Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Sun, 23 Feb 2020 17:38:33 +0100
Subject: [PATCH 077/496] debugging prints
---
spacy/tests/test_requirements.py | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py
index 7922f1f18..fc5aeeddd 100644
--- a/spacy/tests/test_requirements.py
+++ b/spacy/tests/test_requirements.py
@@ -13,7 +13,9 @@ def test_build_dependencies(en_vocab):
root_dir = None
# when running tests locally, the file is 3 levels up. On the CI, it's 2 levels up.
roots = [Path(__file__).parent.parent, Path(__file__).parent.parent.parent] # or whatever
+ print()
for r in roots:
+ print("inspecting dir", r, "-->", [f.name for f in r.glob(pattern="*.*")])
req_file = r / "requirements.txt"
if req_file.exists():
root_dir = r
@@ -33,6 +35,18 @@ def test_build_dependencies(en_vocab):
setup_file = root_dir / "setup.cfg"
with setup_file.open() as f:
lines = f.readlines()
+
+ # import configparser
+ # config = configparser.ConfigParser()
+ # config.read(setup_file)
+ # print("SECTIONS", config.sections())
+ # print("options", config['options'])
+ # for key in config['options']:
+ # print("key", key)
+ # print("setup_requires *", config['options']['setup_requires'], "*")
+ # lines = config['options']['setup_requires']
+ # lines += config['options']['install_requires']
+
setup_keys = set()
for line in lines:
line = line.strip()
From 6f846c2cbf1a0a2b4ceaffb83dd8e3d43a22e03e Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Mon, 24 Feb 2020 09:19:08 +0100
Subject: [PATCH 078/496] removing --pyargs for testing purposes
---
azure-pipelines.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 054365336..316ac0c68 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -99,5 +99,5 @@ jobs:
pip install dist/$SDIST
displayName: 'Install from sdist'
- - script: python -m pytest --pyargs spacy
+ - script: python -m pytest spacy
displayName: 'Run tests'
From 217c16c7a9f6c08c078d56fb34bc6497e8c38131 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Mon, 24 Feb 2020 09:38:43 +0100
Subject: [PATCH 079/496] running tests BEFORE deleting them ?
---
azure-pipelines.yml | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 316ac0c68..2ebc381cd 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -84,6 +84,9 @@ jobs:
pip install -r requirements.txt
displayName: 'Install dependencies'
+ - script: python -m pytest spacy
+ displayName: 'Run tests'
+
- script: |
python setup.py build_ext --inplace
python setup.py sdist --formats=gztar
@@ -99,5 +102,3 @@ jobs:
pip install dist/$SDIST
displayName: 'Install from sdist'
- - script: python -m pytest spacy
- displayName: 'Run tests'
From d5bfebe1c5d4772004965a450b57a3ca3119bcd2 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Mon, 24 Feb 2020 10:04:24 +0100
Subject: [PATCH 080/496] it's moving day
---
azure-pipelines.yml | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 2ebc381cd..779037c96 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -84,21 +84,20 @@ jobs:
pip install -r requirements.txt
displayName: 'Install dependencies'
- - script: python -m pytest spacy
- displayName: 'Run tests'
-
- script: |
python setup.py build_ext --inplace
python setup.py sdist --formats=gztar
displayName: 'Compile and build sdist'
- - task: DeleteFiles@1
- inputs:
- contents: 'spacy'
- displayName: 'Delete source directory'
-
- bash: |
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
pip install dist/$SDIST
displayName: 'Install from sdist'
+ - script: python -m pytest spacy
+ displayName: 'Run tests'
+
+ - task: DeleteFiles@1
+ inputs:
+ contents: 'spacy'
+ displayName: 'Delete source directory'
From c1a5ece65f18b4955c8e7e72bdf815c78290d6f4 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Tue, 25 Feb 2020 15:46:39 +0100
Subject: [PATCH 081/496] Tidy up setup and update requirements tests
---
.gitignore | 5 ++
pyproject.toml | 2 +-
requirements.txt | 2 +-
setup.py | 26 ++++++--
spacy/tests/package/test_requirements.py | 76 ++++++++++++++++++++++
spacy/tests/test_requirements.py | 83 ------------------------
6 files changed, 102 insertions(+), 92 deletions(-)
create mode 100644 spacy/tests/package/test_requirements.py
delete mode 100644 spacy/tests/test_requirements.py
diff --git a/.gitignore b/.gitignore
index a0af6d4d2..f39607b76 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,11 @@ corpora/
keys/
*.json.gz
+# Tests
+spacy/tests/package/setup.cfg
+spacy/tests/package/pyproject.toml
+spacy/tests/package/requirements.txt
+
# Website
website/.cache/
website/public/
diff --git a/pyproject.toml b/pyproject.toml
index 8a6ababf3..8d3652a2f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,6 +6,6 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
- "thinc==7.4.0.dev0",
+ "thinc==8.0.0a0",
]
build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index bb6bf9804..f3a7cc162 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ plac>=0.9.6,<1.2.0
tqdm>=4.38.0,<5.0.0
# Optional dependencies
jsonschema>=2.6.0,<3.1.0
-pydantic>=1.0.0,<2.0.0
+pydantic>=1.3.0,<2.0.0
# Development dependencies
cython>=0.25
pytest>=4.6.5
diff --git a/setup.py b/setup.py
index 31f22ba3f..d850a74ac 100755
--- a/setup.py
+++ b/setup.py
@@ -7,15 +7,19 @@ from distutils import ccompiler, msvccompiler
from setuptools import Extension, setup, find_packages
import numpy
from pathlib import Path
+import shutil
from Cython.Build import cythonize
from Cython.Compiler import Options
+ROOT = Path(__file__).parent
+PACKAGE_ROOT = ROOT / "spacy"
+
+
# Preserve `__doc__` on functions and classes
# http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options
Options.docstrings = True
-
PACKAGES = find_packages()
MOD_NAMES = [
"spacy.parts_of_speech",
@@ -60,6 +64,12 @@ COMPILER_DIRECTIVES = {
"embedsignature": True,
"annotation_typing": False,
}
+# Files to copy into the package that are otherwise not included
+COPY_FILES = {
+ ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
+ ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
+ ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
+}
def is_new_osx():
@@ -115,25 +125,27 @@ def clean(path):
def setup_package():
- root = Path(__file__).parent
-
if len(sys.argv) > 1 and sys.argv[1] == "clean":
- return clean(root / "spacy")
+ return clean(PACKAGE_ROOT)
- with (root / "spacy" / "about.py").open("r") as f:
+ with (PACKAGE_ROOT / "about.py").open("r") as f:
about = {}
exec(f.read(), about)
+ for copy_file, target_dir in COPY_FILES.items():
+ shutil.copy(str(copy_file), str(target_dir))
+ print(f"Copied {copy_file} -> {target_dir}")
+
include_dirs = [
get_python_inc(plat_specific=True),
numpy.get_include(),
- str(root / "include"),
+ str(ROOT / "include"),
]
if (
ccompiler.new_compiler().compiler_type == "msvc"
and msvccompiler.get_build_version() == 9
):
- include_dirs.append(str(root / "include" / "msvc9"))
+ include_dirs.append(str(ROOT / "include" / "msvc9"))
ext_modules = []
for name in MOD_NAMES:
mod_path = name.replace(".", "/") + ".pyx"
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
new file mode 100644
index 000000000..59a8569ee
--- /dev/null
+++ b/spacy/tests/package/test_requirements.py
@@ -0,0 +1,76 @@
+import re
+from pathlib import Path
+
+
+def test_build_dependencies():
+ # Check that library requirements are pinned exactly the same across different setup files.
+ libs_ignore_requirements = [
+ "pytest",
+ "pytest-timeout",
+ "mock",
+ "flake8",
+ "jsonschema",
+ ]
+ libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]
+
+ # check requirements.txt
+ req_dict = {}
+
+ root_dir = Path(__file__).parent
+ req_file = root_dir / "requirements.txt"
+ with req_file.open() as f:
+ lines = f.readlines()
+ for line in lines:
+ line = line.strip()
+ if not line.startswith("#"):
+ lib, v = _parse_req(line)
+ if lib and lib not in libs_ignore_requirements:
+ req_dict[lib] = v
+ # check setup.cfg and compare to requirements.txt
+ # also fails when there are missing or additional libs
+ setup_file = root_dir / "setup.cfg"
+ with setup_file.open() as f:
+ lines = f.readlines()
+
+ setup_keys = set()
+ for line in lines:
+ line = line.strip()
+ if not line.startswith("#"):
+ lib, v = _parse_req(line)
+ if lib and not lib.startswith("cupy") and lib not in libs_ignore_setup:
+ req_v = req_dict.get(lib, None)
+ assert (
+ req_v is not None
+ ), "{} in setup.cfg but not in requirements.txt".format(lib)
+ assert (lib + v) == (lib + req_v), (
+ "{} has different version in setup.cfg and in requirements.txt: "
+ "{} and {} respectively".format(lib, v, req_v)
+ )
+ setup_keys.add(lib)
+ assert sorted(setup_keys) == sorted(
+ req_dict.keys()
+ ) # if fail: requirements.txt contains a lib not in setup.cfg
+
+ # check pyproject.toml and compare the versions of the libs to requirements.txt
+ # does not fail when there are missing or additional libs
+ toml_file = root_dir / "pyproject.toml"
+ with toml_file.open() as f:
+ lines = f.readlines()
+ for line in lines:
+ line = line.strip().strip(",").strip('"')
+ if not line.startswith("#"):
+ lib, v = _parse_req(line)
+ if lib:
+ req_v = req_dict.get(lib, None)
+ assert (lib + v) == (lib + req_v), (
+ "{} has different version in pyproject.toml and in requirements.txt: "
+ "{} and {} respectively".format(lib, v, req_v)
+ )
+
+
+def _parse_req(line):
+ lib = re.match(r"^[a-z0-9\-]*", line).group(0)
+ v = line.replace(lib, "").strip()
+ if not re.match(r"^[<>=][<>=].*", v):
+ return None, None
+ return lib, v
diff --git a/spacy/tests/test_requirements.py b/spacy/tests/test_requirements.py
deleted file mode 100644
index fc5aeeddd..000000000
--- a/spacy/tests/test_requirements.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import re
-from pathlib import Path
-
-
-def test_build_dependencies(en_vocab):
- # Check that library requirements are pinned exactly the same across different setup files.
- libs_ignore_requirements = ["pytest", "pytest-timeout", "mock", "flake8", "jsonschema"]
- libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]
-
- # check requirements.txt
- req_dict = {}
-
- root_dir = None
- # when running tests locally, the file is 3 levels up. On the CI, it's 2 levels up.
- roots = [Path(__file__).parent.parent, Path(__file__).parent.parent.parent] # or whatever
- print()
- for r in roots:
- print("inspecting dir", r, "-->", [f.name for f in r.glob(pattern="*.*")])
- req_file = r / "requirements.txt"
- if req_file.exists():
- root_dir = r
- with req_file.open() as f:
- lines = f.readlines()
- for line in lines:
- line = line.strip()
- if not line.startswith("#"):
- lib, v = _parse_req(line)
- if lib and lib not in libs_ignore_requirements:
- req_dict[lib] = v
-
- assert root_dir is not None, "Could not find the root directory of requirements.txt"
-
- # check setup.cfg and compare to requirements.txt
- # also fails when there are missing or additional libs
- setup_file = root_dir / "setup.cfg"
- with setup_file.open() as f:
- lines = f.readlines()
-
- # import configparser
- # config = configparser.ConfigParser()
- # config.read(setup_file)
- # print("SECTIONS", config.sections())
- # print("options", config['options'])
- # for key in config['options']:
- # print("key", key)
- # print("setup_requires *", config['options']['setup_requires'], "*")
- # lines = config['options']['setup_requires']
- # lines += config['options']['install_requires']
-
- setup_keys = set()
- for line in lines:
- line = line.strip()
- if not line.startswith("#"):
- lib, v = _parse_req(line)
- if lib and not lib.startswith("cupy") and lib not in libs_ignore_setup:
- req_v = req_dict.get(lib, None)
- assert req_v is not None, "{} in setup.cfg but not in requirements.txt".format(lib)
- assert (lib+v) == (lib+req_v), "{} has different version in setup.cfg and in requirements.txt: " \
- "{} and {} respectively".format(lib, v, req_v)
- setup_keys.add(lib)
- assert sorted(setup_keys) == sorted(req_dict.keys()) # if fail: requirements.txt contains a lib not in setup.cfg
-
- # check pyproject.toml and compare the versions of the libs to requirements.txt
- # does not fail when there are missing or additional libs
- toml_file = root_dir / "pyproject.toml"
- with toml_file.open() as f:
- lines = f.readlines()
- for line in lines:
- line = line.strip().strip(",").strip("\"")
- if not line.startswith("#"):
- lib, v = _parse_req(line)
- if lib:
- req_v = req_dict.get(lib, None)
- assert (lib+v) == (lib+req_v), "{} has different version in pyproject.toml and in requirements.txt: " \
- "{} and {} respectively".format(lib, v, req_v)
-
-
-def _parse_req(line):
- lib = re.match(r"^[a-z0-9\-]*", line).group(0)
- v = line.replace(lib, "").strip()
- if not re.match(r"^[<>=][<>=].*", v):
- return None, None
- return lib, v
From 436b26fe0fb28118d41351aa4f94bb4bb9932cd0 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Tue, 25 Feb 2020 15:48:29 +0100
Subject: [PATCH 082/496] Revert other changes
---
MANIFEST.in | 1 -
azure-pipelines.yml | 12 ++++++------
2 files changed, 6 insertions(+), 7 deletions(-)
diff --git a/MANIFEST.in b/MANIFEST.in
index 64886cd19..1947b9140 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -4,6 +4,5 @@ include LICENSE
include README.md
include bin/spacy
include pyproject.toml
-include requirements.txt
recursive-exclude spacy/lang *.json
recursive-include spacy/lang *.json.gz
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 5a5e8f03a..d34da39f7 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -83,15 +83,15 @@ jobs:
python setup.py sdist --formats=gztar
displayName: 'Compile and build sdist'
+ - task: DeleteFiles@1
+ inputs:
+ contents: 'spacy'
+ displayName: 'Delete source directory'
+
- bash: |
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
pip install dist/$SDIST
displayName: 'Install from sdist'
- - script: python -m pytest spacy
+ - script: python -m pytest --pyargs spacy
displayName: 'Run tests'
-
- - task: DeleteFiles@1
- inputs:
- contents: 'spacy'
- displayName: 'Delete source directory'
From 912572e04a6fe15c515c005847f054c989d5e6f1 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Tue, 25 Feb 2020 16:01:58 +0100
Subject: [PATCH 083/496] Only copy if file exists (not if installed from sdist
etc.)
---
setup.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/setup.py b/setup.py
index d850a74ac..d9021836f 100755
--- a/setup.py
+++ b/setup.py
@@ -133,8 +133,9 @@ def setup_package():
exec(f.read(), about)
for copy_file, target_dir in COPY_FILES.items():
- shutil.copy(str(copy_file), str(target_dir))
- print(f"Copied {copy_file} -> {target_dir}")
+ if copy_file.exists():
+ shutil.copy(str(copy_file), str(target_dir))
+ print(f"Copied {copy_file} -> {target_dir}")
include_dirs = [
get_python_inc(plat_specific=True),
From b6a6cff70857b9edd0a1d2fa6a2fe62deb7a4290 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Tue, 25 Feb 2020 16:17:23 +0100
Subject: [PATCH 084/496] Add blis to pyproject.toml
---
pyproject.toml | 1 +
1 file changed, 1 insertion(+)
diff --git a/pyproject.toml b/pyproject.toml
index 8d3652a2f..71e523c7c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,5 +7,6 @@ requires = [
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc==8.0.0a0",
+ "blis>=0.4.0,<0.5.0"
]
build-backend = "setuptools.build_meta"
From fc6e34c3a13b93caaed7b2c0cf60dcc0df59c0f4 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Wed, 26 Feb 2020 08:44:22 +0100
Subject: [PATCH 085/496] fix bugs from porting master to develop
---
.../wikidata_train_entity_linker.py | 4 +---
spacy/cli/train.py | 22 +++----------------
2 files changed, 4 insertions(+), 22 deletions(-)
diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
index 386af7d4d..af0e68768 100644
--- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py
+++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
@@ -175,12 +175,10 @@ def main(
kb=kb,
labels_discard=labels_discard,
)
- docs, golds = zip(*train_batch)
try:
with nlp.disable_pipes(*other_pipes):
nlp.update(
- docs=docs,
- golds=golds,
+ examples=train_batch,
sgd=optimizer,
drop=dropout,
losses=losses,
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index d8514095b..92f94b53d 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -28,13 +28,6 @@ def train(
pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner",
vectors: ("Model to load vectors from", "option", "v", str) = None,
replace_components: ("Replace components from base model", "flag", "R", bool) = False,
- width: ("Width of CNN layers of Tok2Vec component", "option", "cw", int) = 96,
- conv_depth: ("Depth of CNN layers of Tok2Vec component", "option", "cd", int) = 4,
- cnn_window: ("Window size for CNN layers of Tok2Vec component", "option", "cW", int) = 1,
- cnn_pieces: ("Maxout size for CNN layers of Tok2Vec component. 1 for Mish", "option", "cP", int) = 3,
- use_chars: ("Whether to use character-based embedding of Tok2Vec component", "flag", "chr", bool) = False,
- bilstm_depth: ("Depth of BiLSTM layers of Tok2Vec component (requires PyTorch)", "option", "lstm", int) = 0,
- embed_rows: ("Number of embedding rows of Tok2Vec component", "option", "er", int) = 2000,
n_iter: ("Number of iterations", "option", "n", int) = 30,
n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None,
n_examples: ("Number of examples", "option", "ns", int) = 0,
@@ -232,14 +225,7 @@ def train(
else:
# Start with a blank model, call begin_training
cfg = {"device": use_gpu}
- cfg["conv_depth"] = conv_depth
- cfg["token_vector_width"] = width
- cfg["bilstm_depth"] = bilstm_depth
- cfg["cnn_maxout_pieces"] = cnn_pieces
- cfg["embed_size"] = embed_rows
- cfg["conv_window"] = cnn_window
- cfg["subword_features"] = not use_chars
- optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg)
+ optimizer = nlp.begin_training(lambda: corpus.train_examples, **cfg)
nlp._optimizer = None
# Load in pretrained weights
@@ -362,11 +348,9 @@ def train(
for batch in util.minibatch_by_words(train_data, size=batch_sizes):
if not batch:
continue
- docs, golds = zip(*batch)
try:
nlp.update(
- docs,
- golds,
+ batch,
sgd=optimizer,
drop=next(dropout_rates),
losses=losses,
@@ -609,7 +593,7 @@ def _get_metrics(component):
elif component == "tagger":
return ("tags_acc",)
elif component == "ner":
- return ("ents_f", "ents_p", "ents_r", "enty_per_type")
+ return ("ents_f", "ents_p", "ents_r", "ents_per_type")
elif component == "sentrec":
return ("sent_f", "sent_p", "sent_r")
elif component == "textcat":
From 06f0a8daa0b919edbafa966db42fc74dce5cab02 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Thu, 27 Feb 2020 18:42:27 +0100
Subject: [PATCH 086/496] Default settings to configurations (#4995)
* fix grad_clip naming
* cleaning up pretrained_vectors out of cfg
* further refactoring Model init's
* move Model building out of pipes
* further refactor to require a model config when creating a pipe
* small fixes
* making cfg in nn_parser more consistent
* fixing nr_class for parser
* fixing nn_parser's nO
* fix printing of loss
* architectures in own file per type, consistent naming
* convenience methods default_tagger_config and default_tok2vec_config
* let create_pipe access default config if available for that component
* default_parser_config
* move defaults to separate folder
* allow reading nlp from package or dir with argument 'name'
* architecture spacy.VocabVectors.v1 to read static vectors from file
* cleanup
* default configs for nel, textcat, morphologizer, tensorizer
* fix imports
* fixing unit tests
* fixes and clean up
* fixing defaults, nO, fix unit tests
* restore parser IO
* fix IO
* 'fix' serialization test
* add *.cfg to manifest
* fix example configs with additional arguments
* replace Morpohologizer with Tagger
* add IO bit when testing overfitting of tagger (currently failing)
* fix IO - don't initialize when reading from disk
* expand overfitting tests to also check IO goes OK
* remove dropout from HashEmbed to fix Tagger performance
* add defaults for sentrec
* update thinc
* always pass a Model instance to a Pipe
* fix piped_added statement
* remove obsolete W029
* remove obsolete errors
* restore byte checking tests (work again)
* clean up test
* further test cleanup
* convert from config to Model in create_pipe
* bring back error when component is not initialized
* cleanup
* remove calls for nlp2.begin_training
* use thinc.api in imports
* allow setting charembed's nM and nC
* fix for hardcoded nM/nC + unit test
* formatting fixes
* trigger build
---
MANIFEST.in | 2 +-
bin/ud/ud_train.py | 4 +-
bin/wiki_entity_linking/train_descriptions.py | 6 +-
.../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 12 +-
.../ptb-joint-pos-dep/defaults.cfg | 11 +-
examples/training/pretrain_textcat.py | 17 +-
examples/training/train_textcat.py | 2 +-
pyproject.toml | 2 +-
requirements.txt | 2 +-
setup.cfg | 4 +-
spacy/cli/pretrain.py | 17 +-
spacy/cli/train.py | 88 ++--
spacy/cli/train_from_config.py | 139 ++-----
spacy/errors.py | 15 +-
spacy/language.py | 75 +++-
spacy/ml/component_models.py | 227 ----------
spacy/ml/models/__init__.py | 6 +
spacy/ml/models/defaults/__init__.py | 93 +++++
.../defaults/entity_linker_defaults.cfg | 12 +
.../defaults/morphologizer_defaults.cfg | 14 +
spacy/ml/models/defaults/ner_defaults.cfg | 15 +
spacy/ml/models/defaults/parser_defaults.cfg | 15 +
spacy/ml/models/defaults/sentrec_defaults.cfg | 14 +
spacy/ml/models/defaults/tagger_defaults.cfg | 12 +
.../models/defaults/tensorizer_defaults.cfg | 4 +
spacy/ml/models/defaults/textcat_defaults.cfg | 13 +
spacy/ml/models/defaults/tok2vec_defaults.cfg | 9 +
spacy/ml/models/entity_linker.py | 23 ++
spacy/ml/models/multi_task.py | 29 ++
spacy/ml/models/parser.py | 33 ++
spacy/ml/models/tagger.py | 16 +
spacy/ml/models/tensorizer.py | 10 +
spacy/ml/models/textcat.py | 42 ++
spacy/ml/models/tok2vec.py | 390 ++++++++++++++++++
spacy/ml/tok2vec.py | 178 --------
spacy/pipeline/entityruler.py | 2 +-
spacy/pipeline/hooks.py | 2 -
spacy/pipeline/morphologizer.pyx | 26 +-
spacy/pipeline/pipes.pyx | 356 +++-------------
spacy/pipeline/tok2vec.py | 21 +-
spacy/syntax/_parser_model.pyx | 65 ++-
spacy/syntax/nn_parser.pyx | 221 +++-------
spacy/tests/doc/test_add_entities.py | 5 +-
spacy/tests/parser/test_add_label.py | 18 +-
spacy/tests/parser/test_arc_eager_oracle.py | 4 +-
spacy/tests/parser/test_ner.py | 57 +--
spacy/tests/parser/test_neural_parser.py | 20 +-
spacy/tests/parser/test_nn_beam.py | 3 +-
spacy/tests/parser/test_parse.py | 15 +-
spacy/tests/parser/test_preset_sbd.py | 4 +-
spacy/tests/pipeline/test_analysis.py | 3 +-
spacy/tests/pipeline/test_tagger.py | 19 +-
spacy/tests/pipeline/test_textcat.py | 22 +-
spacy/tests/regression/test_issue1501-2000.py | 8 +-
spacy/tests/regression/test_issue2001-2500.py | 3 +
spacy/tests/regression/test_issue3001-3500.py | 4 +-
spacy/tests/regression/test_issue3830.py | 6 +-
spacy/tests/regression/test_issue4042.py | 3 +-
spacy/tests/regression/test_issue4313.py | 3 +-
.../tests/serialize/test_serialize_config.py | 126 ++++++
.../serialize/test_serialize_language.py | 3 +-
.../serialize/test_serialize_pipeline.py | 71 ++--
spacy/tests/test_tok2vec.py | 36 +-
spacy/util.py | 47 ++-
64 files changed, 1511 insertions(+), 1213 deletions(-)
delete mode 100644 spacy/ml/component_models.py
create mode 100644 spacy/ml/models/__init__.py
create mode 100644 spacy/ml/models/defaults/__init__.py
create mode 100644 spacy/ml/models/defaults/entity_linker_defaults.cfg
create mode 100644 spacy/ml/models/defaults/morphologizer_defaults.cfg
create mode 100644 spacy/ml/models/defaults/ner_defaults.cfg
create mode 100644 spacy/ml/models/defaults/parser_defaults.cfg
create mode 100644 spacy/ml/models/defaults/sentrec_defaults.cfg
create mode 100644 spacy/ml/models/defaults/tagger_defaults.cfg
create mode 100644 spacy/ml/models/defaults/tensorizer_defaults.cfg
create mode 100644 spacy/ml/models/defaults/textcat_defaults.cfg
create mode 100644 spacy/ml/models/defaults/tok2vec_defaults.cfg
create mode 100644 spacy/ml/models/entity_linker.py
create mode 100644 spacy/ml/models/multi_task.py
create mode 100644 spacy/ml/models/parser.py
create mode 100644 spacy/ml/models/tagger.py
create mode 100644 spacy/ml/models/tensorizer.py
create mode 100644 spacy/ml/models/textcat.py
create mode 100644 spacy/ml/models/tok2vec.py
create mode 100644 spacy/tests/serialize/test_serialize_config.py
diff --git a/MANIFEST.in b/MANIFEST.in
index 1947b9140..e6d25284f 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,5 @@
recursive-include include *.h
-recursive-include spacy *.txt *.pyx *.pxd
+recursive-include spacy *.pyx *.pxd *.txt *.cfg
include LICENSE
include README.md
include bin/spacy
diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py
index bda22088d..aa5050f3a 100644
--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@@ -386,8 +386,8 @@ def _load_pretrained_tok2vec(nlp, loc):
weights_data = file_.read()
loaded = []
for name, component in nlp.pipeline:
- if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
- component.tok2vec.from_bytes(weights_data)
+ if hasattr(component, "model") and component.model.has_ref("tok2vec"):
+ component.get_ref("tok2vec").from_bytes(weights_data)
loaded.append(name)
return loaded
diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py
index d98bba565..b0cfbb4c6 100644
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ b/bin/wiki_entity_linking/train_descriptions.py
@@ -1,13 +1,9 @@
-# coding: utf-8
from random import shuffle
import logging
import numpy as np
-from thinc.model import Model
-from thinc.api import chain
-from thinc.loss import CosineDistance
-from thinc.layers import Linear
+from thinc.api import Model, chain, CosineDistance, Linear
from spacy.util import create_default_optimizer
diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
index 8cd150868..4f1a915c5 100644
--- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@@ -39,25 +39,27 @@ factory = "tagger"
factory = "parser"
[nlp.pipeline.tagger.model]
-@architectures = "tagger_model.v1"
+@architectures = "spacy.Tagger.v1"
[nlp.pipeline.tagger.model.tok2vec]
-@architectures = "tok2vec_tensors.v1"
+@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.parser.model]
-@architectures = "transition_based_parser.v1"
+@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
hidden_width = 64
maxout_pieces = 3
[nlp.pipeline.parser.model.tok2vec]
-@architectures = "tok2vec_tensors.v1"
+@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.tok2vec.model]
-@architectures = "hash_embed_bilstm.v1"
+@architectures = "spacy.HashEmbedBiLSTM.v1"
pretrained_vectors = ${nlp:vectors}
width = 96
depth = 4
embed_size = 2000
+subword_features = true
+char_embed = false
diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
index 6735284a7..2ceaab0be 100644
--- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
@@ -39,27 +39,28 @@ factory = "tagger"
factory = "parser"
[nlp.pipeline.tagger.model]
-@architectures = "tagger_model.v1"
+@architectures = "spacy.Tagger.v1"
[nlp.pipeline.tagger.model.tok2vec]
-@architectures = "tok2vec_tensors.v1"
+@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.parser.model]
-@architectures = "transition_based_parser.v1"
+@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
hidden_width = 64
maxout_pieces = 3
[nlp.pipeline.parser.model.tok2vec]
-@architectures = "tok2vec_tensors.v1"
+@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.tok2vec.model]
-@architectures = "hash_embed_cnn.v1"
+@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = ${nlp:vectors}
width = 96
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
+subword_features = true
diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py
index 85d36fd66..0aefec9ef 100644
--- a/examples/training/pretrain_textcat.py
+++ b/examples/training/pretrain_textcat.py
@@ -20,9 +20,9 @@ import random
import ml_datasets
import spacy
-from spacy.util import minibatch, use_gpu, compounding
+from spacy.util import minibatch
from spacy.pipeline import TextCategorizer
-from spacy.ml.tok2vec import Tok2Vec
+from spacy.ml.models.tok2vec import build_Tok2Vec_model
import numpy
@@ -65,9 +65,7 @@ def prefer_gpu():
def build_textcat_model(tok2vec, nr_class, width):
- from thinc.model import Model
- from thinc.layers import Softmax, chain, reduce_mean
- from thinc.layers import list2ragged
+ from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged
with Model.define_operators({">>": chain}):
model = (
@@ -76,7 +74,7 @@ def build_textcat_model(tok2vec, nr_class, width):
>> reduce_mean()
>> Softmax(nr_class, width)
)
- model.tok2vec = tok2vec
+ model.set_ref("tok2vec", tok2vec)
return model
@@ -97,8 +95,9 @@ def create_pipeline(width, embed_size, vectors_model):
textcat = TextCategorizer(
nlp.vocab,
labels=["POSITIVE", "NEGATIVE"],
+ # TODO: replace with config version
model=build_textcat_model(
- Tok2Vec(width=width, embed_size=embed_size), 2, width
+ build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width
),
)
@@ -121,7 +120,7 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
def train_textcat(nlp, n_texts, n_iter=10):
textcat = nlp.get_pipe("textcat")
- tok2vec_weights = textcat.model.tok2vec.to_bytes()
+ tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes()
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
print(
"Using {} examples ({} training, {} evaluation)".format(
@@ -135,7 +134,7 @@ def train_textcat(nlp, n_texts, n_iter=10):
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train textcat
optimizer = nlp.begin_training()
- textcat.model.tok2vec.from_bytes(tok2vec_weights)
+ textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights)
print("Training the model...")
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
for i in range(n_iter):
diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py
index 4d402e04d..50c852ac1 100644
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@@ -74,7 +74,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
optimizer = nlp.begin_training()
if init_tok2vec is not None:
with init_tok2vec.open("rb") as file_:
- textcat.model.tok2vec.from_bytes(file_.read())
+ textcat.model.get_ref("tok2vec").from_bytes(file_.read())
print("Training the model...")
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
batch_sizes = compounding(4.0, 32.0, 1.001)
diff --git a/pyproject.toml b/pyproject.toml
index 71e523c7c..ee28d5d42 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
- "thinc==8.0.0a0",
+ "thinc==8.0.0a1",
"blis>=0.4.0,<0.5.0"
]
build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index f3a7cc162..09998cdc9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
-thinc==8.0.0a0
+thinc==8.0.0a1
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 980269c35..7b3a468b6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -36,13 +36,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
- thinc==8.0.0a0
+ thinc==8.0.0a1
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
- thinc==8.0.0a0
+ thinc==8.0.0a1
blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 690e3107d..95d549254 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -11,10 +11,10 @@ import srsly
from ..gold import Example
from ..errors import Errors
+from ..ml.models.multi_task import build_masked_language_model
from ..tokens import Doc
from ..attrs import ID, HEAD
-from ..ml.component_models import Tok2Vec
-from ..ml.component_models import masked_language_model
+from ..ml.models.tok2vec import build_Tok2Vec_model
from .. import util
from ..util import create_default_optimizer
from .train import _load_pretrained_tok2vec
@@ -108,14 +108,19 @@ def pretrain(
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors
model = create_pretraining_model(
nlp,
- Tok2Vec(
+ # TODO: replace with config
+ build_Tok2Vec_model(
width,
embed_rows,
conv_depth=conv_depth,
pretrained_vectors=pretrained_vectors,
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
subword_features=not use_chars, # Set to False for Chinese etc
- cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation.
+ maxout_pieces=cnn_pieces, # If set to 1, use Mish activation.
+ window_size=1,
+ char_embed=False,
+ nM=64,
+ nC=8
),
)
# Load in pretrained weights
@@ -152,7 +157,7 @@ def pretrain(
is_temp_str = ".temp" if is_temp else ""
with model.use_params(optimizer.averages):
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
- file_.write(model.tok2vec.to_bytes())
+ file_.write(model.get_ref("tok2vec").to_bytes())
log = {
"nr_word": tracker.nr_word,
"loss": tracker.loss,
@@ -284,7 +289,7 @@ def create_pretraining_model(nlp, tok2vec):
# "tok2vec" has to be the same set of processes as what the components do.
tok2vec = chain(tok2vec, list2array())
model = chain(tok2vec, output_layer)
- model = masked_language_model(nlp.vocab, model)
+ model = build_masked_language_model(nlp.vocab, model)
model.set_ref("tok2vec", tok2vec)
model.set_ref("output_layer", output_layer)
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 92f94b53d..5667bb905 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -9,7 +9,7 @@ from wasabi import msg
import contextlib
import random
-from ..util import create_default_optimizer
+from ..util import create_default_optimizer, registry
from ..util import use_gpu as set_gpu
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus
@@ -111,6 +111,8 @@ def train(
eval_beam_widths.sort()
has_beam_widths = eval_beam_widths != [1]
+ default_dir = Path(__file__).parent.parent / "ml" / "models" / "defaults"
+
# Set up the base model and pipeline. If a base model is specified, load
# the model and make sure the pipeline matches the pipeline setting. If
# training starts from a blank model, intitalize the language class.
@@ -118,7 +120,6 @@ def train(
msg.text(f"Training pipeline: {pipeline}")
disabled_pipes = None
pipes_added = False
- msg.text(f"Training pipeline: {pipeline}")
if use_gpu >= 0:
activated_gpu = None
try:
@@ -140,16 +141,36 @@ def train(
f"specified as `lang` argument ('{lang}') ",
exits=1,
)
+ if vectors:
+ msg.text(f"Loading vectors from model '{vectors}'")
+
+ nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
for pipe in pipeline:
- pipe_cfg = {}
+ # first, create the model.
+ # Bit of a hack after the refactor to get the vectors into a default config
+ # use train-from-config instead :-)
if pipe == "parser":
- pipe_cfg = {"learn_tokens": learn_tokens}
+ config_loc = default_dir / "parser_defaults.cfg"
+ elif pipe == "tagger":
+ config_loc = default_dir / "tagger_defaults.cfg"
+ elif pipe == "ner":
+ config_loc = default_dir / "ner_defaults.cfg"
elif pipe == "textcat":
- pipe_cfg = {
- "exclusive_classes": not textcat_multilabel,
- "architecture": textcat_arch,
- "positive_label": textcat_positive_label,
- }
+ config_loc = default_dir / "textcat_defaults.cfg"
+ else:
+ raise ValueError(f"Component {pipe} currently not supported.")
+ pipe_cfg = util.load_config(config_loc, create_objects=False)
+ if vectors:
+ pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors}
+ pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
+
+ if pipe == "parser":
+ pipe_cfg["learn_tokens"] = learn_tokens
+ elif pipe == "textcat":
+ pipe_cfg["exclusive_classes"] = not textcat_multilabel
+ pipe_cfg["architecture"] = textcat_arch
+ pipe_cfg["positive_label"] = textcat_positive_label
+
if pipe not in nlp.pipe_names:
msg.text(f"Adding component to base model '{pipe}'")
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
@@ -181,26 +202,42 @@ def train(
msg.text(f"Starting with blank model '{lang}'")
lang_cls = util.get_lang_class(lang)
nlp = lang_cls()
+
+ if vectors:
+ msg.text(f"Loading vectors from model '{vectors}'")
+
for pipe in pipeline:
+ # first, create the model.
+ # Bit of a hack after the refactor to get the vectors into a default config
+ # use train-from-config instead :-)
if pipe == "parser":
- pipe_cfg = {"learn_tokens": learn_tokens}
+ config_loc = default_dir / "parser_defaults.cfg"
+ elif pipe == "tagger":
+ config_loc = default_dir / "tagger_defaults.cfg"
+ elif pipe == "ner":
+ config_loc = default_dir / "ner_defaults.cfg"
elif pipe == "textcat":
- pipe_cfg = {
- "exclusive_classes": not textcat_multilabel,
- "architecture": textcat_arch,
- "positive_label": textcat_positive_label,
- }
+ config_loc = default_dir / "textcat_defaults.cfg"
else:
- pipe_cfg = {}
- nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
+ raise ValueError(f"Component {pipe} currently not supported.")
+ pipe_cfg = util.load_config(config_loc, create_objects=False)
+ if vectors:
+ pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors}
+ pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
+
+ if pipe == "parser":
+ pipe_cfg["learn_tokens"] = learn_tokens
+ elif pipe == "textcat":
+ pipe_cfg["exclusive_classes"] = not textcat_multilabel
+ pipe_cfg["architecture"] = textcat_arch
+ pipe_cfg["positive_label"] = textcat_positive_label
+
+ pipe = nlp.create_pipe(pipe, config=pipe_cfg)
+ nlp.add_pipe(pipe)
# Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map)
- if vectors:
- msg.text(f"Loading vector from model '{vectors}'")
- _load_vectors(nlp, vectors)
-
# Multitask objectives
multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
for pipe_name, multitasks in multitask_options:
@@ -228,7 +265,7 @@ def train(
optimizer = nlp.begin_training(lambda: corpus.train_examples, **cfg)
nlp._optimizer = None
- # Load in pretrained weights
+ # Load in pretrained weights (TODO: this may be broken in the config rewrite)
if init_tok2vec is not None:
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
msg.text(f"Loaded pretrained tok2vec for: {components}")
@@ -531,7 +568,7 @@ def _create_progress_bar(total):
def _load_vectors(nlp, vectors):
- util.load_model(vectors, vocab=nlp.vocab)
+ loaded_model = util.load_model(vectors, vocab=nlp.vocab)
for lex in nlp.vocab:
values = {}
for attr, func in nlp.vocab.lex_attr_getters.items():
@@ -541,6 +578,7 @@ def _load_vectors(nlp, vectors):
values[lex.vocab.strings[attr]] = func(lex.orth_)
lex.set_attrs(**values)
lex.is_oov = False
+ return loaded_model
def _load_pretrained_tok2vec(nlp, loc):
@@ -551,8 +589,8 @@ def _load_pretrained_tok2vec(nlp, loc):
weights_data = file_.read()
loaded = []
for name, component in nlp.pipeline:
- if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
- component.tok2vec.from_bytes(weights_data)
+ if hasattr(component, "model") and component.model.has_ref("tok2vec"):
+ component.get_ref("tok2vec").from_bytes(weights_data)
loaded.append(name)
return loaded
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index 9150da356..0dba8a962 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -1,19 +1,17 @@
from typing import Optional, Dict, List, Union, Sequence
+from pydantic import BaseModel, FilePath, StrictInt
+
import plac
-from wasabi import msg
+import tqdm
from pathlib import Path
+
+from wasabi import msg
import thinc
import thinc.schedules
from thinc.api import Model
-from pydantic import BaseModel, FilePath, StrictInt
-import tqdm
-# TODO: relative imports?
-import spacy
-from spacy.gold import GoldCorpus
-from spacy.pipeline.tok2vec import Tok2VecListener
-from spacy.ml import component_models
-from spacy import util
+from ..gold import GoldCorpus
+from .. import util
registry = util.registry
@@ -57,23 +55,24 @@ factory = "tok2vec"
factory = "ner"
[nlp.pipeline.ner.model]
-@architectures = "transition_based_ner.v1"
+@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3
hidden_width = 64
maxout_pieces = 3
[nlp.pipeline.ner.model.tok2vec]
-@architectures = "tok2vec_tensors.v1"
+@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.tok2vec.model]
-@architectures = "hash_embed_cnn.v1"
+@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = ${nlp:vectors}
width = 128
depth = 4
window_size = 1
embed_size = 10000
maxout_pieces = 3
+subword_features = true
"""
@@ -113,65 +112,6 @@ class ConfigSchema(BaseModel):
extra = "allow"
-# Of course, these would normally decorate the functions where they're defined.
-# But for now...
-@registry.architectures.register("hash_embed_cnn.v1")
-def hash_embed_cnn(
- pretrained_vectors, width, depth, embed_size, maxout_pieces, window_size
-):
- return component_models.Tok2Vec(
- width=width,
- embed_size=embed_size,
- pretrained_vectors=pretrained_vectors,
- conv_depth=depth,
- cnn_maxout_pieces=maxout_pieces,
- bilstm_depth=0,
- window_size=window_size,
- )
-
-
-@registry.architectures.register("hash_embed_bilstm.v1")
-def hash_embed_bilstm_v1(pretrained_vectors, width, depth, embed_size):
- return component_models.Tok2Vec(
- width=width,
- embed_size=embed_size,
- pretrained_vectors=pretrained_vectors,
- bilstm_depth=depth,
- conv_depth=0,
- cnn_maxout_pieces=0,
- )
-
-
-@registry.architectures.register("tagger_model.v1")
-def build_tagger_model_v1(tok2vec):
- return component_models.build_tagger_model(nr_class=None, tok2vec=tok2vec)
-
-
-@registry.architectures.register("transition_based_parser.v1")
-def create_tb_parser_model(
- tok2vec: Model,
- nr_feature_tokens: StrictInt = 3,
- hidden_width: StrictInt = 64,
- maxout_pieces: StrictInt = 3,
-):
- from thinc.api import Linear, chain, list2array, use_ops, zero_init
- from spacy.ml._layers import PrecomputableAffine
- from spacy.syntax._parser_model import ParserModel
-
- token_vector_width = tok2vec.get_dim("nO")
- tok2vec = chain(tok2vec, list2array())
- tok2vec.set_dim("nO", token_vector_width)
-
- lower = PrecomputableAffine(
- hidden_width, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), nP=maxout_pieces
- )
- lower.set_dim("nP", maxout_pieces)
- with use_ops("numpy"):
- # Initialize weights at zero, as it's a classification layer.
- upper = Linear(init_W=zero_init)
- return ParserModel(tok2vec, lower, upper)
-
-
@plac.annotations(
# fmt: off
train_path=("Location of JSON-formatted training data", "positional", None, Path),
@@ -224,23 +164,25 @@ def train_from_config(
config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
):
msg.info(f"Loading config from: {config_path}")
- config = util.load_from_config(config_path, create_objects=True)
+ config = util.load_config(config_path, create_objects=True)
use_gpu = config["training"]["use_gpu"]
if use_gpu >= 0:
msg.info("Using GPU")
else:
msg.info("Using CPU")
msg.info("Creating nlp from config")
- nlp = create_nlp_from_config(**config["nlp"])
+ nlp_config = util.load_config(config_path, create_objects=False)["nlp"]
+ nlp = util.load_model_from_config(nlp_config)
optimizer = config["optimizer"]
- limit = config["training"]["limit"]
+ training = config["training"]
+ limit = training["limit"]
msg.info("Loading training corpus")
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
msg.info("Initializing the nlp pipeline")
nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
- train_batches = create_train_batches(nlp, corpus, config["training"])
- evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"])
+ train_batches = create_train_batches(nlp, corpus, training)
+ evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)
# Create iterator, which yields out info after each optimization step.
msg.info("Start training")
@@ -249,16 +191,16 @@ def train_from_config(
optimizer,
train_batches,
evaluate,
- config["training"]["dropout"],
- config["training"]["patience"],
- config["training"]["eval_frequency"],
+ training["dropout"],
+ training["patience"],
+ training["eval_frequency"],
)
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
- print_row = setup_printer(config)
+ print_row = setup_printer(training, nlp)
try:
- progress = tqdm.tqdm(total=config["training"]["eval_frequency"], leave=False)
+ progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
for batch, info, is_best_checkpoint in training_step_iterator:
progress.update(1)
if is_best_checkpoint is not None:
@@ -266,9 +208,7 @@ def train_from_config(
print_row(info)
if is_best_checkpoint and output_path is not None:
nlp.to_disk(output_path)
- progress = tqdm.tqdm(
- total=config["training"]["eval_frequency"], leave=False
- )
+ progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
finally:
if output_path is not None:
with nlp.use_params(optimizer.averages):
@@ -280,18 +220,6 @@ def train_from_config(
# msg.good("Created best model", best_model_path)
-def create_nlp_from_config(lang, vectors, pipeline):
- lang_class = spacy.util.get_lang_class(lang)
- nlp = lang_class()
- if vectors is not None:
- spacy.cli.train._load_vectors(nlp, vectors)
- for name, component_cfg in pipeline.items():
- factory = component_cfg.pop("factory")
- component = nlp.create_pipe(factory, config=component_cfg)
- nlp.add_pipe(component, name=name)
- return nlp
-
-
def create_train_batches(nlp, corpus, cfg):
while True:
train_examples = corpus.train_dataset(
@@ -405,10 +333,10 @@ def subdivide_batch(batch):
return [batch]
-def setup_printer(config):
- score_cols = config["training"]["scores"]
+def setup_printer(training, nlp):
+ score_cols = training["scores"]
score_widths = [max(len(col), 6) for col in score_cols]
- loss_cols = [f"Loss {pipe}" for pipe in config["nlp"]["pipeline"]]
+ loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
loss_widths = [max(len(col), 8) for col in loss_cols]
table_header = ["#"] + loss_cols + score_cols + ["Score"]
table_header = [col.upper() for col in table_header]
@@ -420,20 +348,13 @@ def setup_printer(config):
def print_row(info):
losses = [
- "{0:.2f}".format(info["losses"].get(col, 0.0))
- for col in config["nlp"]["pipeline"]
+ "{0:.2f}".format(info["losses"].get(pipe_name, 0.0))
+ for pipe_name in nlp.pipe_names
]
scores = [
- "{0:.2f}".format(info["other_scores"].get(col, 0.0))
- for col in config["training"]["scores"]
+ "{0:.2f}".format(info["other_scores"].get(col, 0.0)) for col in score_cols
]
data = [info["step"]] + losses + scores + ["{0:.2f}".format(info["score"])]
msg.row(data, widths=table_widths, aligns=table_aligns)
return print_row
-
-
-@registry.architectures.register("tok2vec_tensors.v1")
-def tok2vec_tensors_v1(width):
- tok2vec = Tok2VecListener("tok2vec", width=width)
- return tok2vec
diff --git a/spacy/errors.py b/spacy/errors.py
index 7a4953cce..6afbfc3c6 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -106,6 +106,12 @@ class Warnings(object):
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
"string \"Field1=Value1,Value2|Field2=Value3\".")
+ # TODO: fix numbering after merging develop into master
+ W098 = ("No Model config was provided to create the '{name}' component, "
+ "so a default configuration was used.")
+ W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
+ "but got '{type}' instead, so ignoring it.")
+
@add_codes
class Errors(object):
@@ -227,7 +233,7 @@ class Errors(object):
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
"package or a valid path to a data directory.")
E052 = ("Can't find model directory: {path}")
- E053 = ("Could not read meta.json from {path}")
+ E053 = ("Could not read {name} from {path}")
E054 = ("No valid '{setting}' setting found in model meta.json.")
E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
@@ -345,8 +351,8 @@ class Errors(object):
E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
"in favor of the pipe name `sentencizer`, which does the same "
"thing. For example, use `nlp.create_pipeline('sentencizer')`")
- E109 = ("Model for component '{name}' not initialized. Did you forget to "
- "load a model, or forget to call begin_training()?")
+ E109 = ("Component '{name}' could not be run. Did you forget to "
+ "call begin_training()?")
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
E111 = ("Pickling a token is not supported, because tokens are only views "
"of the parent Doc and can't exist on their own. A pickled token "
@@ -532,6 +538,9 @@ class Errors(object):
"make sure the gold EL data refers to valid results of the "
"named entity recognizer in the `nlp` pipeline.")
# TODO: fix numbering after merging develop into master
+ E993 = ("The config for 'nlp' should include either a key 'name' to "
+ "refer to an existing model by name or path, or a key 'lang' "
+ "to create a new blank model.")
E996 = ("Could not parse {file}: {msg}")
E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes "
diff --git a/spacy/language.py b/spacy/language.py
index 1c6014cec..83f8c9d21 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -4,7 +4,9 @@ import weakref
import functools
from contextlib import contextmanager
from copy import copy, deepcopy
-from thinc.api import get_current_ops
+from pathlib import Path
+
+from thinc.api import get_current_ops, Config
import srsly
import multiprocessing as mp
from itertools import chain, cycle
@@ -16,7 +18,7 @@ from .lookups import Lookups
from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
from .gold import Example
from .scorer import Scorer
-from .util import link_vectors_to_models, create_default_optimizer
+from .util import link_vectors_to_models, create_default_optimizer, registry
from .attrs import IS_STOP, LANG
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
@@ -24,7 +26,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
from .tokens import Doc
from .lang.lex_attrs import LEX_ATTRS, is_stop
-from .errors import Errors, Warnings, deprecation_warning
+from .errors import Errors, Warnings, deprecation_warning, user_warning
from . import util
from . import about
@@ -128,7 +130,7 @@ class Language(object):
factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)}
def __init__(
- self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, **kwargs
+ self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, config=None, **kwargs
):
"""Initialise a Language object.
@@ -138,6 +140,7 @@ class Language(object):
object. Usually a `Tokenizer`.
meta (dict): Custom meta data for the Language class. Is written to by
models to add model meta data.
+ config (Config): Configuration data for creating the pipeline components.
max_length (int) :
Maximum number of characters in a single text. The current v2 models
may run out memory on extremely long texts, due to large internal
@@ -152,6 +155,9 @@ class Language(object):
user_factories = util.registry.factories.get_all()
self.factories.update(user_factories)
self._meta = dict(meta)
+ self._config = config
+ if not self._config:
+ self._config = Config()
self._path = None
if vocab is True:
factory = self.Defaults.create_vocab
@@ -170,6 +176,21 @@ class Language(object):
self.max_length = max_length
self._optimizer = None
+ from .ml.models.defaults import default_tagger_config, default_parser_config, default_ner_config, \
+ default_textcat_config, default_nel_config, default_morphologizer_config, default_sentrec_config, \
+ default_tensorizer_config, default_tok2vec_config
+
+ self.defaults = {"tagger": default_tagger_config(),
+ "parser": default_parser_config(),
+ "ner": default_ner_config(),
+ "textcat": default_textcat_config(),
+ "entity_linker": default_nel_config(),
+ "morphologizer": default_morphologizer_config(),
+ "sentrec": default_sentrec_config(),
+ "tensorizer": default_tensorizer_config(),
+ "tok2vec": default_tok2vec_config(),
+ }
+
@property
def path(self):
return self._path
@@ -203,6 +224,10 @@ class Language(object):
def meta(self, value):
self._meta = value
+ @property
+ def config(self):
+ return self._config
+
# Conveniences to access pipeline components
# Shouldn't be used anymore!
@property
@@ -293,7 +318,24 @@ class Language(object):
else:
raise KeyError(Errors.E002.format(name=name))
factory = self.factories[name]
- return factory(self, **config)
+ default_config = self.defaults.get(name, None)
+
+ # transform the model's config to an actual Model
+ model_cfg = None
+ if "model" in config:
+ model_cfg = config["model"]
+ if not isinstance(model_cfg, dict):
+ user_warning(Warnings.W099.format(type=type(model_cfg), pipe=name))
+ model_cfg = None
+ del config["model"]
+ if model_cfg is None and default_config is not None:
+ user_warning(Warnings.W098)
+ model_cfg = default_config["model"]
+ model = None
+ if model_cfg is not None:
+ self.config[name] = {"model": model_cfg}
+ model = registry.make_from_config({"model": model_cfg}, validate=True)["model"]
+ return factory(self, model, **config)
def add_pipe(
self, component, name=None, before=None, after=None, first=None, last=None
@@ -430,7 +472,10 @@ class Language(object):
continue
if not hasattr(proc, "__call__"):
raise ValueError(Errors.E003.format(component=type(proc), name=name))
- doc = proc(doc, **component_cfg.get(name, {}))
+ try:
+ doc = proc(doc, **component_cfg.get(name, {}))
+ except KeyError:
+ raise ValueError(Errors.E109.format(name=name))
if doc is None:
raise ValueError(Errors.E005.format(name=name))
return doc
@@ -578,9 +623,6 @@ class Language(object):
ops = get_current_ops()
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
link_vectors_to_models(self.vocab)
- if self.vocab.vectors.data.shape[1]:
- cfg["pretrained_vectors"] = self.vocab.vectors.name
- cfg["pretrained_dims"] = self.vocab.vectors.data.shape[1]
if sgd is None:
sgd = create_default_optimizer()
self._optimizer = sgd
@@ -611,8 +653,6 @@ class Language(object):
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
link_vectors_to_models(self.vocab)
- if self.vocab.vectors.data.shape[1]:
- cfg["pretrained_vectors"] = self.vocab.vectors
if sgd is None:
sgd = create_default_optimizer()
self._optimizer = sgd
@@ -868,6 +908,7 @@ class Language(object):
serializers["meta.json"] = lambda p: p.open("w").write(
srsly.json_dumps(self.meta)
)
+ serializers["config.cfg"] = lambda p: self.config.to_disk(p)
for name, proc in self.pipeline:
if not hasattr(proc, "name"):
continue
@@ -895,6 +936,8 @@ class Language(object):
exclude = disable
path = util.ensure_path(path)
deserializers = {}
+ if Path(path / "config.cfg").exists():
+ deserializers["config.cfg"] = lambda p: self.config.from_disk(p)
deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
deserializers["vocab"] = lambda p: self.vocab.from_disk(
p
@@ -933,6 +976,7 @@ class Language(object):
serializers["vocab"] = lambda: self.vocab.to_bytes()
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
+ serializers["config.cfg"] = lambda: self.config.to_bytes()
for name, proc in self.pipeline:
if name in exclude:
continue
@@ -955,6 +999,7 @@ class Language(object):
deprecation_warning(Warnings.W014)
exclude = disable
deserializers = {}
+ deserializers["config.cfg"] = lambda b: self.config.from_bytes(b)
deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
deserializers["vocab"] = lambda b: self.vocab.from_bytes(
b
@@ -981,8 +1026,8 @@ class component(object):
and class components and will automatically register components in the
Language.factories. If the component is a class and needs access to the
nlp object or config parameters, it can expose a from_nlp classmethod
- that takes the nlp object and **cfg arguments and returns the initialized
- component.
+ that takes the nlp & model objects and **cfg arguments, and returns the
+ initialized component.
"""
# NB: This decorator needs to live here, because it needs to write to
@@ -1011,9 +1056,9 @@ class component(object):
obj.requires = self.requires
obj.retokenizes = self.retokenizes
- def factory(nlp, **cfg):
+ def factory(nlp, model, **cfg):
if hasattr(obj, "from_nlp"):
- return obj.from_nlp(nlp, **cfg)
+ return obj.from_nlp(nlp, model, **cfg)
elif isinstance(obj, type):
return obj()
return obj
diff --git a/spacy/ml/component_models.py b/spacy/ml/component_models.py
deleted file mode 100644
index 8c694f950..000000000
--- a/spacy/ml/component_models.py
+++ /dev/null
@@ -1,227 +0,0 @@
-from spacy import util
-from spacy.ml.extract_ngrams import extract_ngrams
-
-from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
-from ..errors import Errors
-from ._character_embed import CharacterEmbed
-
-from thinc.api import Model, Maxout, Linear, residual, reduce_mean, list2ragged
-from thinc.api import PyTorchLSTM, add, MultiSoftmax, HashEmbed, StaticVectors
-from thinc.api import expand_window, FeatureExtractor, SparseLinear, chain
-from thinc.api import clone, concatenate, with_array, Softmax, Logistic, uniqued
-from thinc.api import zero_init
-
-
-def build_text_classifier(arch, config):
- if arch == "cnn":
- return build_simple_cnn_text_classifier(**config)
- elif arch == "bow":
- return build_bow_text_classifier(**config)
- else:
- raise ValueError("Unexpected textcat arch")
-
-
-def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes, **cfg):
- """
- Build a simple CNN text classifier, given a token-to-vector model as inputs.
- If exclusive_classes=True, a softmax non-linearity is applied, so that the
- outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
- is applied instead, so that outputs are in the range [0, 1].
- """
- with Model.define_operators({">>": chain}):
- if exclusive_classes:
- output_layer = Softmax(nO=nr_class, nI=tok2vec.get_dim("nO"))
- else:
- # TODO: experiment with init_w=zero_init
- output_layer = Linear(nO=nr_class, nI=tok2vec.get_dim("nO")) >> Logistic()
- model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
- model.set_ref("tok2vec", tok2vec)
- model.set_dim("nO", nr_class)
- return model
-
-
-def build_bow_text_classifier(
- nr_class, exclusive_classes, ngram_size=1, no_output_layer=False, **cfg
-):
- with Model.define_operators({">>": chain}):
- model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nr_class)
- model.to_cpu()
- if not no_output_layer:
- output_layer = (
- Softmax(nO=nr_class) if exclusive_classes else Logistic(nO=nr_class)
- )
- output_layer.to_cpu()
- model = model >> output_layer
- model.set_dim("nO", nr_class)
- return model
-
-
-def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
- if "entity_width" not in cfg:
- raise ValueError(Errors.E144.format(param="entity_width"))
-
- conv_depth = cfg.get("conv_depth", 2)
- cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
- pretrained_vectors = cfg.get("pretrained_vectors", None)
- context_width = cfg.get("entity_width")
-
- with Model.define_operators({">>": chain, "**": clone}):
- nel_tok2vec = Tok2Vec(
- width=hidden_width,
- embed_size=embed_width,
- pretrained_vectors=pretrained_vectors,
- cnn_maxout_pieces=cnn_maxout_pieces,
- subword_features=True,
- conv_depth=conv_depth,
- bilstm_depth=0,
- )
-
- model = (
- nel_tok2vec
- >> list2ragged()
- >> reduce_mean()
- >> residual(Maxout(nO=hidden_width, nI=hidden_width, nP=2, dropout=0.0))
- >> Linear(nO=context_width, nI=hidden_width)
- )
- model.initialize()
-
- model.set_ref("tok2vec", nel_tok2vec)
- model.set_dim("nO", context_width)
- return model
-
-
-def masked_language_model(*args, **kwargs):
- raise NotImplementedError
-
-
-def build_tagger_model(nr_class, tok2vec):
- token_vector_width = tok2vec.get_dim("nO")
- # TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
- softmax = with_array(Softmax(nO=nr_class, nI=token_vector_width, init_W=zero_init))
- model = chain(tok2vec, softmax)
- model.set_ref("tok2vec", tok2vec)
- model.set_ref("softmax", softmax)
- return model
-
-
-def build_morphologizer_model(class_nums, **cfg):
- embed_size = util.env_opt("embed_size", 7000)
- if "token_vector_width" in cfg:
- token_vector_width = cfg["token_vector_width"]
- else:
- token_vector_width = util.env_opt("token_vector_width", 128)
- pretrained_vectors = cfg.get("pretrained_vectors")
- char_embed = cfg.get("char_embed", True)
- with Model.define_operators({">>": chain, "+": add, "**": clone}):
- if "tok2vec" in cfg:
- tok2vec = cfg["tok2vec"]
- else:
- tok2vec = Tok2Vec(
- token_vector_width,
- embed_size,
- char_embed=char_embed,
- pretrained_vectors=pretrained_vectors,
- )
- softmax = with_array(MultiSoftmax(nOs=class_nums, nI=token_vector_width))
- model = tok2vec >> softmax
- model.set_ref("tok2vec", tok2vec)
- model.set_ref("softmax", softmax)
- return model
-
-
-def Tok2Vec(
- width,
- embed_size,
- pretrained_vectors=None,
- window_size=1,
- cnn_maxout_pieces=3,
- subword_features=True,
- char_embed=False,
- conv_depth=4,
- bilstm_depth=0,
-):
- if char_embed:
- subword_features = False
- cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
- with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
- norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=0.0)
- if subword_features:
- prefix = HashEmbed(
- nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=0.0
- )
- suffix = HashEmbed(
- nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=0.0
- )
- shape = HashEmbed(
- nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=0.0
- )
- else:
- prefix, suffix, shape = (None, None, None)
- if pretrained_vectors is not None:
- glove = StaticVectors(
- vectors=pretrained_vectors, nO=width, column=cols.index(ID), dropout=0.0
- )
-
- if subword_features:
- embed = uniqued(
- (glove | norm | prefix | suffix | shape)
- >> Maxout(
- nO=width, nI=width * 5, nP=3, dropout=0.0, normalize=True
- ),
- column=cols.index(ORTH),
- )
- else:
- embed = uniqued(
- (glove | norm)
- >> Maxout(
- nO=width, nI=width * 2, nP=3, dropout=0.0, normalize=True
- ),
- column=cols.index(ORTH),
- )
- elif subword_features:
- embed = uniqued(
- concatenate(norm, prefix, suffix, shape)
- >> Maxout(nO=width, nI=width * 4, nP=3, dropout=0.0, normalize=True),
- column=cols.index(ORTH),
- )
- elif char_embed:
- embed = CharacterEmbed(nM=64, nC=8) | FeatureExtractor(cols) >> with_array(
- norm
- )
- reduce_dimensions = Maxout(
- nO=width,
- nI=64 * 8 + width,
- nP=cnn_maxout_pieces,
- dropout=0.0,
- normalize=True,
- )
- else:
- embed = norm
-
- convolution = residual(
- expand_window(window_size=window_size)
- >> Maxout(
- nO=width,
- nI=width * 3,
- nP=cnn_maxout_pieces,
- dropout=0.0,
- normalize=True,
- )
- )
- if char_embed:
- tok2vec = embed >> with_array(
- reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
- )
- else:
- tok2vec = FeatureExtractor(cols) >> with_array(
- embed >> convolution ** conv_depth, pad=conv_depth
- )
-
- if bilstm_depth >= 1:
- tok2vec = tok2vec >> PyTorchLSTM(
- nO=width, nI=width, depth=bilstm_depth, bi=True
- )
- # Work around thinc API limitations :(. TODO: Revise in Thinc 7
- tok2vec.set_dim("nO", width)
- tok2vec.set_ref("embed", embed)
- return tok2vec
diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py
new file mode 100644
index 000000000..56696d581
--- /dev/null
+++ b/spacy/ml/models/__init__.py
@@ -0,0 +1,6 @@
+from .entity_linker import *
+from .parser import *
+from .tagger import *
+from .tensorizer import *
+from .textcat import *
+from .tok2vec import *
diff --git a/spacy/ml/models/defaults/__init__.py b/spacy/ml/models/defaults/__init__.py
new file mode 100644
index 000000000..9af4da87d
--- /dev/null
+++ b/spacy/ml/models/defaults/__init__.py
@@ -0,0 +1,93 @@
+from pathlib import Path
+
+from .... import util
+
+
+def default_nel_config():
+ loc = Path(__file__).parent / "entity_linker_defaults.cfg"
+ return util.load_config(loc, create_objects=False)
+
+
+def default_nel():
+ loc = Path(__file__).parent / "entity_linker_defaults.cfg"
+ return util.load_config(loc, create_objects=True)["model"]
+
+
+def default_morphologizer_config():
+ loc = Path(__file__).parent / "morphologizer_defaults.cfg"
+ return util.load_config(loc, create_objects=False)
+
+
+def default_morphologizer():
+ loc = Path(__file__).parent / "morphologizer_defaults.cfg"
+ return util.load_config(loc, create_objects=True)["model"]
+
+
+def default_parser_config():
+ loc = Path(__file__).parent / "parser_defaults.cfg"
+ return util.load_config(loc, create_objects=False)
+
+
+def default_parser():
+ loc = Path(__file__).parent / "parser_defaults.cfg"
+ return util.load_config(loc, create_objects=True)["model"]
+
+
+def default_ner_config():
+ loc = Path(__file__).parent / "ner_defaults.cfg"
+ return util.load_config(loc, create_objects=False)
+
+
+def default_ner():
+ loc = Path(__file__).parent / "ner_defaults.cfg"
+ return util.load_config(loc, create_objects=True)["model"]
+
+
+def default_sentrec_config():
+ loc = Path(__file__).parent / "sentrec_defaults.cfg"
+ return util.load_config(loc, create_objects=False)
+
+
+def default_sentrec():
+ loc = Path(__file__).parent / "sentrec_defaults.cfg"
+ return util.load_config(loc, create_objects=True)["model"]
+
+
+def default_tagger_config():
+ loc = Path(__file__).parent / "tagger_defaults.cfg"
+ return util.load_config(loc, create_objects=False)
+
+
+def default_tagger():
+ loc = Path(__file__).parent / "tagger_defaults.cfg"
+ return util.load_config(loc, create_objects=True)["model"]
+
+
+def default_tensorizer_config():
+ loc = Path(__file__).parent / "tensorizer_defaults.cfg"
+ return util.load_config(loc, create_objects=False)
+
+
+def default_tensorizer():
+ loc = Path(__file__).parent / "tensorizer_defaults.cfg"
+ return util.load_config(loc, create_objects=True)["model"]
+
+
+def default_textcat_config():
+ loc = Path(__file__).parent / "textcat_defaults.cfg"
+ return util.load_config(loc, create_objects=False)
+
+
+def default_textcat():
+ loc = Path(__file__).parent / "textcat_defaults.cfg"
+ return util.load_config(loc, create_objects=True)["model"]
+
+
+def default_tok2vec_config():
+ loc = Path(__file__).parent / "tok2vec_defaults.cfg"
+ return util.load_config(loc, create_objects=False)
+
+
+def default_tok2vec():
+ loc = Path(__file__).parent / "tok2vec_defaults.cfg"
+ return util.load_config(loc, create_objects=True)["model"]
diff --git a/spacy/ml/models/defaults/entity_linker_defaults.cfg b/spacy/ml/models/defaults/entity_linker_defaults.cfg
new file mode 100644
index 000000000..6a591ec3e
--- /dev/null
+++ b/spacy/ml/models/defaults/entity_linker_defaults.cfg
@@ -0,0 +1,12 @@
+[model]
+@architectures = "spacy.EntityLinker.v1"
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 96
+depth = 2
+embed_size = 300
+window_size = 1
+maxout_pieces = 3
+subword_features = true
diff --git a/spacy/ml/models/defaults/morphologizer_defaults.cfg b/spacy/ml/models/defaults/morphologizer_defaults.cfg
new file mode 100644
index 000000000..80e776c4f
--- /dev/null
+++ b/spacy/ml/models/defaults/morphologizer_defaults.cfg
@@ -0,0 +1,14 @@
+[model]
+@architectures = "spacy.Tagger.v1"
+
+[model.tok2vec]
+@architectures = "spacy.HashCharEmbedCNN.v1"
+pretrained_vectors = null
+width = 128
+depth = 4
+embed_size = 7000
+window_size = 1
+maxout_pieces = 3
+subword_features = true
+nM = 64
+nC = 8
diff --git a/spacy/ml/models/defaults/ner_defaults.cfg b/spacy/ml/models/defaults/ner_defaults.cfg
new file mode 100644
index 000000000..db2c131f5
--- /dev/null
+++ b/spacy/ml/models/defaults/ner_defaults.cfg
@@ -0,0 +1,15 @@
+[model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 6
+hidden_width = 64
+maxout_pieces = 2
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 96
+depth = 4
+embed_size = 2000
+window_size = 1
+maxout_pieces = 3
+subword_features = true
diff --git a/spacy/ml/models/defaults/parser_defaults.cfg b/spacy/ml/models/defaults/parser_defaults.cfg
new file mode 100644
index 000000000..9cbb6eadb
--- /dev/null
+++ b/spacy/ml/models/defaults/parser_defaults.cfg
@@ -0,0 +1,15 @@
+[model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 64
+maxout_pieces = 2
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 96
+depth = 4
+embed_size = 2000
+window_size = 1
+maxout_pieces = 3
+subword_features = true
diff --git a/spacy/ml/models/defaults/sentrec_defaults.cfg b/spacy/ml/models/defaults/sentrec_defaults.cfg
new file mode 100644
index 000000000..a039a4533
--- /dev/null
+++ b/spacy/ml/models/defaults/sentrec_defaults.cfg
@@ -0,0 +1,14 @@
+[model]
+@architectures = "spacy.Tagger.v1"
+
+[model.tok2vec]
+@architectures = "spacy.HashCharEmbedCNN.v1"
+pretrained_vectors = null
+width = 12
+depth = 1
+embed_size = 2000
+window_size = 1
+maxout_pieces = 2
+subword_features = true
+nM = 64
+nC = 8
diff --git a/spacy/ml/models/defaults/tagger_defaults.cfg b/spacy/ml/models/defaults/tagger_defaults.cfg
new file mode 100644
index 000000000..5aea80a32
--- /dev/null
+++ b/spacy/ml/models/defaults/tagger_defaults.cfg
@@ -0,0 +1,12 @@
+[model]
+@architectures = "spacy.Tagger.v1"
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 96
+depth = 4
+embed_size = 2000
+window_size = 1
+maxout_pieces = 3
+subword_features = true
diff --git a/spacy/ml/models/defaults/tensorizer_defaults.cfg b/spacy/ml/models/defaults/tensorizer_defaults.cfg
new file mode 100644
index 000000000..81880a109
--- /dev/null
+++ b/spacy/ml/models/defaults/tensorizer_defaults.cfg
@@ -0,0 +1,4 @@
+[model]
+@architectures = "spacy.Tensorizer.v1"
+input_size=96
+output_size=300
diff --git a/spacy/ml/models/defaults/textcat_defaults.cfg b/spacy/ml/models/defaults/textcat_defaults.cfg
new file mode 100644
index 000000000..cea1bfe54
--- /dev/null
+++ b/spacy/ml/models/defaults/textcat_defaults.cfg
@@ -0,0 +1,13 @@
+[model]
+@architectures = "spacy.TextCatCNN.v1"
+exclusive_classes = false
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 96
+depth = 4
+embed_size = 2000
+window_size = 1
+maxout_pieces = 3
+subword_features = true
diff --git a/spacy/ml/models/defaults/tok2vec_defaults.cfg b/spacy/ml/models/defaults/tok2vec_defaults.cfg
new file mode 100644
index 000000000..9475d4aab
--- /dev/null
+++ b/spacy/ml/models/defaults/tok2vec_defaults.cfg
@@ -0,0 +1,9 @@
+[model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 96
+depth = 4
+embed_size = 2000
+window_size = 1
+maxout_pieces = 3
+subword_features = true
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
new file mode 100644
index 000000000..0c1762026
--- /dev/null
+++ b/spacy/ml/models/entity_linker.py
@@ -0,0 +1,23 @@
+from pathlib import Path
+
+from thinc.api import chain, clone, list2ragged, reduce_mean, residual
+from thinc.api import Model, Maxout, Linear
+
+from spacy.util import registry
+
+
+@registry.architectures.register("spacy.EntityLinker.v1")
+def build_nel_encoder(tok2vec, nO=None):
+ with Model.define_operators({">>": chain, "**": clone}):
+ token_width = tok2vec.get_dim("nO")
+ output_layer = Linear(nO=nO, nI=token_width)
+ model = (
+ tok2vec
+ >> list2ragged()
+ >> reduce_mean()
+ >> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0))
+ >> output_layer
+ )
+ model.set_ref("output_layer", output_layer)
+ model.set_ref("tok2vec", tok2vec)
+ return model
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
new file mode 100644
index 000000000..1c193df82
--- /dev/null
+++ b/spacy/ml/models/multi_task.py
@@ -0,0 +1,29 @@
+from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init
+
+
+def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96):
+ model = chain(
+ tok2vec,
+ Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=3, dropout=0.0),
+ LayerNorm(token_vector_width * 2),
+ Softmax(nO=n_tags, nI=token_vector_width * 2),
+ )
+ return model
+
+
+def build_cloze_multi_task_model(vocab, tok2vec):
+ output_size = vocab.vectors.data.shape[1]
+ output_layer = chain(
+ Maxout(
+ nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0
+ ),
+ Linear(nO=output_size, nI=output_size, init_W=zero_init),
+ )
+ model = chain(tok2vec, output_layer)
+ model = build_masked_language_model(vocab, model)
+ return model
+
+
+def build_masked_language_model(*args, **kwargs):
+ # TODO cf https://github.com/explosion/spaCy/blob/2c107f02a4d60bda2440db0aad1a88cbbf4fb52d/spacy/_ml.py#L828
+ raise NotImplementedError
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
new file mode 100644
index 000000000..89f303e2a
--- /dev/null
+++ b/spacy/ml/models/parser.py
@@ -0,0 +1,33 @@
+from pydantic import StrictInt
+
+from spacy.util import registry
+from spacy.ml._layers import PrecomputableAffine
+from spacy.syntax._parser_model import ParserModel
+
+from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
+
+
+@registry.architectures.register("spacy.TransitionBasedParser.v1")
+def build_tb_parser_model(
+ tok2vec: Model,
+ nr_feature_tokens: StrictInt,
+ hidden_width: StrictInt,
+ maxout_pieces: StrictInt,
+ nO=None,
+):
+ token_vector_width = tok2vec.get_dim("nO")
+ tok2vec = chain(tok2vec, list2array())
+ tok2vec.set_dim("nO", token_vector_width)
+
+ lower = PrecomputableAffine(
+ nO=hidden_width,
+ nF=nr_feature_tokens,
+ nI=tok2vec.get_dim("nO"),
+ nP=maxout_pieces,
+ )
+ lower.set_dim("nP", maxout_pieces)
+ with use_ops("numpy"):
+ # Initialize weights at zero, as it's a classification layer.
+ upper = Linear(nO=nO, init_W=zero_init)
+ model = ParserModel(tok2vec, lower, upper)
+ return model
diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py
new file mode 100644
index 000000000..92e8be1b2
--- /dev/null
+++ b/spacy/ml/models/tagger.py
@@ -0,0 +1,16 @@
+from thinc.api import zero_init, with_array, Softmax, chain, Model
+
+from spacy.util import registry
+
+
+@registry.architectures.register("spacy.Tagger.v1")
+def build_tagger_model(tok2vec, nO=None) -> Model:
+ token_vector_width = tok2vec.get_dim("nO")
+ # TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
+ output_layer = Softmax(nO, nI=token_vector_width, init_W=zero_init)
+ softmax = with_array(output_layer)
+ model = chain(tok2vec, softmax)
+ model.set_ref("tok2vec", tok2vec)
+ model.set_ref("softmax", softmax)
+ model.set_ref("output_layer", output_layer)
+ return model
diff --git a/spacy/ml/models/tensorizer.py b/spacy/ml/models/tensorizer.py
new file mode 100644
index 000000000..f66610b64
--- /dev/null
+++ b/spacy/ml/models/tensorizer.py
@@ -0,0 +1,10 @@
+from thinc.api import Linear, zero_init
+
+from ... import util
+from ...util import registry
+
+
+@registry.architectures.register("spacy.Tensorizer.v1")
+def build_tensorizer(input_size, output_size):
+ input_size = util.env_opt("token_vector_width", input_size)
+ return Linear(output_size, input_size, init_W=zero_init)
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
new file mode 100644
index 000000000..d9ac34b99
--- /dev/null
+++ b/spacy/ml/models/textcat.py
@@ -0,0 +1,42 @@
+from spacy.attrs import ORTH
+from spacy.util import registry
+from spacy.ml.extract_ngrams import extract_ngrams
+
+from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic, SparseLinear, Softmax
+
+
+@registry.architectures.register("spacy.TextCatCNN.v1")
+def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
+ """
+ Build a simple CNN text classifier, given a token-to-vector model as inputs.
+ If exclusive_classes=True, a softmax non-linearity is applied, so that the
+ outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
+ is applied instead, so that outputs are in the range [0, 1].
+ """
+ with Model.define_operators({">>": chain}):
+ if exclusive_classes:
+ output_layer = Softmax(nO=nO, nI=tok2vec.get_dim("nO"))
+ model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
+ model.set_ref("output_layer", output_layer)
+ else:
+ # TODO: experiment with init_w=zero_init
+ linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
+ model = tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
+ model.set_ref("output_layer", linear_layer)
+ model.set_ref("tok2vec", tok2vec)
+ model.set_dim("nO", nO)
+ return model
+
+
+@registry.architectures.register("spacy.TextCatBOW.v1")
+def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None):
+ # Note: original defaults were ngram_size=1 and no_output_layer=False
+ with Model.define_operators({">>": chain}):
+ model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nO)
+ model.to_cpu()
+ if not no_output_layer:
+ output_layer = Softmax(nO) if exclusive_classes else Logistic(nO)
+ output_layer.to_cpu()
+ model = model >> output_layer
+ model.set_ref("output_layer", output_layer)
+ return model
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
new file mode 100644
index 000000000..2e0e4c2d4
--- /dev/null
+++ b/spacy/ml/models/tok2vec.py
@@ -0,0 +1,390 @@
+from thinc.api import chain, clone, concatenate, with_array, uniqued
+from thinc.api import Model, noop, with_padded, Maxout, expand_window
+from thinc.api import HashEmbed, StaticVectors, PyTorchLSTM
+from thinc.api import residual, LayerNorm, FeatureExtractor, Mish
+
+from ... import util
+from ...util import registry, make_layer
+from ...ml import _character_embed
+from ...pipeline.tok2vec import Tok2VecListener
+from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
+
+
+@registry.architectures.register("spacy.Tok2VecTensors.v1")
+def tok2vec_tensors_v1(width):
+ tok2vec = Tok2VecListener("tok2vec", width=width)
+ return tok2vec
+
+
+@registry.architectures.register("spacy.VocabVectors.v1")
+def get_vocab_vectors(name):
+ nlp = util.load_model(name)
+ return nlp.vocab.vectors
+
+
+@registry.architectures.register("spacy.Tok2Vec.v1")
+def Tok2Vec(config):
+ doc2feats = make_layer(config["@doc2feats"])
+ embed = make_layer(config["@embed"])
+ encode = make_layer(config["@encode"])
+ field_size = 0
+ if encode.has_attr("receptive_field"):
+ field_size = encode.attrs["receptive_field"]
+ tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size))
+ tok2vec.attrs["cfg"] = config
+ tok2vec.set_dim("nO", encode.get_dim("nO"))
+ tok2vec.set_ref("embed", embed)
+ tok2vec.set_ref("encode", encode)
+ return tok2vec
+
+
+@registry.architectures.register("spacy.Doc2Feats.v1")
+def Doc2Feats(config):
+ columns = config["columns"]
+ return FeatureExtractor(columns)
+
+
+@registry.architectures.register("spacy.HashEmbedCNN.v1")
+def hash_embed_cnn(
+ pretrained_vectors,
+ width,
+ depth,
+ embed_size,
+ maxout_pieces,
+ window_size,
+ subword_features,
+):
+ # Does not use character embeddings: set to False by default
+ return build_Tok2Vec_model(
+ width=width,
+ embed_size=embed_size,
+ pretrained_vectors=pretrained_vectors,
+ conv_depth=depth,
+ bilstm_depth=0,
+ maxout_pieces=maxout_pieces,
+ window_size=window_size,
+ subword_features=subword_features,
+ char_embed=False,
+ nM=0,
+ nC=0,
+ )
+
+
+@registry.architectures.register("spacy.HashCharEmbedCNN.v1")
+def hash_charembed_cnn(
+ pretrained_vectors,
+ width,
+ depth,
+ embed_size,
+ maxout_pieces,
+ window_size,
+ subword_features,
+ nM=0,
+ nC=0,
+):
+ # Allows using character embeddings by setting nC, nM and char_embed=True
+ return build_Tok2Vec_model(
+ width=width,
+ embed_size=embed_size,
+ pretrained_vectors=pretrained_vectors,
+ conv_depth=depth,
+ bilstm_depth=0,
+ maxout_pieces=maxout_pieces,
+ window_size=window_size,
+ subword_features=subword_features,
+ char_embed=True,
+ nM=nM,
+ nC=nC,
+ )
+
+
+@registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
+def hash_embed_bilstm_v1(
+ pretrained_vectors, width, depth, embed_size, subword_features
+):
+ # Does not use character embeddings: set to False by default
+ return build_Tok2Vec_model(
+ width=width,
+ embed_size=embed_size,
+ pretrained_vectors=pretrained_vectors,
+ bilstm_depth=depth,
+ conv_depth=0,
+ maxout_pieces=0,
+ window_size=1,
+ subword_features=subword_features,
+ char_embed=False,
+ nM=0,
+ nC=0,
+ )
+
+
+@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
+def hash_embed_bilstm_v1(
+ pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0
+):
+ # Allows using character embeddings by setting nC, nM and char_embed=True
+ return build_Tok2Vec_model(
+ width=width,
+ embed_size=embed_size,
+ pretrained_vectors=pretrained_vectors,
+ bilstm_depth=depth,
+ conv_depth=0,
+ maxout_pieces=0,
+ window_size=1,
+ subword_features=subword_features,
+ char_embed=True,
+ nM=nM,
+ nC=nC,
+ )
+
+
+@registry.architectures.register("spacy.MultiHashEmbed.v1")
+def MultiHashEmbed(config):
+ # For backwards compatibility with models before the architecture registry,
+ # we have to be careful to get exactly the same model structure. One subtle
+ # trick is that when we define concatenation with the operator, the operator
+ # is actually binary associative. So when we write (a | b | c), we're actually
+ # getting concatenate(concatenate(a, b), c). That's why the implementation
+ # is a bit ugly here.
+ cols = config["columns"]
+ width = config["width"]
+ rows = config["rows"]
+
+ norm = HashEmbed(width, rows, column=cols.index("NORM"))
+ if config["use_subwords"]:
+ prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX"))
+ suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX"))
+ shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE"))
+ if config.get("@pretrained_vectors"):
+ glove = make_layer(config["@pretrained_vectors"])
+ mix = make_layer(config["@mix"])
+
+ with Model.define_operators({">>": chain, "|": concatenate}):
+ if config["use_subwords"] and config["@pretrained_vectors"]:
+ mix._layers[0].set_dim("nI", width * 5)
+ layer = uniqued(
+ (glove | norm | prefix | suffix | shape) >> mix,
+ column=cols.index("ORTH"),
+ )
+ elif config["use_subwords"]:
+ mix._layers[0].set_dim("nI", width * 4)
+ layer = uniqued(
+ (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH")
+ )
+ elif config["@pretrained_vectors"]:
+ mix._layers[0].set_dim("nI", width * 2)
+ layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"))
+ else:
+ layer = norm
+ layer.attrs["cfg"] = config
+ return layer
+
+
+@registry.architectures.register("spacy.CharacterEmbed.v1")
+def CharacterEmbed(config):
+ width = config["width"]
+ chars = config["chars"]
+
+ chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars)
+ other_tables = make_layer(config["@embed_features"])
+ mix = make_layer(config["@mix"])
+
+ model = chain(concatenate(chr_embed, other_tables), mix)
+ model.attrs["cfg"] = config
+ return model
+
+
+@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
+def MaxoutWindowEncoder(config):
+ nO = config["width"]
+ nW = config["window_size"]
+ nP = config["pieces"]
+ depth = config["depth"]
+
+ cnn = (
+ expand_window(window_size=nW),
+ Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True),
+ )
+ model = clone(residual(cnn), depth)
+ model.set_dim("nO", nO)
+ model.attrs["receptive_field"] = nW * depth
+ return model
+
+
+@registry.architectures.register("spacy.MishWindowEncoder.v1")
+def MishWindowEncoder(config):
+ nO = config["width"]
+ nW = config["window_size"]
+ depth = config["depth"]
+
+ cnn = chain(
+ expand_window(window_size=nW),
+ Mish(nO=nO, nI=nO * ((nW * 2) + 1)),
+ LayerNorm(nO),
+ )
+ model = clone(residual(cnn), depth)
+ model.set_dim("nO", nO)
+ return model
+
+
+@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
+def TorchBiLSTMEncoder(config):
+ import torch.nn
+
+ # TODO FIX
+ from thinc.api import PyTorchRNNWrapper
+
+ width = config["width"]
+ depth = config["depth"]
+ if depth == 0:
+ return noop()
+ return with_padded(
+ PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True))
+ )
+
+
+# TODO: update
+_EXAMPLE_CONFIG = {
+ "@doc2feats": {
+ "arch": "Doc2Feats",
+ "config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]},
+ },
+ "@embed": {
+ "arch": "spacy.MultiHashEmbed.v1",
+ "config": {
+ "width": 96,
+ "rows": 2000,
+ "columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"],
+ "use_subwords": True,
+ "@pretrained_vectors": {
+ "arch": "TransformedStaticVectors",
+ "config": {
+ "vectors_name": "en_vectors_web_lg.vectors",
+ "width": 96,
+ "column": 0,
+ },
+ },
+ "@mix": {
+ "arch": "LayerNormalizedMaxout",
+ "config": {"width": 96, "pieces": 3},
+ },
+ },
+ },
+ "@encode": {
+ "arch": "MaxoutWindowEncode",
+ "config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3},
+ },
+}
+
+
+def build_Tok2Vec_model(
+ width,
+ embed_size,
+ pretrained_vectors,
+ window_size,
+ maxout_pieces,
+ subword_features,
+ char_embed,
+ nM,
+ nC,
+ conv_depth,
+ bilstm_depth,
+) -> Model:
+ if char_embed:
+ subword_features = False
+ cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+ with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
+ norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM))
+ if subword_features:
+ prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX))
+ suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX))
+ shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE))
+ else:
+ prefix, suffix, shape = (None, None, None)
+ if pretrained_vectors is not None:
+ glove = StaticVectors(
+ vectors=pretrained_vectors.data,
+ nO=width,
+ column=cols.index(ID),
+ dropout=0.0,
+ )
+
+ if subword_features:
+ columns = 5
+ embed = uniqued(
+ (glove | norm | prefix | suffix | shape)
+ >> Maxout(
+ nO=width,
+ nI=width * columns,
+ nP=maxout_pieces,
+ dropout=0.0,
+ normalize=True,
+ ),
+ column=cols.index(ORTH),
+ )
+ else:
+ columns = 2
+ embed = uniqued(
+ (glove | norm)
+ >> Maxout(
+ nO=width,
+ nI=width * columns,
+ nP=maxout_pieces,
+ dropout=0.0,
+ normalize=True,
+ ),
+ column=cols.index(ORTH),
+ )
+ elif subword_features:
+ columns = 4
+ embed = uniqued(
+ concatenate(norm, prefix, suffix, shape)
+ >> Maxout(
+ nO=width,
+ nI=width * columns,
+ nP=maxout_pieces,
+ dropout=0.0,
+ normalize=True,
+ ),
+ column=cols.index(ORTH),
+ )
+ elif char_embed:
+ embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) | FeatureExtractor(
+ cols
+ ) >> with_array(norm)
+ reduce_dimensions = Maxout(
+ nO=width,
+ nI=nM * nC + width,
+ nP=maxout_pieces,
+ dropout=0.0,
+ normalize=True,
+ )
+ else:
+ embed = norm
+
+ convolution = residual(
+ expand_window(window_size=window_size)
+ >> Maxout(
+ nO=width,
+ nI=width * ((window_size * 2) + 1),
+ nP=maxout_pieces,
+ dropout=0.0,
+ normalize=True,
+ )
+ )
+ if char_embed:
+ tok2vec = embed >> with_array(
+ reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
+ )
+ else:
+ tok2vec = FeatureExtractor(cols) >> with_array(
+ embed >> convolution ** conv_depth, pad=conv_depth
+ )
+
+ if bilstm_depth >= 1:
+ tok2vec = tok2vec >> PyTorchLSTM(
+ nO=width, nI=width, depth=bilstm_depth, bi=True
+ )
+ tok2vec.set_dim("nO", width)
+ tok2vec.set_ref("embed", embed)
+ return tok2vec
diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py
index 5e51bc47a..e69de29bb 100644
--- a/spacy/ml/tok2vec.py
+++ b/spacy/ml/tok2vec.py
@@ -1,178 +0,0 @@
-from thinc.api import Model, chain, clone, concatenate, with_array, uniqued, noop
-from thinc.api import with_padded, Maxout, expand_window, HashEmbed, StaticVectors
-from thinc.api import residual, LayerNorm, FeatureExtractor
-
-from ..ml import _character_embed
-from ..util import make_layer, registry
-
-
-@registry.architectures.register("spacy.Tok2Vec.v1")
-def Tok2Vec(config):
- doc2feats = make_layer(config["@doc2feats"])
- embed = make_layer(config["@embed"])
- encode = make_layer(config["@encode"])
- field_size = 0
- if encode.has_attr("receptive_field"):
- field_size = encode.attrs["receptive_field"]
- tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size))
- tok2vec.attrs["cfg"] = config
- tok2vec.set_dim("nO", encode.get_dim("nO"))
- tok2vec.set_ref("embed", embed)
- tok2vec.set_ref("encode", encode)
- return tok2vec
-
-
-@registry.architectures.register("spacy.Doc2Feats.v1")
-def Doc2Feats(config):
- columns = config["columns"]
- return FeatureExtractor(columns)
-
-
-@registry.architectures.register("spacy.MultiHashEmbed.v1")
-def MultiHashEmbed(config):
- # For backwards compatibility with models before the architecture registry,
- # we have to be careful to get exactly the same model structure. One subtle
- # trick is that when we define concatenation with the operator, the operator
- # is actually binary associative. So when we write (a | b | c), we're actually
- # getting concatenate(concatenate(a, b), c). That's why the implementation
- # is a bit ugly here.
- cols = config["columns"]
- width = config["width"]
- rows = config["rows"]
-
- norm = HashEmbed(width, rows, column=cols.index("NORM"), dropout=0.0)
- if config["use_subwords"]:
- prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX"), dropout=0.0)
- suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX"), dropout=0.0)
- shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE"), dropout=0.0)
- if config.get("@pretrained_vectors"):
- glove = make_layer(config["@pretrained_vectors"])
- mix = make_layer(config["@mix"])
-
- with Model.define_operators({">>": chain, "|": concatenate}):
- if config["use_subwords"] and config["@pretrained_vectors"]:
- mix._layers[0].set_dim("nI", width * 5)
- layer = uniqued(
- (glove | norm | prefix | suffix | shape) >> mix,
- column=cols.index("ORTH"),
- )
- elif config["use_subwords"]:
- mix._layers[0].set_dim("nI", width * 4)
- layer = uniqued(
- (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH")
- )
- elif config["@pretrained_vectors"]:
- mix._layers[0].set_dim("nI", width * 2)
- layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),)
- else:
- layer = norm
- layer.attrs["cfg"] = config
- return layer
-
-
-@registry.architectures.register("spacy.CharacterEmbed.v1")
-def CharacterEmbed(config):
- width = config["width"]
- chars = config["chars"]
-
- chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars)
- other_tables = make_layer(config["@embed_features"])
- mix = make_layer(config["@mix"])
-
- model = chain(concatenate(chr_embed, other_tables), mix)
- model.attrs["cfg"] = config
- return model
-
-
-@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
-def MaxoutWindowEncoder(config):
- nO = config["width"]
- nW = config["window_size"]
- nP = config["pieces"]
- depth = config["depth"]
- cnn = (
- expand_window(window_size=nW),
- Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True),
- )
- model = clone(residual(cnn), depth)
- model.set_dim("nO", nO)
- model.attrs["receptive_field"] = nW * depth
- return model
-
-
-@registry.architectures.register("spacy.MishWindowEncoder.v1")
-def MishWindowEncoder(config):
- from thinc.api import Mish
-
- nO = config["width"]
- nW = config["window_size"]
- depth = config["depth"]
- cnn = chain(
- expand_window(window_size=nW),
- Mish(nO=nO, nI=nO * ((nW * 2) + 1)),
- LayerNorm(nO),
- )
- model = clone(residual(cnn), depth)
- model.set_dim("nO", nO)
- return model
-
-
-@registry.architectures.register("spacy.PretrainedVectors.v1")
-def PretrainedVectors(config):
- # TODO: actual vectors instead of name
- return StaticVectors(
- vectors=config["vectors_name"],
- nO=config["width"],
- column=config["column"],
- dropout=0.0,
- )
-
-
-@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
-def TorchBiLSTMEncoder(config):
- import torch.nn
-
- # TODO: FIX
- from thinc.api import PyTorchRNNWrapper
-
- width = config["width"]
- depth = config["depth"]
- if depth == 0:
- return noop()
- return with_padded(
- PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True))
- )
-
-
-# TODO: update
-_EXAMPLE_CONFIG = {
- "@doc2feats": {
- "arch": "Doc2Feats",
- "config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]},
- },
- "@embed": {
- "arch": "spacy.MultiHashEmbed.v1",
- "config": {
- "width": 96,
- "rows": 2000,
- "columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"],
- "use_subwords": True,
- "@pretrained_vectors": {
- "arch": "TransformedStaticVectors",
- "config": {
- "vectors_name": "en_vectors_web_lg.vectors",
- "width": 96,
- "column": 0,
- },
- },
- "@mix": {
- "arch": "LayerNormalizedMaxout",
- "config": {"width": 96, "pieces": 3},
- },
- },
- },
- "@encode": {
- "arch": "MaxoutWindowEncode",
- "config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3},
- },
-}
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index e211acb44..06c568ac9 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -66,7 +66,7 @@ class EntityRuler(object):
self.add_patterns(patterns)
@classmethod
- def from_nlp(cls, nlp, **cfg):
+ def from_nlp(cls, nlp, model=None, **cfg):
return cls(nlp, **cfg)
def __len__(self):
diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py
index d48b04bd1..351323ae9 100644
--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@@ -76,11 +76,9 @@ class SimilarityHook(Pipe):
yield self(doc)
def predict(self, doc1, doc2):
- self.require_model()
return self.model.predict([(doc1, doc2)])
def update(self, doc1_doc2, golds, sgd=None, drop=0.0):
- self.require_model()
sims, bp_sims = self.model.begin_update(doc1_doc2)
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 999132b35..b6a6045d1 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -15,25 +15,15 @@ from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..morphology cimport Morphology
-from ..ml.component_models import build_morphologizer_model
-
@component("morphologizer", assigns=["token.morph", "token.pos"])
class Morphologizer(Pipe):
- @classmethod
- def Model(cls, **cfg):
- if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
- raise ValueError(TempErrors.T008)
- class_map = Morphology.create_class_map()
- return build_morphologizer_model(class_map.field_sizes, **cfg)
-
- def __init__(self, vocab, model=True, **cfg):
+ def __init__(self, vocab, model, **cfg):
self.vocab = vocab
self.model = model
self.cfg = dict(sorted(cfg.items()))
- self.cfg.setdefault('cnn_maxout_pieces', 2)
- self._class_map = self.vocab.morphology.create_class_map()
+ self._class_map = self.vocab.morphology.create_class_map() # Morphology.create_class_map() ?
@property
def labels(self):
@@ -58,6 +48,14 @@ class Morphologizer(Pipe):
self.set_annotations(docs, features, tensors=tokvecs)
yield from docs
+ def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
+ **kwargs):
+ self.set_output(len(self.labels))
+ self.model.initialize()
+ if sgd is None:
+ sgd = self.create_optimizer()
+ return sgd
+
def predict(self, docs):
if not any(len(doc) for doc in docs):
# Handle case where there are no tokens in any docs.
@@ -65,8 +63,8 @@ class Morphologizer(Pipe):
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
tokvecs = self.model.ops.alloc((0, self.model.get_ref("tok2vec").get_dim("nO")))
return guesses, tokvecs
- tokvecs = self.model.tok2vec(docs)
- scores = self.model.softmax(tokvecs)
+ tokvecs = self.model.get_ref("tok2vec")(docs)
+ scores = self.model.get_ref("softmax")(tokvecs)
return scores, tokvecs
def set_annotations(self, docs, batch_scores, tensors=None):
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index ad75d2e78..b9bf1ccd6 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -3,8 +3,7 @@
import numpy
import srsly
import random
-from thinc.api import chain, Linear, Maxout, Softmax, LayerNorm, list2array
-from thinc.api import zero_init, CosineDistance, to_categorical, get_array_module
+from thinc.api import CosineDistance, to_categorical, get_array_module
from thinc.api import set_dropout_rate
from ..tokens.doc cimport Doc
@@ -22,11 +21,6 @@ from ..attrs import POS, ID
from ..util import link_vectors_to_models, create_default_optimizer
from ..parts_of_speech import X
from ..kb import KnowledgeBase
-from ..ml.component_models import Tok2Vec, build_tagger_model
-from ..ml.component_models import build_text_classifier
-from ..ml.component_models import build_simple_cnn_text_classifier
-from ..ml.component_models import build_bow_text_classifier, build_nel_encoder
-from ..ml.component_models import masked_language_model
from ..errors import Errors, TempErrors, user_warning, Warnings
from .. import util
@@ -47,13 +41,8 @@ class Pipe(object):
name = None
@classmethod
- def Model(cls, *shape, **kwargs):
- """Initialize a model for the pipe."""
- raise NotImplementedError
-
- @classmethod
- def from_nlp(cls, nlp, **cfg):
- return cls(nlp.vocab, **cfg)
+ def from_nlp(cls, nlp, model, **cfg):
+ return cls(nlp.vocab, model, **cfg)
def _get_doc(self, example):
""" Use this method if the `example` can be both a Doc or an Example """
@@ -61,7 +50,7 @@ class Pipe(object):
return example
return example.doc
- def __init__(self, vocab, model=True, **cfg):
+ def __init__(self, vocab, model, **cfg):
"""Create a new pipe instance."""
raise NotImplementedError
@@ -72,7 +61,6 @@ class Pipe(object):
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
- self.require_model()
doc = self._get_doc(example)
predictions = self.predict([doc])
if isinstance(predictions, tuple) and len(predictions) == 2:
@@ -85,11 +73,6 @@ class Pipe(object):
return example
return doc
- def require_model(self):
- """Raise an error if the component's model is not initialized."""
- if getattr(self, "model", None) in (None, True, False):
- raise ValueError(Errors.E109.format(name=self.name))
-
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
"""Apply the pipe to a stream of documents.
@@ -116,7 +99,6 @@ class Pipe(object):
"""Apply the pipeline's model to a batch of docs, without
modifying them.
"""
- self.require_model()
raise NotImplementedError
def set_annotations(self, docs, scores, tensors=None):
@@ -158,22 +140,23 @@ class Pipe(object):
):
"""Initialize the pipe for training, using data exampes if available.
If no model has been initialized yet, the model is added."""
- if self.model is True:
- self.model = self.Model(**self.cfg)
+ self.model.initialize()
if hasattr(self, "vocab"):
link_vectors_to_models(self.vocab)
- self.model.initialize()
if sgd is None:
sgd = self.create_optimizer()
return sgd
+ def set_output(self, nO):
+ self.model.set_dim("nO", nO)
+ if self.model.has_ref("output_layer"):
+ self.model.get_ref("output_layer").set_dim("nO", nO)
+
def get_gradients(self):
"""Get non-zero gradients of the model's parameters, as a dictionary
keyed by the parameter ID. The values are (weights, gradients) tuples.
"""
gradients = {}
- if self.model in (None, True, False):
- return gradients
queue = [self.model]
seen = set()
for node in queue:
@@ -199,8 +182,7 @@ class Pipe(object):
"""
serialize = {}
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
- if self.model not in (True, False, None):
- serialize["model"] = self.model.to_bytes
+ serialize["model"] = self.model.to_bytes
if hasattr(self, "vocab"):
serialize["vocab"] = self.vocab.to_bytes
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
@@ -210,20 +192,15 @@ class Pipe(object):
"""Load the pipe from a bytestring."""
def load_model(b):
- # TODO: Remove this once we don't have to handle previous models
- if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
- self.cfg["pretrained_vectors"] = self.vocab.vectors
- if self.model is True:
- self.model = self.Model(**self.cfg)
try:
self.model.from_bytes(b)
except AttributeError:
raise ValueError(Errors.E149)
deserialize = {}
- deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
if hasattr(self, "vocab"):
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
+ deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
deserialize["model"] = load_model
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_bytes(bytes_data, deserialize, exclude)
@@ -234,8 +211,7 @@ class Pipe(object):
serialize = {}
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
- if self.model not in (None, True, False):
- serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
+ serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude)
@@ -243,19 +219,14 @@ class Pipe(object):
"""Load the pipe from disk."""
def load_model(p):
- # TODO: Remove this once we don't have to handle previous models
- if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
- self.cfg["pretrained_vectors"] = self.vocab.vectors
- if self.model is True:
- self.model = self.Model(**self.cfg)
try:
self.model.from_bytes(p.open("rb").read())
except AttributeError:
raise ValueError(Errors.E149)
deserialize = {}
- deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
+ deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
deserialize["model"] = load_model
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
util.from_disk(path, deserialize, exclude)
@@ -266,31 +237,13 @@ class Pipe(object):
class Tensorizer(Pipe):
"""Pre-train position-sensitive vectors for tokens."""
- @classmethod
- def Model(cls, output_size=300, **cfg):
- """Create a new statistical model for the class.
-
- width (int): Output size of the model.
- embed_size (int): Number of vectors in the embedding table.
- **cfg: Config parameters.
- RETURNS (Model): A `thinc.model.Model` or similar instance.
- """
- input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96))
- return Linear(output_size, input_size, init_W=zero_init)
-
- def __init__(self, vocab, model=True, **cfg):
+ def __init__(self, vocab, model, **cfg):
"""Construct a new statistical model. Weights are not allocated on
initialisation.
vocab (Vocab): A `Vocab` instance. The model must share the same
`Vocab` instance with the `Doc` objects it will process.
- model (Model): A `Model` instance or `True` to allocate one later.
**cfg: Config parameters.
-
- EXAMPLE:
- >>> from spacy.pipeline import TokenVectorEncoder
- >>> tok2vec = TokenVectorEncoder(nlp.vocab)
- >>> tok2vec.model = tok2vec.Model(128, 5000)
"""
self.vocab = vocab
self.model = model
@@ -337,7 +290,6 @@ class Tensorizer(Pipe):
docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the docs.
"""
- self.require_model()
inputs = self.model.ops.flatten([doc.tensor for doc in docs])
outputs = self.model(inputs)
return self.model.ops.unflatten(outputs, [len(d) for d in docs])
@@ -362,7 +314,6 @@ class Tensorizer(Pipe):
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
"""
- self.require_model()
examples = Example.to_example_objects(examples)
inputs = []
bp_inputs = []
@@ -405,10 +356,8 @@ class Tensorizer(Pipe):
"""
if pipeline is not None:
for name, model in pipeline:
- if getattr(model, "tok2vec", None):
- self.input_models.append(model.tok2vec)
- if self.model is True:
- self.model = self.Model(**self.cfg)
+ if model.has_ref("tok2vec"):
+ self.input_models.append(model.get_ref("tok2vec"))
self.model.initialize()
link_vectors_to_models(self.vocab)
if sgd is None:
@@ -423,7 +372,7 @@ class Tagger(Pipe):
DOCS: https://spacy.io/api/tagger
"""
- def __init__(self, vocab, model=True, **cfg):
+ def __init__(self, vocab, model, **cfg):
self.vocab = vocab
self.model = model
self._rehearsal_model = None
@@ -433,13 +382,6 @@ class Tagger(Pipe):
def labels(self):
return tuple(self.vocab.morphology.tag_names)
- @property
- def tok2vec(self):
- if self.model in (None, True, False):
- return None
- else:
- return chain(self.model.get_ref("tok2vec"), list2array())
-
def __call__(self, example):
doc = self._get_doc(example)
tags = self.predict([doc])
@@ -465,7 +407,6 @@ class Tagger(Pipe):
yield from docs
def predict(self, docs):
- self.require_model()
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
n_labels = len(self.labels)
@@ -513,7 +454,6 @@ class Tagger(Pipe):
doc.is_tagged = True
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
- self.require_model()
examples = Example.to_example_objects(examples)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
@@ -600,52 +540,21 @@ class Tagger(Pipe):
vocab.morphology = Morphology(vocab.strings, new_tag_map,
vocab.morphology.lemmatizer,
exc=vocab.morphology.exc)
- self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
- if self.model is True:
- for hp in ["token_vector_width", "conv_depth"]:
- if hp in kwargs:
- self.cfg[hp] = kwargs[hp]
- self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
+ self.set_output(len(self.labels))
+ self.model.initialize()
# Get batch of example docs, example outputs to call begin_training().
# This lets the model infer shapes.
- n_tags = self.vocab.morphology.n_tags
- for node in self.model.walk():
- # TODO: softmax hack ?
- if node.name == "softmax" and node.has_dim("nO") is None:
- node.set_dim("nO", n_tags)
link_vectors_to_models(self.vocab)
- self.model.initialize()
if sgd is None:
sgd = self.create_optimizer()
return sgd
- @classmethod
- def Model(cls, n_tags=None, **cfg):
- if cfg.get("pretrained_dims") and not cfg.get("pretrained_vectors"):
- raise ValueError(TempErrors.T008)
- if "tok2vec" in cfg:
- tok2vec = cfg["tok2vec"]
- else:
- config = {
- "width": cfg.get("token_vector_width", 96),
- "embed_size": cfg.get("embed_size", 2000),
- "pretrained_vectors": cfg.get("pretrained_vectors", None),
- "window_size": cfg.get("window_size", 1),
- "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3),
- "subword_features": cfg.get("subword_features", True),
- "char_embed": cfg.get("char_embed", False),
- "conv_depth": cfg.get("conv_depth", 4),
- "bilstm_depth": cfg.get("bilstm_depth", 0),
- }
- tok2vec = Tok2Vec(**config)
- return build_tagger_model(n_tags, tok2vec)
-
def add_label(self, label, values=None):
if not isinstance(label, str):
raise ValueError(Errors.E187)
if label in self.labels:
return 0
- if self.model not in (True, False, None):
+ if self.model.has_dim("nO"):
# Here's how the model resizing will work, once the
# neuron-to-tag mapping is no longer controlled by
# the Morphology class, which sorts the tag names.
@@ -672,8 +581,7 @@ class Tagger(Pipe):
def to_bytes(self, exclude=tuple(), **kwargs):
serialize = {}
- if self.model not in (None, True, False):
- serialize["model"] = self.model.to_bytes
+ serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
@@ -683,14 +591,6 @@ class Tagger(Pipe):
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
def load_model(b):
- # TODO: Remove this once we don't have to handle previous models
- if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
- self.cfg["pretrained_vectors"] = self.vocab.vectors
- if self.model is True:
- token_vector_width = util.env_opt(
- "token_vector_width",
- self.cfg.get("token_vector_width", 96))
- self.model = self.Model(**self.cfg)
try:
self.model.from_bytes(b)
except AttributeError:
@@ -719,18 +619,13 @@ class Tagger(Pipe):
"vocab": lambda p: self.vocab.to_disk(p),
"tag_map": lambda p: srsly.write_msgpack(p, tag_map),
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
- "cfg": lambda p: srsly.write_json(p, self.cfg)
+ "cfg": lambda p: srsly.write_json(p, self.cfg),
}
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude=tuple(), **kwargs):
def load_model(p):
- # TODO: Remove this once we don't have to handle previous models
- if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
- self.cfg["pretrained_vectors"] = self.vocab.vectors
- if self.model is True:
- self.model = self.Model(**self.cfg)
with p.open("rb") as file_:
try:
self.model.from_bytes(file_.read())
@@ -745,8 +640,8 @@ class Tagger(Pipe):
exc=self.vocab.morphology.exc)
deserialize = {
- "cfg": lambda p: self.cfg.update(_load_cfg(p)),
"vocab": lambda p: self.vocab.from_disk(p),
+ "cfg": lambda p: self.cfg.update(_load_cfg(p)),
"tag_map": load_tag_map,
"model": load_model,
}
@@ -762,16 +657,11 @@ class SentenceRecognizer(Tagger):
DOCS: https://spacy.io/api/sentencerecognizer
"""
- def __init__(self, vocab, model=True, **cfg):
+ def __init__(self, vocab, model, **cfg):
self.vocab = vocab
self.model = model
self._rehearsal_model = None
self.cfg = dict(sorted(cfg.items()))
- self.cfg.setdefault("cnn_maxout_pieces", 2)
- self.cfg.setdefault("subword_features", True)
- self.cfg.setdefault("token_vector_width", 12)
- self.cfg.setdefault("conv_depth", 1)
- self.cfg.setdefault("pretrained_vectors", None)
@property
def labels(self):
@@ -797,7 +687,6 @@ class SentenceRecognizer(Tagger):
doc.c[j].sent_start = -1
def update(self, examples, drop=0., sgd=None, losses=None):
- self.require_model()
examples = Example.to_example_objects(examples)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
@@ -844,20 +733,12 @@ class SentenceRecognizer(Tagger):
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
**kwargs):
cdef Vocab vocab = self.vocab
- if self.model is True:
- for hp in ["token_vector_width", "conv_depth"]:
- if hp in kwargs:
- self.cfg[hp] = kwargs[hp]
- self.model = self.Model(len(self.labels), **self.cfg)
+ self.set_output(len(self.labels))
+ self.model.initialize()
if sgd is None:
sgd = self.create_optimizer()
- self.model.initialize()
return sgd
- @classmethod
- def Model(cls, n_tags, **cfg):
- return build_tagger_model(n_tags, **cfg)
-
def add_label(self, label, values=None):
raise NotImplementedError
@@ -867,8 +748,7 @@ class SentenceRecognizer(Tagger):
def to_bytes(self, exclude=tuple(), **kwargs):
serialize = {}
- if self.model not in (None, True, False):
- serialize["model"] = self.model.to_bytes
+ serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
@@ -876,8 +756,6 @@ class SentenceRecognizer(Tagger):
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
def load_model(b):
- if self.model is True:
- self.model = self.Model(len(self.labels), **self.cfg)
try:
self.model.from_bytes(b)
except AttributeError:
@@ -896,15 +774,13 @@ class SentenceRecognizer(Tagger):
serialize = {
"vocab": lambda p: self.vocab.to_disk(p),
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
- "cfg": lambda p: srsly.write_json(p, self.cfg)
+ "cfg": lambda p: srsly.write_json(p, self.cfg),
}
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude=tuple(), **kwargs):
def load_model(p):
- if self.model is True:
- self.model = self.Model(len(self.labels), **self.cfg)
with p.open("rb") as file_:
try:
self.model.from_bytes(file_.read())
@@ -912,8 +788,8 @@ class SentenceRecognizer(Tagger):
raise ValueError(Errors.E149)
deserialize = {
- "cfg": lambda p: self.cfg.update(_load_cfg(p)),
"vocab": lambda p: self.vocab.from_disk(p),
+ "cfg": lambda p: self.cfg.update(_load_cfg(p)),
"model": load_model,
}
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
@@ -927,7 +803,7 @@ class MultitaskObjective(Tagger):
side-objective.
"""
- def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
+ def __init__(self, vocab, model, target='dep_tag_offset', **cfg):
self.vocab = vocab
self.model = model
if target == "dep":
@@ -947,7 +823,8 @@ class MultitaskObjective(Tagger):
else:
raise ValueError(Errors.E016)
self.cfg = dict(cfg)
- self.cfg.setdefault("cnn_maxout_pieces", 2)
+ # TODO: remove - put in config
+ self.cfg.setdefault("maxout_pieces", 2)
@property
def labels(self):
@@ -969,30 +846,15 @@ class MultitaskObjective(Tagger):
label = self.make_label(i, example.token_annotation)
if label is not None and label not in self.labels:
self.labels[label] = len(self.labels)
- if self.model is True:
- token_vector_width = util.env_opt("token_vector_width")
- self.model = self.Model(len(self.labels), tok2vec=tok2vec)
- link_vectors_to_models(self.vocab)
self.model.initialize()
+ link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
return sgd
- @classmethod
- def Model(cls, n_tags, tok2vec=None, **cfg):
- token_vector_width = util.env_opt("token_vector_width", 96)
- model = chain(
- tok2vec,
- Maxout(nO=token_vector_width*2, nI=token_vector_width, nP=3, dropout=0.0),
- LayerNorm(token_vector_width*2),
- Softmax(nO=n_tags, nI=token_vector_width*2)
- )
- return model
-
def predict(self, docs):
- self.require_model()
- tokvecs = self.model.tok2vec(docs)
- scores = self.model.softmax(tokvecs)
+ tokvecs = self.model.get_ref("tok2vec")(docs)
+ scores = self.model.get_ref("softmax")(tokvecs)
return tokvecs, scores
def get_loss(self, examples, scores):
@@ -1097,18 +959,7 @@ class MultitaskObjective(Tagger):
class ClozeMultitask(Pipe):
- @classmethod
- def Model(cls, vocab, tok2vec, **cfg):
- output_size = vocab.vectors.data.shape[1]
- output_layer = chain(
- Maxout(nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0),
- Linear(nO=output_size, nI=output_size, init_W=zero_init)
- )
- model = chain(tok2vec, output_layer)
- model = masked_language_model(vocab, model)
- return model
-
- def __init__(self, vocab, model=True, **cfg):
+ def __init__(self, vocab, model, **cfg):
self.vocab = vocab
self.model = model
self.cfg = cfg
@@ -1120,19 +971,16 @@ class ClozeMultitask(Pipe):
def begin_training(self, get_examples=lambda: [], pipeline=None,
tok2vec=None, sgd=None, **kwargs):
link_vectors_to_models(self.vocab)
- if self.model is True:
- self.model = self.Model(self.vocab, tok2vec)
- X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
self.model.initialize()
+ X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
self.model.output_layer.begin_training(X)
if sgd is None:
sgd = self.create_optimizer()
return sgd
def predict(self, docs):
- self.require_model()
- tokvecs = self.model.tok2vec(docs)
- vectors = self.model.output_layer(tokvecs)
+ tokvecs = self.model.get_ref("tok2vec")(docs)
+ vectors = self.model.get_ref("output_layer")(tokvecs)
return tokvecs, vectors
def get_loss(self, examples, vectors, prediction):
@@ -1150,7 +998,6 @@ class ClozeMultitask(Pipe):
pass
def rehearse(self, examples, drop=0., sgd=None, losses=None):
- self.require_model()
examples = Example.to_example_objects(examples)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
@@ -1171,62 +1018,11 @@ class TextCategorizer(Pipe):
DOCS: https://spacy.io/api/textcategorizer
"""
-
- @classmethod
- def Model(cls, nr_class=1, exclusive_classes=None, **cfg):
- if nr_class == 1:
- exclusive_classes = False
- if exclusive_classes is None:
- raise ValueError(
- "TextCategorizer Model must specify 'exclusive_classes'. "
- "This setting determines whether the model will output "
- "scores that sum to 1 for each example. If only one class "
- "is true for each example, you should set exclusive_classes=True. "
- "For 'multi_label' classification, set exclusive_classes=False."
- )
- if "embed_size" not in cfg:
- cfg["embed_size"] = util.env_opt("embed_size", 2000)
- if "token_vector_width" not in cfg:
- cfg["token_vector_width"] = util.env_opt("token_vector_width", 96)
- if cfg.get("architecture") == "bow":
- return build_bow_text_classifier(nr_class, exclusive_classes, **cfg)
- else:
- if "tok2vec" in cfg:
- tok2vec = cfg["tok2vec"]
- else:
- config = {
- "width": cfg.get("token_vector_width", 96),
- "embed_size": cfg.get("embed_size", 2000),
- "pretrained_vectors": cfg.get("pretrained_vectors", None),
- "window_size": cfg.get("window_size", 1),
- "cnn_maxout_pieces": cfg.get("cnn_maxout_pieces", 3),
- "subword_features": cfg.get("subword_features", True),
- "char_embed": cfg.get("char_embed", False),
- "conv_depth": cfg.get("conv_depth", 4),
- "bilstm_depth": cfg.get("bilstm_depth", 0),
- }
- tok2vec = Tok2Vec(**config)
- return build_simple_cnn_text_classifier(
- tok2vec,
- nr_class,
- exclusive_classes,
- **cfg
- )
-
- @property
- def tok2vec(self):
- if self.model in (None, True, False):
- return None
- else:
- return self.model.tok2vec
-
- def __init__(self, vocab, model=True, **cfg):
+ def __init__(self, vocab, model, **cfg):
self.vocab = vocab
self.model = model
self._rehearsal_model = None
self.cfg = dict(cfg)
- if "exclusive_classes" not in cfg:
- self.cfg["exclusive_classes"] = True
@property
def labels(self):
@@ -1255,7 +1051,6 @@ class TextCategorizer(Pipe):
yield from docs
def predict(self, docs):
- self.require_model()
tensors = [doc.tensor for doc in docs]
if not any(len(doc) for doc in docs):
@@ -1274,7 +1069,6 @@ class TextCategorizer(Pipe):
doc.cats[label] = float(scores[i, j])
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
- self.require_model()
examples = Example.to_example_objects(examples)
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
# Handle cases where there are no tokens in any docs.
@@ -1311,7 +1105,7 @@ class TextCategorizer(Pipe):
losses.setdefault(self.name, 0.0)
losses[self.name] += (gradient**2).sum()
- def get_loss(self, examples, scores):
+ def _examples_to_truth(self, examples):
golds = [ex.gold for ex in examples]
truths = numpy.zeros((len(golds), len(self.labels)), dtype="f")
not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f")
@@ -1322,6 +1116,10 @@ class TextCategorizer(Pipe):
else:
not_missing[i, j] = 0.
truths = self.model.ops.asarray(truths)
+ return truths, not_missing
+
+ def get_loss(self, examples, scores):
+ truths, not_missing = self._examples_to_truth(examples)
not_missing = self.model.ops.asarray(not_missing)
d_scores = (scores-truths) / scores.shape[0]
d_scores *= not_missing
@@ -1333,7 +1131,7 @@ class TextCategorizer(Pipe):
raise ValueError(Errors.E187)
if label in self.labels:
return 0
- if self.model not in (None, True, False):
+ if self.model.has_dim("nO"):
# This functionality was available previously, but was broken.
# The problem is that we resize the last layer, but the last layer
# is actually just an ensemble. We're not resizing the child layers
@@ -1348,19 +1146,18 @@ class TextCategorizer(Pipe):
return 1
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
- for example in get_examples():
+ # TODO: begin_training is not guaranteed to see all data / labels ?
+ examples = list(get_examples())
+ for example in examples:
for cat in example.doc_annotation.cats:
self.add_label(cat)
- if self.model is True:
- self.cfg.update(kwargs)
- self.require_labels()
- self.model = self.Model(len(self.labels), **self.cfg)
- link_vectors_to_models(self.vocab)
+ self.require_labels()
+ docs = [Doc(Vocab(), words=["hello"])]
+ truths, _ = self._examples_to_truth(examples)
+ self.set_output(len(self.labels))
+ self.model.initialize(X=docs, Y=truths)
if sgd is None:
sgd = self.create_optimizer()
- # TODO: use get_examples instead
- docs = [Doc(Vocab(), words=["hello"])]
- self.model.initialize(X=docs)
return sgd
@@ -1393,7 +1190,7 @@ cdef class DependencyParser(Parser):
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
for labeller in self._multitasks:
- tok2vec = self.model.tok2vec
+ tok2vec = self.model.get_ref("tok2vec")
labeller.begin_training(get_examples, pipeline=pipeline,
tok2vec=tok2vec, sgd=sgd)
@@ -1423,7 +1220,6 @@ cdef class EntityRecognizer(Parser):
assigns = ["doc.ents", "token.ent_iob", "token.ent_type"]
requires = []
TransitionSystem = BiluoPushDown
- nr_feature = 6
def add_multitask_objective(self, target):
if target == "cloze":
@@ -1435,7 +1231,7 @@ cdef class EntityRecognizer(Parser):
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
for labeller in self._multitasks:
- tok2vec = self.model.tok2vec
+ tok2vec = self.model.get_ref("tok2vec")
labeller.begin_training(get_examples, pipeline=pipeline,
tok2vec=tok2vec)
@@ -1464,18 +1260,9 @@ class EntityLinker(Pipe):
"""
NIL = "NIL" # string used to refer to a non-existing link
- @classmethod
- def Model(cls, **cfg):
- embed_width = cfg.get("embed_width", 300)
- hidden_width = cfg.get("hidden_width", 128)
- type_to_int = cfg.get("type_to_int", dict())
-
- model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg)
- return model
-
- def __init__(self, vocab, **cfg):
+ def __init__(self, vocab, model, **cfg):
self.vocab = vocab
- self.model = True
+ self.model = model
self.kb = None
self.cfg = dict(cfg)
self.distance = CosineDistance(normalize=False)
@@ -1483,11 +1270,6 @@ class EntityLinker(Pipe):
def set_kb(self, kb):
self.kb = kb
- def require_model(self):
- # Raise an error if the component's model is not initialized.
- if getattr(self, "model", None) in (None, True, False):
- raise ValueError(Errors.E109.format(name=self.name))
-
def require_kb(self):
# Raise an error if the knowledge base is not initialized.
if getattr(self, "kb", None) in (None, True, False):
@@ -1495,16 +1277,14 @@ class EntityLinker(Pipe):
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
self.require_kb()
- self.cfg["entity_width"] = self.kb.entity_vector_length
- if self.model is True:
- self.model = self.Model(**self.cfg)
+ nO = self.kb.entity_vector_length
+ self.set_output(nO)
self.model.initialize()
if sgd is None:
sgd = self.create_optimizer()
return sgd
def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None):
- self.require_model()
self.require_kb()
if losses is not None:
losses.setdefault(self.name, 0.0)
@@ -1614,7 +1394,6 @@ class EntityLinker(Pipe):
def predict(self, docs):
""" Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
- self.require_model()
self.require_kb()
entity_count = 0
@@ -1714,15 +1493,12 @@ class EntityLinker(Pipe):
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["kb"] = lambda p: self.kb.dump(p)
- if self.model not in (None, True, False):
- serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
+ serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude=tuple(), **kwargs):
def load_model(p):
- if self.model is True:
- self.model = self.Model(**self.cfg)
try:
self.model.from_bytes(p.open("rb").read())
except AttributeError:
@@ -1734,8 +1510,8 @@ class EntityLinker(Pipe):
self.set_kb(kb)
deserialize = {}
- deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
+ deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
deserialize["kb"] = load_kb
deserialize["model"] = load_model
exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
@@ -1782,7 +1558,7 @@ class Sentencizer(Pipe):
self.punct_chars = set(self.default_punct_chars)
@classmethod
- def from_nlp(cls, nlp, **cfg):
+ def from_nlp(cls, nlp, model=None, **cfg):
return cls(**cfg)
def __call__(self, example):
@@ -1915,8 +1691,8 @@ class Sentencizer(Pipe):
# Cython classes can't be decorated, so we need to add the factories here
-Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, **cfg)
-Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg)
+Language.factories["parser"] = lambda nlp, model, **cfg: DependencyParser.from_nlp(nlp, model, **cfg)
+Language.factories["ner"] = lambda nlp, model, **cfg: EntityRecognizer.from_nlp(nlp, model, **cfg)
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 8290468cf..a49f94ca3 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -5,32 +5,21 @@ from ..gold import Example
from ..tokens import Doc
from ..vocab import Vocab
from ..language import component
-from ..util import link_vectors_to_models, minibatch, registry, eg2doc
+from ..util import link_vectors_to_models, minibatch, eg2doc
@component("tok2vec", assigns=["doc.tensor"])
class Tok2Vec(Pipe):
- @classmethod
- def from_nlp(cls, nlp, **cfg):
- return cls(nlp.vocab, **cfg)
@classmethod
- def Model(cls, architecture, **cfg):
- """Create a new statistical model for the class.
+ def from_nlp(cls, nlp, model, **cfg):
+ return cls(nlp.vocab, model, **cfg)
- architecture (str): The registered model architecture to use.
- **cfg: Config parameters.
- RETURNS (Model): A `thinc.model.Model` or similar instance.
- """
- model = registry.architectures.get(architecture)
- return model(**cfg)
-
- def __init__(self, vocab, model=True, **cfg):
+ def __init__(self, vocab, model, **cfg):
"""Construct a new statistical model. Weights are not allocated on
initialisation.
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
instance with the `Doc` objects it will process.
- model (Model): A `Model` instance or `True` to allocate one later.
**cfg: Config parameters.
"""
self.vocab = vocab
@@ -143,8 +132,6 @@ class Tok2Vec(Pipe):
get_examples (function): Function returning example training data.
pipeline (list): The pipeline the model is part of.
"""
- if self.model is True:
- self.model = self.Model(**self.cfg)
# TODO: use examples instead ?
docs = [Doc(Vocab(), words=["hello"])]
self.model.initialize(X=docs)
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 442233f19..7ff9517a5 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -221,7 +221,10 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
class ParserModel(Model):
def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None):
- Model.__init__(self, name="parser_model", forward=forward)
+ # don't define nO for this object, because we can't dynamically change it
+ Model.__init__(self, name="parser_model", forward=forward, dims={"nI": None})
+ if tok2vec.has_dim("nI"):
+ self.set_dim("nI", tok2vec.get_dim("nI"))
self._layers = [tok2vec, lower_model]
if upper_model is not None:
self._layers.append(upper_model)
@@ -229,6 +232,7 @@ class ParserModel(Model):
if unseen_classes:
for class_ in unseen_classes:
self.unseen_classes.add(class_)
+ self.set_ref("tok2vec", tok2vec)
def predict(self, docs):
step_model = ParserStepModel(docs, self._layers,
@@ -238,25 +242,32 @@ class ParserModel(Model):
def resize_output(self, new_nO):
if len(self._layers) == 2:
return
- if new_nO == self.upper.get_dim("nO"):
+ if self.upper.has_dim("nO") and (new_nO == self.upper.get_dim("nO")):
return
smaller = self.upper
- nI = smaller.get_dim("nI")
+ nI = None
+ if smaller.has_dim("nI"):
+ nI = smaller.get_dim("nI")
with use_ops('numpy'):
- larger = Linear(new_nO, nI)
- larger_W = larger.ops.alloc2f(new_nO, nI)
- larger_b = larger.ops.alloc1f(new_nO)
- smaller_W = smaller.get_param("W")
- smaller_b = smaller.get_param("b")
- # Weights are stored in (nr_out, nr_in) format, so we're basically
- # just adding rows here.
- larger_W[:smaller.get_dim("nO")] = smaller_W
- larger_b[:smaller.get_dim("nO")] = smaller_b
- larger.set_param("W", larger_W)
- larger.set_param("b", larger_b)
+ larger = Linear(nO=new_nO, nI=nI)
+ larger._init = smaller._init
+ # it could be that the model is not initialized yet, then skip this bit
+ if nI:
+ larger_W = larger.ops.alloc2f(new_nO, nI)
+ larger_b = larger.ops.alloc1f(new_nO)
+ smaller_W = smaller.get_param("W")
+ smaller_b = smaller.get_param("b")
+ # Weights are stored in (nr_out, nr_in) format, so we're basically
+ # just adding rows here.
+ if smaller.has_dim("nO"):
+ larger_W[:smaller.get_dim("nO")] = smaller_W
+ larger_b[:smaller.get_dim("nO")] = smaller_b
+ for i in range(smaller.get_dim("nO"), new_nO):
+ self.unseen_classes.add(i)
+
+ larger.set_param("W", larger_W)
+ larger.set_param("b", larger_b)
self._layers[-1] = larger
- for i in range(smaller.get_dim("nO"), new_nO):
- self.unseen_classes.add(i)
def initialize(self, X=None, Y=None):
self.tok2vec.initialize()
@@ -412,7 +423,7 @@ cdef class precompute_hiddens:
we can do all our hard maths up front, packed into large multiplications,
and do the hard-to-program parsing on the CPU.
"""
- cdef readonly int nF, nO, nP # TODO: make these more like the dimensions in thinc
+ cdef readonly int nF, nO, nP
cdef bint _is_synchronized
cdef public object ops
cdef np.ndarray _features
@@ -458,6 +469,16 @@ cdef class precompute_hiddens:
self._is_synchronized = True
return self._cached.data
+ def has_dim(self, name):
+ if name == "nF":
+ return self.nF if self.nF is not None else True
+ elif name == "nP":
+ return self.nP if self.nP is not None else True
+ elif name == "nO":
+ return self.nO if self.nO is not None else True
+ else:
+ return False
+
def get_dim(self, name):
if name == "nF":
return self.nF
@@ -468,6 +489,16 @@ cdef class precompute_hiddens:
else:
raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP")
+ def set_dim(self, name, value):
+ if name == "nF":
+ self.nF = value
+ elif name == "nP":
+ self.nP = value
+ elif name == "nO":
+ self.nO = value
+ else:
+ raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP")
+
def __call__(self, X, bint is_train):
if is_train:
return self.begin_update(X)
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index cf57e1cf6..9381fab6b 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -27,11 +27,11 @@ from ._parser_model cimport predict_states, arg_max_if_valid
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
from ._parser_model cimport get_c_weights, get_c_sizes
from ._parser_model import ParserModel
-from ..util import link_vectors_to_models, create_default_optimizer
+from ..util import link_vectors_to_models, create_default_optimizer, registry
from ..compat import copy_array
from ..tokens.doc cimport Doc
from ..gold cimport GoldParse
-from ..errors import Errors, TempErrors
+from ..errors import Errors, user_warning, Warnings
from .. import util
from .stateclass cimport StateClass
from ._state cimport StateC
@@ -41,114 +41,42 @@ from . import _beam_utils
from . import nonproj
-from ..ml._layers import PrecomputableAffine
-from ..ml.component_models import Tok2Vec
-
-
cdef class Parser:
"""
Base class of the DependencyParser and EntityRecognizer.
"""
- @classmethod
- def Model(cls, nr_class, **cfg):
- depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
- subword_features = util.env_opt('subword_features',
- cfg.get('subword_features', True))
- conv_depth = util.env_opt('conv_depth', cfg.get('conv_depth', 4))
- conv_window = util.env_opt('conv_window', cfg.get('conv_window', 1))
- t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3))
- bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0))
- self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0))
- nr_feature_tokens = cfg.get("nr_feature_tokens", cls.nr_feature)
- if depth not in (0, 1):
- raise ValueError(TempErrors.T004.format(value=depth))
- parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
- cfg.get('maxout_pieces', 2))
- token_vector_width = util.env_opt('token_vector_width',
- cfg.get('token_vector_width', 96))
- hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64))
- if depth == 0:
- hidden_width = nr_class
- parser_maxout_pieces = 1
- embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000))
- pretrained_vectors = cfg.get('pretrained_vectors', None)
- tok2vec = Tok2Vec(width=token_vector_width,
- embed_size=embed_size,
- conv_depth=conv_depth,
- window_size=conv_window,
- cnn_maxout_pieces=t2v_pieces,
- subword_features=subword_features,
- pretrained_vectors=pretrained_vectors,
- bilstm_depth=bilstm_depth)
- tok2vec = chain(tok2vec, list2array())
- tok2vec.set_dim("nO", token_vector_width)
- lower = PrecomputableAffine(hidden_width,
- nF=nr_feature_tokens, nI=token_vector_width,
- nP=parser_maxout_pieces)
- lower.set_dim("nP", parser_maxout_pieces)
- if depth == 1:
- with use_ops('numpy'):
- upper = Linear(nr_class, hidden_width, init_W=zero_init)
- else:
- upper = None
-
- cfg = {
- 'nr_class': nr_class,
- 'nr_feature_tokens': nr_feature_tokens,
- 'hidden_depth': depth,
- 'token_vector_width': token_vector_width,
- 'hidden_width': hidden_width,
- 'maxout_pieces': parser_maxout_pieces,
- 'pretrained_vectors': pretrained_vectors,
- 'bilstm_depth': bilstm_depth,
- 'self_attn_depth': self_attn_depth,
- 'conv_depth': conv_depth,
- 'window_size': conv_window,
- 'embed_size': embed_size,
- 'cnn_maxout_pieces': t2v_pieces
- }
- model = ParserModel(tok2vec, lower, upper)
- model.initialize()
- return model, cfg
-
name = 'base_parser'
- def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
+
+ def __init__(self, Vocab vocab, model, **cfg):
"""Create a Parser.
vocab (Vocab): The vocabulary object. Must be shared with documents
to be processed. The value is set to the `.vocab` attribute.
- moves (TransitionSystem): Defines how the parse-state is created,
- updated and evaluated. The value is set to the .moves attribute
- unless True (default), in which case a new instance is created with
- `Parser.Moves()`.
- model (object): Defines how the parse-state is created, updated and
- evaluated. The value is set to the .model attribute. If set to True
- (default), a new instance will be created with `Parser.Model()`
- in parser.begin_training(), parser.from_disk() or parser.from_bytes().
- **cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute
+ **cfg: Configuration parameters. Set to the `.cfg` attribute.
+ If it doesn't include a value for 'moves', a new instance is
+ created with `self.TransitionSystem()`. This defines how the
+ parse-state is created, updated and evaluated.
"""
self.vocab = vocab
- if moves is True:
- self.moves = self.TransitionSystem(self.vocab.strings)
- else:
- self.moves = moves
- if 'beam_width' not in cfg:
- cfg['beam_width'] = util.env_opt('beam_width', 1)
- if 'beam_density' not in cfg:
- cfg['beam_density'] = util.env_opt('beam_density', 0.0)
- if 'beam_update_prob' not in cfg:
- cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0)
- cfg.setdefault('cnn_maxout_pieces', 3)
- cfg.setdefault("nr_feature_tokens", self.nr_feature)
- self.cfg = cfg
+ moves = cfg.get("moves", None)
+ if moves is None:
+ # defined by EntityRecognizer as a BiluoPushDown
+ moves = self.TransitionSystem(self.vocab.strings)
+ self.moves = moves
+ cfg.setdefault('min_action_freq', 30)
+ cfg.setdefault('learn_tokens', False)
+ cfg.setdefault('beam_width', 1)
+ cfg.setdefault('beam_update_prob', 1.0) # or 0.5 (both defaults were previously used)
self.model = model
+ self.set_output(self.moves.n_moves)
+ self.cfg = cfg
self._multitasks = []
self._rehearsal_model = None
@classmethod
- def from_nlp(cls, nlp, **cfg):
- return cls(nlp.vocab, **cfg)
+ def from_nlp(cls, nlp, model, **cfg):
+ return cls(nlp.vocab, model, **cfg)
def __reduce__(self):
return (Parser, (self.vocab, self.moves, self.model), None, None)
@@ -163,8 +91,6 @@ cdef class Parser:
names.append(name)
return names
- nr_feature = 8
-
@property
def labels(self):
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
@@ -173,7 +99,7 @@ cdef class Parser:
@property
def tok2vec(self):
'''Return the embedding and convolutional layer of the model.'''
- return None if self.model in (None, True, False) else self.model.tok2vec
+ return self.model.tok2vec
@property
def postprocesses(self):
@@ -190,10 +116,7 @@ cdef class Parser:
self._resize()
def _resize(self):
- if "nr_class" in self.cfg:
- self.cfg["nr_class"] = self.moves.n_moves
- if self.model not in (True, False, None):
- self.model.resize_output(self.moves.n_moves)
+ self.model.resize_output(self.moves.n_moves)
if self._rehearsal_model not in (True, False, None):
self._rehearsal_model.resize_output(self.moves.n_moves)
@@ -227,7 +150,7 @@ cdef class Parser:
doc (Doc): The document to be processed.
"""
if beam_width is None:
- beam_width = self.cfg.get('beam_width', 1)
+ beam_width = self.cfg['beam_width']
beam_density = self.cfg.get('beam_density', 0.)
states = self.predict([doc], beam_width=beam_width,
beam_density=beam_density)
@@ -243,7 +166,7 @@ cdef class Parser:
YIELDS (Doc): Documents, in order.
"""
if beam_width is None:
- beam_width = self.cfg.get('beam_width', 1)
+ beam_width = self.cfg['beam_width']
beam_density = self.cfg.get('beam_density', 0.)
cdef Doc doc
for batch in util.minibatch(docs, size=batch_size):
@@ -264,13 +187,7 @@ cdef class Parser:
else:
yield from batch_in_order
- def require_model(self):
- """Raise an error if the component's model is not initialized."""
- if getattr(self, 'model', None) in (None, True, False):
- raise ValueError(Errors.E109.format(name=self.name))
-
def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.):
- self.require_model()
if isinstance(docs, Doc):
docs = [docs]
if not any(len(doc) for doc in docs):
@@ -313,11 +230,11 @@ cdef class Parser:
# if labels are missing. We therefore have to check whether we need to
# expand our model output.
self._resize()
+ cdef int nr_feature = self.model.lower.get_dim("nF")
model = self.model.predict(docs)
- token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
+ token_ids = numpy.zeros((len(docs) * beam_width, nr_feature),
dtype='i', order='C')
cdef int* c_ids
- cdef int nr_feature = self.cfg["nr_feature_tokens"]
cdef int n_states
model = self.model.predict(docs)
todo = [beam for beam in beams if not beam.is_done]
@@ -430,7 +347,6 @@ cdef class Parser:
return [b for b in beams if not b.is_done]
def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
- self.require_model()
examples = Example.to_example_objects(examples)
if losses is None:
@@ -440,9 +356,9 @@ cdef class Parser:
multitask.update(examples, drop=drop, sgd=sgd)
# The probability we use beam update, instead of falling back to
# a greedy update
- beam_update_prob = self.cfg.get('beam_update_prob', 0.5)
- if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() < beam_update_prob:
- return self.update_beam(examples, self.cfg.get('beam_width', 1),
+ beam_update_prob = self.cfg['beam_update_prob']
+ if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
+ return self.update_beam(examples, self.cfg['beam_width'],
drop=drop, sgd=sgd, losses=losses, set_annotations=set_annotations,
beam_density=self.cfg.get('beam_density', 0.001))
@@ -533,7 +449,7 @@ cdef class Parser:
set_dropout_rate(self.model, drop)
model, backprop_tok2vec = self.model.begin_update(docs)
states_d_scores, backprops, beams = _beam_utils.update_beam(
- self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds,
+ self.moves, self.model.lower.get_dim("nF"), 10000, states, golds,
model.state2vec, model.vec2scores, width, losses=losses,
beam_density=beam_density)
for i, d_scores in enumerate(states_d_scores):
@@ -562,8 +478,6 @@ cdef class Parser:
keyed by the parameter ID. The values are (weights, gradients) tuples.
"""
gradients = {}
- if self.model in (None, True, False):
- return gradients
queue = [self.model]
seen = set()
for node in queue:
@@ -647,45 +561,40 @@ cdef class Parser:
def create_optimizer(self):
return create_default_optimizer()
- def begin_training(self, get_examples, pipeline=None, sgd=None, **cfg):
- if 'model' in cfg:
- self.model = cfg['model']
+ def set_output(self, nO):
+ if self.model.upper.has_dim("nO") is None:
+ self.model.upper.set_dim("nO", nO)
+
+ def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
+ self.cfg.update(kwargs)
if not hasattr(get_examples, '__call__'):
gold_tuples = get_examples
get_examples = lambda: gold_tuples
- cfg.setdefault('min_action_freq', 30)
actions = self.moves.get_actions(gold_parses=get_examples(),
- min_freq=cfg.get('min_action_freq', 30),
- learn_tokens=self.cfg.get("learn_tokens", False))
+ min_freq=self.cfg['min_action_freq'],
+ learn_tokens=self.cfg["learn_tokens"])
for action, labels in self.moves.labels.items():
actions.setdefault(action, {})
for label, freq in labels.items():
if label not in actions[action]:
actions[action][label] = freq
self.moves.initialize_actions(actions)
- cfg.setdefault('token_vector_width', 96)
- if self.model is True:
- self.model, cfg = self.Model(self.moves.n_moves, **cfg)
- if sgd is None:
- sgd = self.create_optimizer()
- doc_sample = []
- gold_sample = []
- for example in islice(get_examples(), 1000):
- parses = example.get_gold_parses(merge=False, vocab=self.vocab)
- for doc, gold in parses:
- doc_sample.append(doc)
- gold_sample.append(gold)
- self.model.initialize(doc_sample, gold_sample)
- if pipeline is not None:
- self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **cfg)
- link_vectors_to_models(self.vocab)
- else:
- if sgd is None:
- sgd = self.create_optimizer()
- if self.model.upper.has_dim("nO") is None:
- self.model.upper.set_dim("nO", self.moves.n_moves)
- self.model.initialize()
- self.cfg.update(cfg)
+ # make sure we resize so we have an appropriate upper layer
+ self._resize()
+ if sgd is None:
+ sgd = self.create_optimizer()
+ doc_sample = []
+ gold_sample = []
+ for example in islice(get_examples(), 1000):
+ parses = example.get_gold_parses(merge=False, vocab=self.vocab)
+ for doc, gold in parses:
+ doc_sample.append(doc)
+ gold_sample.append(gold)
+
+ self.model.initialize(doc_sample, gold_sample)
+ if pipeline is not None:
+ self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
+ link_vectors_to_models(self.vocab)
return sgd
def _get_doc(self, example):
@@ -709,28 +618,24 @@ cdef class Parser:
'vocab': lambda p: self.vocab.from_disk(p),
'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
- 'model': lambda p: None
+ 'model': lambda p: None,
}
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
util.from_disk(path, deserializers, exclude)
if 'model' not in exclude:
path = util.ensure_path(path)
- if self.model is True:
- self.model, cfg = self.Model(**self.cfg)
- else:
- cfg = {}
with (path / 'model').open('rb') as file_:
bytes_data = file_.read()
try:
+ self._resize()
self.model.from_bytes(bytes_data)
except AttributeError:
raise ValueError(Errors.E149)
- self.cfg.update(cfg)
return self
def to_bytes(self, exclude=tuple(), **kwargs):
serializers = {
- "model": lambda: (self.model.to_bytes() if self.model is not True else True),
+ "model": lambda: (self.model.to_bytes()),
"vocab": lambda: self.vocab.to_bytes(),
"moves": lambda: self.moves.to_bytes(exclude=["strings"]),
"cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
@@ -743,22 +648,14 @@ cdef class Parser:
"vocab": lambda b: self.vocab.from_bytes(b),
"moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
- "model": lambda b: None
+ "model": lambda b: None,
}
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude:
- # TODO: Remove this once we don't have to handle previous models
- if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
- self.cfg['pretrained_vectors'] = self.vocab.vectors
- if self.model is True:
- self.model, cfg = self.Model(**self.cfg)
- else:
- cfg = {}
if 'model' in msg:
try:
self.model.from_bytes(msg['model'])
except AttributeError:
raise ValueError(Errors.E149)
- self.cfg.update(cfg)
return self
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 766dcb739..3a466b24c 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -3,12 +3,13 @@ from spacy.tokens import Span
import pytest
from ..util import get_doc
+from ...ml.models.defaults import default_ner
def test_doc_add_entities_set_ents_iob(en_vocab):
text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text)
- ner = EntityRecognizer(en_vocab)
+ ner = EntityRecognizer(en_vocab, default_ner())
ner.begin_training([])
ner(doc)
assert len(list(doc.ents)) == 0
@@ -24,7 +25,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
def test_ents_reset(en_vocab):
text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text)
- ner = EntityRecognizer(en_vocab)
+ ner = EntityRecognizer(en_vocab, default_ner())
ner.begin_training([])
ner(doc)
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index fe847a6ae..5af772ddc 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -3,6 +3,8 @@ from thinc.api import Adam, NumpyOps
from spacy.attrs import NORM
from spacy.gold import GoldParse
from spacy.vocab import Vocab
+
+from spacy.ml.models.defaults import default_parser, default_ner
from spacy.tokens import Doc
from spacy.pipeline import DependencyParser, EntityRecognizer
from spacy.util import fix_random_seed
@@ -15,7 +17,7 @@ def vocab():
@pytest.fixture
def parser(vocab):
- parser = DependencyParser(vocab)
+ parser = DependencyParser(vocab, default_parser())
return parser
@@ -55,27 +57,31 @@ def test_add_label(parser):
def test_add_label_deserializes_correctly():
- ner1 = EntityRecognizer(Vocab())
+ ner1 = EntityRecognizer(Vocab(), default_ner())
ner1.add_label("C")
ner1.add_label("B")
ner1.add_label("A")
ner1.begin_training([])
- ner2 = EntityRecognizer(Vocab()).from_bytes(ner1.to_bytes())
+ ner2 = EntityRecognizer(Vocab(), default_ner())
+
+ # the second model needs to be resized before we can call from_bytes
+ ner2.model.resize_output(ner1.moves.n_moves)
+ ner2.from_bytes(ner1.to_bytes())
assert ner1.moves.n_moves == ner2.moves.n_moves
for i in range(ner1.moves.n_moves):
assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i)
@pytest.mark.parametrize(
- "pipe_cls,n_moves", [(DependencyParser, 5), (EntityRecognizer, 4)]
+ "pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())]
)
-def test_add_label_get_label(pipe_cls, n_moves):
+def test_add_label_get_label(pipe_cls, n_moves, model):
"""Test that added labels are returned correctly. This test was added to
test for a bug in DependencyParser.labels that'd cause it to fail when
splitting the move names.
"""
labels = ["A", "B", "C"]
- pipe = pipe_cls(Vocab())
+ pipe = pipe_cls(Vocab(), model)
for label in labels:
pipe.add_label(label)
assert len(pipe.move_names) == len(labels) * n_moves
diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py
index dd593f7d3..2426805d2 100644
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@@ -1,5 +1,7 @@
import pytest
from spacy.vocab import Vocab
+
+from spacy.ml.models.defaults import default_parser
from spacy.pipeline import DependencyParser
from spacy.tokens import Doc
from spacy.gold import GoldParse
@@ -136,7 +138,7 @@ def test_get_oracle_actions():
deps.append(dep)
ents.append(ent)
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
- parser = DependencyParser(doc.vocab)
+ parser = DependencyParser(doc.vocab, default_parser())
parser.moves.add_action(0, "")
parser.moves.add_action(1, "")
parser.moves.add_action(1, "")
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 9a4d21a8d..3fde75eb5 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -1,10 +1,15 @@
import pytest
+
+from spacy import util
from spacy.lang.en import English
+from spacy.ml.models.defaults import default_ner
from spacy.pipeline import EntityRecognizer, EntityRuler
from spacy.vocab import Vocab
from spacy.syntax.ner import BiluoPushDown
from spacy.gold import GoldParse
+
+from spacy.tests.util import make_tempdir
from spacy.tokens import Doc
TRAIN_DATA = [
@@ -134,7 +139,7 @@ def test_accept_blocked_token():
# 1. test normal behaviour
nlp1 = English()
doc1 = nlp1("I live in New York")
- ner1 = EntityRecognizer(doc1.vocab)
+ ner1 = EntityRecognizer(doc1.vocab, default_ner())
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
@@ -152,7 +157,7 @@ def test_accept_blocked_token():
# 2. test blocking behaviour
nlp2 = English()
doc2 = nlp2("I live in New York")
- ner2 = EntityRecognizer(doc2.vocab)
+ ner2 = EntityRecognizer(doc2.vocab, default_ner())
# set "New York" to a blocked entity
doc2.ents = [(0, 3, 5)]
@@ -188,7 +193,7 @@ def test_overwrite_token():
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
# Check that a new ner can overwrite O
- ner2 = EntityRecognizer(doc.vocab)
+ ner2 = EntityRecognizer(doc.vocab, default_ner())
ner2.moves.add_action(5, "")
ner2.add_label("GPE")
state = ner2.moves.init_batch([doc])[0]
@@ -199,6 +204,17 @@ def test_overwrite_token():
assert ner2.moves.is_valid(state, "L-GPE")
+def test_empty_ner():
+ nlp = English()
+ ner = nlp.create_pipe("ner")
+ ner.add_label("MY_LABEL")
+ nlp.add_pipe(ner)
+ nlp.begin_training()
+ doc = nlp("John is watching the news about Croatia's elections")
+ # if this goes wrong, the initialization of the parser's upper layer is probably broken
+ assert [token.ent_iob_ for token in doc] == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+
+
def test_ruler_before_ner():
""" Test that an NER works after an entity_ruler: the second can add annotations """
nlp = English()
@@ -214,7 +230,6 @@ def test_ruler_before_ner():
untrained_ner.add_label("MY_LABEL")
nlp.add_pipe(untrained_ner)
nlp.begin_training()
-
doc = nlp("This is Antti Korhonen speaking in Finland")
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
expected_types = ["THING", "", "", "", "", "", ""]
@@ -261,28 +276,7 @@ def test_block_ner():
assert [token.ent_type_ for token in doc] == expected_types
-def test_change_number_features():
- # Test the default number features
- nlp = English()
- ner = nlp.create_pipe("ner")
- nlp.add_pipe(ner)
- ner.add_label("PERSON")
- nlp.begin_training()
- assert ner.model.lower.get_dim("nF") == ner.nr_feature
- # Test we can change it
- nlp = English()
- ner = nlp.create_pipe("ner")
- nlp.add_pipe(ner)
- ner.add_label("PERSON")
- nlp.begin_training(
- component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}}
- )
- assert ner.model.lower.get_dim("nF") == 3
- # Test the model runs
- nlp("hello world")
-
-
-def test_overfitting():
+def test_overfitting_IO():
# Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly
nlp = English()
ner = nlp.create_pipe("ner")
@@ -301,11 +295,20 @@ def test_overfitting():
test_text = "I like London."
doc = nlp(test_text)
ents = doc.ents
-
assert len(ents) == 1
assert ents[0].text == "London"
assert ents[0].label_ == "LOC"
+ # Also test the results are still the same after IO
+ with make_tempdir() as tmp_dir:
+ nlp.to_disk(tmp_dir)
+ nlp2 = util.load_model_from_path(tmp_dir)
+ doc2 = nlp2(test_text)
+ ents2 = doc2.ents
+ assert len(ents2) == 1
+ assert ents2[0].text == "London"
+ assert ents2[0].label_ == "LOC"
+
class BlockerComponent1(object):
name = "my_blocker"
diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py
index 2470982d3..984af4d6b 100644
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@@ -1,8 +1,9 @@
import pytest
-from spacy.ml.component_models import Tok2Vec
+from spacy.ml.models.defaults import default_parser, default_tok2vec
from spacy.vocab import Vocab
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.nn_parser import Parser
+from spacy.syntax._parser_model import ParserModel
from spacy.tokens.doc import Doc
from spacy.gold import GoldParse
@@ -20,19 +21,22 @@ def arc_eager(vocab):
@pytest.fixture
def tok2vec():
- tok2vec = Tok2Vec(8, 100)
+ tok2vec = default_tok2vec()
tok2vec.initialize()
return tok2vec
@pytest.fixture
def parser(vocab, arc_eager):
- return Parser(vocab, moves=arc_eager, model=None)
+ return Parser(vocab, model=default_parser(), moves=arc_eager)
@pytest.fixture
-def model(arc_eager, tok2vec):
- return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.get_dim("nO"))[0]
+def model(arc_eager, tok2vec, vocab):
+ model = default_parser()
+ model.resize_output(arc_eager.n_moves)
+ model.initialize()
+ return model
@pytest.fixture
@@ -46,11 +50,11 @@ def gold(doc):
def test_can_init_nn_parser(parser):
- assert parser.model is None
+ assert isinstance(parser.model, ParserModel)
-def test_build_model(parser):
- parser.model = Parser.Model(parser.moves.n_moves, hist_size=0)[0]
+def test_build_model(parser, vocab):
+ parser.model = Parser(vocab, model=default_parser(), moves=parser.moves).model
assert parser.model is not None
diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py
index 24997e47c..619e0cc0b 100644
--- a/spacy/tests/parser/test_nn_beam.py
+++ b/spacy/tests/parser/test_nn_beam.py
@@ -2,6 +2,7 @@ import pytest
import numpy
from spacy.vocab import Vocab
from spacy.language import Language
+from spacy.ml.models.defaults import default_parser
from spacy.pipeline import DependencyParser
from spacy.syntax.arc_eager import ArcEager
from spacy.tokens import Doc
@@ -93,7 +94,7 @@ def test_beam_advance_too_few_scores(beam, scores):
def test_beam_parse():
nlp = Language()
- nlp.add_pipe(DependencyParser(nlp.vocab), name="parser")
+ nlp.add_pipe(DependencyParser(nlp.vocab, default_parser()), name="parser")
nlp.parser.add_label("nsubj")
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
doc = nlp.make_doc("Australia is a country")
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 1d3f522c9..6e13d3044 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,7 +1,8 @@
import pytest
from spacy.lang.en import English
-from ..util import get_doc, apply_transition_sequence
+from ..util import get_doc, apply_transition_sequence, make_tempdir
+from ... import util
TRAIN_DATA = [
(
@@ -182,7 +183,7 @@ def test_parser_set_sent_starts(en_vocab):
assert token.head in sent
-def test_overfitting():
+def test_overfitting_IO():
# Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly
nlp = English()
parser = nlp.create_pipe("parser")
@@ -200,7 +201,15 @@ def test_overfitting():
# test the trained model
test_text = "I like securities."
doc = nlp(test_text)
-
assert doc[0].dep_ is "nsubj"
assert doc[2].dep_ is "dobj"
assert doc[3].dep_ is "punct"
+
+ # Also test the results are still the same after IO
+ with make_tempdir() as tmp_dir:
+ nlp.to_disk(tmp_dir)
+ nlp2 = util.load_model_from_path(tmp_dir)
+ doc2 = nlp2(test_text)
+ assert doc2[0].dep_ is "nsubj"
+ assert doc2[2].dep_ is "dobj"
+ assert doc2[3].dep_ is "punct"
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index c6c1240a8..af777aa6b 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -3,6 +3,8 @@ from thinc.api import Adam
from spacy.attrs import NORM
from spacy.gold import GoldParse
from spacy.vocab import Vocab
+
+from spacy.ml.models.defaults import default_parser
from spacy.tokens import Doc
from spacy.pipeline import DependencyParser
@@ -14,7 +16,7 @@ def vocab():
@pytest.fixture
def parser(vocab):
- parser = DependencyParser(vocab)
+ parser = DependencyParser(vocab, default_parser())
parser.cfg["token_vector_width"] = 4
parser.cfg["hidden_width"] = 32
# parser.add_label('right')
diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py
index 5c246538c..cda39f6ee 100644
--- a/spacy/tests/pipeline/test_analysis.py
+++ b/spacy/tests/pipeline/test_analysis.py
@@ -111,7 +111,8 @@ def test_component_factories_from_nlp():
nlp.add_pipe(pipe)
assert nlp("hello world")
# The first argument here is the class itself, so we're accepting any here
- mock.assert_called_once_with(ANY, nlp, foo="bar")
+ # The model will be initialized to None by the factory
+ mock.assert_called_once_with(ANY, nlp, None, foo="bar")
def test_analysis_validate_attrs_valid():
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 366cd4f1a..a90207a78 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,5 +1,9 @@
import pytest
+
+from spacy import util
+from spacy.lang.en import English
from spacy.language import Language
+from spacy.tests.util import make_tempdir
def test_label_types():
@@ -18,9 +22,9 @@ TRAIN_DATA = [
]
-def test_overfitting():
+def test_overfitting_IO():
# Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
- nlp = Language()
+ nlp = English()
tagger = nlp.create_pipe("tagger")
for tag, values in TAG_MAP.items():
tagger.add_label(tag, values)
@@ -35,8 +39,17 @@ def test_overfitting():
# test the trained model
test_text = "I like blue eggs"
doc = nlp(test_text)
-
assert doc[0].tag_ is "N"
assert doc[1].tag_ is "V"
assert doc[2].tag_ is "J"
assert doc[3].tag_ is "N"
+
+ # Also test the results are still the same after IO
+ with make_tempdir() as tmp_dir:
+ nlp.to_disk(tmp_dir)
+ nlp2 = util.load_model_from_path(tmp_dir)
+ doc2 = nlp2(test_text)
+ assert doc2[0].tag_ is "N"
+ assert doc2[1].tag_ is "V"
+ assert doc2[2].tag_ is "J"
+ assert doc2[3].tag_ is "N"
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 558d09e40..1b5ca9a4c 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,8 +1,12 @@
import pytest
import random
import numpy.random
+
+from spacy import util
+from spacy.lang.en import English
from spacy.language import Language
from spacy.pipeline import TextCategorizer
+from spacy.tests.util import make_tempdir
from spacy.tokens import Doc
from spacy.gold import GoldParse
@@ -74,9 +78,9 @@ def test_label_types():
nlp.get_pipe("textcat").add_label(9)
-def test_overfitting():
+def test_overfitting_IO():
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
- nlp = Language()
+ nlp = English()
textcat = nlp.create_pipe("textcat")
for _, annotations in TRAIN_DATA:
for label, value in annotations.get("cats").items():
@@ -87,11 +91,21 @@ def test_overfitting():
for i in range(50):
losses = {}
nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
- assert losses["textcat"] < 0.00001
+ assert losses["textcat"] < 0.01
# test the trained model
test_text = "I am happy."
doc = nlp(test_text)
cats = doc.cats
+ # note that by default, exclusive_classes = false so we need a bigger error margin
assert cats["POSITIVE"] > 0.9
- assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001)
+ assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.1)
+
+ # Also test the results are still the same after IO
+ with make_tempdir() as tmp_dir:
+ nlp.to_disk(tmp_dir)
+ nlp2 = util.load_model_from_path(tmp_dir)
+ doc2 = nlp2(test_text)
+ cats2 = doc2.cats
+ assert cats2["POSITIVE"] > 0.9
+ assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 2bfdbd7c3..ff8c7c2fe 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -10,6 +10,7 @@ from spacy.lang.lex_attrs import is_stop
from spacy.vectors import Vectors
from spacy.vocab import Vocab
from spacy.language import Language
+from spacy.ml.models.defaults import default_ner, default_tagger
from spacy.tokens import Doc, Span, Token
from spacy.pipeline import Tagger, EntityRecognizer
from spacy.attrs import HEAD, DEP
@@ -123,7 +124,7 @@ def test_issue1727():
correctly after vectors are added."""
data = numpy.ones((3, 300), dtype="f")
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
- tagger = Tagger(Vocab())
+ tagger = Tagger(Vocab(), default_tagger())
tagger.add_label("PRP")
with pytest.warns(UserWarning):
tagger.begin_training()
@@ -131,7 +132,7 @@ def test_issue1727():
tagger.vocab.vectors = vectors
with make_tempdir() as path:
tagger.to_disk(path)
- tagger = Tagger(Vocab()).from_disk(path)
+ tagger = Tagger(Vocab(), default_tagger()).from_disk(path)
assert tagger.cfg.get("pretrained_dims", 0) == 0
@@ -236,6 +237,7 @@ def test_issue1889(word):
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
+@pytest.mark.skip(reason="This test has become obsolete with the config refactor of v.3")
def test_issue1915():
cfg = {"hidden_depth": 2} # should error out
nlp = Language()
@@ -268,7 +270,7 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label):
- ner = EntityRecognizer(Vocab())
+ ner = EntityRecognizer(Vocab(), default_ner())
example = Example(doc=None)
example.set_token_annotation(
ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index 2c25b6d73..1786677e0 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -32,6 +32,9 @@ def test_issue2179():
nlp.begin_training()
nlp2 = Italian()
nlp2.add_pipe(nlp2.create_pipe("ner"))
+
+ assert len(nlp2.get_pipe("ner").labels) == 0
+ nlp2.get_pipe("ner").model.resize_output(nlp.get_pipe("ner").moves.n_moves)
nlp2.from_bytes(nlp.to_bytes())
assert "extra_labels" not in nlp2.get_pipe("ner").cfg
assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index cc893e472..df23efa4f 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -1,6 +1,7 @@
import pytest
from spacy.lang.en import English
from spacy.lang.de import German
+from spacy.ml.models.defaults import default_ner
from spacy.pipeline import EntityRuler, EntityRecognizer
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc
@@ -103,6 +104,7 @@ def test_issue3209():
assert ner.move_names == move_names
nlp2 = English()
nlp2.add_pipe(nlp2.create_pipe("ner"))
+ nlp2.get_pipe("ner").model.resize_output(ner.moves.n_moves)
nlp2.from_bytes(nlp.to_bytes())
assert nlp2.get_pipe("ner").move_names == move_names
@@ -193,7 +195,7 @@ def test_issue3345():
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
doc[4].is_sent_start = True
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
- ner = EntityRecognizer(doc.vocab)
+ ner = EntityRecognizer(doc.vocab, default_ner())
# Add the OUT action. I wouldn't have thought this would be necessary...
ner.moves.add_action(5, "")
ner.add_label("GPE")
diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py
index 54ce10924..9752f70df 100644
--- a/spacy/tests/regression/test_issue3830.py
+++ b/spacy/tests/regression/test_issue3830.py
@@ -1,10 +1,12 @@
from spacy.pipeline.pipes import DependencyParser
from spacy.vocab import Vocab
+from spacy.ml.models.defaults import default_parser
+
def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens"""
- parser = DependencyParser(Vocab())
+ parser = DependencyParser(Vocab(), default_parser())
parser.add_label("nsubj")
assert "subtok" not in parser.labels
parser.begin_training(lambda: [])
@@ -13,7 +15,7 @@ def test_issue3830_no_subtok():
def test_issue3830_with_subtok():
"""Test that the parser does have subtok label if learn_tokens=True."""
- parser = DependencyParser(Vocab(), learn_tokens=True)
+ parser = DependencyParser(Vocab(), default_parser(), learn_tokens=True)
parser.add_label("nsubj")
assert "subtok" not in parser.labels
parser.begin_training(lambda: [])
diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py
index 6644a8eda..75a1c23b7 100644
--- a/spacy/tests/regression/test_issue4042.py
+++ b/spacy/tests/regression/test_issue4042.py
@@ -3,6 +3,7 @@ from spacy.pipeline import EntityRecognizer, EntityRuler
from spacy.lang.en import English
from spacy.tokens import Span
from spacy.util import ensure_path
+from spacy.ml.models.defaults import default_ner
from ..util import make_tempdir
@@ -73,6 +74,6 @@ def test_issue4042_bug2():
output_dir.mkdir()
ner1.to_disk(output_dir)
- ner2 = EntityRecognizer(vocab)
+ ner2 = EntityRecognizer(vocab, default_ner())
ner2.from_disk(output_dir)
assert len(ner2.labels) == 2
diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py
index a3f6f69df..30688601f 100644
--- a/spacy/tests/regression/test_issue4313.py
+++ b/spacy/tests/regression/test_issue4313.py
@@ -1,5 +1,6 @@
from collections import defaultdict
+from spacy.ml.models.defaults import default_ner
from spacy.pipeline import EntityRecognizer
from spacy.lang.en import English
@@ -11,7 +12,7 @@ def test_issue4313():
beam_width = 16
beam_density = 0.0001
nlp = English()
- ner = EntityRecognizer(nlp.vocab)
+ ner = EntityRecognizer(nlp.vocab, default_ner())
ner.add_label("SOME_LABEL")
ner.begin_training([])
nlp.add_pipe(ner)
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
new file mode 100644
index 000000000..c34d01547
--- /dev/null
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -0,0 +1,126 @@
+from thinc.api import Config
+
+import spacy
+from spacy import util
+from spacy.lang.en import English
+from spacy.util import registry
+
+from ..util import make_tempdir
+from ...ml.models import build_Tok2Vec_model, build_tb_parser_model
+
+nlp_config_string = """
+[nlp]
+lang = "en"
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 342
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
+subword_features = true
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+"""
+
+
+parser_config_string = """
+[model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 99
+hidden_width = 66
+maxout_pieces = 2
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 333
+depth = 4
+embed_size = 5555
+window_size = 1
+maxout_pieces = 7
+subword_features = false
+"""
+
+
+@registry.architectures.register("my_test_parser")
+def my_parser():
+ tok2vec = build_Tok2Vec_model(width=321, embed_size=5432, pretrained_vectors=None, window_size=3,
+ maxout_pieces=4, subword_features=True, char_embed=True, nM=64, nC=8,
+ conv_depth=2, bilstm_depth=0)
+ parser = build_tb_parser_model(tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5)
+ return parser
+
+
+def test_serialize_nlp():
+ """ Create a custom nlp pipeline from config and ensure it serializes it correctly """
+ nlp_config = Config().from_str(nlp_config_string)
+ nlp = util.load_model_from_config(nlp_config["nlp"])
+ nlp.begin_training()
+ assert "tok2vec" in nlp.pipe_names
+ assert "tagger" in nlp.pipe_names
+ assert "parser" not in nlp.pipe_names
+ assert nlp.get_pipe("tagger").model.get_ref("tok2vec").get_dim("nO") == 342
+
+ with make_tempdir() as d:
+ nlp.to_disk(d)
+ nlp2 = spacy.load(d)
+ assert "tok2vec" in nlp2.pipe_names
+ assert "tagger" in nlp2.pipe_names
+ assert "parser" not in nlp2.pipe_names
+ assert nlp2.get_pipe("tagger").model.get_ref("tok2vec").get_dim("nO") == 342
+
+
+def test_serialize_custom_nlp():
+ """ Create a custom nlp pipeline and ensure it serializes it correctly"""
+ nlp = English()
+ parser_cfg = dict()
+ parser_cfg["model"] = {'@architectures': "my_test_parser"}
+ parser = nlp.create_pipe("parser", parser_cfg)
+ nlp.add_pipe(parser)
+ nlp.begin_training()
+
+ with make_tempdir() as d:
+ nlp.to_disk(d)
+ nlp2 = spacy.load(d)
+ model = nlp2.get_pipe("parser").model
+ tok2vec = model.get_ref("tok2vec")
+ upper = model.upper
+
+ # check that we have the correct settings, not the default ones
+ assert tok2vec.get_dim("nO") == 321
+ assert upper.get_dim("nI") == 65
+
+
+def test_serialize_parser():
+ """ Create a non-default parser config to check nlp serializes it correctly """
+ nlp = English()
+ model_config = Config().from_str(parser_config_string)
+ parser = nlp.create_pipe("parser", config=model_config)
+ parser.add_label("nsubj")
+ nlp.add_pipe(parser)
+ nlp.begin_training()
+
+ with make_tempdir() as d:
+ nlp.to_disk(d)
+ nlp2 = spacy.load(d)
+ model = nlp2.get_pipe("parser").model
+ tok2vec = model.get_ref("tok2vec")
+ upper = model.upper
+
+ # check that we have the correct settings, not the default ones
+ assert upper.get_dim("nI") == 66
+ assert tok2vec.get_dim("nO") == 333
diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py
index 4089a0d07..0e3b7c59f 100644
--- a/spacy/tests/serialize/test_serialize_language.py
+++ b/spacy/tests/serialize/test_serialize_language.py
@@ -1,5 +1,6 @@
import pytest
import re
+
from spacy.language import Language
from spacy.tokenizer import Tokenizer
@@ -56,7 +57,7 @@ def test_serialize_language_exclude(meta_data):
nlp = Language(meta=meta_data)
assert nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes())
- assert nlp.meta["name"] == name
+ assert new_nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"])
assert not new_nlp.meta["name"] == name
new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"]))
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 0ad9bc4d4..fe14fba10 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -1,6 +1,7 @@
import pytest
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
+from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger, default_textcat, default_sentrec
from ..util import make_tempdir
@@ -10,58 +11,58 @@ test_parsers = [DependencyParser, EntityRecognizer]
@pytest.fixture
def parser(en_vocab):
- parser = DependencyParser(en_vocab)
+ parser = DependencyParser(en_vocab, default_parser())
parser.add_label("nsubj")
- parser.model, cfg = parser.Model(parser.moves.n_moves)
- parser.cfg.update(cfg)
return parser
@pytest.fixture
def blank_parser(en_vocab):
- parser = DependencyParser(en_vocab)
+ parser = DependencyParser(en_vocab, default_parser())
return parser
@pytest.fixture
def taggers(en_vocab):
- tagger1 = Tagger(en_vocab)
- tagger2 = Tagger(en_vocab)
- tagger1.model = tagger1.Model(8)
- tagger2.model = tagger1.model
- return (tagger1, tagger2)
+ model = default_tagger()
+ tagger1 = Tagger(en_vocab, model)
+ tagger2 = Tagger(en_vocab, model)
+ return tagger1, tagger2
@pytest.mark.parametrize("Parser", test_parsers)
def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
- parser = Parser(en_vocab)
- parser.model, _ = parser.Model(10)
- new_parser = Parser(en_vocab)
- new_parser.model, _ = new_parser.Model(10)
+ parser = Parser(en_vocab, default_parser())
+ new_parser = Parser(en_vocab, default_parser())
new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"]))
- assert new_parser.to_bytes(exclude=["vocab"]) == parser.to_bytes(exclude=["vocab"])
+ bytes_2 = new_parser.to_bytes(exclude=["vocab"])
+ bytes_3 = parser.to_bytes(exclude=["vocab"])
+ assert len(bytes_2) == len(bytes_3)
+ assert bytes_2 == bytes_3
@pytest.mark.parametrize("Parser", test_parsers)
def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
- parser = Parser(en_vocab)
- parser.model, _ = parser.Model(0)
+ parser = Parser(en_vocab, default_parser())
with make_tempdir() as d:
file_path = d / "parser"
parser.to_disk(file_path)
- parser_d = Parser(en_vocab)
- parser_d.model, _ = parser_d.Model(0)
+ parser_d = Parser(en_vocab, default_parser())
parser_d = parser_d.from_disk(file_path)
parser_bytes = parser.to_bytes(exclude=["model", "vocab"])
parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"])
+ assert len(parser_bytes) == len(parser_d_bytes)
assert parser_bytes == parser_d_bytes
def test_to_from_bytes(parser, blank_parser):
assert parser.model is not True
- assert blank_parser.model is True
+ assert blank_parser.model is not True
assert blank_parser.moves.n_moves != parser.moves.n_moves
bytes_data = parser.to_bytes(exclude=["vocab"])
+
+ # the blank parser needs to be resized before we can call from_bytes
+ blank_parser.model.resize_output(parser.moves.n_moves)
blank_parser.from_bytes(bytes_data)
assert blank_parser.model is not True
assert blank_parser.moves.n_moves == parser.moves.n_moves
@@ -75,8 +76,10 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
tagger1_b = tagger1.to_bytes()
tagger1 = tagger1.from_bytes(tagger1_b)
assert tagger1.to_bytes() == tagger1_b
- new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
- assert new_tagger1.to_bytes() == tagger1_b
+ new_tagger1 = Tagger(en_vocab, default_tagger()).from_bytes(tagger1_b)
+ new_tagger1_b = new_tagger1.to_bytes()
+ assert len(new_tagger1_b) == len(tagger1_b)
+ assert new_tagger1_b == tagger1_b
def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
@@ -86,26 +89,24 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
file_path2 = d / "tagger2"
tagger1.to_disk(file_path1)
tagger2.to_disk(file_path2)
- tagger1_d = Tagger(en_vocab).from_disk(file_path1)
- tagger2_d = Tagger(en_vocab).from_disk(file_path2)
+ tagger1_d = Tagger(en_vocab, default_tagger()).from_disk(file_path1)
+ tagger2_d = Tagger(en_vocab, default_tagger()).from_disk(file_path2)
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
- tensorizer = Tensorizer(en_vocab)
- tensorizer.model = tensorizer.Model()
+ tensorizer = Tensorizer(en_vocab, default_tensorizer())
tensorizer_b = tensorizer.to_bytes(exclude=["vocab"])
- new_tensorizer = Tensorizer(en_vocab).from_bytes(tensorizer_b)
+ new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b)
assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b
def test_serialize_tensorizer_roundtrip_disk(en_vocab):
- tensorizer = Tensorizer(en_vocab)
- tensorizer.model = tensorizer.Model()
+ tensorizer = Tensorizer(en_vocab, default_tensorizer())
with make_tempdir() as d:
file_path = d / "tensorizer"
tensorizer.to_disk(file_path)
- tensorizer_d = Tensorizer(en_vocab).from_disk(file_path)
+ tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path)
assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes(
exclude=["vocab"]
)
@@ -113,19 +114,17 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
def test_serialize_textcat_empty(en_vocab):
# See issue #1105
- textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
+ textcat = TextCategorizer(en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"])
textcat.to_bytes(exclude=["vocab"])
@pytest.mark.parametrize("Parser", test_parsers)
def test_serialize_pipe_exclude(en_vocab, Parser):
def get_new_parser():
- new_parser = Parser(en_vocab)
- new_parser.model, _ = new_parser.Model(0)
+ new_parser = Parser(en_vocab, default_parser())
return new_parser
- parser = Parser(en_vocab)
- parser.model, _ = parser.Model(0)
+ parser = Parser(en_vocab, default_parser())
parser.cfg["foo"] = "bar"
new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]))
assert "foo" in new_parser.cfg
@@ -144,7 +143,7 @@ def test_serialize_pipe_exclude(en_vocab, Parser):
def test_serialize_sentencerecognizer(en_vocab):
- sr = SentenceRecognizer(en_vocab)
+ sr = SentenceRecognizer(en_vocab, default_sentrec())
sr_b = sr.to_bytes()
- sr_d = SentenceRecognizer(en_vocab).from_bytes(sr_b)
+ sr_d = SentenceRecognizer(en_vocab, default_sentrec()).from_bytes(sr_b)
assert sr.to_bytes() == sr_d.to_bytes()
diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py
index 2d10d79d4..310103d10 100644
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@@ -1,6 +1,6 @@
import pytest
-from spacy.ml.component_models import Tok2Vec
+from spacy.ml.models.tok2vec import build_Tok2Vec_model
from spacy.vocab import Vocab
from spacy.tokens import Doc
@@ -25,7 +25,8 @@ def test_empty_doc():
embed_size = 2000
vocab = Vocab()
doc = Doc(vocab, words=[])
- tok2vec = Tok2Vec(width, embed_size)
+ # TODO: fix tok2vec arguments
+ tok2vec = build_Tok2Vec_model(width, embed_size)
vectors, backprop = tok2vec.begin_update([doc])
assert len(vectors) == 1
assert vectors[0].shape == (0, width)
@@ -36,7 +37,19 @@ def test_empty_doc():
)
def test_tok2vec_batch_sizes(batch_size, width, embed_size):
batch = get_batch(batch_size)
- tok2vec = Tok2Vec(width, embed_size)
+ tok2vec = build_Tok2Vec_model(
+ width,
+ embed_size,
+ pretrained_vectors=None,
+ conv_depth=4,
+ bilstm_depth=0,
+ window_size=1,
+ maxout_pieces=3,
+ subword_features=True,
+ char_embed=False,
+ nM=64,
+ nC=8,
+ )
tok2vec.initialize()
vectors, backprop = tok2vec.begin_update(batch)
assert len(vectors) == len(batch)
@@ -44,19 +57,24 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
assert doc_vec.shape == (len(doc), width)
+# fmt: off
@pytest.mark.parametrize(
"tok2vec_config",
[
- {"width": 8, "embed_size": 100, "char_embed": False},
- {"width": 8, "embed_size": 100, "char_embed": True},
- {"width": 8, "embed_size": 100, "conv_depth": 6},
- {"width": 8, "embed_size": 100, "conv_depth": 6},
- {"width": 8, "embed_size": 100, "subword_features": False},
+ {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
+ {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
+ {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
+ {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
+ {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
+ {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
+ {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
+ {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
],
)
+# fmt: on
def test_tok2vec_configs(tok2vec_config):
docs = get_batch(3)
- tok2vec = Tok2Vec(**tok2vec_config)
+ tok2vec = build_Tok2Vec_model(**tok2vec_config)
tok2vec.initialize()
vectors, backprop = tok2vec.begin_update(docs)
assert len(vectors) == len(docs)
diff --git a/spacy/util.py b/spacy/util.py
index 465b9645e..286a6574c 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -6,8 +6,7 @@ from pathlib import Path
import random
from typing import List
import thinc
-import thinc.config
-from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu
+from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config
import functools
import itertools
import numpy.random
@@ -146,6 +145,10 @@ def load_model_from_path(model_path, meta=False, **overrides):
pipeline from meta.json and then calls from_disk() with path."""
if not meta:
meta = get_model_meta(model_path)
+ nlp_config = get_model_config(model_path)
+ if nlp_config.get("nlp", None):
+ return load_model_from_config(nlp_config["nlp"])
+
# Support language factories registered via entry points (e.g. custom
# language subclass) while keeping top-level language identifier "lang"
lang = meta.get("lang_factory", meta["lang"])
@@ -162,11 +165,30 @@ def load_model_from_path(model_path, meta=False, **overrides):
if name not in disable:
config = meta.get("pipeline_args", {}).get(name, {})
factory = factories.get(name, name)
+ if nlp_config.get(name, None):
+ model_config = nlp_config[name]["model"]
+ config["model"] = model_config
component = nlp.create_pipe(factory, config=config)
nlp.add_pipe(component, name=name)
return nlp.from_disk(model_path, exclude=disable)
+def load_model_from_config(nlp_config):
+ if "name" in nlp_config:
+ nlp = load_model(**nlp_config)
+ elif "lang" in nlp_config:
+ lang_class = get_lang_class(nlp_config["lang"])
+ nlp = lang_class()
+ else:
+ raise ValueError(Errors.E993)
+ if "pipeline" in nlp_config:
+ for name, component_cfg in nlp_config["pipeline"].items():
+ factory = component_cfg.pop("factory")
+ component = nlp.create_pipe(factory, config=component_cfg)
+ nlp.add_pipe(component, name=name)
+ return nlp
+
+
def load_model_from_init_py(init_file, **overrides):
"""Helper function to use in the `load()` method of a model package's
__init__.py.
@@ -184,7 +206,7 @@ def load_model_from_init_py(init_file, **overrides):
return load_model_from_path(data_path, meta, **overrides)
-def load_from_config(path, create_objects=False):
+def load_config(path, create_objects=False):
"""Load a Thinc-formatted config file, optionally filling in objects where
the config references registry entries. See "Thinc config files" for details.
@@ -212,7 +234,7 @@ def get_model_meta(path):
raise IOError(Errors.E052.format(path=model_path))
meta_path = model_path / "meta.json"
if not meta_path.is_file():
- raise IOError(Errors.E053.format(path=meta_path))
+ raise IOError(Errors.E053.format(path=meta_path, name="meta.json"))
meta = srsly.read_json(meta_path)
for setting in ["lang", "name", "version"]:
if setting not in meta or not meta[setting]:
@@ -220,6 +242,23 @@ def get_model_meta(path):
return meta
+def get_model_config(path):
+ """Get the model's config from a directory path.
+
+ path (unicode or Path): Path to model directory.
+ RETURNS (Config): The model's config data.
+ """
+ model_path = ensure_path(path)
+ if not model_path.exists():
+ raise IOError(Errors.E052.format(path=model_path))
+ config_path = model_path / "config.cfg"
+ # model directories are allowed not to have config files ?
+ if not config_path.is_file():
+ return Config({})
+ # raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
+ return Config().from_disk(config_path)
+
+
def is_package(name):
"""Check if string maps to a package installed via pip.
From 5da3ad682a0109dad6d2e6e1a45f4a833fefc929 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Fri, 28 Feb 2020 11:57:41 +0100
Subject: [PATCH 087/496] Tidy up and auto-format
---
spacy/cli/pretrain.py | 2 +-
spacy/cli/train.py | 14 +++--
spacy/cli/train_from_config.py | 4 +-
spacy/language.py | 51 ++++++++++++-------
spacy/ml/models/__init__.py | 12 ++---
spacy/ml/models/entity_linker.py | 4 +-
spacy/ml/models/parser.py | 9 ++--
spacy/ml/models/tagger.py | 2 +-
spacy/ml/models/textcat.py | 13 +++--
spacy/ml/models/tok2vec.py | 2 +-
spacy/pipeline/tok2vec.py | 1 -
spacy/tests/parser/test_add_label.py | 3 +-
spacy/tests/parser/test_ner.py | 3 +-
spacy/tests/regression/test_issue1501-2000.py | 2 +-
.../tests/serialize/test_serialize_config.py | 22 ++++++--
.../serialize/test_serialize_pipeline.py | 7 ++-
spacy/tests/test_tok2vec.py | 4 +-
17 files changed, 96 insertions(+), 59 deletions(-)
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 95d549254..b2e3229ee 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -120,7 +120,7 @@ def pretrain(
window_size=1,
char_embed=False,
nM=64,
- nC=8
+ nC=8,
),
)
# Load in pretrained weights
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 5667bb905..1ca678b85 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -9,7 +9,7 @@ from wasabi import msg
import contextlib
import random
-from ..util import create_default_optimizer, registry
+from ..util import create_default_optimizer
from ..util import use_gpu as set_gpu
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus
@@ -161,7 +161,10 @@ def train(
raise ValueError(f"Component {pipe} currently not supported.")
pipe_cfg = util.load_config(config_loc, create_objects=False)
if vectors:
- pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors}
+ pretrained_config = {
+ "@architectures": "spacy.VocabVectors.v1",
+ "name": vectors,
+ }
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
if pipe == "parser":
@@ -202,7 +205,7 @@ def train(
msg.text(f"Starting with blank model '{lang}'")
lang_cls = util.get_lang_class(lang)
nlp = lang_cls()
-
+
if vectors:
msg.text(f"Loading vectors from model '{vectors}'")
@@ -222,7 +225,10 @@ def train(
raise ValueError(f"Component {pipe} currently not supported.")
pipe_cfg = util.load_config(config_loc, create_objects=False)
if vectors:
- pretrained_config = {'@architectures': 'spacy.VocabVectors.v1', 'name': vectors}
+ pretrained_config = {
+ "@architectures": "spacy.VocabVectors.v1",
+ "name": vectors,
+ }
pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config
if pipe == "parser":
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index 0dba8a962..5b09909c7 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -1,10 +1,8 @@
from typing import Optional, Dict, List, Union, Sequence
-from pydantic import BaseModel, FilePath, StrictInt
-
+from pydantic import BaseModel, FilePath
import plac
import tqdm
from pathlib import Path
-
from wasabi import msg
import thinc
import thinc.schedules
diff --git a/spacy/language.py b/spacy/language.py
index 83f8c9d21..af9f2c157 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -130,7 +130,13 @@ class Language(object):
factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)}
def __init__(
- self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, config=None, **kwargs
+ self,
+ vocab=True,
+ make_doc=True,
+ max_length=10 ** 6,
+ meta={},
+ config=None,
+ **kwargs,
):
"""Initialise a Language object.
@@ -176,20 +182,29 @@ class Language(object):
self.max_length = max_length
self._optimizer = None
- from .ml.models.defaults import default_tagger_config, default_parser_config, default_ner_config, \
- default_textcat_config, default_nel_config, default_morphologizer_config, default_sentrec_config, \
- default_tensorizer_config, default_tok2vec_config
+ from .ml.models.defaults import (
+ default_tagger_config,
+ default_parser_config,
+ default_ner_config,
+ default_textcat_config,
+ default_nel_config,
+ default_morphologizer_config,
+ default_sentrec_config,
+ default_tensorizer_config,
+ default_tok2vec_config,
+ )
- self.defaults = {"tagger": default_tagger_config(),
- "parser": default_parser_config(),
- "ner": default_ner_config(),
- "textcat": default_textcat_config(),
- "entity_linker": default_nel_config(),
- "morphologizer": default_morphologizer_config(),
- "sentrec": default_sentrec_config(),
- "tensorizer": default_tensorizer_config(),
- "tok2vec": default_tok2vec_config(),
- }
+ self.defaults = {
+ "tagger": default_tagger_config(),
+ "parser": default_parser_config(),
+ "ner": default_ner_config(),
+ "textcat": default_textcat_config(),
+ "entity_linker": default_nel_config(),
+ "morphologizer": default_morphologizer_config(),
+ "sentrec": default_sentrec_config(),
+ "tensorizer": default_tensorizer_config(),
+ "tok2vec": default_tok2vec_config(),
+ }
@property
def path(self):
@@ -329,12 +344,14 @@ class Language(object):
model_cfg = None
del config["model"]
if model_cfg is None and default_config is not None:
- user_warning(Warnings.W098)
+ user_warning(Warnings.W098.format(name=name))
model_cfg = default_config["model"]
model = None
if model_cfg is not None:
- self.config[name] = {"model": model_cfg}
- model = registry.make_from_config({"model": model_cfg}, validate=True)["model"]
+ self.config[name] = {"model": model_cfg}
+ model = registry.make_from_config({"model": model_cfg}, validate=True)[
+ "model"
+ ]
return factory(self, model, **config)
def add_pipe(
diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py
index 56696d581..d44c7cb2e 100644
--- a/spacy/ml/models/__init__.py
+++ b/spacy/ml/models/__init__.py
@@ -1,6 +1,6 @@
-from .entity_linker import *
-from .parser import *
-from .tagger import *
-from .tensorizer import *
-from .textcat import *
-from .tok2vec import *
+from .entity_linker import * # noqa
+from .parser import * # noqa
+from .tagger import * # noqa
+from .tensorizer import * # noqa
+from .textcat import * # noqa
+from .tok2vec import * # noqa
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 0c1762026..9cbaba984 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -1,9 +1,7 @@
-from pathlib import Path
-
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
from thinc.api import Model, Maxout, Linear
-from spacy.util import registry
+from ...util import registry
@registry.architectures.register("spacy.EntityLinker.v1")
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 89f303e2a..d2de10a0e 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,11 +1,10 @@
from pydantic import StrictInt
-
-from spacy.util import registry
-from spacy.ml._layers import PrecomputableAffine
-from spacy.syntax._parser_model import ParserModel
-
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
+from ...util import registry
+from .._layers import PrecomputableAffine
+from ...syntax._parser_model import ParserModel
+
@registry.architectures.register("spacy.TransitionBasedParser.v1")
def build_tb_parser_model(
diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py
index 92e8be1b2..baca325bd 100644
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@@ -1,6 +1,6 @@
from thinc.api import zero_init, with_array, Softmax, chain, Model
-from spacy.util import registry
+from ...util import registry
@registry.architectures.register("spacy.Tagger.v1")
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index d9ac34b99..49679c8cd 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -1,8 +1,9 @@
-from spacy.attrs import ORTH
-from spacy.util import registry
-from spacy.ml.extract_ngrams import extract_ngrams
+from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic
+from thinc.api import SparseLinear, Softmax
-from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic, SparseLinear, Softmax
+from ...attrs import ORTH
+from ...util import registry
+from ..extract_ngrams import extract_ngrams
@registry.architectures.register("spacy.TextCatCNN.v1")
@@ -21,7 +22,9 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
else:
# TODO: experiment with init_w=zero_init
linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
- model = tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
+ model = (
+ tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
+ )
model.set_ref("output_layer", linear_layer)
model.set_ref("tok2vec", tok2vec)
model.set_dim("nO", nO)
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 2e0e4c2d4..0d33d010d 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -119,7 +119,7 @@ def hash_embed_bilstm_v1(
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
-def hash_embed_bilstm_v1(
+def hash_char_embed_bilstm_v1(
pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0
):
# Allows using character embeddings by setting nC, nM and char_embed=True
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index a49f94ca3..2fee6881a 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -10,7 +10,6 @@ from ..util import link_vectors_to_models, minibatch, eg2doc
@component("tok2vec", assigns=["doc.tensor"])
class Tok2Vec(Pipe):
-
@classmethod
def from_nlp(cls, nlp, model, **cfg):
return cls(nlp.vocab, model, **cfg)
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index 5af772ddc..fb43458ae 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -73,7 +73,8 @@ def test_add_label_deserializes_correctly():
@pytest.mark.parametrize(
- "pipe_cls,n_moves,model", [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())]
+ "pipe_cls,n_moves,model",
+ [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())],
)
def test_add_label_get_label(pipe_cls, n_moves, model):
"""Test that added labels are returned correctly. This test was added to
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 3fde75eb5..2fd87ead3 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -212,7 +212,8 @@ def test_empty_ner():
nlp.begin_training()
doc = nlp("John is watching the news about Croatia's elections")
# if this goes wrong, the initialization of the parser's upper layer is probably broken
- assert [token.ent_iob_ for token in doc] == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+ result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
+ assert [token.ent_iob_ for token in doc] == result
def test_ruler_before_ner():
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index ff8c7c2fe..5f5f0c9eb 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -237,7 +237,7 @@ def test_issue1889(word):
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
-@pytest.mark.skip(reason="This test has become obsolete with the config refactor of v.3")
+@pytest.mark.skip(reason="obsolete with the config refactor of v.3")
def test_issue1915():
cfg = {"hidden_depth": 2} # should error out
nlp = Language()
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index c34d01547..298cddc74 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -58,10 +58,22 @@ subword_features = false
@registry.architectures.register("my_test_parser")
def my_parser():
- tok2vec = build_Tok2Vec_model(width=321, embed_size=5432, pretrained_vectors=None, window_size=3,
- maxout_pieces=4, subword_features=True, char_embed=True, nM=64, nC=8,
- conv_depth=2, bilstm_depth=0)
- parser = build_tb_parser_model(tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5)
+ tok2vec = build_Tok2Vec_model(
+ width=321,
+ embed_size=5432,
+ pretrained_vectors=None,
+ window_size=3,
+ maxout_pieces=4,
+ subword_features=True,
+ char_embed=True,
+ nM=64,
+ nC=8,
+ conv_depth=2,
+ bilstm_depth=0,
+ )
+ parser = build_tb_parser_model(
+ tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
+ )
return parser
@@ -88,7 +100,7 @@ def test_serialize_custom_nlp():
""" Create a custom nlp pipeline and ensure it serializes it correctly"""
nlp = English()
parser_cfg = dict()
- parser_cfg["model"] = {'@architectures': "my_test_parser"}
+ parser_cfg["model"] = {"@architectures": "my_test_parser"}
parser = nlp.create_pipe("parser", parser_cfg)
nlp.add_pipe(parser)
nlp.begin_training()
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index fe14fba10..b1070a9e7 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -1,7 +1,8 @@
import pytest
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
-from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger, default_textcat, default_sentrec
+from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger
+from spacy.ml.models.defaults import default_textcat, default_sentrec
from ..util import make_tempdir
@@ -114,7 +115,9 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
def test_serialize_textcat_empty(en_vocab):
# See issue #1105
- textcat = TextCategorizer(en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"])
+ textcat = TextCategorizer(
+ en_vocab, default_textcat(), labels=["ENTITY", "ACTION", "MODIFIER"]
+ )
textcat.to_bytes(exclude=["vocab"])
diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py
index 310103d10..e1ad1f0fc 100644
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@@ -67,8 +67,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
{"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
- {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
- {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
+ {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
+ {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False},
],
)
# fmt: on
From 37691e6d5deb38fd1788fe0a4761f1bcd66200c5 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Fri, 28 Feb 2020 12:20:23 +0100
Subject: [PATCH 088/496] Simplify warnings
---
spacy/__init__.py | 4 +-
spacy/analysis.py | 5 ++-
spacy/cli/init_model.py | 5 ++-
spacy/displacy/__init__.py | 10 +++--
spacy/errors.py | 66 ---------------------------------
spacy/gold.pyx | 5 ++-
spacy/kb.pyx | 16 ++++----
spacy/language.py | 17 +++++----
spacy/lexeme.pyx | 5 ++-
spacy/matcher/matcher.pyx | 5 ++-
spacy/matcher/phrasematcher.pyx | 13 ++++---
spacy/morphology.pyx | 7 ++--
spacy/pipeline/pipes.pyx | 5 ++-
spacy/syntax/nn_parser.pyx | 3 +-
spacy/tests/doc/test_doc_api.py | 3 +-
spacy/tests/doc/test_span.py | 3 +-
spacy/tokenizer.pyx | 7 ++--
spacy/tokens/doc.pyx | 12 +++---
spacy/tokens/span.pyx | 10 ++---
spacy/tokens/token.pyx | 7 ++--
spacy/util.py | 7 ++--
21 files changed, 82 insertions(+), 133 deletions(-)
diff --git a/spacy/__init__.py b/spacy/__init__.py
index 2c063ce24..e4e1f6c8e 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -11,7 +11,7 @@ from . import pipeline
from .cli.info import info as cli_info
from .glossary import explain
from .about import __version__
-from .errors import Errors, Warnings, deprecation_warning
+from .errors import Errors, Warnings
from . import util
from .util import registry
from .language import component
@@ -27,7 +27,7 @@ config = registry
def load(name, **overrides):
depr_path = overrides.get("path")
if depr_path not in (True, False, None):
- deprecation_warning(Warnings.W001.format(path=depr_path))
+ warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning)
return util.load_model(name, **overrides)
diff --git a/spacy/analysis.py b/spacy/analysis.py
index ed6d6b18e..c2600048f 100644
--- a/spacy/analysis.py
+++ b/spacy/analysis.py
@@ -1,7 +1,8 @@
from wasabi import Printer
+import warnings
from .tokens import Doc, Token, Span
-from .errors import Errors, Warnings, user_warning
+from .errors import Errors, Warnings
def analyze_pipes(pipeline, name, pipe, index, warn=True):
@@ -30,7 +31,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
if not fulfilled:
problems.append(annot)
if warn:
- user_warning(Warnings.W025.format(name=name, attr=annot))
+ warnings.warn(Warnings.W025.format(name=name, attr=annot))
return problems
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index babef106c..4b4949179 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -9,9 +9,10 @@ import gzip
import zipfile
import srsly
from wasabi import msg
+import warnings
from ..vectors import Vectors
-from ..errors import Errors, Warnings, user_warning
+from ..errors import Errors, Warnings
from ..util import ensure_path, get_lang_class
try:
@@ -227,7 +228,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
def read_clusters(clusters_loc):
clusters = {}
if ftfy is None:
- user_warning(Warnings.W004)
+ warnings.warn(Warnings.W004)
with clusters_loc.open() as f:
for line in tqdm(f):
try:
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index e4a8ad666..36b34e5a0 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -4,9 +4,11 @@ spaCy's built in visualization suite for dependencies and named entities.
DOCS: https://spacy.io/api/top-level#displacy
USAGE: https://spacy.io/usage/visualizers
"""
+import warnings
+
from .render import DependencyRenderer, EntityRenderer
from ..tokens import Doc, Span
-from ..errors import Errors, Warnings, user_warning
+from ..errors import Errors, Warnings
from ..util import is_in_jupyter
@@ -85,7 +87,7 @@ def serve(
from wsgiref import simple_server
if is_in_jupyter():
- user_warning(Warnings.W011)
+ warnings.warn(Warnings.W011)
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
httpd = simple_server.make_server(host, port, app)
@@ -115,7 +117,7 @@ def parse_deps(orig_doc, options={}):
"""
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
if not doc.is_parsed:
- user_warning(Warnings.W005)
+ warnings.warn(Warnings.W005)
if options.get("collapse_phrases", False):
with doc.retokenize() as retokenizer:
for np in list(doc.noun_chunks):
@@ -173,7 +175,7 @@ def parse_ents(doc, options={}):
for ent in doc.ents
]
if not ents:
- user_warning(Warnings.W006)
+ warnings.warn(Warnings.W006)
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
settings = get_doc_settings(doc)
return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
diff --git a/spacy/errors.py b/spacy/errors.py
index 6afbfc3c6..33603eb1f 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,8 +1,3 @@
-import os
-import warnings
-import inspect
-
-
def add_codes(err_cls):
"""Add error codes to string messages via class attribute names."""
@@ -583,64 +578,3 @@ class MatchPatternError(ValueError):
class AlignmentError(ValueError):
pass
-
-
-class ModelsWarning(UserWarning):
- pass
-
-
-WARNINGS = {
- "user": UserWarning,
- "deprecation": DeprecationWarning,
- "models": ModelsWarning,
-}
-
-
-def _get_warn_types(arg):
- if arg == "": # don't show any warnings
- return []
- if not arg or arg == "all": # show all available warnings
- return WARNINGS.keys()
- return [w_type.strip() for w_type in arg.split(",") if w_type.strip() in WARNINGS]
-
-
-def _get_warn_excl(arg):
- if not arg:
- return []
- return [w_id.strip() for w_id in arg.split(",")]
-
-
-SPACY_WARNING_FILTER = os.environ.get("SPACY_WARNING_FILTER")
-SPACY_WARNING_TYPES = _get_warn_types(os.environ.get("SPACY_WARNING_TYPES"))
-SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get("SPACY_WARNING_IGNORE"))
-
-
-def user_warning(message):
- _warn(message, "user")
-
-
-def deprecation_warning(message):
- _warn(message, "deprecation")
-
-
-def models_warning(message):
- _warn(message, "models")
-
-
-def _warn(message, warn_type="user"):
- """
- message (unicode): The message to display.
- category (Warning): The Warning to show.
- """
- if message.startswith("["):
- w_id = message.split("[", 1)[1].split("]", 1)[0] # get ID from string
- else:
- w_id = None
- ignore_warning = w_id and w_id in SPACY_WARNING_IGNORE
- if warn_type in SPACY_WARNING_TYPES and not ignore_warning:
- category = WARNINGS[warn_type]
- stack = inspect.stack()[-1]
- with warnings.catch_warnings():
- if SPACY_WARNING_FILTER:
- warnings.simplefilter(SPACY_WARNING_FILTER, category)
- warnings.warn_explicit(message, category, stack[1], stack[2])
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index eca801176..37d092395 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -7,10 +7,11 @@ import shutil
import itertools
from pathlib import Path
import srsly
+import warnings
from .syntax import nonproj
from .tokens import Doc, Span
-from .errors import Errors, AlignmentError, user_warning, Warnings
+from .errors import Errors, AlignmentError, Warnings
from . import util
@@ -550,7 +551,7 @@ def _json_iterate(loc):
py_raw = file_.read()
cdef long file_length = len(py_raw)
if file_length > 2 ** 30:
- user_warning(Warnings.W027.format(size=file_length))
+ warnings.warn(Warnings.W027.format(size=file_length))
raw = py_raw
cdef int square_depth = 0
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 64fbb1e29..797702d23 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,16 +1,18 @@
# cython: infer_types=True
# cython: profile=True
-from pathlib import Path
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from cpython.exc cimport PyErr_SetFromErrno
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
from libc.stdint cimport int32_t, int64_t
-from os import path
from libcpp.vector cimport vector
+from pathlib import Path
+import warnings
+from os import path
+
from .typedefs cimport hash_t
-from .errors import Errors, Warnings, user_warning
+from .errors import Errors, Warnings
cdef class Candidate:
@@ -110,7 +112,7 @@ cdef class KnowledgeBase:
# Return if this entity was added before
if entity_hash in self._entry_index:
- user_warning(Warnings.W018.format(entity=entity))
+ warnings.warn(Warnings.W018.format(entity=entity))
return
# Raise an error if the provided entity vector is not of the correct length
@@ -142,7 +144,7 @@ cdef class KnowledgeBase:
# only process this entity if its unique ID hadn't been added before
entity_hash = self.vocab.strings.add(entity_list[i])
if entity_hash in self._entry_index:
- user_warning(Warnings.W018.format(entity=entity_list[i]))
+ warnings.warn(Warnings.W018.format(entity=entity_list[i]))
else:
entity_vector = vector_list[i]
@@ -190,7 +192,7 @@ cdef class KnowledgeBase:
# Check whether this alias was added before
if alias_hash in self._alias_index:
- user_warning(Warnings.W017.format(alias=alias))
+ warnings.warn(Warnings.W017.format(alias=alias))
return
cdef vector[int64_t] entry_indices
@@ -247,7 +249,7 @@ cdef class KnowledgeBase:
if is_present:
if not ignore_warnings:
- user_warning(Warnings.W024.format(entity=entity, alias=alias))
+ warnings.warn(Warnings.W024.format(entity=entity, alias=alias))
else:
entry_indices.push_back(int(entry_index))
alias_entry.entry_indices = entry_indices
diff --git a/spacy/language.py b/spacy/language.py
index af9f2c157..9f5f9d86a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -5,6 +5,7 @@ import functools
from contextlib import contextmanager
from copy import copy, deepcopy
from pathlib import Path
+import warnings
from thinc.api import get_current_ops, Config
import srsly
@@ -26,7 +27,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
from .tokens import Doc
from .lang.lex_attrs import LEX_ATTRS, is_stop
-from .errors import Errors, Warnings, deprecation_warning, user_warning
+from .errors import Errors, Warnings
from . import util
from . import about
@@ -340,11 +341,11 @@ class Language(object):
if "model" in config:
model_cfg = config["model"]
if not isinstance(model_cfg, dict):
- user_warning(Warnings.W099.format(type=type(model_cfg), pipe=name))
+ warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name))
model_cfg = None
del config["model"]
if model_cfg is None and default_config is not None:
- user_warning(Warnings.W098.format(name=name))
+ warnings.warn(Warnings.W098.format(name=name))
model_cfg = default_config["model"]
model = None
if model_cfg is not None:
@@ -779,7 +780,7 @@ class Language(object):
# raw_texts will be used later to stop iterator.
texts, raw_texts = itertools.tee(texts)
if n_threads != -1:
- deprecation_warning(Warnings.W016)
+ warnings.warn(Warnings.W016, DeprecationWarning)
if n_process == -1:
n_process = mp.cpu_count()
if as_tuples:
@@ -915,7 +916,7 @@ class Language(object):
DOCS: https://spacy.io/api/language#to_disk
"""
if disable is not None:
- deprecation_warning(Warnings.W014)
+ warnings.warn(Warnings.W014, DeprecationWarning)
exclude = disable
path = util.ensure_path(path)
serializers = {}
@@ -949,7 +950,7 @@ class Language(object):
DOCS: https://spacy.io/api/language#from_disk
"""
if disable is not None:
- deprecation_warning(Warnings.W014)
+ warnings.warn(Warnings.W014, DeprecationWarning)
exclude = disable
path = util.ensure_path(path)
deserializers = {}
@@ -987,7 +988,7 @@ class Language(object):
DOCS: https://spacy.io/api/language#to_bytes
"""
if disable is not None:
- deprecation_warning(Warnings.W014)
+ warnings.warn(Warnings.W014, DeprecationWarning)
exclude = disable
serializers = {}
serializers["vocab"] = lambda: self.vocab.to_bytes()
@@ -1013,7 +1014,7 @@ class Language(object):
DOCS: https://spacy.io/api/language#from_bytes
"""
if disable is not None:
- deprecation_warning(Warnings.W014)
+ warnings.warn(Warnings.W014, DeprecationWarning)
exclude = disable
deserializers = {}
deserializers["config.cfg"] = lambda b: self.config.from_bytes(b)
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 5910ebfe1..20e175f03 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -7,6 +7,7 @@ np.import_array()
import numpy
from thinc.api import get_array_module
+import warnings
from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
@@ -15,7 +16,7 @@ from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
from .attrs cimport IS_CURRENCY, IS_OOV, PROB
from .attrs import intify_attrs
-from .errors import Errors, Warnings, user_warning
+from .errors import Errors, Warnings
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
@@ -124,7 +125,7 @@ cdef class Lexeme:
if self.c.orth == other[0].orth:
return 1.0
if self.vector_norm == 0 or other.vector_norm == 0:
- user_warning(Warnings.W008.format(obj="Lexeme"))
+ warnings.warn(Warnings.W008.format(obj="Lexeme"))
return 0.0
vector = self.vector
xp = get_array_module(vector)
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 4258fdb6a..735bc5a44 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -7,6 +7,7 @@ from murmurhash.mrmr cimport hash64
import re
import srsly
+import warnings
from ..typedefs cimport attr_t
from ..structs cimport TokenC
@@ -16,7 +17,7 @@ from ..tokens.token cimport Token
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA
from ..schemas import validate_token_pattern
-from ..errors import Errors, MatchPatternError, Warnings, deprecation_warning
+from ..errors import Errors, MatchPatternError, Warnings
from ..strings import get_string_id
from ..attrs import IDS
@@ -188,7 +189,7 @@ cdef class Matcher:
YIELDS (Doc): Documents, in order.
"""
if n_threads != -1:
- deprecation_warning(Warnings.W016)
+ warnings.warn(Warnings.W016, DeprecationWarning)
if as_tuples:
for doc, context in docs:
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 961a318f6..b17a53e3a 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,16 +1,17 @@
# cython: infer_types=True
# cython: profile=True
from libc.stdint cimport uintptr_t
-
from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
+import warnings
+
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
from ..structs cimport TokenC
from ..tokens.token cimport Token
from ..typedefs cimport attr_t
from ..schemas import TokenPattern
-from ..errors import Errors, Warnings, deprecation_warning, user_warning
+from ..errors import Errors, Warnings
cdef class PhraseMatcher:
@@ -37,7 +38,7 @@ cdef class PhraseMatcher:
DOCS: https://spacy.io/api/phrasematcher#init
"""
if max_length != 0:
- deprecation_warning(Warnings.W010)
+ warnings.warn(Warnings.W010, DeprecationWarning)
self.vocab = vocab
self._callbacks = {}
self._docs = {}
@@ -193,7 +194,7 @@ cdef class PhraseMatcher:
if self._validate and (doc.is_tagged or doc.is_parsed) \
and self.attr not in (DEP, POS, TAG, LEMMA):
string_attr = self.vocab.strings[self.attr]
- user_warning(Warnings.W012.format(key=key, attr=string_attr))
+ warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
keyword = self._convert_to_array(doc)
else:
keyword = doc
@@ -202,7 +203,7 @@ cdef class PhraseMatcher:
current_node = self.c_map
for token in keyword:
if token == self._terminal_hash:
- user_warning(Warnings.W021)
+ warnings.warn(Warnings.W021)
break
result = map_get(current_node, token)
if not result:
@@ -304,7 +305,7 @@ cdef class PhraseMatcher:
DOCS: https://spacy.io/api/phrasematcher#pipe
"""
if n_threads != -1:
- deprecation_warning(Warnings.W016)
+ warnings.warn(Warnings.W016, DeprecationWarning)
if as_tuples:
for doc, context in stream:
matches = self(doc)
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 3003d118f..89870b121 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -3,6 +3,7 @@ from libc.string cimport memset
import srsly
from collections import Counter
import numpy
+import warnings
from .strings import get_string_id
from . import symbols
@@ -11,7 +12,7 @@ from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme
-from .errors import Errors, Warnings, user_warning
+from .errors import Errors, Warnings
from .util import ensure_path
@@ -39,7 +40,7 @@ def _normalize_props(props):
elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
out[key] = value
else:
- user_warning(Warnings.W028.format(feature={key: value}))
+ warnings.warn(Warnings.W028.format(feature={key: value}))
return out
@@ -109,7 +110,7 @@ cdef class Morphology:
return tag_ptr.key
features = self.feats_to_dict(features)
if not isinstance(features, dict):
- user_warning(Warnings.W028.format(feature=features))
+ warnings.warn(Warnings.W028.format(feature=features))
features = {}
features = _normalize_props(features)
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index b9bf1ccd6..3b74d2960 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -5,6 +5,7 @@ import srsly
import random
from thinc.api import CosineDistance, to_categorical, get_array_module
from thinc.api import set_dropout_rate
+import warnings
from ..tokens.doc cimport Doc
from ..syntax.nn_parser cimport Parser
@@ -21,7 +22,7 @@ from ..attrs import POS, ID
from ..util import link_vectors_to_models, create_default_optimizer
from ..parts_of_speech import X
from ..kb import KnowledgeBase
-from ..errors import Errors, TempErrors, user_warning, Warnings
+from ..errors import Errors, TempErrors, Warnings
from .. import util
@@ -525,7 +526,7 @@ class Tagger(Pipe):
**kwargs):
lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
if not any(table in self.vocab.lookups for table in lemma_tables):
- user_warning(Warnings.W022)
+ warnings.warn(Warnings.W022)
orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = {}
for example in get_examples():
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 9381fab6b..312ae9d61 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -19,6 +19,7 @@ from itertools import islice
import srsly
import numpy.random
import numpy
+import warnings
from ..gold import Example
from ..typedefs cimport weight_t, class_t, hash_t
@@ -31,7 +32,7 @@ from ..util import link_vectors_to_models, create_default_optimizer, registry
from ..compat import copy_array
from ..tokens.doc cimport Doc
from ..gold cimport GoldParse
-from ..errors import Errors, user_warning, Warnings
+from ..errors import Errors, Warnings
from .. import util
from .stateclass cimport StateClass
from ._state cimport StateC
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 4323bb736..87a8f4585 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -2,7 +2,6 @@ import pytest
import numpy
from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
-from spacy.errors import ModelsWarning
from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP
from ..util import get_doc
@@ -213,7 +212,7 @@ def test_doc_api_similarity_match():
assert doc.similarity(doc[0]) == 1.0
assert doc.similarity(doc.vocab["a"]) == 1.0
doc2 = Doc(doc.vocab, words=["a", "b", "c"])
- with pytest.warns(ModelsWarning):
+ with pytest.warns(UserWarning):
assert doc.similarity(doc2[:1]) == 1.0
assert doc.similarity(doc2) == 0.0
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index d7b91d476..43c699d21 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -2,7 +2,6 @@ import pytest
from spacy.attrs import ORTH, LENGTH
from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
-from spacy.errors import ModelsWarning
from spacy.util import filter_spans
from ..util import get_doc
@@ -121,7 +120,7 @@ def test_span_similarity_match():
doc = Doc(Vocab(), words=["a", "b", "a", "b"])
span1 = doc[:2]
span2 = doc[2:]
- with pytest.warns(ModelsWarning):
+ with pytest.warns(UserWarning):
assert span1.similarity(span2) == 1.0
assert span1.similarity(doc) == 0.0
assert span1[:1].similarity(doc.vocab["a"]) == 1.0
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 25d9f239d..f31c8a0e5 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -11,13 +11,14 @@ from preshed.maps cimport PreshMap
cimport cython
import re
+import warnings
from .tokens.doc cimport Doc
from .strings cimport hash_string
from .attrs import intify_attrs
from .symbols import ORTH
-from .errors import Errors, Warnings, deprecation_warning
+from .errors import Errors, Warnings
from . import util
from .attrs import intify_attrs
from .lexeme cimport EMPTY_LEXEME
@@ -128,7 +129,7 @@ cdef class Tokenizer:
return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings):
- deprecation_warning(Warnings.W002)
+ warnings.warn(Warnings.W002, DeprecationWarning)
return Doc(self.vocab, words=strings)
def __call__(self, unicode string):
@@ -216,7 +217,7 @@ cdef class Tokenizer:
DOCS: https://spacy.io/api/tokenizer#pipe
"""
if n_threads != -1:
- deprecation_warning(Warnings.W016)
+ warnings.warn(Warnings.W016, DeprecationWarning)
for text in texts:
yield self(text)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 54d92f8b1..14c6d0bbb 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -13,6 +13,7 @@ import struct
import srsly
from thinc.api import get_array_module
from thinc.util import copy_array
+import warnings
from .span cimport Span
from .token cimport Token
@@ -26,7 +27,6 @@ from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..attrs import intify_attrs, IDS
from ..util import normalize_slice
from ..compat import copy_reg, pickle
-from ..errors import deprecation_warning, models_warning, user_warning
from ..errors import Errors, Warnings
from .. import util
from .underscore import Underscore, get_ext_args
@@ -388,9 +388,9 @@ cdef class Doc:
else:
return 1.0
if self.vocab.vectors.n_keys == 0:
- models_warning(Warnings.W007.format(obj="Doc"))
+ warnings.warn(Warnings.W007.format(obj="Doc"))
if self.vector_norm == 0 or other.vector_norm == 0:
- user_warning(Warnings.W008.format(obj="Doc"))
+ warnings.warn(Warnings.W008.format(obj="Doc"))
return 0.0
vector = self.vector
xp = get_array_module(vector)
@@ -1024,10 +1024,10 @@ cdef class Doc:
indices did not fall at token boundaries.
"""
cdef unicode tag, lemma, ent_type
- deprecation_warning(Warnings.W013.format(obj="Doc"))
+ warnings.warn(Warnings.W013.format(obj="Doc"), DeprecationWarning)
# TODO: ENT_KB_ID ?
if len(args) == 3:
- deprecation_warning(Warnings.W003)
+ warnings.warn(Warnings.W003, DeprecationWarning)
tag, lemma, ent_type = args
attributes[TAG] = tag
attributes[LEMMA] = lemma
@@ -1167,7 +1167,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
while not heads_within_sents:
heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count)
if loop_count > 10:
- user_warning(Warnings.W026)
+ warnings.warn(Warnings.W026)
loop_count += 1
# Set sentence starts
for i in range(length):
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index d6b50b5f4..b6ff763b0 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -6,6 +6,7 @@ import numpy
import numpy.linalg
from thinc.api import get_array_module
from collections import defaultdict
+import warnings
from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
from .token cimport TokenC
@@ -18,8 +19,7 @@ from ..lexeme cimport Lexeme
from ..symbols cimport dep
from ..util import normalize_slice
-from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
-from ..errors import deprecation_warning
+from ..errors import Errors, TempErrors, Warnings
from .underscore import Underscore, get_ext_args
@@ -287,7 +287,7 @@ cdef class Span:
attributes are inherited from the syntactic root token of the span.
RETURNS (Token): The newly merged token.
"""
- deprecation_warning(Warnings.W013.format(obj="Span"))
+ warnings.warn(Warnings.W013.format(obj="Span"), DeprecationWarning)
return self.doc.merge(self.start_char, self.end_char, *args,
**attributes)
@@ -326,9 +326,9 @@ cdef class Span:
else:
return 1.0
if self.vocab.vectors.n_keys == 0:
- models_warning(Warnings.W007.format(obj="Span"))
+ warnings.warn(Warnings.W007.format(obj="Span"))
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
- user_warning(Warnings.W008.format(obj="Span"))
+ warnings.warn(Warnings.W008.format(obj="Span"))
return 0.0
vector = self.vector
xp = get_array_module(vector)
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 379da6c77..023581d1f 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -8,6 +8,7 @@ np.import_array()
import numpy
from thinc.api import get_array_module
+import warnings
from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme
@@ -20,7 +21,7 @@ from ..symbols cimport conj
from .. import parts_of_speech
from .. import util
-from ..errors import Errors, Warnings, user_warning, models_warning
+from ..errors import Errors, Warnings
from .underscore import Underscore, get_ext_args
from .morphanalysis cimport MorphAnalysis
@@ -205,9 +206,9 @@ cdef class Token:
if self.c.lex.orth == other.orth:
return 1.0
if self.vocab.vectors.n_keys == 0:
- models_warning(Warnings.W007.format(obj="Token"))
+ warnings.warn(Warnings.W007.format(obj="Token"))
if self.vector_norm == 0 or other.vector_norm == 0:
- user_warning(Warnings.W008.format(obj="Token"))
+ warnings.warn(Warnings.W008.format(obj="Token"))
return 0.0
vector = self.vector
xp = get_array_module(vector)
diff --git a/spacy/util.py b/spacy/util.py
index 286a6574c..216158e52 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -13,6 +13,7 @@ import numpy.random
import srsly
import catalogue
import sys
+import warnings
try:
@@ -22,7 +23,7 @@ except ImportError:
from .symbols import ORTH
from .compat import cupy, CudaStream
-from .errors import Errors, Warnings, deprecation_warning, user_warning
+from .errors import Errors, Warnings
_PRINT_ENV = False
@@ -731,7 +732,7 @@ def get_serialization_exclude(serializers, exclude, kwargs):
options = [name.split(".")[0] for name in serializers]
for key, value in kwargs.items():
if key in ("vocab",) and value is False:
- deprecation_warning(Warnings.W015.format(arg=key))
+ warnings.warn(Warnings.W015.format(arg=key), DeprecationWarning)
exclude.append(key)
elif key.split(".")[0] in options:
raise ValueError(Errors.E128.format(arg=key))
@@ -776,7 +777,7 @@ def link_vectors_to_models(vocab):
if vectors.name is None:
vectors.name = VECTORS_KEY
if vectors.data.size != 0:
- user_warning(Warnings.W020.format(shape=vectors.data.shape))
+ warnings.warn(Warnings.W020.format(shape=vectors.data.shape))
for word in vocab:
if word.orth in vectors.key2row:
word.rank = vectors.key2row[word.orth]
From 7efaa76168103b4c6e13d1852d805e34418666cf Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Fri, 28 Feb 2020 12:23:31 +0100
Subject: [PATCH 089/496] Update errors.py
---
spacy/errors.py | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/spacy/errors.py b/spacy/errors.py
index 33603eb1f..947898b31 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -86,9 +86,8 @@ class Warnings(object):
"lemmatization rules or data. This means that the trained model "
"may not be able to lemmatize correctly. If this is intentional "
"or the language you're using doesn't have lemmatization data, "
- "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
- "If this is surprising, make sure you have the spacy-lookups-data "
- "package installed.")
+ "you can ignore this warning. If this is surprising, make sure you "
+ "have the spacy-lookups-data package installed.")
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
"the Knowledge Base.")
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
From 648f61d07710f53b2972d1925f59d23a0f9247e4 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Mon, 2 Mar 2020 11:48:10 +0100
Subject: [PATCH 090/496] Tidy up compiler flags and imports (#5071)
---
spacy/gold.pxd | 3 ++-
spacy/kb.pxd | 4 ++--
spacy/kb.pyx | 9 +++++----
spacy/lexeme.pxd | 5 ++---
spacy/matcher/dependencymatcher.pyx | 8 ++++----
spacy/matcher/matcher.pyx | 3 +--
spacy/matcher/phrasematcher.pxd | 1 -
spacy/matcher/phrasematcher.pyx | 4 +---
spacy/morphology.pxd | 6 +++---
spacy/morphology.pyx | 10 ++++++----
spacy/pipeline/morphologizer.pyx | 16 ++++++++--------
spacy/pipeline/pipes.pyx | 3 +--
spacy/strings.pxd | 1 -
spacy/strings.pyx | 4 +++-
spacy/structs.pxd | 6 ++----
spacy/syntax/_beam_utils.pyx | 13 ++++++-------
spacy/syntax/_parser_model.pyx | 20 ++++++++++----------
spacy/syntax/_state.pxd | 2 --
spacy/syntax/arc_eager.pxd | 6 +-----
spacy/syntax/arc_eager.pyx | 21 +++++++++++----------
spacy/syntax/ner.pyx | 2 ++
spacy/syntax/nn_parser.pyx | 21 ++++++++++-----------
spacy/syntax/nonproj.pyx | 6 +++---
spacy/syntax/transition_system.pxd | 1 -
spacy/syntax/transition_system.pyx | 4 +++-
spacy/tokenizer.pxd | 1 -
spacy/tokenizer.pyx | 7 +++----
spacy/tokens/_retokenize.pyx | 4 +---
spacy/tokens/doc.pyx | 6 ++----
spacy/tokens/span.pyx | 1 +
spacy/tokens/token.pxd | 1 +
spacy/tokens/token.pyx | 2 +-
spacy/vocab.pxd | 1 -
33 files changed, 95 insertions(+), 107 deletions(-)
diff --git a/spacy/gold.pxd b/spacy/gold.pxd
index aea691130..c5ab6ebbe 100644
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@@ -1,9 +1,10 @@
from cymem.cymem cimport Pool
-from .tokens import Doc
from .typedefs cimport attr_t
from .syntax.transition_system cimport Transition
+from .tokens import Doc
+
cdef struct GoldParseC:
int* tags
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 518ce0f4e..53038b5db 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -1,15 +1,15 @@
"""Knowledge-base for entity or concept linking."""
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
-
from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE
from .vocab cimport Vocab
from .typedefs cimport hash_t
-
from .structs cimport KBEntryC, AliasC
+
+
ctypedef vector[KBEntryC] entry_vec
ctypedef vector[AliasC] alias_vec
ctypedef vector[float] float_vec
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 64fbb1e29..4d6b47c55 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,15 +1,16 @@
-# cython: infer_types=True
-# cython: profile=True
-from pathlib import Path
+# cython: infer_types=True, profile=True
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from cpython.exc cimport PyErr_SetFromErrno
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
from libc.stdint cimport int32_t, int64_t
-from os import path
from libcpp.vector cimport vector
+from pathlib import Path
+from os import path
+
from .typedefs cimport hash_t
+
from .errors import Errors, Warnings, user_warning
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 048f8016e..e73f1e700 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,13 +1,12 @@
+from numpy cimport ndarray
+
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
from .attrs cimport attr_id_t
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG
-
from .structs cimport LexemeC, SerializedLexemeC
from .strings cimport StringStore
from .vocab cimport Vocab
-from numpy cimport ndarray
-
cdef LexemeC EMPTY_LEXEME
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index f94c66cb0..ff707a71c 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -1,7 +1,9 @@
-# cython: infer_types=True
-# cython: profile=True
+# cython: infer_types=True, profile=True
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
+from libcpp cimport bool
+
+import numpy
from .matcher cimport Matcher
from ..vocab cimport Vocab
@@ -10,8 +12,6 @@ from ..tokens.doc cimport Doc
from .matcher import unpickle_matcher
from ..errors import Errors
-from libcpp cimport bool
-import numpy
DELIMITER = "||"
INDEX_HEAD = 1
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 4258fdb6a..9dcf0ded9 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -1,5 +1,4 @@
-# cython: infer_types=True
-# cython: profile=True
+# cython: infer_types=True, cython: profile=True
from libcpp.vector cimport vector
from libc.stdint cimport int32_t
from cymem.cymem cimport Pool
diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd
index a8e5e5085..3b42f3fab 100644
--- a/spacy/matcher/phrasematcher.pxd
+++ b/spacy/matcher/phrasematcher.pxd
@@ -1,5 +1,4 @@
from libcpp.vector cimport vector
-
from cymem.cymem cimport Pool
from preshed.maps cimport key_t, MapStruct
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 961a318f6..297b05fbc 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,7 +1,5 @@
-# cython: infer_types=True
-# cython: profile=True
+# cython: infer_types=True, profile=True
from libc.stdint cimport uintptr_t
-
from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 1e8c255b8..c57e3a1db 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -8,14 +8,14 @@ from .structs cimport TokenC, MorphAnalysisC
from .strings cimport StringStore
from .typedefs cimport hash_t, attr_t, flags_t
from .parts_of_speech cimport univ_pos_t
-
from . cimport symbols
+
cdef class Morphology:
cdef readonly Pool mem
cdef readonly StringStore strings
cdef PreshMap tags # Keyed by hash, value is pointer to tag
-
+
cdef public object lemmatizer
cdef readonly object tag_map
cdef readonly object tag_names
@@ -26,7 +26,7 @@ cdef class Morphology:
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
cdef int insert(self, MorphAnalysisC tag) except -1
-
+
cdef int assign_untagged(self, TokenC* token) except -1
cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 3003d118f..47df5800e 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,18 +1,20 @@
# cython: infer_types
from libc.string cimport memset
+
import srsly
from collections import Counter
import numpy
-from .strings import get_string_id
-from . import symbols
from .attrs cimport POS, IS_SPACE
-from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
-from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme
+
+from .strings import get_string_id
+from .attrs import LEMMA, intify_attrs
+from .parts_of_speech import IDS as POS_IDS
from .errors import Errors, Warnings, user_warning
from .util import ensure_path
+from . import symbols
def _normalize_props(props):
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index b6a6045d1..be9b166bf 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,20 +1,20 @@
-from collections import defaultdict
-
-import numpy
cimport numpy as np
+import numpy
+from collections import defaultdict
from thinc.api import chain, list2array, to_categorical, get_array_module
from thinc.util import copy_array
-from .. import util
-from .pipes import Pipe
-from ..language import component
-from ..util import link_vectors_to_models, create_default_optimizer
-from ..errors import Errors, TempErrors
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..morphology cimport Morphology
+from .. import util
+from ..language import component
+from ..util import link_vectors_to_models, create_default_optimizer
+from ..errors import Errors, TempErrors
+from .pipes import Pipe
+
@component("morphologizer", assigns=["token.morph", "token.pos"])
class Morphologizer(Pipe):
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index b9bf1ccd6..b0cb8585f 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1,5 +1,4 @@
-# cython: infer_types=True
-# cython: profile=True
+# cython: infer_types=True, profile=True
import numpy
import srsly
import random
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index e436fb33b..ba2476ec7 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,7 +1,6 @@
from libc.stdint cimport int64_t
from libcpp.vector cimport vector
from libcpp.set cimport set
-
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from murmurhash.mrmr cimport hash64
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 0605de96c..a30f11729 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -4,11 +4,13 @@ from libc.string cimport memcpy
from libcpp.set cimport set
from libc.stdint cimport uint32_t
from murmurhash.mrmr cimport hash64, hash32
+
import srsly
+from .typedefs cimport hash_t
+
from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT
-from .typedefs cimport hash_t
from .errors import Errors
from . import util
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 259fd657d..f140a4220 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -1,11 +1,9 @@
from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
-
-from .typedefs cimport flags_t, attr_t, hash_t
-from .parts_of_speech cimport univ_pos_t
-
from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t
+from .typedefs cimport flags_t, attr_t, hash_t
+from .parts_of_speech cimport univ_pos_t
cdef struct LexemeC:
diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
index 32cf9193a..03702e54e 100644
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@@ -1,18 +1,19 @@
-# cython: infer_types=True
-# cython: profile=True
+# cython: infer_types=True, profile=True
cimport numpy as np
-import numpy
from cpython.ref cimport PyObject, Py_XDECREF
from thinc.extra.search cimport Beam
-from thinc.extra.search import MaxViolation
from thinc.extra.search cimport MaxViolation
+from thinc.extra.search import MaxViolation
+import numpy
+
from ..typedefs cimport hash_t, class_t
from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParse
-from ..errors import Errors
from .stateclass cimport StateC, StateClass
+from ..errors import Errors
+
# These are passed as callbacks to thinc.search.Beam
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
@@ -326,5 +327,3 @@ def cleanup_beam(Beam beam):
seen.add(addr)
else:
raise ValueError(Errors.E023.format(addr=addr, i=i))
-
-
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 7ff9517a5..e36a2a28b 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -1,9 +1,5 @@
-# cython: infer_types=True
-# cython: cdivision=True
-# cython: boundscheck=False
-import numpy
+# cython: infer_types=True, cdivision=True, boundscheck=False
cimport cython.parallel
-import numpy.random
cimport numpy as np
from libc.math cimport exp
from libcpp.vector cimport vector
@@ -11,21 +7,25 @@ from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free, realloc
from cymem.cymem cimport Pool
from thinc.extra.search cimport Beam
-from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops
from thinc.backends.linalg cimport Vec, VecVec
cimport blis.cy
+import numpy
+import numpy.random
+from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops
+
from ..typedefs cimport weight_t, class_t, hash_t
-from ..compat import copy_array
from ..tokens.doc cimport Doc
from ..gold cimport GoldParse
-from ..errors import Errors, TempErrors
-from .. import util
from .stateclass cimport StateClass
from .transition_system cimport Transition
+
+from ..compat import copy_array
+from ..errors import Errors, TempErrors
+from ..util import link_vectors_to_models, create_default_optimizer
+from .. import util
from . import _beam_utils
from . import nonproj
-from ..util import link_vectors_to_models, create_default_optimizer
cdef WeightsC get_c_weights(model) except *:
diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index 141d796a4..fef4f0c92 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -1,9 +1,7 @@
from libc.string cimport memcpy, memset, memmove
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint32_t, uint64_t
-
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
-
from murmurhash.mrmr cimport hash64
from ..vocab cimport EMPTY_LEXEME
diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd
index 9e9593eee..14d706548 100644
--- a/spacy/syntax/arc_eager.pxd
+++ b/spacy/syntax/arc_eager.pxd
@@ -1,10 +1,7 @@
from cymem.cymem cimport Pool
-from ..typedefs cimport weight_t
-
from .stateclass cimport StateClass
-from ..typedefs cimport attr_t
-
+from ..typedefs cimport weight_t, attr_t
from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParseC
@@ -15,4 +12,3 @@ cdef class ArcEager(TransitionSystem):
cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil
cdef weight_t arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil
-
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 5ec169428..19be95f3f 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -1,23 +1,24 @@
-# cython: profile=True
-# cython: cdivision=True
-# cython: infer_types=True
+# cython: profile=True, cdivision=True, infer_types=True
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
-from collections import defaultdict, Counter
from thinc.extra.search cimport Beam
+
+from collections import defaultdict, Counter
import json
-from .nonproj import is_nonproj_tree
from ..typedefs cimport hash_t, attr_t
from ..strings cimport hash_string
-from .stateclass cimport StateClass
-from ._state cimport StateC
-from . import nonproj
-from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse, GoldParseC
from ..structs cimport TokenC
-from ..errors import Errors
from ..tokens.doc cimport Doc, set_children_from_heads
+from .stateclass cimport StateClass
+from ._state cimport StateC
+from .transition_system cimport move_cost_func_t, label_cost_func_t
+
+from ..errors import Errors
+from .nonproj import is_nonproj_tree
+from . import nonproj
+
# Calculate cost as gold/not gold. We don't use scalar value anyway.
cdef int BINARY_COSTS = 1
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index 50b916fe2..ff74be601 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -1,4 +1,5 @@
from thinc.extra.search cimport Beam
+
from collections import Counter
from ..typedefs cimport weight_t
@@ -9,6 +10,7 @@ from .transition_system cimport do_func_t
from ..gold cimport GoldParseC, GoldParse
from ..lexeme cimport Lexeme
from ..attrs cimport IS_SPACE
+
from ..errors import Errors
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 9381fab6b..cf5414628 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -1,6 +1,4 @@
-# cython: infer_types=True
-# cython: cdivision=True
-# cython: boundscheck=False
+# cython: infer_types=True, cdivision=True, boundscheck=False
cimport cython.parallel
cimport numpy as np
from cpython.ref cimport PyObject, Py_XDECREF
@@ -20,23 +18,24 @@ import srsly
import numpy.random
import numpy
-from ..gold import Example
+from ..tokens.doc cimport Doc
+from ..gold cimport GoldParse
from ..typedefs cimport weight_t, class_t, hash_t
from ._parser_model cimport alloc_activations, free_activations
from ._parser_model cimport predict_states, arg_max_if_valid
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
from ._parser_model cimport get_c_weights, get_c_sizes
-from ._parser_model import ParserModel
-from ..util import link_vectors_to_models, create_default_optimizer, registry
-from ..compat import copy_array
-from ..tokens.doc cimport Doc
-from ..gold cimport GoldParse
-from ..errors import Errors, user_warning, Warnings
-from .. import util
from .stateclass cimport StateClass
from ._state cimport StateC
from .transition_system cimport Transition
from . cimport _beam_utils
+
+from ..gold import Example
+from ..util import link_vectors_to_models, create_default_optimizer, registry
+from ..compat import copy_array
+from ..errors import Errors, user_warning, Warnings
+from .. import util
+from ._parser_model import ParserModel
from . import _beam_utils
from . import nonproj
diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx
index 27516ffd9..1edb2e65c 100644
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@@ -1,13 +1,13 @@
-# cython: profile=True
-# cython: infer_types=True
+# cython: profile=True, infer_types=True
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme.
"""
from copy import copy
-from ..gold import Example
from ..tokens.doc cimport Doc, set_children_from_heads
+
+from ..gold import Example
from ..errors import Errors
diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd
index bd706a997..5fd3b5c5f 100644
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@@ -5,7 +5,6 @@ from ..structs cimport TokenC
from ..gold cimport GoldParse
from ..gold cimport GoldParseC
from ..strings cimport StringStore
-
from .stateclass cimport StateClass
from ._state cimport StateC
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 6ab83436e..78017c84a 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -1,16 +1,18 @@
# cython: infer_types=True
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
-from ..typedefs cimport weight_t
from thinc.extra.search cimport Beam
+
from collections import Counter
import srsly
+from ..typedefs cimport weight_t
from . cimport _beam_utils
from ..tokens.doc cimport Doc
from ..structs cimport TokenC
from .stateclass cimport StateClass
from ..typedefs cimport attr_t
+
from ..errors import Errors
from .. import util
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index ba22f7782..e82833701 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -1,5 +1,4 @@
from libcpp.vector cimport vector
-
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 25d9f239d..20557366e 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -1,5 +1,4 @@
-# cython: embedsignature=True
-# cython: profile=True
+# cython: embedsignature=True, profile=True
from __future__ import unicode_literals
from cython.operator cimport dereference as deref
@@ -14,13 +13,13 @@ import re
from .tokens.doc cimport Doc
from .strings cimport hash_string
+from .lexeme cimport EMPTY_LEXEME
+
from .attrs import intify_attrs
from .symbols import ORTH
-
from .errors import Errors, Warnings, deprecation_warning
from . import util
from .attrs import intify_attrs
-from .lexeme cimport EMPTY_LEXEME
from .symbols import ORTH
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 337c154a2..8df38965d 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -1,6 +1,4 @@
-# cython: infer_types=True
-# cython: bounds_check=False
-# cython: profile=True
+# cython: infer_types=True, bounds_check=False, profile=True
from libc.string cimport memcpy, memset
from libc.stdlib cimport malloc, free
from cymem.cymem cimport Pool
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 54d92f8b1..6206a4810 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1,12 +1,10 @@
-# cython: infer_types=True
-# cython: bounds_check=False
-# cython: profile=True
+# cython: infer_types=True, bounds_check=False, profile=True
cimport cython
cimport numpy as np
from libc.string cimport memcpy, memset
from libc.math cimport sqrt
-from collections import Counter
+from collections import Counter
import numpy
import numpy.linalg
import struct
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index d6b50b5f4..bca69461f 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -1,4 +1,5 @@
from __future__ import unicode_literals
+
cimport numpy as np
from libc.math cimport sqrt
diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd
index 0d25974f3..45c906a82 100644
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@@ -6,6 +6,7 @@ from ..typedefs cimport attr_t, flags_t
from ..parts_of_speech cimport univ_pos_t
from .doc cimport Doc
from ..lexeme cimport Lexeme
+
from ..errors import Errors
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 379da6c77..a450a9154 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -17,12 +17,12 @@ from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
from ..symbols cimport conj
+from .morphanalysis cimport MorphAnalysis
from .. import parts_of_speech
from .. import util
from ..errors import Errors, Warnings, user_warning, models_warning
from .underscore import Underscore, get_ext_args
-from .morphanalysis cimport MorphAnalysis
cdef class Token:
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index d989d6c40..a95ffb11a 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -1,5 +1,4 @@
from libcpp.vector cimport vector
-
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
From 6ac9fc06192c0cdb3ef06f3dcd8f5bee4e39e6b1 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Fri, 6 Mar 2020 14:42:23 +0100
Subject: [PATCH 091/496] Unit test for NEL functionality (#5114)
* empty begin_training for sentencizer
* overfitting unit test for entity linker
* fixed NEL IO by storing the entity_vector_length in the cfg
---
spacy/pipeline/pipes.pyx | 6 ++
spacy/tests/pipeline/test_entity_linker.py | 72 ++++++++++++++++++++++
2 files changed, 78 insertions(+)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 0b1bd8ccf..4ee470606 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1490,6 +1490,7 @@ class EntityLinker(Pipe):
def to_disk(self, path, exclude=tuple(), **kwargs):
serialize = {}
+ self.cfg["entity_width"] = self.kb.entity_vector_length
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["kb"] = lambda p: self.kb.dump(p)
@@ -1561,6 +1562,11 @@ class Sentencizer(Pipe):
def from_nlp(cls, nlp, model=None, **cfg):
return cls(**cfg)
+ def begin_training(
+ self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
+ ):
+ pass
+
def __call__(self, example):
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 9ff5f8194..cdd8451fd 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,8 +1,11 @@
import pytest
from spacy.kb import KnowledgeBase
+
+from spacy import util
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
+from spacy.tests.util import make_tempdir
from spacy.tokens import Span
@@ -245,3 +248,72 @@ def test_preserving_links_ents_2(nlp):
assert len(list(doc.ents)) == 1
assert list(doc.ents)[0].label_ == "LOC"
assert list(doc.ents)[0].kb_id_ == "Q1"
+
+
+# fmt: off
+TRAIN_DATA = [
+ ("Russ Cochran captured his first major title with his son as caddie.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}),
+ ("Russ Cochran his reprints include EC Comics.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}),
+ ("Russ Cochran has been publishing comic art.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}),
+ ("Russ Cochran was a member of University of Kentucky's golf team.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}),
+]
+GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
+# fmt: on
+
+
+def test_overfitting_IO():
+ # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
+ nlp = English()
+ nlp.add_pipe(nlp.create_pipe('sentencizer'))
+
+ # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
+ ruler = EntityRuler(nlp)
+ patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
+ ruler.add_patterns(patterns)
+ nlp.add_pipe(ruler)
+
+ # Convert the texts to docs to make sure we have doc.ents set for the training examples
+ TRAIN_DOCS = []
+ for text, annotation in TRAIN_DATA:
+ doc = nlp(text)
+ annotation_clean = annotation
+ TRAIN_DOCS.append((doc, annotation_clean))
+
+ # create artificial KB - assign same prior weight to the two russ cochran's
+ # Q2146908 (Russ Cochran): American golfer
+ # Q7381115 (Russ Cochran): publisher
+ mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+ mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
+ mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
+
+ # Create the Entity Linker component and add it to the pipeline
+ entity_linker = nlp.create_pipe("entity_linker")
+ entity_linker.set_kb(mykb)
+ nlp.add_pipe(entity_linker, last=True)
+
+ # train the NEL pipe
+ optimizer = nlp.begin_training()
+ for i in range(50):
+ losses = {}
+ nlp.update(TRAIN_DOCS, sgd=optimizer, losses=losses)
+ assert losses["entity_linker"] < 0.001
+
+ # test the trained model
+ predictions = []
+ for text, annotation in TRAIN_DATA:
+ doc = nlp(text)
+ for ent in doc.ents:
+ predictions.append(ent.kb_id_)
+ assert predictions == GOLD_entities
+
+ # Also test the results are still the same after IO
+ with make_tempdir() as tmp_dir:
+ nlp.to_disk(tmp_dir)
+ nlp2 = util.load_model_from_path(tmp_dir)
+ predictions = []
+ for text, annotation in TRAIN_DATA:
+ doc2 = nlp2(text)
+ for ent in doc2.ents:
+ predictions.append(ent.kb_id_)
+ assert predictions == GOLD_entities
From c95ce96c448bd3d3e2a167bd7e7eaee1611c11b0 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Fri, 6 Mar 2020 14:45:02 +0100
Subject: [PATCH 092/496] Update sentence recognizer (#5109)
* Update sentence recognizer
* rename `sentrec` to `senter`
* use `spacy.HashEmbedCNN.v1` by default
* update to follow `Tagger` modifications
* remove component methods that can be inherited from `Tagger`
* add simple initialization and overfitting pipeline tests
* Update serialization test for senter
---
spacy/cli/train.py | 16 +++---
spacy/language.py | 8 +--
spacy/ml/models/defaults/__init__.py | 8 +--
...ntrec_defaults.cfg => senter_defaults.cfg} | 4 +-
spacy/pipeline/pipes.pyx | 28 ++--------
spacy/tests/pipeline/test_senter.py | 52 +++++++++++++++++++
.../serialize/test_serialize_pipeline.py | 6 +--
7 files changed, 77 insertions(+), 45 deletions(-)
rename spacy/ml/models/defaults/{sentrec_defaults.cfg => senter_defaults.cfg} (75%)
create mode 100644 spacy/tests/pipeline/test_senter.py
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 1ca678b85..7eb9bbd3c 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -157,6 +157,8 @@ def train(
config_loc = default_dir / "ner_defaults.cfg"
elif pipe == "textcat":
config_loc = default_dir / "textcat_defaults.cfg"
+ elif pipe == "senter":
+ config_loc = default_dir / "senter_defaults.cfg"
else:
raise ValueError(f"Component {pipe} currently not supported.")
pipe_cfg = util.load_config(config_loc, create_objects=False)
@@ -221,6 +223,8 @@ def train(
config_loc = default_dir / "ner_defaults.cfg"
elif pipe == "textcat":
config_loc = default_dir / "textcat_defaults.cfg"
+ elif pipe == "senter":
+ config_loc = default_dir / "senter_defaults.cfg"
else:
raise ValueError(f"Component {pipe} currently not supported.")
pipe_cfg = util.load_config(config_loc, create_objects=False)
@@ -559,7 +563,7 @@ def _score_for_model(meta):
mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
if "textcat" in pipes:
mean_acc.append(acc["textcat_score"])
- if "sentrec" in pipes:
+ if "senter" in pipes:
mean_acc.append((acc["sent_p"] + acc["sent_r"] + acc["sent_f"]) / 3)
return sum(mean_acc) / len(mean_acc)
@@ -638,7 +642,7 @@ def _get_metrics(component):
return ("tags_acc",)
elif component == "ner":
return ("ents_f", "ents_p", "ents_r", "ents_per_type")
- elif component == "sentrec":
+ elif component == "senter":
return ("sent_f", "sent_p", "sent_r")
elif component == "textcat":
return ("textcat_score",)
@@ -665,9 +669,9 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths):
elif pipe == "textcat":
row_head.extend(["Textcat Loss", "Textcat"])
output_stats.extend(["textcat_loss", "textcat_score"])
- elif pipe == "sentrec":
- row_head.extend(["Sentrec Loss", "Sent P", "Sent R", "Sent F"])
- output_stats.extend(["sentrec_loss", "sent_p", "sent_r", "sent_f"])
+ elif pipe == "senter":
+ row_head.extend(["Senter Loss", "Sent P", "Sent R", "Sent F"])
+ output_stats.extend(["senter_loss", "sent_p", "sent_r", "sent_f"])
row_head.extend(["Token %", "CPU WPS"])
output_stats.extend(["token_acc", "cpu_wps"])
@@ -693,7 +697,7 @@ def _get_progress(
scores["ner_loss"] = losses.get("ner", 0.0)
scores["tag_loss"] = losses.get("tagger", 0.0)
scores["textcat_loss"] = losses.get("textcat", 0.0)
- scores["sentrec_loss"] = losses.get("sentrec", 0.0)
+ scores["senter_loss"] = losses.get("senter", 0.0)
scores["cpu_wps"] = cpu_wps
scores["gpu_wps"] = gpu_wps or 0.0
scores.update(dev_scores)
diff --git a/spacy/language.py b/spacy/language.py
index 9f5f9d86a..d0077b9d2 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -190,7 +190,7 @@ class Language(object):
default_textcat_config,
default_nel_config,
default_morphologizer_config,
- default_sentrec_config,
+ default_senter_config,
default_tensorizer_config,
default_tok2vec_config,
)
@@ -202,7 +202,7 @@ class Language(object):
"textcat": default_textcat_config(),
"entity_linker": default_nel_config(),
"morphologizer": default_morphologizer_config(),
- "sentrec": default_sentrec_config(),
+ "senter": default_senter_config(),
"tensorizer": default_tensorizer_config(),
"tok2vec": default_tok2vec_config(),
}
@@ -267,8 +267,8 @@ class Language(object):
return self.get_pipe("entity_linker")
@property
- def sentrec(self):
- return self.get_pipe("sentrec")
+ def senter(self):
+ return self.get_pipe("senter")
@property
def matcher(self):
diff --git a/spacy/ml/models/defaults/__init__.py b/spacy/ml/models/defaults/__init__.py
index 9af4da87d..d5490fd16 100644
--- a/spacy/ml/models/defaults/__init__.py
+++ b/spacy/ml/models/defaults/__init__.py
@@ -43,13 +43,13 @@ def default_ner():
return util.load_config(loc, create_objects=True)["model"]
-def default_sentrec_config():
- loc = Path(__file__).parent / "sentrec_defaults.cfg"
+def default_senter_config():
+ loc = Path(__file__).parent / "senter_defaults.cfg"
return util.load_config(loc, create_objects=False)
-def default_sentrec():
- loc = Path(__file__).parent / "sentrec_defaults.cfg"
+def default_senter():
+ loc = Path(__file__).parent / "senter_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
diff --git a/spacy/ml/models/defaults/sentrec_defaults.cfg b/spacy/ml/models/defaults/senter_defaults.cfg
similarity index 75%
rename from spacy/ml/models/defaults/sentrec_defaults.cfg
rename to spacy/ml/models/defaults/senter_defaults.cfg
index a039a4533..ffa2c6ce2 100644
--- a/spacy/ml/models/defaults/sentrec_defaults.cfg
+++ b/spacy/ml/models/defaults/senter_defaults.cfg
@@ -2,7 +2,7 @@
@architectures = "spacy.Tagger.v1"
[model.tok2vec]
-@architectures = "spacy.HashCharEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 12
depth = 1
@@ -10,5 +10,3 @@ embed_size = 2000
window_size = 1
maxout_pieces = 2
subword_features = true
-nM = 64
-nC = 8
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 4ee470606..51340ee00 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -650,7 +650,7 @@ class Tagger(Pipe):
return self
-@component("sentrec", assigns=["token.is_sent_start"])
+@component("senter", assigns=["token.is_sent_start"])
class SentenceRecognizer(Tagger):
"""Pipeline component for sentence segmentation.
@@ -670,7 +670,7 @@ class SentenceRecognizer(Tagger):
# are 0
return tuple(["I", "S"])
- def set_annotations(self, docs, batch_tag_ids, **_):
+ def set_annotations(self, docs, batch_tag_ids):
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
@@ -686,24 +686,6 @@ class SentenceRecognizer(Tagger):
else:
doc.c[j].sent_start = -1
- def update(self, examples, drop=0., sgd=None, losses=None):
- examples = Example.to_example_objects(examples)
- if losses is not None and self.name not in losses:
- losses[self.name] = 0.
-
- if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
- # Handle cases where there are no tokens in any docs.
- return
- set_dropout_rate(self.model, drop)
- tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
- loss, d_tag_scores = self.get_loss(examples, tag_scores)
- bp_tag_scores(d_tag_scores)
- if sgd is not None:
- self.model.finish_update(sgd)
-
- if losses is not None:
- losses[self.name] += loss
-
def get_loss(self, examples, scores):
scores = self.model.ops.flatten(scores)
tag_index = range(len(self.labels))
@@ -732,9 +714,9 @@ class SentenceRecognizer(Tagger):
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
**kwargs):
- cdef Vocab vocab = self.vocab
self.set_output(len(self.labels))
self.model.initialize()
+ link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
return sgd
@@ -742,10 +724,6 @@ class SentenceRecognizer(Tagger):
def add_label(self, label, values=None):
raise NotImplementedError
- def use_params(self, params):
- with self.model.use_params(params):
- yield
-
def to_bytes(self, exclude=tuple(), **kwargs):
serialize = {}
serialize["model"] = self.model.to_bytes
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
new file mode 100644
index 000000000..7a929a6a2
--- /dev/null
+++ b/spacy/tests/pipeline/test_senter.py
@@ -0,0 +1,52 @@
+import pytest
+
+from spacy import util
+from spacy.lang.en import English
+from spacy.language import Language
+from spacy.tests.util import make_tempdir
+
+
+def test_label_types():
+ nlp = Language()
+ nlp.add_pipe(nlp.create_pipe("senter"))
+ with pytest.raises(NotImplementedError):
+ nlp.get_pipe("senter").add_label("A")
+
+SENT_STARTS = [0] * 14
+SENT_STARTS[0] = 1
+SENT_STARTS[5] = 1
+SENT_STARTS[9] = 1
+
+TRAIN_DATA = [
+ ("I like green eggs. Eat blue ham. I like purple eggs.", {"sent_starts": SENT_STARTS}),
+ ("She likes purple eggs. They hate ham. You like yellow eggs.", {"sent_starts": SENT_STARTS}),
+]
+
+
+def test_overfitting_IO():
+ # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly
+ nlp = English()
+ senter = nlp.create_pipe("senter")
+ nlp.add_pipe(senter)
+ optimizer = nlp.begin_training()
+
+ for i in range(200):
+ losses = {}
+ nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
+ assert losses["senter"] < 0.0001
+
+ # test the trained model
+ test_text = "I like eggs. There is ham. She likes ham."
+ doc = nlp(test_text)
+ gold_sent_starts = [0] * 12
+ gold_sent_starts[0] = 1
+ gold_sent_starts[4] = 1
+ gold_sent_starts[8] = 1
+ assert gold_sent_starts == [int(t.is_sent_start) for t in doc]
+
+ # Also test the results are still the same after IO
+ with make_tempdir() as tmp_dir:
+ nlp.to_disk(tmp_dir)
+ nlp2 = util.load_model_from_path(tmp_dir)
+ doc2 = nlp2(test_text)
+ assert gold_sent_starts == [int(t.is_sent_start) for t in doc2]
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index b1070a9e7..a3381cb2f 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -2,7 +2,7 @@ import pytest
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger
-from spacy.ml.models.defaults import default_textcat, default_sentrec
+from spacy.ml.models.defaults import default_textcat, default_senter
from ..util import make_tempdir
@@ -146,7 +146,7 @@ def test_serialize_pipe_exclude(en_vocab, Parser):
def test_serialize_sentencerecognizer(en_vocab):
- sr = SentenceRecognizer(en_vocab, default_sentrec())
+ sr = SentenceRecognizer(en_vocab, default_senter())
sr_b = sr.to_bytes()
- sr_d = SentenceRecognizer(en_vocab, default_sentrec()).from_bytes(sr_b)
+ sr_d = SentenceRecognizer(en_vocab, default_senter()).from_bytes(sr_b)
assert sr.to_bytes() == sr_d.to_bytes()
From 5847be6022e615cdea55ca5a7856d203254e7ddf Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Sun, 8 Mar 2020 13:23:18 +0100
Subject: [PATCH 093/496] Tok2Vec: extract-embed-encode (#5102)
* avoid changing original config
* fix elif structure, batch with just int crashes otherwise
* tok2vec example with doc2feats, encode and embed architectures
* further clean up MultiHashEmbed
* further generalize Tok2Vec to work with extract-embed-encode parts
* avoid initializing the charembed layer with Docs (for now ?)
* small fixes for bilstm config (still does not run)
* rename to core layer
* move new configs
* walk model to set nI instead of using core ref
* fix senter overfitting test to be more similar to the training data (avoid flakey behaviour)
---
.../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 2 +-
.../tok2vec-ner/charembed_tok2vec.cfg | 65 ++++++
.../tok2vec-ner/multihashembed_tok2vec.cfg | 65 ++++++
spacy/language.py | 9 +-
spacy/ml/_character_embed.py | 2 +-
spacy/ml/models/tok2vec.py | 199 +++++++-----------
spacy/ml/tok2vec.py | 0
spacy/pipeline/tok2vec.py | 7 +-
spacy/tests/pipeline/test_senter.py | 12 +-
spacy/util.py | 7 +-
10 files changed, 227 insertions(+), 141 deletions(-)
create mode 100644 examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
create mode 100644 examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
delete mode 100644 spacy/ml/tok2vec.py
diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
index 4f1a915c5..b6b4e82b6 100644
--- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@@ -62,4 +62,4 @@ width = 96
depth = 4
embed_size = 2000
subword_features = true
-char_embed = false
+maxout_pieces = 3
diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
new file mode 100644
index 000000000..b8219ad10
--- /dev/null
+++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
@@ -0,0 +1,65 @@
+[training]
+use_gpu = -1
+limit = 0
+dropout = 0.2
+patience = 10000
+eval_frequency = 200
+scores = ["ents_f"]
+score_weights = {"ents_f": 1}
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+batch_size = 25
+
+[optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = null
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[nlp.pipeline.tok2vec.model.extract]
+@architectures = "spacy.CharacterEmbed.v1"
+width = 96
+nM = 64
+nC = 8
+rows = 2000
+columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
+
+[nlp.pipeline.tok2vec.model.extract.features]
+@architectures = "spacy.Doc2Feats.v1"
+columns = ${nlp.pipeline.tok2vec.model.extract:columns}
+
+[nlp.pipeline.tok2vec.model.embed]
+@architectures = "spacy.LayerNormalizedMaxout.v1"
+width = ${nlp.pipeline.tok2vec.model.extract:width}
+maxout_pieces = 4
+
+[nlp.pipeline.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = ${nlp.pipeline.tok2vec.model.extract:width}
+window_size = 1
+maxout_pieces = 2
+depth = 2
+
+[nlp.pipeline.ner]
+factory = "ner"
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 6
+hidden_width = 64
+maxout_pieces = 2
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model.extract:width}
diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
new file mode 100644
index 000000000..4678a7d6b
--- /dev/null
+++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
@@ -0,0 +1,65 @@
+[training]
+use_gpu = -1
+limit = 0
+dropout = 0.2
+patience = 10000
+eval_frequency = 200
+scores = ["ents_f"]
+score_weights = {"ents_f": 1}
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+batch_size = 25
+
+[optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = null
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[nlp.pipeline.tok2vec.model.extract]
+@architectures = "spacy.Doc2Feats.v1"
+columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
+
+[nlp.pipeline.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+columns = ${nlp.pipeline.tok2vec.model.extract:columns}
+width = 96
+rows = 2000
+use_subwords = true
+pretrained_vectors = null
+
+[nlp.pipeline.tok2vec.model.embed.mix]
+@architectures = "spacy.LayerNormalizedMaxout.v1"
+width = ${nlp.pipeline.tok2vec.model.embed:width}
+maxout_pieces = 3
+
+[nlp.pipeline.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = ${nlp.pipeline.tok2vec.model.embed:width}
+window_size = 1
+maxout_pieces = 3
+depth = 2
+
+[nlp.pipeline.ner]
+factory = "ner"
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 6
+hidden_width = 64
+maxout_pieces = 2
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model.embed:width}
diff --git a/spacy/language.py b/spacy/language.py
index d0077b9d2..20e29c829 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -337,13 +337,14 @@ class Language(object):
default_config = self.defaults.get(name, None)
# transform the model's config to an actual Model
+ factory_cfg = dict(config)
model_cfg = None
- if "model" in config:
- model_cfg = config["model"]
+ if "model" in factory_cfg:
+ model_cfg = factory_cfg["model"]
if not isinstance(model_cfg, dict):
warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name))
model_cfg = None
- del config["model"]
+ del factory_cfg["model"]
if model_cfg is None and default_config is not None:
warnings.warn(Warnings.W098.format(name=name))
model_cfg = default_config["model"]
@@ -353,7 +354,7 @@ class Language(object):
model = registry.make_from_config({"model": model_cfg}, validate=True)[
"model"
]
- return factory(self, model, **config)
+ return factory(self, model, **factory_cfg)
def add_pipe(
self, component, name=None, before=None, after=None, first=None, last=None
diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py
index b366f67c6..f4890144a 100644
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
@@ -21,7 +21,7 @@ def init(model, X=None, Y=None):
def forward(model, docs, is_train):
- if not docs:
+ if docs is None:
return []
ids = []
output = []
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 0d33d010d..d1a98c080 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -4,7 +4,7 @@ from thinc.api import HashEmbed, StaticVectors, PyTorchLSTM
from thinc.api import residual, LayerNorm, FeatureExtractor, Mish
from ... import util
-from ...util import registry, make_layer
+from ...util import registry
from ...ml import _character_embed
from ...pipeline.tok2vec import Tok2VecListener
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
@@ -23,15 +23,14 @@ def get_vocab_vectors(name):
@registry.architectures.register("spacy.Tok2Vec.v1")
-def Tok2Vec(config):
- doc2feats = make_layer(config["@doc2feats"])
- embed = make_layer(config["@embed"])
- encode = make_layer(config["@encode"])
+def Tok2Vec(extract, embed, encode):
field_size = 0
- if encode.has_attr("receptive_field"):
+ if encode.attrs.get("receptive_field", None):
field_size = encode.attrs["receptive_field"]
- tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size))
- tok2vec.attrs["cfg"] = config
+ with Model.define_operators({">>": chain, "|": concatenate}):
+ if extract.has_dim("nO"):
+ _set_dims(embed, "nI", extract.get_dim("nO"))
+ tok2vec = extract >> with_array(embed >> encode, pad=field_size)
tok2vec.set_dim("nO", encode.get_dim("nO"))
tok2vec.set_ref("embed", embed)
tok2vec.set_ref("encode", encode)
@@ -39,8 +38,7 @@ def Tok2Vec(config):
@registry.architectures.register("spacy.Doc2Feats.v1")
-def Doc2Feats(config):
- columns = config["columns"]
+def Doc2Feats(columns):
return FeatureExtractor(columns)
@@ -79,8 +77,8 @@ def hash_charembed_cnn(
maxout_pieces,
window_size,
subword_features,
- nM=0,
- nC=0,
+ nM,
+ nC,
):
# Allows using character embeddings by setting nC, nM and char_embed=True
return build_Tok2Vec_model(
@@ -100,7 +98,7 @@ def hash_charembed_cnn(
@registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
def hash_embed_bilstm_v1(
- pretrained_vectors, width, depth, embed_size, subword_features
+ pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces
):
# Does not use character embeddings: set to False by default
return build_Tok2Vec_model(
@@ -109,7 +107,7 @@ def hash_embed_bilstm_v1(
pretrained_vectors=pretrained_vectors,
bilstm_depth=depth,
conv_depth=0,
- maxout_pieces=0,
+ maxout_pieces=maxout_pieces,
window_size=1,
subword_features=subword_features,
char_embed=False,
@@ -120,7 +118,7 @@ def hash_embed_bilstm_v1(
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
def hash_char_embed_bilstm_v1(
- pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0
+ pretrained_vectors, width, depth, embed_size, subword_features, nM, nC, maxout_pieces
):
# Allows using character embeddings by setting nC, nM and char_embed=True
return build_Tok2Vec_model(
@@ -129,7 +127,7 @@ def hash_char_embed_bilstm_v1(
pretrained_vectors=pretrained_vectors,
bilstm_depth=depth,
conv_depth=0,
- maxout_pieces=0,
+ maxout_pieces=maxout_pieces,
window_size=1,
subword_features=subword_features,
char_embed=True,
@@ -138,104 +136,99 @@ def hash_char_embed_bilstm_v1(
)
-@registry.architectures.register("spacy.MultiHashEmbed.v1")
-def MultiHashEmbed(config):
- # For backwards compatibility with models before the architecture registry,
- # we have to be careful to get exactly the same model structure. One subtle
- # trick is that when we define concatenation with the operator, the operator
- # is actually binary associative. So when we write (a | b | c), we're actually
- # getting concatenate(concatenate(a, b), c). That's why the implementation
- # is a bit ugly here.
- cols = config["columns"]
- width = config["width"]
- rows = config["rows"]
+@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
+def LayerNormalizedMaxout(width, maxout_pieces):
+ return Maxout(
+ nO=width,
+ nP=maxout_pieces,
+ dropout=0.0,
+ normalize=True,
+ )
- norm = HashEmbed(width, rows, column=cols.index("NORM"))
- if config["use_subwords"]:
- prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX"))
- suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX"))
- shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE"))
- if config.get("@pretrained_vectors"):
- glove = make_layer(config["@pretrained_vectors"])
- mix = make_layer(config["@mix"])
+
+@registry.architectures.register("spacy.MultiHashEmbed.v1")
+def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix):
+ norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"))
+ if use_subwords:
+ prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"))
+ suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"))
+ shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"))
+
+ if pretrained_vectors:
+ glove = StaticVectors(
+ vectors=pretrained_vectors.data,
+ nO=width,
+ column=columns.index(ID),
+ dropout=0.0,
+ )
with Model.define_operators({">>": chain, "|": concatenate}):
- if config["use_subwords"] and config["@pretrained_vectors"]:
- mix._layers[0].set_dim("nI", width * 5)
- layer = uniqued(
- (glove | norm | prefix | suffix | shape) >> mix,
- column=cols.index("ORTH"),
- )
- elif config["use_subwords"]:
- mix._layers[0].set_dim("nI", width * 4)
- layer = uniqued(
- (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH")
- )
- elif config["@pretrained_vectors"]:
- mix._layers[0].set_dim("nI", width * 2)
- layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"))
+ if not use_subwords and not pretrained_vectors:
+ embed_layer = norm
else:
- layer = norm
- layer.attrs["cfg"] = config
- return layer
+ if use_subwords and pretrained_vectors:
+ nr_columns = 5
+ concat_columns = glove | norm | prefix | suffix | shape
+ elif use_subwords:
+ nr_columns = 4
+ concat_columns = norm | prefix | suffix | shape
+ else:
+ nr_columns = 2
+ concat_columns = glove | norm
+ _set_dims(mix, "nI", width * nr_columns)
+ embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH"))
+
+ return embed_layer
+
+
+def _set_dims(model, name, value):
+ # Loop through the model to set a specific dimension if its unset on any layer.
+ for node in model.walk():
+ if node.has_dim(name) is None:
+ node.set_dim(name, value)
@registry.architectures.register("spacy.CharacterEmbed.v1")
-def CharacterEmbed(config):
- width = config["width"]
- chars = config["chars"]
-
- chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars)
- other_tables = make_layer(config["@embed_features"])
- mix = make_layer(config["@mix"])
-
- model = chain(concatenate(chr_embed, other_tables), mix)
- model.attrs["cfg"] = config
- return model
+def CharacterEmbed(columns, width, rows, nM, nC, features):
+ norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"))
+ chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC)
+ with Model.define_operators({">>": chain, "|": concatenate}):
+ embed_layer = chr_embed | features >> with_array(norm)
+ embed_layer.set_dim("nO", nM * nC + width)
+ return embed_layer
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
-def MaxoutWindowEncoder(config):
- nO = config["width"]
- nW = config["window_size"]
- nP = config["pieces"]
- depth = config["depth"]
-
- cnn = (
- expand_window(window_size=nW),
- Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True),
+def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth):
+ cnn = chain(
+ expand_window(window_size=window_size),
+ Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True),
)
model = clone(residual(cnn), depth)
- model.set_dim("nO", nO)
- model.attrs["receptive_field"] = nW * depth
+ model.set_dim("nO", width)
+ model.attrs["receptive_field"] = window_size * depth
return model
@registry.architectures.register("spacy.MishWindowEncoder.v1")
-def MishWindowEncoder(config):
- nO = config["width"]
- nW = config["window_size"]
- depth = config["depth"]
-
+def MishWindowEncoder(width, window_size, depth):
cnn = chain(
- expand_window(window_size=nW),
- Mish(nO=nO, nI=nO * ((nW * 2) + 1)),
- LayerNorm(nO),
+ expand_window(window_size=window_size),
+ Mish(nO=width, nI=width * ((window_size * 2) + 1)),
+ LayerNorm(width),
)
model = clone(residual(cnn), depth)
- model.set_dim("nO", nO)
+ model.set_dim("nO", width)
return model
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
-def TorchBiLSTMEncoder(config):
+def TorchBiLSTMEncoder(width, depth):
import torch.nn
# TODO FIX
from thinc.api import PyTorchRNNWrapper
- width = config["width"]
- depth = config["depth"]
if depth == 0:
return noop()
return with_padded(
@@ -243,40 +236,6 @@ def TorchBiLSTMEncoder(config):
)
-# TODO: update
-_EXAMPLE_CONFIG = {
- "@doc2feats": {
- "arch": "Doc2Feats",
- "config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]},
- },
- "@embed": {
- "arch": "spacy.MultiHashEmbed.v1",
- "config": {
- "width": 96,
- "rows": 2000,
- "columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"],
- "use_subwords": True,
- "@pretrained_vectors": {
- "arch": "TransformedStaticVectors",
- "config": {
- "vectors_name": "en_vectors_web_lg.vectors",
- "width": 96,
- "column": 0,
- },
- },
- "@mix": {
- "arch": "LayerNormalizedMaxout",
- "config": {"width": 96, "pieces": 3},
- },
- },
- },
- "@encode": {
- "arch": "MaxoutWindowEncode",
- "config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3},
- },
-}
-
-
def build_Tok2Vec_model(
width,
embed_size,
diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 2fee6881a..4623f99b0 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -131,9 +131,10 @@ class Tok2Vec(Pipe):
get_examples (function): Function returning example training data.
pipeline (list): The pipeline the model is part of.
"""
- # TODO: use examples instead ?
- docs = [Doc(Vocab(), words=["hello"])]
- self.model.initialize(X=docs)
+ # TODO: charembed does not play nicely with dim inference yet
+ # docs = [Doc(Vocab(), words=["hello"])]
+ # self.model.initialize(X=docs)
+ self.model.initialize()
link_vectors_to_models(self.vocab)
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 7a929a6a2..411768e5f 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -36,17 +36,17 @@ def test_overfitting_IO():
assert losses["senter"] < 0.0001
# test the trained model
- test_text = "I like eggs. There is ham. She likes ham."
+ test_text = "I like purple eggs. They eat ham. You like yellow eggs."
doc = nlp(test_text)
- gold_sent_starts = [0] * 12
+ gold_sent_starts = [0] * 14
gold_sent_starts[0] = 1
- gold_sent_starts[4] = 1
- gold_sent_starts[8] = 1
- assert gold_sent_starts == [int(t.is_sent_start) for t in doc]
+ gold_sent_starts[5] = 1
+ gold_sent_starts[9] = 1
+ assert [int(t.is_sent_start) for t in doc] == gold_sent_starts
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(test_text)
- assert gold_sent_starts == [int(t.is_sent_start) for t in doc2]
+ assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts
diff --git a/spacy/util.py b/spacy/util.py
index 216158e52..37649c5e6 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -79,11 +79,6 @@ def set_lang_class(name, cls):
registry.languages.register(name, func=cls)
-def make_layer(arch_config):
- arch_func = registry.architectures.get(arch_config["arch"])
- return arch_func(arch_config["config"])
-
-
def ensure_path(path):
"""Ensure string is converted to a Path.
@@ -563,7 +558,7 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len):
"""Create minibatches of a given number of words."""
if isinstance(size, int):
size_ = itertools.repeat(size)
- if isinstance(size, List):
+ elif isinstance(size, List):
size_ = iter(size)
else:
size_ = size
From 59000ee21dcacb091fd3493bdfe4ea57e664e110 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Fri, 13 Mar 2020 16:07:56 +0100
Subject: [PATCH 094/496] fix serialization of empty doc + unit test
---
spacy/tests/regression/test_issue5141.py | 11 +++++++++++
spacy/tokens/_serialize.py | 7 +++++--
2 files changed, 16 insertions(+), 2 deletions(-)
create mode 100644 spacy/tests/regression/test_issue5141.py
diff --git a/spacy/tests/regression/test_issue5141.py b/spacy/tests/regression/test_issue5141.py
new file mode 100644
index 000000000..845454583
--- /dev/null
+++ b/spacy/tests/regression/test_issue5141.py
@@ -0,0 +1,11 @@
+from spacy.tokens import DocBin
+
+
+def test_issue5141(en_vocab):
+ """ Ensure an empty DocBin does not crash on serialization """
+ doc_bin = DocBin(attrs=["DEP", "HEAD"])
+ assert list(doc_bin.get_docs(en_vocab)) == []
+ doc_bin_bytes = doc_bin.to_bytes()
+
+ doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
+ assert list(doc_bin_2.get_docs(en_vocab)) == []
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 65b70d1b3..d3f49550c 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -135,10 +135,13 @@ class DocBin(object):
for tokens in self.tokens:
assert len(tokens.shape) == 2, tokens.shape # this should never happen
lengths = [len(tokens) for tokens in self.tokens]
+ tokens = numpy.vstack(self.tokens) if self.tokens else numpy.asarray([])
+ spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
+
msg = {
"attrs": self.attrs,
- "tokens": numpy.vstack(self.tokens).tobytes("C"),
- "spaces": numpy.vstack(self.spaces).tobytes("C"),
+ "tokens": tokens.tobytes("C"),
+ "spaces": spaces.tobytes("C"),
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
"strings": list(self.strings),
"cats": self.cats,
From fba219f73765725afb7468c3c1b114df3e1a27f4 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Mon, 16 Mar 2020 08:31:36 +0100
Subject: [PATCH 095/496] remove unnecessary itertools call
---
spacy/language.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/spacy/language.py b/spacy/language.py
index 20e29c829..6b3957deb 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -778,8 +778,6 @@ class Language(object):
DOCS: https://spacy.io/api/language#pipe
"""
- # raw_texts will be used later to stop iterator.
- texts, raw_texts = itertools.tee(texts)
if n_threads != -1:
warnings.warn(Warnings.W016, DeprecationWarning)
if n_process == -1:
From 02d87a8b2b7db3cfbe2649daf87ed61450fc7fbe Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Thu, 19 Mar 2020 10:30:20 +0100
Subject: [PATCH 096/496] fix showing dep arcs in streamlit script
---
examples/streamlit_spacy.py | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/examples/streamlit_spacy.py b/examples/streamlit_spacy.py
index a2da123c2..2b527b3df 100644
--- a/examples/streamlit_spacy.py
+++ b/examples/streamlit_spacy.py
@@ -1,7 +1,7 @@
# coding: utf-8
"""
Example of a Streamlit app for an interactive spaCy model visualizer. You can
-either download the script, or point streamlit run to the raw URL of this
+either download the script, or point `streamlit run` to the raw URL of this
file. For more details, see https://streamlit.io.
Installation:
@@ -15,6 +15,8 @@ streamlit run streamlit_spacy.py
"""
from __future__ import unicode_literals
+import base64
+
import streamlit as st
import spacy
from spacy import displacy
@@ -54,6 +56,14 @@ model_load_state.empty()
text = st.text_area("Text to analyze", DEFAULT_TEXT)
doc = process_text(spacy_model, text)
+
+def render_svg(svg):
+ """Renders the given svg string."""
+ b64 = base64.b64encode(svg.encode('utf-8')).decode("utf-8")
+ html = r' ' % b64
+ st.write(html, unsafe_allow_html=True)
+
+
if "parser" in nlp.pipe_names:
st.header("Dependency Parse & Part-of-speech tags")
st.sidebar.header("Dependency Parse")
@@ -68,12 +78,14 @@ if "parser" in nlp.pipe_names:
}
docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
for sent in docs:
- html = displacy.render(sent, options=options)
+ html = displacy.render(sent, options=options, style="dep")
# Double newlines seem to mess with the rendering
html = html.replace("\n\n", "\n")
if split_sents and len(docs) > 1:
st.markdown(f"> {sent.text}")
- st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
+ render_svg(html)
+ # this didn't show the dep arc labels properly, cf #5089
+ # st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
if "ner" in nlp.pipe_names:
st.header("Named Entities")
From fcac1ace7839eb49721a4636b4f3687781d5a4ab Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Mon, 23 Mar 2020 22:55:47 +0100
Subject: [PATCH 097/496] Update macOS image on Azure Pipelines
---
azure-pipelines.yml | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index d34da39f7..f93dffaed 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -42,7 +42,7 @@ jobs:
imageName: 'vs2017-win2016'
python.version: '3.6'
Python36Mac:
- imageName: 'macos-10.13'
+ imageName: 'macos-10.14'
python.version: '3.6'
# Don't test on 3.7 for now to speed up builds
# Python37Linux:
@@ -52,7 +52,7 @@ jobs:
# imageName: 'vs2017-win2016'
# python.version: '3.7'
# Python37Mac:
- # imageName: 'macos-10.13'
+ # imageName: 'macos-10.14'
# python.version: '3.7'
Python38Linux:
imageName: 'ubuntu-16.04'
@@ -61,7 +61,7 @@ jobs:
imageName: 'vs2017-win2016'
python.version: '3.8'
Python38Mac:
- imageName: 'macos-10.13'
+ imageName: 'macos-10.14'
python.version: '3.8'
maxParallel: 4
pool:
From 218e1706ac97f276f8226531c0c942ed660b953e Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Wed, 25 Mar 2020 10:20:11 +0100
Subject: [PATCH 098/496] Bugfix linking vectors (#5196)
* restore call to _load_vectors
* bump to thinc 8.0.0a3
* bump to 3.0.0.dev4
---
pyproject.toml | 2 +-
requirements.txt | 2 +-
setup.cfg | 4 ++--
spacy/about.py | 2 +-
spacy/cli/train.py | 2 ++
spacy/syntax/_parser_model.pyx | 2 +-
6 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index ee28d5d42..9440c2d44 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
- "thinc==8.0.0a1",
+ "thinc==8.0.0a3",
"blis>=0.4.0,<0.5.0"
]
build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 09998cdc9..73e595daf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
-thinc==8.0.0a1
+thinc==8.0.0a3
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 7b3a468b6..d7d2be935 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -36,13 +36,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
- thinc==8.0.0a1
+ thinc==8.0.0a3
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
- thinc==8.0.0a1
+ thinc==8.0.0a3
blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0
diff --git a/spacy/about.py b/spacy/about.py
index 6a3c680ab..0c0a2d002 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.0.0.dev3"
+__version__ = "3.0.0.dev4"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 7eb9bbd3c..a40fdadb4 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -143,6 +143,7 @@ def train(
)
if vectors:
msg.text(f"Loading vectors from model '{vectors}'")
+ _load_vectors(nlp, vectors)
nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
for pipe in pipeline:
@@ -210,6 +211,7 @@ def train(
if vectors:
msg.text(f"Loading vectors from model '{vectors}'")
+ _load_vectors(nlp, vectors)
for pipe in pipeline:
# first, create the model.
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index e36a2a28b..4a1014a09 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -250,7 +250,7 @@ class ParserModel(Model):
nI = smaller.get_dim("nI")
with use_ops('numpy'):
larger = Linear(nO=new_nO, nI=nI)
- larger._init = smaller._init
+ larger.init = smaller.init
# it could be that the model is not initialized yet, then skip this bit
if nI:
larger_W = larger.ops.alloc2f(new_nO, nI)
From aa0616bafa86cd1400ea97c020640722f64c347b Mon Sep 17 00:00:00 2001
From: Tiljander <35637838+Tiljander@users.noreply.github.com>
Date: Thu, 26 Mar 2020 13:13:22 +0100
Subject: [PATCH 099/496] Describing priority rules for overlapping matches
(#5197)
* Describing priority rules for overlapping matches
* Create Tiljander.md
* Describing priority rules for overlapping matches
* Update website/docs/api/entityruler.md
Co-Authored-By: Ines Montani
Co-authored-by: Ines Montani
---
.github/contributors/Tiljander.md | 106 ++++++++++++++++++++++
website/docs/api/entityruler.md | 3 +-
website/docs/usage/rule-based-matching.md | 5 +-
3 files changed, 112 insertions(+), 2 deletions(-)
create mode 100644 .github/contributors/Tiljander.md
diff --git a/.github/contributors/Tiljander.md b/.github/contributors/Tiljander.md
new file mode 100644
index 000000000..89e70efa5
--- /dev/null
+++ b/.github/contributors/Tiljander.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Henrik Tiljander |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 24/3/2020 |
+| GitHub username | Tiljander |
+| Website (optional) | |
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index af3db0dcb..0fd24897d 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -83,7 +83,8 @@ Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
happens automatically after the component has been added to the pipeline using
[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
with `overwrite_ents=True`, existing entities will be replaced if they overlap
-with the matches.
+with the matches. When matches overlap in a Doc, the entity ruler prioritizes longer
+patterns over shorter, and if equal the match occuring first in the Doc is chosen.
> #### Example
>
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 0ab74034e..1db2405d1 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -968,7 +968,10 @@ pattern. The entity ruler accepts two types of patterns:
The [`EntityRuler`](/api/entityruler) is a pipeline component that's typically
added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is
called on a text, it will find matches in the `doc` and add them as entities to
-the `doc.ents`, using the specified pattern label as the entity label.
+the `doc.ents`, using the specified pattern label as the entity label. If any
+matches were to overlap, the pattern matching most tokens takes priority. If
+they also happen to be equally long, then the match occuring first in the Doc is
+chosen.
```python
### {executable="true"}
From 70ee4ef4fdcbdb659fa84b7356c08dd910c44968 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 26 Mar 2020 13:47:31 +0100
Subject: [PATCH 100/496] Fix small errors
---
spacy/morphology.pyx | 4 ++--
spacy/tokens/doc.pyx | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 946da141d..0b53b124c 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -42,7 +42,7 @@ def _normalize_props(props):
elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
out[key] = value
else:
- warnings.warn(Warnings.W028.format(feature={key: value}))
+ warnings.warn(Warnings.W029.format(feature={key: value}))
return out
@@ -112,7 +112,7 @@ cdef class Morphology:
return tag_ptr.key
features = self.feats_to_dict(features)
if not isinstance(features, dict):
- warnings.warn(Warnings.W028.format(feature=features))
+ warnings.warn(Warnings.W029.format(feature=features))
features = {}
features = _normalize_props(features)
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 6a139dd86..a6b1b171b 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -781,7 +781,7 @@ cdef class Doc:
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
for id_ in attrs]
if array.dtype != numpy.uint64:
- user_warning(Warnings.W028.format(type=array.dtype))
+ warnings.warn(Warnings.W028.format(type=array.dtype))
if SENT_START in attrs and HEAD in attrs:
raise ValueError(Errors.E032)
From e7341db5dc16102625d9f0f90545596145968920 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 26 Mar 2020 14:05:40 +0100
Subject: [PATCH 101/496] Add sent_start to pattern schema
---
spacy/schemas.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 2268bf100..3b6313db8 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -119,6 +119,7 @@ class TokenPattern(BaseModel):
is_currency: Optional[StrictBool] = None
is_stop: Optional[StrictBool] = None
is_sent_start: Optional[StrictBool] = None
+ sent_start: Optional[StrictBool] = None
like_num: Optional[StrictBool] = None
like_url: Optional[StrictBool] = None
like_email: Optional[StrictBool] = None
From 7453df79d166b0441becc0296de2b691dd7afa06 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 26 Mar 2020 14:09:02 +0100
Subject: [PATCH 102/496] Fix argument
---
spacy/language.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/spacy/language.py b/spacy/language.py
index ce9412d85..5343df4b7 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1150,7 +1150,7 @@ def _pipe(examples, proc, kwargs):
yield ex
-def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state, vectors):
+def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
"""Worker for Language.pipe
receiver (multiprocessing.Connection): Pipe to receive text. Usually
@@ -1158,7 +1158,6 @@ def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state, vectors):
sender (multiprocessing.Connection): Pipe to send doc. Usually created by
`multiprocessing.Pipe()`
underscore_state (tuple): The data in the Underscore class of the parent
- vectors (dict): The global vectors data, copied from the parent
"""
Underscore.load_state(underscore_state)
while True:
From f12a46472c6d5f5cf05a2576ccffe1ca82d2f37e Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 26 Mar 2020 15:18:32 +0100
Subject: [PATCH 103/496] Remove unicode declarations
---
spacy/lang/eu/__init__.py | 3 ---
spacy/lang/eu/examples.py | 3 ---
spacy/lang/eu/lex_attrs.py | 3 ---
spacy/lang/eu/punctuation.py | 3 ---
spacy/lang/eu/stop_words.py | 3 ---
spacy/lang/eu/tag_map.py | 3 ---
spacy/lang/lij/__init__.py | 3 ---
spacy/lang/lij/examples.py | 4 ----
spacy/lang/lij/punctuation.py | 3 ---
spacy/lang/lij/stop_words.py | 4 ----
spacy/lang/lij/tokenizer_exceptions.py | 2 --
spacy/lang/lt/punctuation.py | 3 ---
spacy/lang/ro/punctuation.py | 3 ---
spacy/tests/lang/eu/test_text.py | 3 ---
spacy/tests/regression/test_issue4725.py | 3 ---
spacy/tests/regression/test_issue4903.py | 3 ---
spacy/tests/regression/test_issue5048.py | 3 ---
spacy/tests/regression/test_issue5082.py | 3 ---
18 files changed, 55 deletions(-)
diff --git a/spacy/lang/eu/__init__.py b/spacy/lang/eu/__init__.py
index 4f3338c1d..352eb1548 100644
--- a/spacy/lang/eu/__init__.py
+++ b/spacy/lang/eu/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
diff --git a/spacy/lang/eu/examples.py b/spacy/lang/eu/examples.py
index 463494abd..3b9ef71b6 100644
--- a/spacy/lang/eu/examples.py
+++ b/spacy/lang/eu/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/eu/lex_attrs.py b/spacy/lang/eu/lex_attrs.py
index 19b75c111..a3ab018ee 100644
--- a/spacy/lang/eu/lex_attrs.py
+++ b/spacy/lang/eu/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...attrs import LIKE_NUM
# Source http://mylanguages.org/basque_numbers.php
diff --git a/spacy/lang/eu/punctuation.py b/spacy/lang/eu/punctuation.py
index b8b1a1c83..5d35d0a25 100644
--- a/spacy/lang/eu/punctuation.py
+++ b/spacy/lang/eu/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..punctuation import TOKENIZER_SUFFIXES
diff --git a/spacy/lang/eu/stop_words.py b/spacy/lang/eu/stop_words.py
index dda11a7fd..d213b5b81 100644
--- a/spacy/lang/eu/stop_words.py
+++ b/spacy/lang/eu/stop_words.py
@@ -1,6 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
# Source: https://github.com/stopwords-iso/stopwords-eu
# https://www.ranks.nl/stopwords/basque
# https://www.mustgo.com/worldlanguages/basque/
diff --git a/spacy/lang/eu/tag_map.py b/spacy/lang/eu/tag_map.py
index 2499d7e3e..e0940edb7 100644
--- a/spacy/lang/eu/tag_map.py
+++ b/spacy/lang/eu/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py
index 9b4b29798..a75f081bf 100644
--- a/spacy/lang/lij/__init__.py
+++ b/spacy/lang/lij/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
diff --git a/spacy/lang/lij/examples.py b/spacy/lang/lij/examples.py
index c4034ae7e..ba7fe43fd 100644
--- a/spacy/lang/lij/examples.py
+++ b/spacy/lang/lij/examples.py
@@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
"""
Example sentences to test spaCy and its language models.
diff --git a/spacy/lang/lij/punctuation.py b/spacy/lang/lij/punctuation.py
index 4439376c8..d50b75589 100644
--- a/spacy/lang/lij/punctuation.py
+++ b/spacy/lang/lij/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import ALPHA
diff --git a/spacy/lang/lij/stop_words.py b/spacy/lang/lij/stop_words.py
index ffd53370d..1d6f09d27 100644
--- a/spacy/lang/lij/stop_words.py
+++ b/spacy/lang/lij/stop_words.py
@@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
STOP_WORDS = set(
"""
a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei
diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py
index 2109add62..2befabca3 100644
--- a/spacy/lang/lij/tokenizer_exceptions.py
+++ b/spacy/lang/lij/tokenizer_exceptions.py
@@ -1,5 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA
_exc = {}
diff --git a/spacy/lang/lt/punctuation.py b/spacy/lang/lt/punctuation.py
index 5eedc8116..506aa8f32 100644
--- a/spacy/lang/lt/punctuation.py
+++ b/spacy/lang/lt/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from ..char_classes import LIST_ICONS, LIST_ELLIPSES
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
from ..char_classes import HYPHENS
diff --git a/spacy/lang/ro/punctuation.py b/spacy/lang/ro/punctuation.py
index 87f9a1248..529e1c977 100644
--- a/spacy/lang/ro/punctuation.py
+++ b/spacy/lang/ro/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import itertools
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
diff --git a/spacy/tests/lang/eu/test_text.py b/spacy/tests/lang/eu/test_text.py
index f448a7859..94d5ac91d 100644
--- a/spacy/tests/lang/eu/test_text.py
+++ b/spacy/tests/lang/eu/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
import pytest
diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py
index 57675a202..624eefb2c 100644
--- a/spacy/tests/regression/test_issue4725.py
+++ b/spacy/tests/regression/test_issue4725.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import numpy
from spacy.lang.en import English
diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py
index d467b1cd6..a3dff16aa 100644
--- a/spacy/tests/regression/test_issue4903.py
+++ b/spacy/tests/regression/test_issue4903.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
from spacy.lang.en import English
from spacy.tokens import Span, Doc
diff --git a/spacy/tests/regression/test_issue5048.py b/spacy/tests/regression/test_issue5048.py
index 228322493..bc52ae82f 100644
--- a/spacy/tests/regression/test_issue5048.py
+++ b/spacy/tests/regression/test_issue5048.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import numpy
from spacy.tokens import Doc
from spacy.attrs import DEP, POS, TAG
diff --git a/spacy/tests/regression/test_issue5082.py b/spacy/tests/regression/test_issue5082.py
index efa5d39f2..52a52b177 100644
--- a/spacy/tests/regression/test_issue5082.py
+++ b/spacy/tests/regression/test_issue5082.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
import numpy as np
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
From 4fe2299586227496c2cd1c1649158bb0464ab0d7 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 26 Mar 2020 20:58:13 +0100
Subject: [PATCH 104/496] xfail hanging test
---
spacy/tests/regression/test_issue4725.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py
index 624eefb2c..a5087f0b2 100644
--- a/spacy/tests/regression/test_issue4725.py
+++ b/spacy/tests/regression/test_issue4725.py
@@ -4,6 +4,7 @@ from spacy.lang.en import English
from spacy.vocab import Vocab
+@pytest.mark.xfail(reason="currently hangs")
def test_issue4725():
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
vocab = Vocab(vectors_name="test_vocab_add_vector")
From ee4bb0e3b6247429e05bf0e09599b98ed58c269a Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 26 Mar 2020 21:44:18 +0100
Subject: [PATCH 105/496] Fix import
---
spacy/tests/regression/test_issue4725.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py
index a5087f0b2..720da93e3 100644
--- a/spacy/tests/regression/test_issue4725.py
+++ b/spacy/tests/regression/test_issue4725.py
@@ -1,3 +1,4 @@
+import pytest
import numpy
from spacy.lang.en import English
From 92b9b631ef2efd834cfde471a1f95fe7a3707336 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Fri, 27 Mar 2020 10:51:32 +0100
Subject: [PATCH 106/496] xfail -> skip
---
spacy/tests/regression/test_issue4725.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py
index 720da93e3..ca6c3f767 100644
--- a/spacy/tests/regression/test_issue4725.py
+++ b/spacy/tests/regression/test_issue4725.py
@@ -5,7 +5,7 @@ from spacy.lang.en import English
from spacy.vocab import Vocab
-@pytest.mark.xfail(reason="currently hangs")
+@pytest.mark.skip(reason="currently hangs")
def test_issue4725():
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
vocab = Vocab(vectors_name="test_vocab_add_vector")
From 9b412516e7ccbd3cfd9010465fc0d4220fff7fc9 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Fri, 27 Mar 2020 19:35:26 +0100
Subject: [PATCH 107/496] Fixing pickling of the parser (#5218)
* fix __reduce__ for pickling parser
* setting the move object as 'state' during pickling
* unskip test_issue4725 - works again
---
spacy/pipeline/pipes.pyx | 17 ++++++++++++++---
spacy/syntax/nn_parser.pyx | 8 +++++++-
spacy/tests/regression/test_issue4725.py | 1 -
website/docs/usage/saving-loading.md | 2 +-
4 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 658de8a1f..9ea2507cb 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1173,7 +1173,13 @@ cdef class DependencyParser(Parser):
tok2vec=tok2vec, sgd=sgd)
def __reduce__(self):
- return (DependencyParser, (self.vocab, self.moves, self.model), None, None)
+ return (DependencyParser, (self.vocab, self.model), self.moves)
+
+ def __getstate__(self):
+ return self.moves
+
+ def __setstate__(self, moves):
+ self.moves = moves
@property
def labels(self):
@@ -1214,8 +1220,13 @@ cdef class EntityRecognizer(Parser):
tok2vec=tok2vec)
def __reduce__(self):
- return (EntityRecognizer, (self.vocab, self.moves, self.model),
- None, None)
+ return (EntityRecognizer, (self.vocab, self.model), self.moves)
+
+ def __getstate__(self):
+ return self.moves
+
+ def __setstate__(self, moves):
+ self.moves = moves
@property
def labels(self):
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 2ba13507f..f480e3528 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -79,7 +79,13 @@ cdef class Parser:
return cls(nlp.vocab, model, **cfg)
def __reduce__(self):
- return (Parser, (self.vocab, self.moves, self.model), None, None)
+ return (Parser, (self.vocab, self.model), self.moves)
+
+ def __getstate__(self):
+ return self.moves
+
+ def __setstate__(self, moves):
+ self.moves = moves
@property
def move_names(self):
diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py
index ca6c3f767..967db5d67 100644
--- a/spacy/tests/regression/test_issue4725.py
+++ b/spacy/tests/regression/test_issue4725.py
@@ -5,7 +5,6 @@ from spacy.lang.en import English
from spacy.vocab import Vocab
-@pytest.mark.skip(reason="currently hangs")
def test_issue4725():
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
vocab = Vocab(vectors_name="test_vocab_add_vector")
diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index 8e2c30d82..058204a5d 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -131,7 +131,7 @@ shared vocab it depends on.
If you need to pickle multiple objects, try to pickle them **together** instead
of separately. For instance, instead of pickling all pipeline components, pickle
the entire pipeline once. And instead of pickling several `Doc` objects
-separately, pickle a list of `Doc` objects. Since the all share a reference to
+separately, pickle a list of `Doc` objects. Since they all share a reference to
the _same_ `Vocab` object, it will only be included once.
```python
From 1f9852abc30fd61cdfd0edc494f9ba32ae404b31 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Sat, 28 Mar 2020 23:09:35 +0100
Subject: [PATCH 108/496] Fix parser @ GPU (#5210)
* ensure self.bias is numpy array in parser model
* 2 more little bug fixes for parser on GPU
* removing testing GPU statement
* remove commented code
---
spacy/ml/_layers.py | 8 ++++++--
spacy/syntax/_parser_model.pyx | 5 +----
2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/spacy/ml/_layers.py b/spacy/ml/_layers.py
index 7e9150d8b..a752ef49a 100644
--- a/spacy/ml/_layers.py
+++ b/spacy/ml/_layers.py
@@ -79,7 +79,7 @@ def _backprop_precomputable_affine_padding(model, dY, ids):
# for b in range(nB):
# for f in range(nF):
# if ids[b, f] < 0:
- # d_padding[0, f] += dY[b]
+ # d_pad[0, f] += dY[b]
#
# Which can be rewritten as:
#
@@ -88,9 +88,13 @@ def _backprop_precomputable_affine_padding(model, dY, ids):
#
# I don't know how to avoid the loop without building a whole array :(.
# Cursed numpy.
+ #
+ # Note by Sofie: rewritten to longer loop because "CuPy only supports slices that consist of one boolean array."
d_pad = model.ops.alloc((1, nF, nO, nP))
for b in range(nB):
- d_pad[0, ids[b] < 0] += dY[b]
+ for f in range(nF):
+ if ids[b, f] < 0:
+ d_pad[0, f] += dY[b]
return d_pad
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 4a1014a09..4f4e5e4b0 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -371,8 +371,6 @@ class ParserStepModel(Model):
self.ops.scatter_add(d_tokvecs, ids,
d_state_features)
# Padded -- see update()
- if isinstance(self.ops, CupyOps):
- d_tokvecs = self.ops.to_numpy(d_tokvecs)
self.bp_tokvecs(d_tokvecs[:-1])
return d_tokvecs
@@ -445,8 +443,7 @@ cdef class precompute_hiddens:
else:
cached = gpu_cached
if not isinstance(lower_model.get_param("b"), numpy.ndarray):
- # self.bias = lower_model.get_param("b").get(stream=cuda_stream) ???
- self.bias = lower_model.get_param("b")
+ self.bias = lower_model.get_param("b").get(stream=cuda_stream)
else:
self.bias = lower_model.get_param("b")
self.nF = cached.shape[1]
From d6d95674c15d36afa12b819217a722a3c14a7353 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Sun, 29 Mar 2020 13:56:07 +0200
Subject: [PATCH 109/496] bugfix in span similarity (#5155)
* bugfix in span similarity
* also rewrite doc.pyx for clarity
* formatting
---
spacy/tests/regression/test_issue5152.py | 18 ++++++++++++++++++
spacy/tokens/doc.pyx | 15 ++++++++-------
spacy/tokens/span.pyx | 6 ++++--
3 files changed, 30 insertions(+), 9 deletions(-)
create mode 100644 spacy/tests/regression/test_issue5152.py
diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py
new file mode 100644
index 000000000..a9a57746d
--- /dev/null
+++ b/spacy/tests/regression/test_issue5152.py
@@ -0,0 +1,18 @@
+from spacy.lang.en import English
+
+
+def test_issue5152():
+ # Test that the comparison between a Span and a Token, goes well
+ # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
+ nlp = English()
+ text = nlp("Talk about being boring!")
+ text_var = nlp("Talk of being boring!")
+ y = nlp("Let")
+
+ span = text[0:3] # Talk about being
+ span_2 = text[0:3] # Talk about being
+ span_3 = text_var[0:3] # Talk of being
+ token = y[0] # Let
+ assert span.similarity(token) == 0.0
+ assert span.similarity(span_2) == 1.0
+ assert span_2.similarity(span_3) < 1.0
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index a6b1b171b..0716b2b3d 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -380,13 +380,14 @@ cdef class Doc:
if isinstance(other, (Lexeme, Token)) and self.length == 1:
if self.c[0].lex.orth == other.orth:
return 1.0
- elif isinstance(other, (Span, Doc)):
- if len(self) == len(other):
- for i in range(self.length):
- if self[i].orth != other[i].orth:
- break
- else:
- return 1.0
+ elif isinstance(other, (Span, Doc)) and len(self) == len(other):
+ similar = True
+ for i in range(self.length):
+ if self[i].orth != other[i].orth:
+ similar = False
+ break
+ if similar:
+ return 1.0
if self.vocab.vectors.n_keys == 0:
warnings.warn(Warnings.W007.format(obj="Doc"))
if self.vector_norm == 0 or other.vector_norm == 0:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 53d1b9826..66e8d8c3e 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -320,11 +320,13 @@ cdef class Span:
if len(self) == 1 and hasattr(other, "orth"):
if self[0].orth == other.orth:
return 1.0
- elif hasattr(other, "__len__") and len(self) == len(other):
+ elif isinstance(other, (Doc, Span)) and len(self) == len(other):
+ similar = True
for i in range(len(self)):
if self[i].orth != getattr(other[i], "orth", None):
+ similar = False
break
- else:
+ if similar:
return 1.0
if self.vocab.vectors.n_keys == 0:
warnings.warn(Warnings.W007.format(obj="Span"))
From ce0e5380684fd593e2839ad1d954e1218224246c Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Sun, 29 Mar 2020 13:57:00 +0200
Subject: [PATCH 110/496] Check whether doc is instantiated in
Example.get_gold_parses() (#5167)
* Check whether doc is instantiated
When creating docs to pair with gold parses, modify test to check
whether a doc is unset rather than whether it contains tokens.
* Restore test of evaluate on an empty doc
* Set a minimal gold.orig for the scorer
Without a minimal gold.orig the scorer can't evaluate empty docs. This
is the v3 equivalent of #4925.
---
spacy/gold.pyx | 7 +++++--
spacy/tests/regression/test_issue4924.py | 3 +--
spacy/tests/test_gold.py | 7 +++++++
3 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 37d092395..a9156c1a5 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -834,7 +834,7 @@ cdef class Example:
if merge:
t = self.token_annotation
doc = self.doc
- if not self.doc:
+ if self.doc is None:
if not vocab:
raise ValueError(Errors.E998)
doc = Doc(vocab, words=t.words)
@@ -993,7 +993,10 @@ cdef class GoldParse:
self.links = {} if links is None else dict(links)
# avoid allocating memory if the doc does not contain any tokens
- if self.length > 0:
+ if self.length == 0:
+ # set a minimal orig so that the scorer can score an empty doc
+ self.orig = TokenAnnotation(ids=[])
+ else:
if not words:
words = [token.text for token in doc]
if not tags:
diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py
index 1eb6afcf0..b240f6d4a 100644
--- a/spacy/tests/regression/test_issue4924.py
+++ b/spacy/tests/regression/test_issue4924.py
@@ -5,5 +5,4 @@ from spacy.language import Language
def test_issue4924():
nlp = Language()
docs_golds = [("", {})]
- with pytest.raises(ValueError):
- nlp.evaluate(docs_golds)
+ nlp.evaluate(docs_golds)
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 7fe8aab73..0754fb5bc 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -480,3 +480,10 @@ def test_tuples_to_example(merged_dict):
assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"]
assert ex_dict["doc_annotation"]["cats"] == cats
+
+
+def test_empty_example_goldparse():
+ nlp = English()
+ doc = nlp("")
+ example = Example(doc=doc)
+ assert len(example.get_gold_parses()) == 1
From 311133e579158a26f34379e44054762dac8d93fc Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Sun, 29 Mar 2020 19:40:36 +0200
Subject: [PATCH 111/496] Train textcat with config (#5143)
* bring back default build_text_classifier method
* remove _set_dims_ hack in favor of proper dim inference
* add tok2vec initialize to unit test
* small fixes
* add unit test for various textcat config settings
* logistic output layer does not have nO
* fix window_size setting
* proper fix
* fix W initialization
* Update textcat training example
* Use ml_datasets
* Convert training data to `Example` format
* Use `n_texts` to set proportionate dev size
* fix _init renaming on latest thinc
* avoid setting a non-existing dim
* update to thinc==8.0.0a2
* add BOW and CNN defaults for easy testing
* various experiments with train_textcat script, fix softmax activation in textcat bow
* allow textcat train script to work on other datasets as well
* have dataset as a parameter
* train textcat from config, with example config
* add config for training textcat
* formatting
* fix exclusive_classes
* fixing BOW for GPU
* bump thinc to 8.0.0a3 (not published yet so CI will fail)
* add in link_vectors_to_models which got deleted
Co-authored-by: Adriane Boyd
---
examples/training/train_textcat.py | 100 ++++++++++------
examples/training/train_textcat_config.cfg | 19 +++
.../{_layers.py => _precomputable_affine.py} | 0
spacy/ml/extract_ngrams.py | 20 ++--
.../models/defaults/textcat_bow_defaults.cfg | 5 +
.../models/defaults/textcat_cnn_defaults.cfg | 13 ++
spacy/ml/models/defaults/textcat_defaults.cfg | 12 +-
spacy/ml/models/parser.py | 2 +-
spacy/ml/models/textcat.py | 112 ++++++++++++++++--
spacy/ml/models/tok2vec.py | 12 +-
spacy/ml/spacy_vectors.py | 27 +++++
spacy/pipeline/pipes.pyx | 4 +-
spacy/pipeline/tok2vec.py | 6 +-
spacy/tests/pipeline/test_textcat.py | 34 +++++-
spacy/tests/test_misc.py | 3 +-
spacy/tests/test_tok2vec.py | 15 +--
spacy/tests/util.py | 15 +++
17 files changed, 301 insertions(+), 98 deletions(-)
create mode 100644 examples/training/train_textcat_config.cfg
rename spacy/ml/{_layers.py => _precomputable_affine.py} (100%)
create mode 100644 spacy/ml/models/defaults/textcat_bow_defaults.cfg
create mode 100644 spacy/ml/models/defaults/textcat_cnn_defaults.cfg
create mode 100644 spacy/ml/spacy_vectors.py
diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py
index 50c852ac1..dfb95b038 100644
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@@ -2,70 +2,71 @@
# coding: utf8
"""Train a convolutional neural network text classifier on the
IMDB dataset, using the TextCategorizer component. The dataset will be loaded
-automatically via Thinc's built-in dataset loader. The model is added to
+automatically via the package `ml_datasets`. The model is added to
spacy.pipeline, and predictions are available via `doc.cats`. For more details,
see the documentation:
* Training: https://spacy.io/usage/training
-Compatible with: spaCy v2.0.0+
+Compatible with: spaCy v3.0.0+
"""
from __future__ import unicode_literals, print_function
-import ml_datasets
import plac
import random
from pathlib import Path
+from ml_datasets import loaders
import spacy
+from spacy import util
from spacy.util import minibatch, compounding
+from spacy.gold import Example, GoldParse
@plac.annotations(
- model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
+ config_path=("Path to config file", "positional", None, Path),
output_dir=("Optional output directory", "option", "o", Path),
n_texts=("Number of texts to train from", "option", "t", int),
n_iter=("Number of training iterations", "option", "n", int),
init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path),
+ dataset=("Dataset to train on (default: imdb)", "option", "d", str),
+ threshold=("Min. number of instances for a given label (default 20)", "option", "m", int)
)
-def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
+def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None, dataset="imdb", threshold=20):
+ if not config_path or not config_path.exists():
+ raise ValueError(f"Config file not found at {config_path}")
+
+ spacy.util.fix_random_seed()
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
- if model is not None:
- nlp = spacy.load(model) # load existing spaCy model
- print("Loaded model '%s'" % model)
- else:
- nlp = spacy.blank("en") # create blank Language class
- print("Created blank 'en' model")
+ print(f"Loading nlp model from {config_path}")
+ nlp_config = util.load_config(config_path, create_objects=False)["nlp"]
+ nlp = util.load_model_from_config(nlp_config)
- # add the text classifier to the pipeline if it doesn't exist
- # nlp.create_pipe works for built-ins that are registered with spaCy
+ # ensure the nlp object was defined with a textcat component
if "textcat" not in nlp.pipe_names:
- textcat = nlp.create_pipe(
- "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
- )
- nlp.add_pipe(textcat, last=True)
- # otherwise, get it, so we can add labels to it
- else:
- textcat = nlp.get_pipe("textcat")
+ raise ValueError(f"The nlp definition in the config does not contain a textcat component")
- # add label to text classifier
- textcat.add_label("POSITIVE")
- textcat.add_label("NEGATIVE")
+ textcat = nlp.get_pipe("textcat")
- # load the IMDB dataset
- print("Loading IMDB data...")
- (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
- train_texts = train_texts[:n_texts]
- train_cats = train_cats[:n_texts]
+ # load the dataset
+ print(f"Loading dataset {dataset} ...")
+ (train_texts, train_cats), (dev_texts, dev_cats) = load_data(dataset=dataset, threshold=threshold, limit=n_texts)
print(
"Using {} examples ({} training, {} evaluation)".format(
n_texts, len(train_texts), len(dev_texts)
)
)
- train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
+ train_examples = []
+ for text, cats in zip(train_texts, train_cats):
+ doc = nlp.make_doc(text)
+ gold = GoldParse(doc, cats=cats)
+ for cat in cats:
+ textcat.add_label(cat)
+ ex = Example.from_gold(gold, doc=doc)
+ train_examples.append(ex)
# get names of other pipes to disable them during training
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
@@ -81,8 +82,8 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
for i in range(n_iter):
losses = {}
# batch up the examples using spaCy's minibatch
- random.shuffle(train_data)
- batches = minibatch(train_data, size=batch_sizes)
+ random.shuffle(train_examples)
+ batches = minibatch(train_examples, size=batch_sizes)
for batch in batches:
nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
with textcat.model.use_params(optimizer.averages):
@@ -97,7 +98,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
)
)
- # test the trained model
+ # test the trained model (only makes sense for sentiment analysis)
test_text = "This movie sucked"
doc = nlp(test_text)
print(test_text, doc.cats)
@@ -114,14 +115,39 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
print(test_text, doc2.cats)
-def load_data(limit=0, split=0.8):
- """Load data from the IMDB dataset."""
+def load_data(dataset, threshold, limit=0, split=0.8):
+ """Load data from the provided dataset."""
# Partition off part of the train data for evaluation
- train_data, _ = ml_datasets.imdb()
+ data_loader = loaders.get(dataset)
+ train_data, _ = data_loader(limit=int(limit/split))
random.shuffle(train_data)
- train_data = train_data[-limit:]
texts, labels = zip(*train_data)
- cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
+
+ unique_labels = sorted(set([l for label_set in labels for l in label_set]))
+ print(f"# of unique_labels: {len(unique_labels)}")
+
+ count_values_train = dict()
+ for text, annot_list in train_data:
+ for annot in annot_list:
+ count_values_train[annot] = count_values_train.get(annot, 0) + 1
+ for value, count in sorted(count_values_train.items(), key=lambda item: item[1]):
+ if count < threshold:
+ unique_labels.remove(value)
+
+ print(f"# of unique_labels after filtering with threshold {threshold}: {len(unique_labels)}")
+
+ if unique_labels == {0, 1}:
+ cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
+ else:
+ cats = []
+ for y in labels:
+ if isinstance(y, str):
+ cats.append({str(label): (label == y) for label in unique_labels})
+ elif isinstance(y, set):
+ cats.append({str(label): (label in y) for label in unique_labels})
+ else:
+ raise ValueError(f"Unrecognised type of labels: {type(y)}")
+
split = int(len(train_data) * split)
return (texts[:split], cats[:split]), (texts[split:], cats[split:])
diff --git a/examples/training/train_textcat_config.cfg b/examples/training/train_textcat_config.cfg
new file mode 100644
index 000000000..7c0f36b57
--- /dev/null
+++ b/examples/training/train_textcat_config.cfg
@@ -0,0 +1,19 @@
+[nlp]
+lang = "en"
+
+[nlp.pipeline.textcat]
+factory = "textcat"
+
+[nlp.pipeline.textcat.model]
+@architectures = "spacy.TextCatCNN.v1"
+exclusive_classes = false
+
+[nlp.pipeline.textcat.model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 96
+depth = 4
+embed_size = 2000
+window_size = 1
+maxout_pieces = 3
+subword_features = true
diff --git a/spacy/ml/_layers.py b/spacy/ml/_precomputable_affine.py
similarity index 100%
rename from spacy/ml/_layers.py
rename to spacy/ml/_precomputable_affine.py
diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py
index d4195b9a4..f9f691aae 100644
--- a/spacy/ml/extract_ngrams.py
+++ b/spacy/ml/extract_ngrams.py
@@ -11,26 +11,26 @@ def extract_ngrams(ngram_size, attr=LOWER) -> Model:
return model
-def forward(self, docs, is_train: bool):
+def forward(model, docs, is_train: bool):
batch_keys = []
batch_vals = []
for doc in docs:
- unigrams = doc.to_array([self.attrs["attr"]])
+ unigrams = model.ops.asarray(doc.to_array([model.attrs["attr"]]))
ngrams = [unigrams]
- for n in range(2, self.attrs["ngram_size"] + 1):
- ngrams.append(self.ops.ngrams(n, unigrams))
- keys = self.ops.xp.concatenate(ngrams)
- keys, vals = self.ops.xp.unique(keys, return_counts=True)
+ for n in range(2, model.attrs["ngram_size"] + 1):
+ ngrams.append(model.ops.ngrams(n, unigrams))
+ keys = model.ops.xp.concatenate(ngrams)
+ keys, vals = model.ops.xp.unique(keys, return_counts=True)
batch_keys.append(keys)
batch_vals.append(vals)
# The dtype here matches what thinc is expecting -- which differs per
# platform (by int definition). This should be fixed once the problem
# is fixed on Thinc's side.
- lengths = self.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
- batch_keys = self.ops.xp.concatenate(batch_keys)
- batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
+ lengths = model.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
+ batch_keys = model.ops.xp.concatenate(batch_keys)
+ batch_vals = model.ops.asarray(model.ops.xp.concatenate(batch_vals), dtype="f")
def backprop(dY):
- return dY
+ return []
return (batch_keys, batch_vals, lengths), backprop
diff --git a/spacy/ml/models/defaults/textcat_bow_defaults.cfg b/spacy/ml/models/defaults/textcat_bow_defaults.cfg
new file mode 100644
index 000000000..84472ea10
--- /dev/null
+++ b/spacy/ml/models/defaults/textcat_bow_defaults.cfg
@@ -0,0 +1,5 @@
+[model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
+ngram_size: 1
+no_output_layer: false
diff --git a/spacy/ml/models/defaults/textcat_cnn_defaults.cfg b/spacy/ml/models/defaults/textcat_cnn_defaults.cfg
new file mode 100644
index 000000000..cea1bfe54
--- /dev/null
+++ b/spacy/ml/models/defaults/textcat_cnn_defaults.cfg
@@ -0,0 +1,13 @@
+[model]
+@architectures = "spacy.TextCatCNN.v1"
+exclusive_classes = false
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 96
+depth = 4
+embed_size = 2000
+window_size = 1
+maxout_pieces = 3
+subword_features = true
diff --git a/spacy/ml/models/defaults/textcat_defaults.cfg b/spacy/ml/models/defaults/textcat_defaults.cfg
index cea1bfe54..9477b2995 100644
--- a/spacy/ml/models/defaults/textcat_defaults.cfg
+++ b/spacy/ml/models/defaults/textcat_defaults.cfg
@@ -1,13 +1,9 @@
[model]
-@architectures = "spacy.TextCatCNN.v1"
+@architectures = "spacy.TextCat.v1"
exclusive_classes = false
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
-width = 96
-depth = 4
+width = 64
+conv_depth = 2
embed_size = 2000
window_size = 1
-maxout_pieces = 3
-subword_features = true
+ngram_size = 1
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index d2de10a0e..f2d51c2ba 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -2,7 +2,7 @@ from pydantic import StrictInt
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
from ...util import registry
-from .._layers import PrecomputableAffine
+from .._precomputable_affine import PrecomputableAffine
from ...syntax._parser_model import ParserModel
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 49679c8cd..ce31d058c 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -1,7 +1,11 @@
-from thinc.api import Model, chain, reduce_mean, Linear, list2ragged, Logistic
-from thinc.api import SparseLinear, Softmax
+from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention
+from thinc.api import chain, concatenate, clone, Dropout
+from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window
+from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor
-from ...attrs import ORTH
+from ..spacy_vectors import SpacyVectors
+from ... import util
+from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE, LOWER
from ...util import registry
from ..extract_ngrams import extract_ngrams
@@ -20,7 +24,6 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
model.set_ref("output_layer", output_layer)
else:
- # TODO: experiment with init_w=zero_init
linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
model = (
tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
@@ -33,13 +36,100 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
@registry.architectures.register("spacy.TextCatBOW.v1")
def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None):
- # Note: original defaults were ngram_size=1 and no_output_layer=False
with Model.define_operators({">>": chain}):
- model = extract_ngrams(ngram_size, attr=ORTH) >> SparseLinear(nO)
- model.to_cpu()
+ sparse_linear = SparseLinear(nO)
+ model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
+ model = with_cpu(model, model.ops)
if not no_output_layer:
- output_layer = Softmax(nO) if exclusive_classes else Logistic(nO)
- output_layer.to_cpu()
- model = model >> output_layer
- model.set_ref("output_layer", output_layer)
+ output_layer = softmax_activation() if exclusive_classes else Logistic()
+ model = model >> with_cpu(output_layer, output_layer.ops)
+ model.set_ref("output_layer", sparse_linear)
+ return model
+
+
+@registry.architectures.register("spacy.TextCat.v1")
+def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size,
+ window_size, conv_depth, nO=None):
+ cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
+ with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
+ lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER))
+ prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX))
+ suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX))
+ shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE))
+
+ width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
+ trained_vectors = FeatureExtractor(cols) >> with_array(
+ uniqued(
+ (lower | prefix | suffix | shape)
+ >> Maxout(nO=width, nI=width_nI, normalize=True),
+ column=cols.index(ORTH),
+ )
+ )
+
+ if pretrained_vectors:
+ nlp = util.load_model(pretrained_vectors)
+ vectors = nlp.vocab.vectors
+ vector_dim = vectors.data.shape[1]
+
+ static_vectors = SpacyVectors(vectors) >> with_array(
+ Linear(width, vector_dim)
+ )
+ vector_layer = trained_vectors | static_vectors
+ vectors_width = width * 2
+ else:
+ vector_layer = trained_vectors
+ vectors_width = width
+ tok2vec = vector_layer >> with_array(
+ Maxout(width, vectors_width, normalize=True)
+ >> residual((expand_window(window_size=window_size)
+ >> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth,
+ pad=conv_depth,
+ )
+ cnn_model = (
+ tok2vec
+ >> list2ragged()
+ >> ParametricAttention(width)
+ >> reduce_sum()
+ >> residual(Maxout(nO=width, nI=width))
+ >> Linear(nO=nO, nI=width)
+ >> Dropout(0.0)
+ )
+
+ linear_model = build_bow_text_classifier(
+ nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False
+ )
+ nO_double = nO*2 if nO else None
+ if exclusive_classes:
+ output_layer = Softmax(nO=nO, nI=nO_double)
+ else:
+ output_layer = (
+ Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
+ )
+ model = (linear_model | cnn_model) >> output_layer
+ model.set_ref("tok2vec", tok2vec)
+ if model.has_dim("nO") is not False:
+ model.set_dim("nO", nO)
+ model.set_ref("output_layer", linear_model.get_ref("output_layer"))
+ return model
+
+
+@registry.architectures.register("spacy.TextCatLowData.v1")
+def build_text_classifier_lowdata(width, pretrained_vectors, nO=None):
+ nlp = util.load_model(pretrained_vectors)
+ vectors = nlp.vocab.vectors
+ vector_dim = vectors.data.shape[1]
+
+ # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
+ with Model.define_operators({">>": chain, "**": clone}):
+ model = (
+ SpacyVectors(vectors)
+ >> list2ragged()
+ >> with_ragged(0, Linear(width, vector_dim))
+ >> ParametricAttention(width)
+ >> reduce_sum()
+ >> residual(Relu(width, width)) ** 2
+ >> Linear(nO, width)
+ >> Dropout(0.0)
+ >> Logistic()
+ )
return model
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index d1a98c080..81820e56b 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -28,8 +28,6 @@ def Tok2Vec(extract, embed, encode):
if encode.attrs.get("receptive_field", None):
field_size = encode.attrs["receptive_field"]
with Model.define_operators({">>": chain, "|": concatenate}):
- if extract.has_dim("nO"):
- _set_dims(embed, "nI", extract.get_dim("nO"))
tok2vec = extract >> with_array(embed >> encode, pad=field_size)
tok2vec.set_dim("nO", encode.get_dim("nO"))
tok2vec.set_ref("embed", embed)
@@ -176,18 +174,11 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix):
nr_columns = 2
concat_columns = glove | norm
- _set_dims(mix, "nI", width * nr_columns)
embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH"))
return embed_layer
-def _set_dims(model, name, value):
- # Loop through the model to set a specific dimension if its unset on any layer.
- for node in model.walk():
- if node.has_dim(name) is None:
- node.set_dim(name, value)
-
@registry.architectures.register("spacy.CharacterEmbed.v1")
def CharacterEmbed(columns, width, rows, nM, nC, features):
norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"))
@@ -344,6 +335,7 @@ def build_Tok2Vec_model(
tok2vec = tok2vec >> PyTorchLSTM(
nO=width, nI=width, depth=bilstm_depth, bi=True
)
- tok2vec.set_dim("nO", width)
+ if tok2vec.has_dim("nO") is not False:
+ tok2vec.set_dim("nO", width)
tok2vec.set_ref("embed", embed)
return tok2vec
diff --git a/spacy/ml/spacy_vectors.py b/spacy/ml/spacy_vectors.py
new file mode 100644
index 000000000..2a4988494
--- /dev/null
+++ b/spacy/ml/spacy_vectors.py
@@ -0,0 +1,27 @@
+import numpy
+from thinc.api import Model, Unserializable
+
+
+def SpacyVectors(vectors) -> Model:
+ attrs = {"vectors": Unserializable(vectors)}
+ model = Model("spacy_vectors", forward, attrs=attrs)
+ return model
+
+
+def forward(model, docs, is_train: bool):
+ batch = []
+ vectors = model.attrs["vectors"].obj
+ for doc in docs:
+ indices = numpy.zeros((len(doc),), dtype="i")
+ for i, word in enumerate(doc):
+ if word.orth in vectors.key2row:
+ indices[i] = vectors.key2row[word.orth]
+ else:
+ indices[i] = 0
+ batch_vectors = vectors.data[indices]
+ batch.append(batch_vectors)
+
+ def backprop(dY):
+ return None
+
+ return batch, backprop
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 9ea2507cb..296ad5089 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -148,7 +148,8 @@ class Pipe(object):
return sgd
def set_output(self, nO):
- self.model.set_dim("nO", nO)
+ if self.model.has_dim("nO") is not False:
+ self.model.set_dim("nO", nO)
if self.model.has_ref("output_layer"):
self.model.get_ref("output_layer").set_dim("nO", nO)
@@ -1133,6 +1134,7 @@ class TextCategorizer(Pipe):
docs = [Doc(Vocab(), words=["hello"])]
truths, _ = self._examples_to_truth(examples)
self.set_output(len(self.labels))
+ link_vectors_to_models(self.vocab)
self.model.initialize(X=docs, Y=truths)
if sgd is None:
sgd = self.create_optimizer()
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 4623f99b0..ef744a5da 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -131,10 +131,8 @@ class Tok2Vec(Pipe):
get_examples (function): Function returning example training data.
pipeline (list): The pipeline the model is part of.
"""
- # TODO: charembed does not play nicely with dim inference yet
- # docs = [Doc(Vocab(), words=["hello"])]
- # self.model.initialize(X=docs)
- self.model.initialize()
+ docs = [Doc(Vocab(), words=["hello"])]
+ self.model.initialize(X=docs)
link_vectors_to_models(self.vocab)
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 1b5ca9a4c..38c980428 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -6,10 +6,12 @@ from spacy import util
from spacy.lang.en import English
from spacy.language import Language
from spacy.pipeline import TextCategorizer
-from spacy.tests.util import make_tempdir
from spacy.tokens import Doc
from spacy.gold import GoldParse
+from ..util import make_tempdir
+from ...ml.models.defaults import default_tok2vec
+
TRAIN_DATA = [
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
@@ -109,3 +111,33 @@ def test_overfitting_IO():
cats2 = doc2.cats
assert cats2["POSITIVE"] > 0.9
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
+
+
+# fmt: off
+@pytest.mark.parametrize(
+ "textcat_config",
+ [
+ {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False},
+ {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False},
+ {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True},
+ {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True},
+ {"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2},
+ {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1},
+ {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3},
+ {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": True},
+ {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": False},
+ ],
+)
+# fmt: on
+def test_textcat_configs(textcat_config):
+ pipe_config = {"model": textcat_config}
+ nlp = English()
+ textcat = nlp.create_pipe("textcat", pipe_config)
+ for _, annotations in TRAIN_DATA:
+ for label, value in annotations.get("cats").items():
+ textcat.add_label(label)
+ nlp.add_pipe(textcat)
+ optimizer = nlp.begin_training()
+ for i in range(5):
+ losses = {}
+ nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 6d4e75a31..1200407d7 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -4,8 +4,7 @@ import ctypes
from pathlib import Path
from spacy import util
from spacy import prefer_gpu, require_gpu
-from spacy.ml._layers import PrecomputableAffine
-from spacy.ml._layers import _backprop_precomputable_affine_padding
+from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding
@pytest.fixture
diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py
index e1ad1f0fc..9c2e9004b 100644
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@@ -4,18 +4,7 @@ from spacy.ml.models.tok2vec import build_Tok2Vec_model
from spacy.vocab import Vocab
from spacy.tokens import Doc
-
-def get_batch(batch_size):
- vocab = Vocab()
- docs = []
- start = 0
- for size in range(1, batch_size + 1):
- # Make the words numbers, so that they're distinct
- # across the batch, and easy to track.
- numbers = [str(i) for i in range(start, start + size)]
- docs.append(Doc(vocab, words=numbers))
- start += size
- return docs
+from .util import get_batch
# This fails in Thinc v7.3.1. Need to push patch
@@ -75,7 +64,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
def test_tok2vec_configs(tok2vec_config):
docs = get_batch(3)
tok2vec = build_Tok2Vec_model(**tok2vec_config)
- tok2vec.initialize()
+ tok2vec.initialize(docs)
vectors, backprop = tok2vec.begin_update(docs)
assert len(vectors) == len(docs)
assert vectors[0].shape == (len(docs[0]), tok2vec_config["width"])
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 958d51e11..e29342268 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -9,6 +9,8 @@ from spacy import Errors
from spacy.tokens import Doc, Span
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA
+from spacy.vocab import Vocab
+
@contextlib.contextmanager
def make_tempfile(mode="r"):
@@ -77,6 +79,19 @@ def get_doc(
return doc
+def get_batch(batch_size):
+ vocab = Vocab()
+ docs = []
+ start = 0
+ for size in range(1, batch_size + 1):
+ # Make the words numbers, so that they're distinct
+ # across the batch, and easy to track.
+ numbers = [str(i) for i in range(start, start + size)]
+ docs.append(Doc(vocab, words=numbers))
+ start += size
+ return docs
+
+
def apply_transition_sequence(parser, doc, sequence):
"""Perform a series of pre-specified transitions, to put the parser in a
desired state."""
From ab59f3124eca47ada6955b7954c04df14d5f5b9f Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Thu, 2 Apr 2020 10:32:52 +0200
Subject: [PATCH 112/496] fix NEL overfitting test for GPU (#5236)
---
spacy/pipeline/pipes.pyx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 296ad5089..1a0812442 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1456,7 +1456,7 @@ class EntityLinker(Pipe):
scores = prior_probs + sims - (prior_probs*sims)
# TODO: thresholding
- best_index = scores.argmax()
+ best_index = scores.argmax().item()
best_candidate = candidates[best_index]
final_kb_ids.append(best_candidate.entity_)
final_tensors.append(sentence_encoding)
From b71a11ff6dd7b47582fbffd45121c05ff3b89977 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Thu, 2 Apr 2020 14:46:32 +0200
Subject: [PATCH 113/496] Update morphologizer (#5108)
* Add pos and morph scoring to Scorer
Add pos, morph, and morph_per_type to `Scorer`. Report pos and morph
accuracy in `spacy evaluate`.
* Update morphologizer for v3
* switch to tagger-based morphologizer
* use `spacy.HashCharEmbedCNN` for morphologizer defaults
* add `Doc.is_morphed` flag
* Add morphologizer to train CLI
* Add basic morphologizer pipeline tests
* Add simple morphologizer training example
* Remove subword_features from CharEmbed models
Remove `subword_features` argument from `spacy.HashCharEmbedCNN.v1` and
`spacy.HashCharEmbedBiLSTM.v1` since in these cases `subword_features`
is always `False`.
* Rename setting in morphologizer example
Use `with_pos_tags` instead of `without_pos_tags`.
* Fix kwargs for spacy.HashCharEmbedBiLSTM.v1
* Remove defaults for spacy.HashCharEmbedBiLSTM.v1
Remove default `nM/nC` for `spacy.HashCharEmbedBiLSTM.v1`.
* Set random seed for textcat overfitting test
---
examples/training/train_morphologizer.py | 133 ++++++++++
spacy/cli/evaluate.py | 4 +-
spacy/cli/train.py | 14 +-
.../defaults/morphologizer_defaults.cfg | 1 -
spacy/ml/models/tok2vec.py | 7 +-
spacy/pipeline/morphologizer.pyx | 237 +++++++++---------
spacy/scorer.py | 63 ++++-
spacy/tests/pipeline/test_morphologizer.py | 49 ++++
spacy/tests/pipeline/test_textcat.py | 2 +
spacy/tests/test_scorer.py | 75 ++++++
spacy/tokens/doc.pxd | 1 +
11 files changed, 458 insertions(+), 128 deletions(-)
create mode 100644 examples/training/train_morphologizer.py
create mode 100644 spacy/tests/pipeline/test_morphologizer.py
diff --git a/examples/training/train_morphologizer.py b/examples/training/train_morphologizer.py
new file mode 100644
index 000000000..aec114de7
--- /dev/null
+++ b/examples/training/train_morphologizer.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+# coding: utf8
+"""
+A simple example for training a morphologizer. For more details, see
+the documentation:
+* Training: https://spacy.io/usage/training
+
+Compatible with: spaCy v3.0.0+
+Last tested with: v3.0.0
+"""
+from __future__ import unicode_literals, print_function
+
+import plac
+import random
+from pathlib import Path
+import spacy
+from spacy.util import minibatch, compounding
+from spacy.morphology import Morphology
+
+
+# Usually you'll read this in, of course. Data formats vary. Ensure your
+# strings are unicode and that the number of tags assigned matches spaCy's
+# tokenization. If not, you can always add a 'words' key to the annotations
+# that specifies the gold-standard tokenization, e.g.:
+# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
+TRAIN_DATA = [
+ (
+ "I like green eggs",
+ {
+ "morphs": [
+ "PronType=Prs|Person=1",
+ "VerbForm=Fin",
+ "Degree=Pos",
+ "Number=Plur",
+ ],
+ "pos": ["PRON", "VERB", "ADJ", "NOUN"],
+ },
+ ),
+ (
+ "Eat blue ham",
+ {
+ "morphs": ["VerbForm=Inf", "Degree=Pos", "Number=Sing"],
+ "pos": ["VERB", "ADJ", "NOUN"],
+ },
+ ),
+ (
+ "She was blue",
+ {
+ "morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos"],
+ "pos": ["PRON", "VERB", "ADJ"],
+ },
+ ),
+ (
+ "He was blue today",
+ {
+ "morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos", ""],
+ "pos": ["PRON", "VERB", "ADJ", "ADV"],
+ },
+ ),
+]
+
+# The POS tags are optional, set `with_pos_tags = False` to omit them for
+# this example:
+with_pos_tags = True
+
+if not with_pos_tags:
+ for i in range(len(TRAIN_DATA)):
+ del TRAIN_DATA[i][1]["pos"]
+
+
+@plac.annotations(
+ lang=("ISO Code of language to use", "option", "l", str),
+ output_dir=("Optional output directory", "option", "o", Path),
+ n_iter=("Number of training iterations", "option", "n", int),
+)
+def main(lang="en", output_dir=None, n_iter=25):
+ """Create a new model, set up the pipeline and train the tagger. In order to
+ train the tagger with a custom tag map, we're creating a new Language
+ instance with a custom vocab.
+ """
+ nlp = spacy.blank(lang)
+ # add the tagger to the pipeline
+ # nlp.create_pipe works for built-ins that are registered with spaCy
+ morphologizer = nlp.create_pipe("morphologizer")
+ nlp.add_pipe(morphologizer)
+
+ # add labels
+ for _, annotations in TRAIN_DATA:
+ morph_labels = annotations.get("morphs")
+ pos_labels = annotations.get("pos", [""] * len(annotations.get("morphs")))
+ assert len(morph_labels) == len(pos_labels)
+ for morph, pos in zip(morph_labels, pos_labels):
+ morph_dict = Morphology.feats_to_dict(morph)
+ if pos:
+ morph_dict["POS"] = pos
+ morph = Morphology.dict_to_feats(morph_dict)
+ morphologizer.add_label(morph)
+
+ optimizer = nlp.begin_training()
+ for i in range(n_iter):
+ random.shuffle(TRAIN_DATA)
+ losses = {}
+ # batch up the examples using spaCy's minibatch
+ batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+ for batch in batches:
+ nlp.update(batch, sgd=optimizer, losses=losses)
+ print("Losses", losses)
+
+ # test the trained model
+ test_text = "I like blue eggs"
+ doc = nlp(test_text)
+ print("Morphs", [(t.text, t.morph) for t in doc])
+
+ # save model to output directory
+ if output_dir is not None:
+ output_dir = Path(output_dir)
+ if not output_dir.exists():
+ output_dir.mkdir()
+ nlp.to_disk(output_dir)
+ print("Saved model to", output_dir)
+
+ # test the save model
+ print("Loading from", output_dir)
+ nlp2 = spacy.load(output_dir)
+ doc = nlp2(test_text)
+ print("Morphs", [(t.text, t.morph) for t in doc])
+
+
+if __name__ == "__main__":
+ plac.call(main)
+
+# Expected output:
+# Morphs [('I', POS=PRON|Person=1|PronType=Prs), ('like', POS=VERB|VerbForm=Fin), ('blue', Degree=Pos|POS=ADJ), ('eggs', Number=Plur|POS=NOUN)]
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index e047f1283..94813e732 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -43,7 +43,9 @@ def evaluate(
"Words": nwords,
"Words/s": f"{nwords / (end - begin):.0f}",
"TOK": f"{scorer.token_acc:.2f}",
- "POS": f"{scorer.tags_acc:.2f}",
+ "TAG": f"{scorer.tags_acc:.2f}",
+ "POS": f"{scorer.pos_acc:.2f}",
+ "MORPH": f"{scorer.morphs_acc:.2f}",
"UAS": f"{scorer.uas:.2f}",
"LAS": f"{scorer.las:.2f}",
"NER P": f"{scorer.ents_p:.2f}",
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 56020e4ff..5fa09da78 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -221,6 +221,8 @@ def train(
config_loc = default_dir / "parser_defaults.cfg"
elif pipe == "tagger":
config_loc = default_dir / "tagger_defaults.cfg"
+ elif pipe == "morphologizer":
+ config_loc = default_dir / "morphologizer_defaults.cfg"
elif pipe == "ner":
config_loc = default_dir / "ner_defaults.cfg"
elif pipe == "textcat":
@@ -590,6 +592,8 @@ def _score_for_model(meta):
acc = meta["accuracy"]
if "tagger" in pipes:
mean_acc.append(acc["tags_acc"])
+ if "morphologizer" in pipes:
+ mean_acc.append((acc["morphs_acc"] + acc["pos_acc"]) / 2)
if "parser" in pipes:
mean_acc.append((acc["uas"] + acc["las"]) / 2)
if "ner" in pipes:
@@ -672,13 +676,15 @@ def _find_best(experiment_dir, component):
def _get_metrics(component):
if component == "parser":
- return ("las", "uas", "las_per_type", "token_acc", "sent_f")
+ return ("las", "uas", "las_per_type", "sent_f", "token_acc")
elif component == "tagger":
return ("tags_acc", "token_acc")
+ elif component == "morphologizer":
+ return ("morphs_acc", "pos_acc", "token_acc")
elif component == "ner":
return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc")
elif component == "senter":
- return ("sent_f", "sent_p", "sent_r")
+ return ("sent_f", "sent_p", "sent_r", "token_acc")
elif component == "textcat":
return ("textcat_score", "token_acc")
return ("token_acc",)
@@ -691,6 +697,9 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths):
if pipe == "tagger":
row_head.extend(["Tag Loss ", " Tag % "])
output_stats.extend(["tag_loss", "tags_acc"])
+ elif pipe == "morphologizer" or pipe == "morphologizertagger":
+ row_head.extend(["Morph Loss ", " Morph % ", " POS % "])
+ output_stats.extend(["morph_loss", "morphs_acc", "pos_acc"])
elif pipe == "parser":
row_head.extend(
["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"]
@@ -731,6 +740,7 @@ def _get_progress(
scores["dep_loss"] = losses.get("parser", 0.0)
scores["ner_loss"] = losses.get("ner", 0.0)
scores["tag_loss"] = losses.get("tagger", 0.0)
+ scores["morph_loss"] = losses.get("morphologizer", 0.0)
scores["textcat_loss"] = losses.get("textcat", 0.0)
scores["senter_loss"] = losses.get("senter", 0.0)
scores["cpu_wps"] = cpu_wps
diff --git a/spacy/ml/models/defaults/morphologizer_defaults.cfg b/spacy/ml/models/defaults/morphologizer_defaults.cfg
index 80e776c4f..150eca507 100644
--- a/spacy/ml/models/defaults/morphologizer_defaults.cfg
+++ b/spacy/ml/models/defaults/morphologizer_defaults.cfg
@@ -9,6 +9,5 @@ depth = 4
embed_size = 7000
window_size = 1
maxout_pieces = 3
-subword_features = true
nM = 64
nC = 8
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 81820e56b..a2e8f589a 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -74,7 +74,6 @@ def hash_charembed_cnn(
embed_size,
maxout_pieces,
window_size,
- subword_features,
nM,
nC,
):
@@ -87,7 +86,7 @@ def hash_charembed_cnn(
bilstm_depth=0,
maxout_pieces=maxout_pieces,
window_size=window_size,
- subword_features=subword_features,
+ subword_features=False,
char_embed=True,
nM=nM,
nC=nC,
@@ -116,7 +115,7 @@ def hash_embed_bilstm_v1(
@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
def hash_char_embed_bilstm_v1(
- pretrained_vectors, width, depth, embed_size, subword_features, nM, nC, maxout_pieces
+ pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC
):
# Allows using character embeddings by setting nC, nM and char_embed=True
return build_Tok2Vec_model(
@@ -127,7 +126,7 @@ def hash_char_embed_bilstm_v1(
conv_depth=0,
maxout_pieces=maxout_pieces,
window_size=1,
- subword_features=subword_features,
+ subword_features=False,
char_embed=True,
nM=nM,
nC=nC,
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index be9b166bf..7a2bc3b17 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,166 +1,169 @@
+# cython: infer_types=True, profile=True
cimport numpy as np
import numpy
-from collections import defaultdict
-from thinc.api import chain, list2array, to_categorical, get_array_module
-from thinc.util import copy_array
+import srsly
+from thinc.api import to_categorical
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..morphology cimport Morphology
+from ..parts_of_speech import IDS as POS_IDS
+from ..symbols import POS
from .. import util
from ..language import component
from ..util import link_vectors_to_models, create_default_optimizer
from ..errors import Errors, TempErrors
-from .pipes import Pipe
+from .pipes import Tagger, _load_cfg
+from .. import util
@component("morphologizer", assigns=["token.morph", "token.pos"])
-class Morphologizer(Pipe):
+class Morphologizer(Tagger):
def __init__(self, vocab, model, **cfg):
self.vocab = vocab
self.model = model
+ self._rehearsal_model = None
self.cfg = dict(sorted(cfg.items()))
- self._class_map = self.vocab.morphology.create_class_map() # Morphology.create_class_map() ?
+ self.cfg.setdefault("labels", {})
+ self.cfg.setdefault("morph_pos", {})
@property
def labels(self):
- return self.vocab.morphology.tag_names
+ return tuple(self.cfg["labels"].keys())
- @property
- def tok2vec(self):
- if self.model in (None, True, False):
- return None
- else:
- return chain(self.model.get_ref("tok2vec"), list2array())
-
- def __call__(self, doc):
- features, tokvecs = self.predict([doc])
- self.set_annotations([doc], features, tensors=tokvecs)
- return doc
-
- def pipe(self, stream, batch_size=128, n_threads=-1):
- for docs in util.minibatch(stream, size=batch_size):
- docs = list(docs)
- features, tokvecs = self.predict(docs)
- self.set_annotations(docs, features, tensors=tokvecs)
- yield from docs
+ def add_label(self, label):
+ if not isinstance(label, str):
+ raise ValueError(Errors.E187)
+ if label in self.labels:
+ return 0
+ morph = Morphology.feats_to_dict(label)
+ norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)]
+ pos = morph.get("POS", "")
+ if norm_morph_pos not in self.cfg["labels"]:
+ self.cfg["labels"][norm_morph_pos] = norm_morph_pos
+ self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos]
+ return 1
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
**kwargs):
+ for example in get_examples():
+ for i, morph in enumerate(example.token_annotation.morphs):
+ pos = example.token_annotation.get_pos(i)
+ morph = Morphology.feats_to_dict(morph)
+ norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)]
+ if pos:
+ morph["POS"] = pos
+ norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)]
+ if norm_morph_pos not in self.cfg["labels"]:
+ self.cfg["labels"][norm_morph_pos] = norm_morph
+ self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos]
self.set_output(len(self.labels))
self.model.initialize()
+ link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
return sgd
- def predict(self, docs):
- if not any(len(doc) for doc in docs):
- # Handle case where there are no tokens in any docs.
- n_labels = self.model.get_dim("nO")
- guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
- tokvecs = self.model.ops.alloc((0, self.model.get_ref("tok2vec").get_dim("nO")))
- return guesses, tokvecs
- tokvecs = self.model.get_ref("tok2vec")(docs)
- scores = self.model.get_ref("softmax")(tokvecs)
- return scores, tokvecs
-
- def set_annotations(self, docs, batch_scores, tensors=None):
+ def set_annotations(self, docs, batch_tag_ids):
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef Vocab vocab = self.vocab
- offsets = [self._class_map.get_field_offset(field)
- for field in self._class_map.fields]
for i, doc in enumerate(docs):
- doc_scores = batch_scores[i]
- doc_guesses = scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"])
- # Convert the neuron indices into feature IDs.
- doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i')
- for j in range(len(doc)):
- for k, offset in enumerate(offsets):
- if doc_guesses[j, k] == 0:
- doc_feat_ids[j, k] = 0
- else:
- doc_feat_ids[j, k] = offset + doc_guesses[j, k]
- # Get the set of feature names.
- feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]}
- if "NIL" in feats:
- feats.remove("NIL")
- # Now add the analysis, and set the hash.
- doc.c[j].morph = self.vocab.morphology.add(feats)
- if doc[j].morph.pos != 0:
- doc.c[j].pos = doc[j].morph.pos
+ doc_tag_ids = batch_tag_ids[i]
+ if hasattr(doc_tag_ids, "get"):
+ doc_tag_ids = doc_tag_ids.get()
+ for j, tag_id in enumerate(doc_tag_ids):
+ morph = self.labels[tag_id]
+ doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels"][morph])
+ doc.c[j].pos = self.cfg["morph_pos"][morph]
- def update(self, examples, drop=0., sgd=None, losses=None):
- if losses is not None and self.name not in losses:
- losses[self.name] = 0.
-
- docs = [self._get_doc(ex) for ex in examples]
- tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
- loss, d_tag_scores = self.get_loss(examples, tag_scores)
- bp_tag_scores(d_tag_scores, sgd=sgd)
-
- if losses is not None:
- losses[self.name] += loss
+ doc.is_morphed = True
def get_loss(self, examples, scores):
- guesses = []
- for doc_scores in scores:
- guesses.append(scores_to_guesses(doc_scores, self.model.get_ref("softmax").attrs["nOs"]))
- guesses = self.model.ops.xp.vstack(guesses)
- scores = self.model.ops.xp.vstack(scores)
- if not isinstance(scores, numpy.ndarray):
- scores = scores.get()
- if not isinstance(guesses, numpy.ndarray):
- guesses = guesses.get()
+ scores = self.model.ops.flatten(scores)
+ tag_index = {tag: i for i, tag in enumerate(self.labels)}
cdef int idx = 0
- # Do this on CPU, as we can't vectorize easily.
- target = numpy.zeros(scores.shape, dtype='f')
- field_sizes = self.model.get_ref("softmax").attrs["nOs"]
- for example in examples:
- doc = example.doc
- gold = example.gold
- for t, features in enumerate(gold.morphology):
- if features is None:
- target[idx] = scores[idx]
+ correct = numpy.zeros((scores.shape[0],), dtype="i")
+ guesses = scores.argmax(axis=1)
+ known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
+ for ex in examples:
+ gold = ex.gold
+ for i in range(len(gold.morphs)):
+ pos = gold.pos[i] if i < len(gold.pos) else ""
+ morph = gold.morphs[i]
+ feats = Morphology.feats_to_dict(morph)
+ if pos:
+ feats["POS"] = pos
+ if len(feats) > 0:
+ morph = self.vocab.strings[self.vocab.morphology.add(feats)]
+ if morph == "":
+ morph = Morphology.EMPTY_MORPH
+ if morph is None:
+ correct[idx] = guesses[idx]
+ elif morph in tag_index:
+ correct[idx] = tag_index[morph]
else:
- gold_fields = {}
- for feature in features:
- field = self._class_map.feat2field[feature]
- gold_fields[field] = self._class_map.feat2offset[feature]
- for field in self._class_map.fields:
- field_id = self._class_map.field2id[field]
- col_offset = self._class_map.field2col[field]
- if field_id in gold_fields:
- target[idx, col_offset + gold_fields[field_id]] = 1.
- else:
- target[idx, col_offset] = 1.
- #print(doc[t])
- #for col, info in enumerate(self._class_map.col2info):
- # print(col, info, scores[idx, col], target[idx, col])
+ correct[idx] = 0
+ known_labels[idx] = 0.
idx += 1
- target = self.model.ops.asarray(target, dtype='f')
- scores = self.model.ops.asarray(scores, dtype='f')
- d_scores = scores - target
+ correct = self.model.ops.xp.array(correct, dtype="i")
+ d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
+ d_scores *= self.model.ops.asarray(known_labels)
loss = (d_scores**2).sum()
- docs = [self._get_doc(ex) for ex in examples]
+ docs = [ex.doc for ex in examples]
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
- def use_params(self, params):
- with self.model.use_params(params):
- yield
+ def to_bytes(self, exclude=tuple(), **kwargs):
+ serialize = {}
+ serialize["model"] = self.model.to_bytes
+ serialize["vocab"] = self.vocab.to_bytes
+ serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
+ exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
+ return util.to_bytes(serialize, exclude)
-def scores_to_guesses(scores, out_sizes):
- xp = get_array_module(scores)
- guesses = xp.zeros((scores.shape[0], len(out_sizes)), dtype='i')
- offset = 0
- for i, size in enumerate(out_sizes):
- slice_ = scores[:, offset : offset + size]
- col_guesses = slice_.argmax(axis=1)
- guesses[:, i] = col_guesses
- offset += size
- return guesses
+ def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
+ def load_model(b):
+ try:
+ self.model.from_bytes(b)
+ except AttributeError:
+ raise ValueError(Errors.E149)
+
+ deserialize = {
+ "vocab": lambda b: self.vocab.from_bytes(b),
+ "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
+ "model": lambda b: load_model(b),
+ }
+ exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
+ util.from_bytes(bytes_data, deserialize, exclude)
+ return self
+
+ def to_disk(self, path, exclude=tuple(), **kwargs):
+ serialize = {
+ "vocab": lambda p: self.vocab.to_disk(p),
+ "model": lambda p: p.open("wb").write(self.model.to_bytes()),
+ "cfg": lambda p: srsly.write_json(p, self.cfg),
+ }
+ exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
+ util.to_disk(path, serialize, exclude)
+
+ def from_disk(self, path, exclude=tuple(), **kwargs):
+ def load_model(p):
+ with p.open("rb") as file_:
+ try:
+ self.model.from_bytes(file_.read())
+ except AttributeError:
+ raise ValueError(Errors.E149)
+
+ deserialize = {
+ "vocab": lambda p: self.vocab.from_disk(p),
+ "cfg": lambda p: self.cfg.update(_load_cfg(p)),
+ "model": load_model,
+ }
+ exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
+ util.from_disk(path, deserialize, exclude)
+ return self
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 82b10a77d..7e2466be7 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -81,6 +81,9 @@ class Scorer(object):
self.labelled = PRFScore()
self.labelled_per_dep = dict()
self.tags = PRFScore()
+ self.pos = PRFScore()
+ self.morphs = PRFScore()
+ self.morphs_per_feat = dict()
self.sent_starts = PRFScore()
self.ner = PRFScore()
self.ner_per_ents = dict()
@@ -111,6 +114,29 @@ class Scorer(object):
"""
return self.tags.fscore * 100
+ @property
+ def pos_acc(self):
+ """RETURNS (float): Part-of-speech tag accuracy (coarse grained pos,
+ i.e. `Token.pos`).
+ """
+ return self.pos.fscore * 100
+
+ @property
+ def morphs_acc(self):
+ """RETURNS (float): Morph tag accuracy (morphological features,
+ i.e. `Token.morph`).
+ """
+ return self.morphs.fscore * 100
+
+ @property
+ def morphs_per_type(self):
+ """RETURNS (dict): Scores per dependency label.
+ """
+ return {
+ k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
+ for k, v in self.morphs_per_feat.items()
+ }
+
@property
def sent_p(self):
"""RETURNS (float): F-score for identification of sentence starts.
@@ -231,6 +257,9 @@ class Scorer(object):
"ents_f": self.ents_f,
"ents_per_type": self.ents_per_type,
"tags_acc": self.tags_acc,
+ "pos_acc": self.pos_acc,
+ "morphs_acc": self.morphs_acc,
+ "morphs_per_type": self.morphs_per_type,
"sent_p": self.sent_p,
"sent_r": self.sent_r,
"sent_f": self.sent_f,
@@ -264,12 +293,23 @@ class Scorer(object):
gold_deps = set()
gold_deps_per_dep = {}
gold_tags = set()
+ gold_pos = set()
+ gold_morphs = set()
+ gold_morphs_per_feat = {}
gold_sent_starts = set()
gold_ents = set(tags_to_entities(orig.entities))
- for id_, tag, head, dep, sent_start in zip(
- orig.ids, orig.tags, orig.heads, orig.deps, orig.sent_starts
- ):
+ for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts):
gold_tags.add((id_, tag))
+ gold_pos.add((id_, pos))
+ gold_morphs.add((id_, morph))
+ if morph:
+ for feat in morph.split("|"):
+ field, values = feat.split("=")
+ if field not in self.morphs_per_feat:
+ self.morphs_per_feat[field] = PRFScore()
+ if field not in gold_morphs_per_feat:
+ gold_morphs_per_feat[field] = set()
+ gold_morphs_per_feat[field].add((id_, feat))
if sent_start:
gold_sent_starts.add(id_)
if dep not in (None, "") and dep.lower() not in punct_labels:
@@ -282,6 +322,9 @@ class Scorer(object):
cand_deps = set()
cand_deps_per_dep = {}
cand_tags = set()
+ cand_pos = set()
+ cand_morphs = set()
+ cand_morphs_per_feat = {}
cand_sent_starts = set()
for token in doc:
if token.orth_.isspace():
@@ -292,6 +335,16 @@ class Scorer(object):
else:
self.tokens.tp += 1
cand_tags.add((gold_i, token.tag_))
+ cand_pos.add((gold_i, token.pos_))
+ cand_morphs.add((gold_i, token.morph_))
+ if token.morph_:
+ for feat in token.morph_.split("|"):
+ field, values = feat.split("=")
+ if field not in self.morphs_per_feat:
+ self.morphs_per_feat[field] = PRFScore()
+ if field not in cand_morphs_per_feat:
+ cand_morphs_per_feat[field] = set()
+ cand_morphs_per_feat[field].add((gold_i, feat))
if token.is_sent_start:
cand_sent_starts.add(gold_i)
if token.dep_.lower() not in punct_labels and token.orth_.strip():
@@ -340,6 +393,10 @@ class Scorer(object):
# Score for all ents
self.ner.score_set(cand_ents, gold_ents)
self.tags.score_set(cand_tags, gold_tags)
+ self.pos.score_set(cand_pos, gold_pos)
+ self.morphs.score_set(cand_morphs, gold_morphs)
+ for field in self.morphs_per_feat:
+ self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set()))
self.sent_starts.score_set(cand_sent_starts, gold_sent_starts)
self.labelled.score_set(cand_deps, gold_deps)
for dep in self.labelled_per_dep:
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
new file mode 100644
index 000000000..f9307afc2
--- /dev/null
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -0,0 +1,49 @@
+import pytest
+
+from spacy import util
+from spacy.lang.en import English
+from spacy.language import Language
+from spacy.tests.util import make_tempdir
+
+
+def test_label_types():
+ nlp = Language()
+ nlp.add_pipe(nlp.create_pipe("morphologizer"))
+ nlp.get_pipe("morphologizer").add_label("Feat=A")
+ with pytest.raises(ValueError):
+ nlp.get_pipe("morphologizer").add_label(9)
+
+
+TRAIN_DATA = [
+ ("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}),
+ ("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}),
+]
+
+
+def test_overfitting_IO():
+ # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly
+ nlp = English()
+ morphologizer = nlp.create_pipe("morphologizer")
+ for inst in TRAIN_DATA:
+ for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]):
+ morphologizer.add_label(morph + "|POS=" + pos)
+ nlp.add_pipe(morphologizer)
+ optimizer = nlp.begin_training()
+
+ for i in range(50):
+ losses = {}
+ nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
+ assert losses["morphologizer"] < 0.00001
+
+ # test the trained model
+ test_text = "I like blue eggs"
+ doc = nlp(test_text)
+ gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"]
+ assert gold_morphs == [t.morph_ for t in doc]
+
+ # Also test the results are still the same after IO
+ with make_tempdir() as tmp_dir:
+ nlp.to_disk(tmp_dir)
+ nlp2 = util.load_model_from_path(tmp_dir)
+ doc2 = nlp2(test_text)
+ assert gold_morphs == [t.morph_ for t in doc2]
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 38c980428..b091ec0de 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -8,6 +8,7 @@ from spacy.language import Language
from spacy.pipeline import TextCategorizer
from spacy.tokens import Doc
from spacy.gold import GoldParse
+from spacy.util import fix_random_seed
from ..util import make_tempdir
from ...ml.models.defaults import default_tok2vec
@@ -82,6 +83,7 @@ def test_label_types():
def test_overfitting_IO():
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
+ fix_random_seed(0)
nlp = English()
textcat = nlp.create_pipe("textcat")
for _, annotations in TRAIN_DATA:
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index efaf80b4f..d750a8202 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -5,6 +5,7 @@ from spacy.gold import Example, GoldParse
from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve
from .util import get_doc
+from spacy.lang.en import English
test_las_apple = [
[
@@ -39,6 +40,43 @@ test_ner_apple = [
]
]
+@pytest.fixture
+def tagged_doc():
+ text = "Sarah's sister flew to Silicon Valley via London."
+ tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
+ pos = [
+ "PROPN",
+ "PART",
+ "NOUN",
+ "VERB",
+ "ADP",
+ "PROPN",
+ "PROPN",
+ "ADP",
+ "PROPN",
+ "PUNCT",
+ ]
+ morphs = [
+ "NounType=prop|Number=sing",
+ "Poss=yes",
+ "Number=sing",
+ "Tense=past|VerbForm=fin",
+ "",
+ "NounType=prop|Number=sing",
+ "NounType=prop|Number=sing",
+ "",
+ "NounType=prop|Number=sing",
+ "PunctType=peri",
+ ]
+ nlp = English()
+ doc = nlp(text)
+ for i in range(len(tags)):
+ doc[i].tag_ = tags[i]
+ doc[i].pos_ = pos[i]
+ doc[i].morph_ = morphs[i]
+ doc.is_tagged = True
+ return doc
+
def test_las_per_type(en_vocab):
# Gold and Doc are identical
@@ -139,6 +177,43 @@ def test_ner_per_type(en_vocab):
assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666)
+def test_tag_score(tagged_doc):
+ # Gold and Doc are identical
+ scorer = Scorer()
+ gold = GoldParse(
+ tagged_doc,
+ tags=[t.tag_ for t in tagged_doc],
+ pos=[t.pos_ for t in tagged_doc],
+ morphs=[t.morph_ for t in tagged_doc]
+ )
+ scorer.score((tagged_doc, gold))
+ results = scorer.scores
+
+ assert results["tags_acc"] == 100
+ assert results["pos_acc"] == 100
+ assert results["morphs_acc"] == 100
+ assert results["morphs_per_type"]["NounType"]["f"] == 100
+
+ # Gold and Doc are identical
+ scorer = Scorer()
+ tags = [t.tag_ for t in tagged_doc]
+ tags[0] = "NN"
+ pos = [t.pos_ for t in tagged_doc]
+ pos[1] = "X"
+ morphs = [t.morph_ for t in tagged_doc]
+ morphs[1] = "Number=sing"
+ morphs[2] = "Number=plur"
+ gold = GoldParse(tagged_doc, tags=tags, pos=pos, morphs=morphs)
+ scorer.score((tagged_doc, gold))
+ results = scorer.scores
+
+ assert results["tags_acc"] == 90
+ assert results["pos_acc"] == 90
+ assert results["morphs_acc"] == approx(80)
+ assert results["morphs_per_type"]["Poss"]["f"] == 0.0
+ assert results["morphs_per_type"]["Number"]["f"] == approx(72.727272)
+
+
def test_roc_auc_score():
# Binary classification, toy tests from scikit-learn test suite
y_true = [0, 1]
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 7f231887f..050a6b898 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -50,6 +50,7 @@ cdef class Doc:
cdef public bint is_tagged
cdef public bint is_parsed
+ cdef public bint is_morphed
cdef public float sentiment
From b2e93be867be16acee8ccc6f95e4fb1ebf7d86cf Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Fri, 3 Apr 2020 13:02:46 +0200
Subject: [PATCH 114/496] Optimizer defaults (#5244)
* set optimizer defaults to mimic thinc 7 + bump to dev6
* larger error range for senter overfitting test
---
spacy/about.py | 2 +-
spacy/tests/pipeline/test_senter.py | 2 +-
spacy/util.py | 4 +++-
3 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/spacy/about.py b/spacy/about.py
index 0c0a2d002..6fa1f4c0b 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.0.0.dev4"
+__version__ = "3.0.0.dev6"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 411768e5f..197fdca6e 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -33,7 +33,7 @@ def test_overfitting_IO():
for i in range(200):
losses = {}
nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses)
- assert losses["senter"] < 0.0001
+ assert losses["senter"] < 0.001
# test the trained model
test_text = "I like purple eggs. They eat ham. You like yellow eggs."
diff --git a/spacy/util.py b/spacy/util.py
index 37649c5e6..ef9082140 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -790,7 +790,8 @@ def create_default_optimizer():
beta2 = env_opt("optimizer_B2", 0.999)
eps = env_opt("optimizer_eps", 1e-8)
L2 = env_opt("L2_penalty", 1e-6)
- grad_clip = env_opt("grad_norm_clip", 1.0)
+ grad_clip = env_opt("grad_norm_clip", 10.0)
+ L2_is_weight_decay = env_opt("L2_is_weight_decay", False)
optimizer = Adam(
learn_rate,
L2=L2,
@@ -799,5 +800,6 @@ def create_default_optimizer():
eps=eps,
ops=ops,
grad_clip=grad_clip,
+ L2_is_weight_decay=L2_is_weight_decay,
)
return optimizer
From e47010bf3c9421bbe9e642ca68ae93455ea03d49 Mon Sep 17 00:00:00 2001
From: vincent d warmerdam
Date: Mon, 6 Apr 2020 11:29:30 +0200
Subject: [PATCH 115/496] add "whatlies" to spaCy universe (#5252)
* Add "whatlies"
We're releasing it on our side officially on the 16th of April. If possible, let's announce around the same time :)
* sign contributor thing
* Added fancy gif
as the image
* Update universe.json
Spellin error and spaCy clarification.
---
.github/contributors/koaning.md | 106 ++++++++++++++++++++++++++++++++
website/meta/universe.json | 28 +++++++++
2 files changed, 134 insertions(+)
create mode 100644 .github/contributors/koaning.md
diff --git a/.github/contributors/koaning.md b/.github/contributors/koaning.md
new file mode 100644
index 000000000..ddb28cab0
--- /dev/null
+++ b/.github/contributors/koaning.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | ------------------------ |
+| Name | Vincent D. Warmerdam |
+| Company name (if applicable) | |
+| Title or role (if applicable) | Data Person |
+| Date | 2020-03-01 |
+| GitHub username | koaning |
+| Website (optional) | https://koaning.io |
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 23d052bb9..8071374f7 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,33 @@
{
"resources": [
+ {
+ "id": "whatlies",
+ "title": "whatlies",
+ "slogan": "Make interactive visualisations to figure out 'what lies' in word embeddings.",
+ "description": "This small library offers tools to make visualisation easier of both word embeddings as well as operations on them. It has support for spaCy prebuilt models as a first class citizen but also offers support for sense2vec. There's a convenient API to perform linear algebra as well as support for popular transformations like PCA/UMAP/etc.",
+ "github": "rasahq/whatlies",
+ "pip": "whatlies",
+ "thumb": "https://i.imgur.com/rOkOiLv.png",
+ "image": "https://raw.githubusercontent.com/RasaHQ/whatlies/master/docs/gif-two.gif",
+ "code_example": [
+ "from whatlies import EmbeddingSet",
+ "from whatlies.language import SpacyLanguage",
+ "",
+ "lang = SpacyLanguage('en_core_web_md')",
+ "words = ['cat', 'dog', 'fish', 'kitten', 'man', 'woman', ',
+ 'king', 'queen', 'doctor', 'nurse']",
+ "",
+ "emb = lang[words]",
+ "emb.plot_interactive(x_axis='man', y_axis='woman')"
+ ],
+ "category": ["visualizers", "research"],
+ "author": "Vincent D. Warmerdam",
+ "author_links": {
+ "twitter": "fishnets88",
+ "github": "koaning",
+ "website": "https://koaning.io"
+ }
+ },
{
"id": "spacy-stanza",
"title": "spacy-stanza",
From 528c4f6b2ee950b82cfd0ead672d7620cddd1642 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Fri, 3 Apr 2020 13:01:43 +0200
Subject: [PATCH 116/496] Small doc fixes (#5250)
* fix link
* torchtext instead tochtext
---
website/docs/usage/linguistic-features.md | 2 +-
website/meta/universe.json | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 685619c88..59712939a 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -1303,7 +1303,7 @@ with doc.retokenize() as retokenizer:
### Overwriting custom extension attributes {#retokenization-extensions}
If you've registered custom
-[extension attributes](/usage/processing-pipelines##custom-components-attributes),
+[extension attributes](/usage/processing-pipelines#custom-components-attributes),
you can overwrite them during tokenization by providing a dictionary of
attribute names mapped to new values as the `"_"` key in the `attrs`. For
merging, you need to provide one dictionary of attributes for the resulting
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 8071374f7..bbd67e8a6 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -669,7 +669,7 @@
"tags": ["chatbots"]
},
{
- "id": "tochtext",
+ "id": "torchtext",
"title": "torchtext",
"slogan": "Data loaders and abstractions for text and NLP",
"github": "pytorch/text",
From 81d6aee6e791e35f54f9369f14bc2399d6c26380 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Tue, 7 Apr 2020 14:11:31 +0200
Subject: [PATCH 117/496] fix json
---
website/meta/universe.json | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/website/meta/universe.json b/website/meta/universe.json
index bbd67e8a6..b5e1dbde0 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -14,8 +14,7 @@
"from whatlies.language import SpacyLanguage",
"",
"lang = SpacyLanguage('en_core_web_md')",
- "words = ['cat', 'dog', 'fish', 'kitten', 'man', 'woman', ',
- 'king', 'queen', 'doctor', 'nurse']",
+ "words = ['cat', 'dog', 'fish', 'kitten', 'man', 'woman', 'king', 'queen', 'doctor', 'nurse']",
"",
"emb = lang[words]",
"emb.plot_interactive(x_axis='man', y_axis='woman')"
From 42364dcd9f7c243271416b068a7bc708f9ef6346 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Thu, 9 Apr 2020 10:21:20 +0200
Subject: [PATCH 118/496] Remove "pala" tokenizer exception for Spanish (#5265)
---
spacy/lang/es/tokenizer_exceptions.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py
index 5c7fcb15d..d5eb42e29 100644
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@@ -3,7 +3,6 @@ from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA
_exc = {
"pal": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "l", LEMMA: "el", NORM: "el"}],
- "pala": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "la", LEMMA: "la", NORM: "la"}],
}
From 8f431ad97ce954bed2365b2417cfda73785d5a29 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Tue, 14 Apr 2020 14:53:47 +0200
Subject: [PATCH 119/496] tag-map-path since 2.2.4 instead of 2.2.3 (#5289)
---
website/docs/api/cli.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index f067ba5a7..4b1b37bc5 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -189,7 +189,7 @@ $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pi
| `lang` | positional | Model language. |
| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. |
| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
-| `--tag-map-path`, `-tm` 2.2.3 | option | Location of JSON-formatted tag map. |
+| `--tag-map-path`, `-tm` 2.2.4 | option | Location of JSON-formatted tag map. |
| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. |
| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
From 688a3286689493d602db156edb0b768dc921eb64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Harinck?=
Date: Wed, 15 Apr 2020 16:47:29 +0200
Subject: [PATCH 120/496] docs(website): fix issue on example in spacy-lookup
---
website/meta/universe.json | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 23d052bb9..70aace8c0 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -334,15 +334,16 @@
"from spacy_lookup import Entity",
"",
"nlp = spacy.load('en')",
- "entity = Entity(keywords_list=['python', 'java platform'])",
+ "entity = Entity(keywords_list=['python', 'product manager', 'java platform'])",
"nlp.add_pipe(entity, last=True)",
"",
"doc = nlp(u\"I am a product manager for a java and python.\")",
"assert doc._.has_entities == True",
- "assert doc[2:5]._.has_entities == True",
"assert doc[0]._.is_entity == False",
+ "assert doc[3]._.entity_desc == 'product manager'",
"assert doc[3]._.is_entity == True",
- "print(doc._.entities)"
+ "",
+ "print([(token.text, token._.canonical) for token in doc if token._.is_entity])"
],
"author": "Marc Puig",
"author_links": {
From 51207c9417028027ca84158f87f1e8671ec3d0fa Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 16 Apr 2020 14:45:25 +0200
Subject: [PATCH 121/496] Update netlify.toml [ci skip]
---
netlify.toml | 62 ++++++++++++++++++++++++++--------------------------
1 file changed, 31 insertions(+), 31 deletions(-)
diff --git a/netlify.toml b/netlify.toml
index 45bd2c3b6..be809f1d4 100644
--- a/netlify.toml
+++ b/netlify.toml
@@ -7,42 +7,42 @@ redirects = [
{from = "https://alpha.spacy.io/*", to = "https://spacy.io", force = true},
{from = "http://alpha.spacy.io/*", to = "https://spacy.io", force = true},
# Old demos
- {from = "/demos/*", to = "https://explosion.ai/demos/:splat"},
+ {from = "/demos/*", to = "https://explosion.ai/demos/:splat", force = true},
# Old blog
- {from = "/blog/*", to = "https://explosion.ai/blog/:splat"},
- {from = "/feed", to = "https://explosion.ai/feed"},
- {from = "/feed.xml", to = "https://explosion.ai/feed"},
+ {from = "/blog/*", to = "https://explosion.ai/blog/:splat", force = true},
+ {from = "/feed", to = "https://explosion.ai/feed", force = true},
+ {from = "/feed.xml", to = "https://explosion.ai/feed", force = true},
# Old documentation pages (1.x)
- {from = "/docs/usage/processing-text", to = "/usage/linguistic-features"},
- {from = "/docs/usage/deep-learning", to = "/usage/training"},
- {from = "/docs/usage/pos-tagging", to = "/usage/linguistic-features#pos-tagging"},
- {from = "/docs/usage/dependency-parse", to = "/usage/linguistic-features#dependency-parse"},
- {from = "/docs/usage/entity-recognition", to = "/usage/linguistic-features#named-entities"},
- {from = "/docs/usage/word-vectors-similarities", to = "/usage/vectors-similarity"},
- {from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization"},
- {from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines"},
- {from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines"},
- {from = "/docs/usage/training-ner", to = "/usage/training#ner"},
- {from = "/docs/usage/tutorials", to = "/usage/examples"},
- {from = "/docs/usage/data-model", to = "/api"},
- {from = "/docs/usage/cli", to = "/api/cli"},
- {from = "/docs/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"},
- {from = "/docs/api/language-models", to = "/usage/models#languages"},
- {from = "/docs/api/spacy", to = "/docs/api/top-level"},
- {from = "/docs/api/displacy", to = "/api/top-level#displacy"},
- {from = "/docs/api/util", to = "/api/top-level#util"},
- {from = "/docs/api/features", to = "/models/#architecture"},
- {from = "/docs/api/philosophy", to = "/usage/spacy-101"},
- {from = "/docs/usage/showcase", to = "/universe"},
- {from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom"},
- {from = "/tutorials", to = "/usage/examples"},
+ {from = "/docs/usage/processing-text", to = "/usage/linguistic-features", force = true},
+ {from = "/docs/usage/deep-learning", to = "/usage/training", force = true},
+ {from = "/docs/usage/pos-tagging", to = "/usage/linguistic-features#pos-tagging", force = true},
+ {from = "/docs/usage/dependency-parse", to = "/usage/linguistic-features#dependency-parse", force = true},
+ {from = "/docs/usage/entity-recognition", to = "/usage/linguistic-features#named-entities", force = true},
+ {from = "/docs/usage/word-vectors-similarities", to = "/usage/vectors-similarity", force = true},
+ {from = "/docs/usage/customizing-tokenizer", to = "/usage/linguistic-features#tokenization", force = true},
+ {from = "/docs/usage/language-processing-pipeline", to = "/usage/processing-pipelines", force = true},
+ {from = "/docs/usage/customizing-pipeline", to = "/usage/processing-pipelines", force = true},
+ {from = "/docs/usage/training-ner", to = "/usage/training#ner", force = true},
+ {from = "/docs/usage/tutorials", to = "/usage/examples", force = true},
+ {from = "/docs/usage/data-model", to = "/api", force = true},
+ {from = "/docs/usage/cli", to = "/api/cli", force = true},
+ {from = "/docs/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour", force = true},
+ {from = "/docs/api/language-models", to = "/usage/models#languages", force = true},
+ {from = "/docs/api/spacy", to = "/docs/api/top-level", force = true},
+ {from = "/docs/api/displacy", to = "/api/top-level#displacy", force = true},
+ {from = "/docs/api/util", to = "/api/top-level#util", force = true},
+ {from = "/docs/api/features", to = "/models/#architecture", force = true},
+ {from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true},
+ {from = "/docs/usage/showcase", to = "/universe", force = true},
+ {from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true},
+ {from = "/tutorials", to = "/usage/examples", force = true},
# Rewrite all other docs pages to /
{from = "/docs/*", to = "/:splat"},
# Updated documentation pages
- {from = "/usage/resources", to = "/universe"},
- {from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"},
- {from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching"},
- {from = "/models/comparison", to = "/models"},
+ {from = "/usage/resources", to = "/universe", force = true},
+ {from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour", force = true},
+ {from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching", force = true},
+ {from = "/models/comparison", to = "/models", force = true},
{from = "/api/#section-cython", to = "/api/cython", force = true},
{from = "/api/#cython", to = "/api/cython", force = true},
{from = "/api/sentencesegmenter", to="/api/sentencizer"},
From 6918d99b6c631b5256aa24302050b085af841cc8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal
Date: Mon, 20 Apr 2020 22:06:28 +0200
Subject: [PATCH 122/496] Improve GPU usage for train-with-config (#5330)
* Adjust for no ops in Optimizer
* Fix gpu in train-from-config
* Update train-from-config script
* Fix parser
* Fix GPU efficiency of padding backprop
---
spacy/cli/train_from_config.py | 37 ++++++++++++++++++-------------
spacy/ml/_precomputable_affine.py | 19 +++++-----------
spacy/syntax/nn_parser.pyx | 1 +
spacy/util.py | 2 --
4 files changed, 27 insertions(+), 32 deletions(-)
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index 5b09909c7..933b275c4 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -1,4 +1,5 @@
from typing import Optional, Dict, List, Union, Sequence
+from timeit import default_timer as timer
from pydantic import BaseModel, FilePath
import plac
import tqdm
@@ -146,30 +147,29 @@ def train_from_config_cli(
if output_path is not None and not output_path.exists():
output_path.mkdir()
- try:
- train_from_config(
- config_path,
- {"train": train_path, "dev": dev_path},
- output_path=output_path,
- meta_path=meta_path,
- raw_text=raw_text,
- )
- except KeyboardInterrupt:
- msg.warn("Cancelled.")
+ train_from_config(
+ config_path,
+ {"train": train_path, "dev": dev_path},
+ output_path=output_path,
+ meta_path=meta_path,
+ raw_text=raw_text,
+ )
def train_from_config(
config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
):
msg.info(f"Loading config from: {config_path}")
- config = util.load_config(config_path, create_objects=True)
+ config = util.load_config(config_path, create_objects=False)
+ nlp_config = config["nlp"]
use_gpu = config["training"]["use_gpu"]
if use_gpu >= 0:
msg.info("Using GPU")
+ util.use_gpu(use_gpu)
else:
msg.info("Using CPU")
+ config = util.load_config(config_path, create_objects=True)
msg.info("Creating nlp from config")
- nlp_config = util.load_config(config_path, create_objects=False)["nlp"]
nlp = util.load_model_from_config(nlp_config)
optimizer = config["optimizer"]
training = config["training"]
@@ -240,12 +240,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
)
)
+ n_words = sum(len(ex.doc) for ex in dev_examples)
+ start_time = timer()
scorer = nlp.evaluate(dev_examples)
+ end_time = timer()
+ wps = n_words / (end_time - start_time)
scores = scorer.scores
# Calculate a weighted sum based on score_weights for the main score
weights = cfg["score_weights"]
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
- return weighted_score, scorer.scores
+ scores["speed"] = wps
+ return weighted_score, scores
return evaluate
@@ -346,13 +351,13 @@ def setup_printer(training, nlp):
def print_row(info):
losses = [
- "{0:.2f}".format(info["losses"].get(pipe_name, 0.0))
+ "{0:.2f}".format(float(info["losses"].get(pipe_name, 0.0)))
for pipe_name in nlp.pipe_names
]
scores = [
- "{0:.2f}".format(info["other_scores"].get(col, 0.0)) for col in score_cols
+ "{0:.2f}".format(float(info["other_scores"].get(col, 0.0))) for col in score_cols
]
- data = [info["step"]] + losses + scores + ["{0:.2f}".format(info["score"])]
+ data = [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
msg.row(data, widths=table_widths, aligns=table_aligns)
return print_row
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
index a752ef49a..c7328bad9 100644
--- a/spacy/ml/_precomputable_affine.py
+++ b/spacy/ml/_precomputable_affine.py
@@ -79,23 +79,14 @@ def _backprop_precomputable_affine_padding(model, dY, ids):
# for b in range(nB):
# for f in range(nF):
# if ids[b, f] < 0:
- # d_pad[0, f] += dY[b]
+ # d_pad[f] += dY[b]
#
# Which can be rewritten as:
#
- # for b in range(nB):
- # d_pad[0, ids[b] < 0] += dY[b]
- #
- # I don't know how to avoid the loop without building a whole array :(.
- # Cursed numpy.
- #
- # Note by Sofie: rewritten to longer loop because "CuPy only supports slices that consist of one boolean array."
- d_pad = model.ops.alloc((1, nF, nO, nP))
- for b in range(nB):
- for f in range(nF):
- if ids[b, f] < 0:
- d_pad[0, f] += dY[b]
- return d_pad
+ # (ids < 0).T @ dY
+ mask = model.ops.asarray(ids < 0, dtype="f")
+ d_pad = model.ops.gemm(mask, dY.reshape(nB, nO*nP), trans1=True)
+ return d_pad.reshape((1, nF, nO, nP))
def init(model, X=None, Y=None):
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index f480e3528..01d6d5bfe 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -216,6 +216,7 @@ cdef class Parser:
# expand our model output.
self._resize()
model = self.model.predict(docs)
+ W_param = model.vec2scores.get_param("W")
weights = get_c_weights(model)
for state in batch:
if not state.is_final():
diff --git a/spacy/util.py b/spacy/util.py
index ef9082140..ea3023629 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -784,7 +784,6 @@ VECTORS_KEY = "spacy_pretrained_vectors"
def create_default_optimizer():
- ops = get_current_ops()
learn_rate = env_opt("learn_rate", 0.001)
beta1 = env_opt("optimizer_B1", 0.9)
beta2 = env_opt("optimizer_B2", 0.999)
@@ -798,7 +797,6 @@ def create_default_optimizer():
beta1=beta1,
beta2=beta2,
eps=eps,
- ops=ops,
grad_clip=grad_clip,
L2_is_weight_decay=L2_is_weight_decay,
)
From b2ef6100af585942388930a14fa78e9762758f36 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal
Date: Tue, 21 Apr 2020 19:30:41 +0200
Subject: [PATCH 123/496] Only run backprop once when shared tok2vec weights
(#5331)
Previously, pipelines with shared tok2vec weights would call the
tok2vec backprop callback multiple times, once for each pipeline
component. This caused errors for PyTorch, and was inefficient.
Instead, accumulate the gradient for all but one component, and just
call the callback once.
---
spacy/pipeline/tok2vec.py | 34 ++++++++++++++++++++++------------
1 file changed, 22 insertions(+), 12 deletions(-)
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index ef744a5da..83a4454e3 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -103,20 +103,30 @@ class Tok2Vec(Pipe):
set_dropout_rate(self.model, drop)
tokvecs, bp_tokvecs = self.model.begin_update(docs)
- def capture_losses(d_tokvecs):
- """Accumulate tok2vec loss before doing backprop."""
- l2_loss = sum((d_t2v ** 2).sum() for d_t2v in d_tokvecs)
- if self.name in losses:
- losses[self.name] += l2_loss / len(d_tokvecs)
- else:
- losses[self.name] = l2_loss / len(d_tokvecs)
- return bp_tokvecs(d_tokvecs)
+ d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
+ losses.setdefault(self.name, 0.0)
+
+ def accumulate_gradient(one_d_tokvecs):
+ """Accumulate tok2vec loss and gradient. This is passed as a callback
+ to all but the last listener. Only the last one does the backprop.
+ """
+ nonlocal d_tokvecs
+ for i in range(len(one_d_tokvecs)):
+ d_tokvecs[i] += one_d_tokvecs[i]
+ losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
+
+ def backprop(one_d_tokvecs):
+ """Callback to actually do the backprop. Passed to last listener."""
+ accumulate_gradient(one_d_tokvecs)
+ d_docs = bp_tokvecs(d_tokvecs)
+ if sgd is not None:
+ self.model.finish_update(sgd)
+ return d_docs
batch_id = Tok2VecListener.get_batch_id(docs)
- for listener in self.listeners:
- listener.receive(batch_id, tokvecs, capture_losses)
- if sgd is not None:
- self.model.finish_update(sgd)
+ for listener in self.listeners[:-1]:
+ listener.receive(batch_id, tokvecs, accumulate_gradient)
+ self.listeners[-1].receive(batch_id, tokvecs, backprop)
if set_annotations:
self.set_annotations(docs, tokvecs)
From 1bf2082ac48ab02300177c8a630e2fa5e74b7b7d Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Wed, 29 Apr 2020 12:51:49 +0200
Subject: [PATCH 124/496] update is_new_osx function (#5376)
---
setup.py | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/setup.py b/setup.py
index d9021836f..d16615f5f 100755
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
#!/usr/bin/env python
import sys
+import platform
from distutils.command.build_ext import build_ext
from distutils.sysconfig import get_python_inc
import distutils.util
@@ -73,18 +74,18 @@ COPY_FILES = {
def is_new_osx():
- """Check whether we're on OSX >= 10.10"""
+ """Check whether we're on OSX >= 10.7"""
name = distutils.util.get_platform()
if sys.platform != "darwin":
return False
- elif name.startswith("macosx-10"):
- minor_version = int(name.split("-")[1].split(".")[1])
+ mac_ver = platform.mac_ver()[0]
+ if mac_ver.startswith("10"):
+ minor_version = int(mac_ver.split('.')[1])
if minor_version >= 7:
return True
else:
return False
- else:
- return False
+ return False
if is_new_osx():
From cafe94ee045fe9937cadbdc0e1c96d6eabde5dec Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Wed, 29 Apr 2020 12:53:53 +0200
Subject: [PATCH 125/496] Update NEL examples and documentation (#5370)
* simplify creation of KB by skipping dim reduction
* small fixes to train EL example script
* add KB creation and NEL training example scripts to example section
* update descriptions of example scripts in the documentation
* moving wiki_entity_linking folder from bin to projects
* remove test for wiki NEL functionality that is being moved
# Conflicts:
# bin/wiki_entity_linking/wikipedia_processor.py
---
bin/wiki_entity_linking/README.md | 37 --
bin/wiki_entity_linking/__init__.py | 12 -
.../entity_linker_evaluation.py | 204 -------
bin/wiki_entity_linking/kb_creator.py | 161 -----
bin/wiki_entity_linking/train_descriptions.py | 152 -----
bin/wiki_entity_linking/wiki_io.py | 127 ----
bin/wiki_entity_linking/wiki_namespaces.py | 128 ----
.../wikidata_pretrain_kb.py | 179 ------
bin/wiki_entity_linking/wikidata_processor.py | 154 -----
.../wikidata_train_entity_linker.py | 172 ------
.../wikipedia_processor.py | 565 ------------------
.../training/{pretrain_kb.py => create_kb.py} | 43 +-
examples/training/train_entity_linker.py | 10 +-
website/docs/usage/examples.md | 21 +
website/docs/usage/linguistic-features.md | 4 +-
website/docs/usage/training.md | 22 +-
16 files changed, 50 insertions(+), 1941 deletions(-)
delete mode 100644 bin/wiki_entity_linking/README.md
delete mode 100644 bin/wiki_entity_linking/__init__.py
delete mode 100644 bin/wiki_entity_linking/entity_linker_evaluation.py
delete mode 100644 bin/wiki_entity_linking/kb_creator.py
delete mode 100644 bin/wiki_entity_linking/train_descriptions.py
delete mode 100644 bin/wiki_entity_linking/wiki_io.py
delete mode 100644 bin/wiki_entity_linking/wiki_namespaces.py
delete mode 100644 bin/wiki_entity_linking/wikidata_pretrain_kb.py
delete mode 100644 bin/wiki_entity_linking/wikidata_processor.py
delete mode 100644 bin/wiki_entity_linking/wikidata_train_entity_linker.py
rename examples/training/{pretrain_kb.py => create_kb.py} (75%)
diff --git a/bin/wiki_entity_linking/README.md b/bin/wiki_entity_linking/README.md
deleted file mode 100644
index 4e4af5c21..000000000
--- a/bin/wiki_entity_linking/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-## Entity Linking with Wikipedia and Wikidata
-
-### Step 1: Create a Knowledge Base (KB) and training data
-
-Run `wikidata_pretrain_kb.py`
-* This takes as input the locations of a **Wikipedia and a Wikidata dump**, and produces a **KB directory** + **training file**
- * WikiData: get `latest-all.json.bz2` from https://dumps.wikimedia.org/wikidatawiki/entities/
- * Wikipedia: get `enwiki-latest-pages-articles-multistream.xml.bz2` from https://dumps.wikimedia.org/enwiki/latest/ (or for any other language)
-* You can set the filtering parameters for KB construction:
- * `max_per_alias` (`-a`): (max) number of candidate entities in the KB per alias/synonym
- * `min_freq` (`-f`): threshold of number of times an entity should occur in the corpus to be included in the KB
- * `min_pair` (`-c`): threshold of number of times an entity+alias combination should occur in the corpus to be included in the KB
-* Further parameters to set:
- * `descriptions_from_wikipedia` (`-wp`): whether to parse descriptions from Wikipedia (`True`) or Wikidata (`False`)
- * `entity_vector_length` (`-v`): length of the pre-trained entity description vectors
- * `lang` (`-la`): language for which to fetch Wikidata information (as the dump contains all languages)
-
-Quick testing and rerunning:
-* When trying out the pipeline for a quick test, set `limit_prior` (`-lp`), `limit_train` (`-lt`) and/or `limit_wd` (`-lw`) to read only parts of the dumps instead of everything.
- * e.g. set `-lt 20000 -lp 2000 -lw 3000 -f 1`
-* If you only want to (re)run certain parts of the pipeline, just remove the corresponding files and they will be recalculated or reparsed.
-
-
-### Step 2: Train an Entity Linking model
-
-Run `wikidata_train_entity_linker.py`
-* This takes the **KB directory** produced by Step 1, and trains an **Entity Linking model**
-* Specify the output directory (`-o`) in which the final, trained model will be saved
-* You can set the learning parameters for the EL training:
- * `epochs` (`-e`): number of training iterations
- * `dropout` (`-p`): dropout rate
- * `lr` (`-n`): learning rate
- * `l2` (`-r`): L2 regularization
-* Specify the number of training and dev testing articles with `train_articles` (`-t`) and `dev_articles` (`-d`) respectively
- * If not specified, the full dataset will be processed - this may take a LONG time !
-* Further parameters to set:
- * `labels_discard` (`-l`): NER label types to discard during training
diff --git a/bin/wiki_entity_linking/__init__.py b/bin/wiki_entity_linking/__init__.py
deleted file mode 100644
index de486bbcf..000000000
--- a/bin/wiki_entity_linking/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-TRAINING_DATA_FILE = "gold_entities.jsonl"
-KB_FILE = "kb"
-KB_MODEL_DIR = "nlp_kb"
-OUTPUT_MODEL_DIR = "nlp"
-
-PRIOR_PROB_PATH = "prior_prob.csv"
-ENTITY_DEFS_PATH = "entity_defs.csv"
-ENTITY_FREQ_PATH = "entity_freq.csv"
-ENTITY_ALIAS_PATH = "entity_alias.csv"
-ENTITY_DESCR_PATH = "entity_descriptions.csv"
-
-LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
diff --git a/bin/wiki_entity_linking/entity_linker_evaluation.py b/bin/wiki_entity_linking/entity_linker_evaluation.py
deleted file mode 100644
index 2aeffbfc2..000000000
--- a/bin/wiki_entity_linking/entity_linker_evaluation.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import logging
-import random
-from tqdm import tqdm
-from collections import defaultdict
-
-logger = logging.getLogger(__name__)
-
-
-class Metrics(object):
- true_pos = 0
- false_pos = 0
- false_neg = 0
-
- def update_results(self, true_entity, candidate):
- candidate_is_correct = true_entity == candidate
-
- # Assume that we have no labeled negatives in the data (i.e. cases where true_entity is "NIL")
- # Therefore, if candidate_is_correct then we have a true positive and never a true negative.
- self.true_pos += candidate_is_correct
- self.false_neg += not candidate_is_correct
- if candidate and candidate not in {"", "NIL"}:
- # A wrong prediction (e.g. Q42 != Q3) counts both as a FP as well as a FN.
- self.false_pos += not candidate_is_correct
-
- def calculate_precision(self):
- if self.true_pos == 0:
- return 0.0
- else:
- return self.true_pos / (self.true_pos + self.false_pos)
-
- def calculate_recall(self):
- if self.true_pos == 0:
- return 0.0
- else:
- return self.true_pos / (self.true_pos + self.false_neg)
-
- def calculate_fscore(self):
- p = self.calculate_precision()
- r = self.calculate_recall()
- if p + r == 0:
- return 0.0
- else:
- return 2 * p * r / (p + r)
-
-
-class EvaluationResults(object):
- def __init__(self):
- self.metrics = Metrics()
- self.metrics_by_label = defaultdict(Metrics)
-
- def update_metrics(self, ent_label, true_entity, candidate):
- self.metrics.update_results(true_entity, candidate)
- self.metrics_by_label[ent_label].update_results(true_entity, candidate)
-
- def report_metrics(self, model_name):
- model_str = model_name.title()
- recall = self.metrics.calculate_recall()
- precision = self.metrics.calculate_precision()
- fscore = self.metrics.calculate_fscore()
- return (
- "{}: ".format(model_str)
- + "F-score = {} | ".format(round(fscore, 3))
- + "Recall = {} | ".format(round(recall, 3))
- + "Precision = {} | ".format(round(precision, 3))
- + "F-score by label = {}".format(
- {k: v.calculate_fscore() for k, v in sorted(self.metrics_by_label.items())}
- )
- )
-
-
-class BaselineResults(object):
- def __init__(self):
- self.random = EvaluationResults()
- self.prior = EvaluationResults()
- self.oracle = EvaluationResults()
-
- def report_performance(self, model):
- results = getattr(self, model)
- return results.report_metrics(model)
-
- def update_baselines(
- self,
- true_entity,
- ent_label,
- random_candidate,
- prior_candidate,
- oracle_candidate,
- ):
- self.oracle.update_metrics(ent_label, true_entity, oracle_candidate)
- self.prior.update_metrics(ent_label, true_entity, prior_candidate)
- self.random.update_metrics(ent_label, true_entity, random_candidate)
-
-
-def measure_performance(dev_data, kb, el_pipe, baseline=True, context=True, dev_limit=None):
- counts = dict()
- baseline_results = BaselineResults()
- context_results = EvaluationResults()
- combo_results = EvaluationResults()
-
- for doc, gold in tqdm(dev_data, total=dev_limit, leave=False, desc='Processing dev data'):
- if len(doc) > 0:
- correct_ents = dict()
- for entity, kb_dict in gold.links.items():
- start, end = entity
- for gold_kb, value in kb_dict.items():
- if value:
- # only evaluating on positive examples
- offset = _offset(start, end)
- correct_ents[offset] = gold_kb
-
- if baseline:
- _add_baseline(baseline_results, counts, doc, correct_ents, kb)
-
- if context:
- # using only context
- el_pipe.cfg["incl_context"] = True
- el_pipe.cfg["incl_prior"] = False
- _add_eval_result(context_results, doc, correct_ents, el_pipe)
-
- # measuring combined accuracy (prior + context)
- el_pipe.cfg["incl_context"] = True
- el_pipe.cfg["incl_prior"] = True
- _add_eval_result(combo_results, doc, correct_ents, el_pipe)
-
- if baseline:
- logger.info("Counts: {}".format({k: v for k, v in sorted(counts.items())}))
- logger.info(baseline_results.report_performance("random"))
- logger.info(baseline_results.report_performance("prior"))
- logger.info(baseline_results.report_performance("oracle"))
-
- if context:
- logger.info(context_results.report_metrics("context only"))
- logger.info(combo_results.report_metrics("context and prior"))
-
-
-def _add_eval_result(results, doc, correct_ents, el_pipe):
- """
- Evaluate the ent.kb_id_ annotations against the gold standard.
- Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
- """
- try:
- doc = el_pipe(doc)
- for ent in doc.ents:
- ent_label = ent.label_
- start = ent.start_char
- end = ent.end_char
- offset = _offset(start, end)
- gold_entity = correct_ents.get(offset, None)
- # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
- if gold_entity is not None:
- pred_entity = ent.kb_id_
- results.update_metrics(ent_label, gold_entity, pred_entity)
-
- except Exception as e:
- logging.error("Error assessing accuracy " + str(e))
-
-
-def _add_baseline(baseline_results, counts, doc, correct_ents, kb):
- """
- Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound.
- Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
- """
- for ent in doc.ents:
- ent_label = ent.label_
- start = ent.start_char
- end = ent.end_char
- offset = _offset(start, end)
- gold_entity = correct_ents.get(offset, None)
-
- # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
- if gold_entity is not None:
- candidates = kb.get_candidates(ent.text)
- oracle_candidate = ""
- prior_candidate = ""
- random_candidate = ""
- if candidates:
- scores = []
-
- for c in candidates:
- scores.append(c.prior_prob)
- if c.entity_ == gold_entity:
- oracle_candidate = c.entity_
-
- best_index = scores.index(max(scores))
- prior_candidate = candidates[best_index].entity_
- random_candidate = random.choice(candidates).entity_
-
- current_count = counts.get(ent_label, 0)
- counts[ent_label] = current_count+1
-
- baseline_results.update_baselines(
- gold_entity,
- ent_label,
- random_candidate,
- prior_candidate,
- oracle_candidate,
- )
-
-
-def _offset(start, end):
- return "{}_{}".format(start, end)
diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py
deleted file mode 100644
index 7778fc701..000000000
--- a/bin/wiki_entity_linking/kb_creator.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import logging
-
-from spacy.kb import KnowledgeBase
-
-from bin.wiki_entity_linking.train_descriptions import EntityEncoder
-from bin.wiki_entity_linking import wiki_io as io
-
-
-logger = logging.getLogger(__name__)
-
-
-def create_kb(
- nlp,
- max_entities_per_alias,
- min_entity_freq,
- min_occ,
- entity_def_path,
- entity_descr_path,
- entity_alias_path,
- entity_freq_path,
- prior_prob_path,
- entity_vector_length,
-):
- # Create the knowledge base from Wikidata entries
- kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length)
- entity_list, filtered_title_to_id = _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length)
- _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path)
- return kb
-
-
-def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length):
- # read the mappings from file
- title_to_id = io.read_title_to_id(entity_def_path)
- id_to_descr = io.read_id_to_descr(entity_descr_path)
-
- # check the length of the nlp vectors
- if "vectors" in nlp.meta and nlp.vocab.vectors.size:
- input_dim = nlp.vocab.vectors_length
- logger.info("Loaded pretrained vectors of size %s" % input_dim)
- else:
- raise ValueError(
- "The `nlp` object should have access to pretrained word vectors, "
- " cf. https://spacy.io/usage/models#languages."
- )
-
- logger.info("Filtering entities with fewer than {} mentions".format(min_entity_freq))
- entity_frequencies = io.read_entity_to_count(entity_freq_path)
- # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
- filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
- title_to_id,
- id_to_descr,
- entity_frequencies,
- min_entity_freq
- )
- logger.info("Kept {} entities from the set of {}".format(len(description_list), len(title_to_id.keys())))
-
- logger.info("Training entity encoder")
- encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
- encoder.train(description_list=description_list, to_print=True)
-
- logger.info("Getting entity embeddings")
- embeddings = encoder.apply_encoder(description_list)
-
- logger.info("Adding {} entities".format(len(entity_list)))
- kb.set_entities(
- entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings
- )
- return entity_list, filtered_title_to_id
-
-
-def _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path):
- logger.info("Adding aliases from Wikipedia and Wikidata")
- _add_aliases(
- kb,
- entity_list=entity_list,
- title_to_id=filtered_title_to_id,
- max_entities_per_alias=max_entities_per_alias,
- min_occ=min_occ,
- prior_prob_path=prior_prob_path,
- )
-
-
-def get_filtered_entities(title_to_id, id_to_descr, entity_frequencies,
- min_entity_freq: int = 10):
- filtered_title_to_id = dict()
- entity_list = []
- description_list = []
- frequency_list = []
- for title, entity in title_to_id.items():
- freq = entity_frequencies.get(title, 0)
- desc = id_to_descr.get(entity, None)
- if desc and freq > min_entity_freq:
- entity_list.append(entity)
- description_list.append(desc)
- frequency_list.append(freq)
- filtered_title_to_id[title] = entity
- return filtered_title_to_id, entity_list, description_list, frequency_list
-
-
-def _add_aliases(kb, entity_list, title_to_id, max_entities_per_alias, min_occ, prior_prob_path):
- wp_titles = title_to_id.keys()
-
- # adding aliases with prior probabilities
- # we can read this file sequentially, it's sorted by alias, and then by count
- logger.info("Adding WP aliases")
- with prior_prob_path.open("r", encoding="utf8") as prior_file:
- # skip header
- prior_file.readline()
- line = prior_file.readline()
- previous_alias = None
- total_count = 0
- counts = []
- entities = []
- while line:
- splits = line.replace("\n", "").split(sep="|")
- new_alias = splits[0]
- count = int(splits[1])
- entity = splits[2]
-
- if new_alias != previous_alias and previous_alias:
- # done reading the previous alias --> output
- if len(entities) > 0:
- selected_entities = []
- prior_probs = []
- for ent_count, ent_string in zip(counts, entities):
- if ent_string in wp_titles:
- wd_id = title_to_id[ent_string]
- p_entity_givenalias = ent_count / total_count
- selected_entities.append(wd_id)
- prior_probs.append(p_entity_givenalias)
-
- if selected_entities:
- try:
- kb.add_alias(
- alias=previous_alias,
- entities=selected_entities,
- probabilities=prior_probs,
- )
- except ValueError as e:
- logger.error(e)
- total_count = 0
- counts = []
- entities = []
-
- total_count += count
-
- if len(entities) < max_entities_per_alias and count >= min_occ:
- counts.append(count)
- entities.append(entity)
- previous_alias = new_alias
-
- line = prior_file.readline()
-
-
-def read_kb(nlp, kb_file):
- kb = KnowledgeBase(vocab=nlp.vocab)
- kb.load_bulk(kb_file)
- return kb
diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py
deleted file mode 100644
index af08d6b8f..000000000
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# coding: utf-8
-from random import shuffle
-
-import logging
-import numpy as np
-
-from spacy._ml import zero_init, create_default_optimizer
-from spacy.cli.pretrain import get_cossim_loss
-
-from thinc.v2v import Model
-from thinc.api import chain
-from thinc.neural._classes.affine import Affine
-
-logger = logging.getLogger(__name__)
-
-
-class EntityEncoder:
- """
- Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D).
- This entity vector will be stored in the KB, for further downstream use in the entity model.
- """
-
- DROP = 0
- BATCH_SIZE = 1000
-
- # Set min. acceptable loss to avoid a 'mean of empty slice' warning by numpy
- MIN_LOSS = 0.01
-
- # Reasonable default to stop training when things are not improving
- MAX_NO_IMPROVEMENT = 20
-
- def __init__(self, nlp, input_dim, desc_width, epochs=5):
- self.nlp = nlp
- self.input_dim = input_dim
- self.desc_width = desc_width
- self.epochs = epochs
-
- def apply_encoder(self, description_list):
- if self.encoder is None:
- raise ValueError("Can not apply encoder before training it")
-
- batch_size = 100000
-
- start = 0
- stop = min(batch_size, len(description_list))
- encodings = []
-
- while start < len(description_list):
- docs = list(self.nlp.pipe(description_list[start:stop]))
- doc_embeddings = [self._get_doc_embedding(doc) for doc in docs]
- enc = self.encoder(np.asarray(doc_embeddings))
- encodings.extend(enc.tolist())
-
- start = start + batch_size
- stop = min(stop + batch_size, len(description_list))
- logger.info("Encoded: {} entities".format(stop))
-
- return encodings
-
- def train(self, description_list, to_print=False):
- processed, loss = self._train_model(description_list)
- if to_print:
- logger.info(
- "Trained entity descriptions on {} ".format(processed) +
- "(non-unique) descriptions across {} ".format(self.epochs) +
- "epochs"
- )
- logger.info("Final loss: {}".format(loss))
-
- def _train_model(self, description_list):
- best_loss = 1.0
- iter_since_best = 0
- self._build_network(self.input_dim, self.desc_width)
-
- processed = 0
- loss = 1
- # copy this list so that shuffling does not affect other functions
- descriptions = description_list.copy()
- to_continue = True
-
- for i in range(self.epochs):
- shuffle(descriptions)
-
- batch_nr = 0
- start = 0
- stop = min(self.BATCH_SIZE, len(descriptions))
-
- while to_continue and start < len(descriptions):
- batch = []
- for descr in descriptions[start:stop]:
- doc = self.nlp(descr)
- doc_vector = self._get_doc_embedding(doc)
- batch.append(doc_vector)
-
- loss = self._update(batch)
- if batch_nr % 25 == 0:
- logger.info("loss: {} ".format(loss))
- processed += len(batch)
-
- # in general, continue training if we haven't reached our ideal min yet
- to_continue = loss > self.MIN_LOSS
-
- # store the best loss and track how long it's been
- if loss < best_loss:
- best_loss = loss
- iter_since_best = 0
- else:
- iter_since_best += 1
-
- # stop learning if we haven't seen improvement since the last few iterations
- if iter_since_best > self.MAX_NO_IMPROVEMENT:
- to_continue = False
-
- batch_nr += 1
- start = start + self.BATCH_SIZE
- stop = min(stop + self.BATCH_SIZE, len(descriptions))
-
- return processed, loss
-
- @staticmethod
- def _get_doc_embedding(doc):
- indices = np.zeros((len(doc),), dtype="i")
- for i, word in enumerate(doc):
- if word.orth in doc.vocab.vectors.key2row:
- indices[i] = doc.vocab.vectors.key2row[word.orth]
- else:
- indices[i] = 0
- word_vectors = doc.vocab.vectors.data[indices]
- doc_vector = np.mean(word_vectors, axis=0)
- return doc_vector
-
- def _build_network(self, orig_width, hidden_with):
- with Model.define_operators({">>": chain}):
- # very simple encoder-decoder model
- self.encoder = Affine(hidden_with, orig_width)
- self.model = self.encoder >> zero_init(
- Affine(orig_width, hidden_with, drop_factor=0.0)
- )
- self.sgd = create_default_optimizer(self.model.ops)
-
- def _update(self, vectors):
- predictions, bp_model = self.model.begin_update(
- np.asarray(vectors), drop=self.DROP
- )
- loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors))
- bp_model(d_scores, sgd=self.sgd)
- return loss / len(vectors)
-
- @staticmethod
- def _get_loss(golds, scores):
- loss, gradients = get_cossim_loss(scores, golds)
- return loss, gradients
diff --git a/bin/wiki_entity_linking/wiki_io.py b/bin/wiki_entity_linking/wiki_io.py
deleted file mode 100644
index 43ae87f0f..000000000
--- a/bin/wiki_entity_linking/wiki_io.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import sys
-import csv
-
-# min() needed to prevent error on windows, cf https://stackoverflow.com/questions/52404416/
-csv.field_size_limit(min(sys.maxsize, 2147483646))
-
-""" This class provides reading/writing methods for temp files """
-
-
-# Entity definition: WP title -> WD ID #
-def write_title_to_id(entity_def_output, title_to_id):
- with entity_def_output.open("w", encoding="utf8") as id_file:
- id_file.write("WP_title" + "|" + "WD_id" + "\n")
- for title, qid in title_to_id.items():
- id_file.write(title + "|" + str(qid) + "\n")
-
-
-def read_title_to_id(entity_def_output):
- title_to_id = dict()
- with entity_def_output.open("r", encoding="utf8") as id_file:
- csvreader = csv.reader(id_file, delimiter="|")
- # skip header
- next(csvreader)
- for row in csvreader:
- title_to_id[row[0]] = row[1]
- return title_to_id
-
-
-# Entity aliases from WD: WD ID -> WD alias #
-def write_id_to_alias(entity_alias_path, id_to_alias):
- with entity_alias_path.open("w", encoding="utf8") as alias_file:
- alias_file.write("WD_id" + "|" + "alias" + "\n")
- for qid, alias_list in id_to_alias.items():
- for alias in alias_list:
- alias_file.write(str(qid) + "|" + alias + "\n")
-
-
-def read_id_to_alias(entity_alias_path):
- id_to_alias = dict()
- with entity_alias_path.open("r", encoding="utf8") as alias_file:
- csvreader = csv.reader(alias_file, delimiter="|")
- # skip header
- next(csvreader)
- for row in csvreader:
- qid = row[0]
- alias = row[1]
- alias_list = id_to_alias.get(qid, [])
- alias_list.append(alias)
- id_to_alias[qid] = alias_list
- return id_to_alias
-
-
-def read_alias_to_id_generator(entity_alias_path):
- """ Read (aliases, qid) tuples """
-
- with entity_alias_path.open("r", encoding="utf8") as alias_file:
- csvreader = csv.reader(alias_file, delimiter="|")
- # skip header
- next(csvreader)
- for row in csvreader:
- qid = row[0]
- alias = row[1]
- yield alias, qid
-
-
-# Entity descriptions from WD: WD ID -> WD alias #
-def write_id_to_descr(entity_descr_output, id_to_descr):
- with entity_descr_output.open("w", encoding="utf8") as descr_file:
- descr_file.write("WD_id" + "|" + "description" + "\n")
- for qid, descr in id_to_descr.items():
- descr_file.write(str(qid) + "|" + descr + "\n")
-
-
-def read_id_to_descr(entity_desc_path):
- id_to_desc = dict()
- with entity_desc_path.open("r", encoding="utf8") as descr_file:
- csvreader = csv.reader(descr_file, delimiter="|")
- # skip header
- next(csvreader)
- for row in csvreader:
- id_to_desc[row[0]] = row[1]
- return id_to_desc
-
-
-# Entity counts from WP: WP title -> count #
-def write_entity_to_count(prior_prob_input, count_output):
- # Write entity counts for quick access later
- entity_to_count = dict()
- total_count = 0
-
- with prior_prob_input.open("r", encoding="utf8") as prior_file:
- # skip header
- prior_file.readline()
- line = prior_file.readline()
-
- while line:
- splits = line.replace("\n", "").split(sep="|")
- # alias = splits[0]
- count = int(splits[1])
- entity = splits[2]
-
- current_count = entity_to_count.get(entity, 0)
- entity_to_count[entity] = current_count + count
-
- total_count += count
-
- line = prior_file.readline()
-
- with count_output.open("w", encoding="utf8") as entity_file:
- entity_file.write("entity" + "|" + "count" + "\n")
- for entity, count in entity_to_count.items():
- entity_file.write(entity + "|" + str(count) + "\n")
-
-
-def read_entity_to_count(count_input):
- entity_to_count = dict()
- with count_input.open("r", encoding="utf8") as csvfile:
- csvreader = csv.reader(csvfile, delimiter="|")
- # skip header
- next(csvreader)
- for row in csvreader:
- entity_to_count[row[0]] = int(row[1])
-
- return entity_to_count
diff --git a/bin/wiki_entity_linking/wiki_namespaces.py b/bin/wiki_entity_linking/wiki_namespaces.py
deleted file mode 100644
index e8f099ccd..000000000
--- a/bin/wiki_entity_linking/wiki_namespaces.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-# List of meta pages in Wikidata, should be kept out of the Knowledge base
-WD_META_ITEMS = [
- "Q163875",
- "Q191780",
- "Q224414",
- "Q4167836",
- "Q4167410",
- "Q4663903",
- "Q11266439",
- "Q13406463",
- "Q15407973",
- "Q18616576",
- "Q19887878",
- "Q22808320",
- "Q23894233",
- "Q33120876",
- "Q42104522",
- "Q47460393",
- "Q64875536",
- "Q66480449",
-]
-
-
-# TODO: add more cases from non-English WP's
-
-# List of prefixes that refer to Wikipedia "file" pages
-WP_FILE_NAMESPACE = ["Bestand", "File"]
-
-# List of prefixes that refer to Wikipedia "category" pages
-WP_CATEGORY_NAMESPACE = ["Kategori", "Category", "Categorie"]
-
-# List of prefixes that refer to Wikipedia "meta" pages
-# these will/should be matched ignoring case
-WP_META_NAMESPACE = (
- WP_FILE_NAMESPACE
- + WP_CATEGORY_NAMESPACE
- + [
- "b",
- "betawikiversity",
- "Book",
- "c",
- "Commons",
- "d",
- "dbdump",
- "download",
- "Draft",
- "Education",
- "Foundation",
- "Gadget",
- "Gadget definition",
- "Gebruiker",
- "gerrit",
- "Help",
- "Image",
- "Incubator",
- "m",
- "mail",
- "mailarchive",
- "media",
- "MediaWiki",
- "MediaWiki talk",
- "Mediawikiwiki",
- "MediaZilla",
- "Meta",
- "Metawikipedia",
- "Module",
- "mw",
- "n",
- "nost",
- "oldwikisource",
- "otrs",
- "OTRSwiki",
- "Overleg gebruiker",
- "outreach",
- "outreachwiki",
- "Portal",
- "phab",
- "Phabricator",
- "Project",
- "q",
- "quality",
- "rev",
- "s",
- "spcom",
- "Special",
- "species",
- "Strategy",
- "sulutil",
- "svn",
- "Talk",
- "Template",
- "Template talk",
- "Testwiki",
- "ticket",
- "TimedText",
- "Toollabs",
- "tools",
- "tswiki",
- "User",
- "User talk",
- "v",
- "voy",
- "w",
- "Wikibooks",
- "Wikidata",
- "wikiHow",
- "Wikinvest",
- "wikilivres",
- "Wikimedia",
- "Wikinews",
- "Wikipedia",
- "Wikipedia talk",
- "Wikiquote",
- "Wikisource",
- "Wikispecies",
- "Wikitech",
- "Wikiversity",
- "Wikivoyage",
- "wikt",
- "wiktionary",
- "wmf",
- "wmania",
- "WP",
- ]
-)
diff --git a/bin/wiki_entity_linking/wikidata_pretrain_kb.py b/bin/wiki_entity_linking/wikidata_pretrain_kb.py
deleted file mode 100644
index 003074feb..000000000
--- a/bin/wiki_entity_linking/wikidata_pretrain_kb.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# coding: utf-8
-"""Script to process Wikipedia and Wikidata dumps and create a knowledge base (KB)
-with specific parameters. Intermediate files are written to disk.
-
-Running the full pipeline on a standard laptop, may take up to 13 hours of processing.
-Use the -p, -d and -s options to speed up processing using the intermediate files
-from a previous run.
-
-For the Wikidata dump: get the latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
-For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2
-from https://dumps.wikimedia.org/enwiki/latest/
-
-"""
-from __future__ import unicode_literals
-
-import logging
-from pathlib import Path
-import plac
-
-from bin.wiki_entity_linking import wikipedia_processor as wp, wikidata_processor as wd
-from bin.wiki_entity_linking import wiki_io as io
-from bin.wiki_entity_linking import kb_creator
-from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_FILE, ENTITY_DESCR_PATH, KB_MODEL_DIR, LOG_FORMAT
-from bin.wiki_entity_linking import ENTITY_FREQ_PATH, PRIOR_PROB_PATH, ENTITY_DEFS_PATH, ENTITY_ALIAS_PATH
-import spacy
-from bin.wiki_entity_linking.kb_creator import read_kb
-
-logger = logging.getLogger(__name__)
-
-
-@plac.annotations(
- wd_json=("Path to the downloaded WikiData JSON dump.", "positional", None, Path),
- wp_xml=("Path to the downloaded Wikipedia XML dump.", "positional", None, Path),
- output_dir=("Output directory", "positional", None, Path),
- model=("Model name or path, should include pretrained vectors.", "positional", None, str),
- max_per_alias=("Max. # entities per alias (default 10)", "option", "a", int),
- min_freq=("Min. count of an entity in the corpus (default 20)", "option", "f", int),
- min_pair=("Min. count of entity-alias pairs (default 5)", "option", "c", int),
- entity_vector_length=("Length of entity vectors (default 64)", "option", "v", int),
- loc_prior_prob=("Location to file with prior probabilities", "option", "p", Path),
- loc_entity_defs=("Location to file with entity definitions", "option", "d", Path),
- loc_entity_desc=("Location to file with entity descriptions", "option", "s", Path),
- descr_from_wp=("Flag for using descriptions from WP instead of WD (default False)", "flag", "wp"),
- limit_prior=("Threshold to limit lines read from WP for prior probabilities", "option", "lp", int),
- limit_train=("Threshold to limit lines read from WP for training set", "option", "lt", int),
- limit_wd=("Threshold to limit lines read from WD", "option", "lw", int),
- lang=("Optional language for which to get Wikidata titles. Defaults to 'en'", "option", "la", str),
-)
-def main(
- wd_json,
- wp_xml,
- output_dir,
- model,
- max_per_alias=10,
- min_freq=20,
- min_pair=5,
- entity_vector_length=64,
- loc_prior_prob=None,
- loc_entity_defs=None,
- loc_entity_alias=None,
- loc_entity_desc=None,
- descr_from_wp=False,
- limit_prior=None,
- limit_train=None,
- limit_wd=None,
- lang="en",
-):
- entity_defs_path = loc_entity_defs if loc_entity_defs else output_dir / ENTITY_DEFS_PATH
- entity_alias_path = loc_entity_alias if loc_entity_alias else output_dir / ENTITY_ALIAS_PATH
- entity_descr_path = loc_entity_desc if loc_entity_desc else output_dir / ENTITY_DESCR_PATH
- entity_freq_path = output_dir / ENTITY_FREQ_PATH
- prior_prob_path = loc_prior_prob if loc_prior_prob else output_dir / PRIOR_PROB_PATH
- training_entities_path = output_dir / TRAINING_DATA_FILE
- kb_path = output_dir / KB_FILE
-
- logger.info("Creating KB with Wikipedia and WikiData")
-
- # STEP 0: set up IO
- if not output_dir.exists():
- output_dir.mkdir(parents=True)
-
- # STEP 1: Load the NLP object
- logger.info("STEP 1: Loading NLP model {}".format(model))
- nlp = spacy.load(model)
-
- # check the length of the nlp vectors
- if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
- raise ValueError(
- "The `nlp` object should have access to pretrained word vectors, "
- " cf. https://spacy.io/usage/models#languages."
- )
-
- # STEP 2: create prior probabilities from WP
- if not prior_prob_path.exists():
- # It takes about 2h to process 1000M lines of Wikipedia XML dump
- logger.info("STEP 2: Writing prior probabilities to {}".format(prior_prob_path))
- if limit_prior is not None:
- logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_prior))
- wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit_prior)
- else:
- logger.info("STEP 2: Reading prior probabilities from {}".format(prior_prob_path))
-
- # STEP 3: calculate entity frequencies
- if not entity_freq_path.exists():
- logger.info("STEP 3: Calculating and writing entity frequencies to {}".format(entity_freq_path))
- io.write_entity_to_count(prior_prob_path, entity_freq_path)
- else:
- logger.info("STEP 3: Reading entity frequencies from {}".format(entity_freq_path))
-
- # STEP 4: reading definitions and (possibly) descriptions from WikiData or from file
- if (not entity_defs_path.exists()) or (not descr_from_wp and not entity_descr_path.exists()):
- # It takes about 10h to process 55M lines of Wikidata JSON dump
- logger.info("STEP 4: Parsing and writing Wikidata entity definitions to {}".format(entity_defs_path))
- if limit_wd is not None:
- logger.warning("Warning: reading only {} lines of Wikidata dump".format(limit_wd))
- title_to_id, id_to_descr, id_to_alias = wd.read_wikidata_entities_json(
- wd_json,
- limit_wd,
- to_print=False,
- lang=lang,
- parse_descr=(not descr_from_wp),
- )
- io.write_title_to_id(entity_defs_path, title_to_id)
-
- logger.info("STEP 4b: Writing Wikidata entity aliases to {}".format(entity_alias_path))
- io.write_id_to_alias(entity_alias_path, id_to_alias)
-
- if not descr_from_wp:
- logger.info("STEP 4c: Writing Wikidata entity descriptions to {}".format(entity_descr_path))
- io.write_id_to_descr(entity_descr_path, id_to_descr)
- else:
- logger.info("STEP 4: Reading entity definitions from {}".format(entity_defs_path))
- logger.info("STEP 4b: Reading entity aliases from {}".format(entity_alias_path))
- if not descr_from_wp:
- logger.info("STEP 4c: Reading entity descriptions from {}".format(entity_descr_path))
-
- # STEP 5: Getting gold entities from Wikipedia
- if (not training_entities_path.exists()) or (descr_from_wp and not entity_descr_path.exists()):
- logger.info("STEP 5: Parsing and writing Wikipedia gold entities to {}".format(training_entities_path))
- if limit_train is not None:
- logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_train))
- wp.create_training_and_desc(wp_xml, entity_defs_path, entity_descr_path,
- training_entities_path, descr_from_wp, limit_train)
- if descr_from_wp:
- logger.info("STEP 5b: Parsing and writing Wikipedia descriptions to {}".format(entity_descr_path))
- else:
- logger.info("STEP 5: Reading gold entities from {}".format(training_entities_path))
- if descr_from_wp:
- logger.info("STEP 5b: Reading entity descriptions from {}".format(entity_descr_path))
-
- # STEP 6: creating the actual KB
- # It takes ca. 30 minutes to pretrain the entity embeddings
- if not kb_path.exists():
- logger.info("STEP 6: Creating the KB at {}".format(kb_path))
- kb = kb_creator.create_kb(
- nlp=nlp,
- max_entities_per_alias=max_per_alias,
- min_entity_freq=min_freq,
- min_occ=min_pair,
- entity_def_path=entity_defs_path,
- entity_descr_path=entity_descr_path,
- entity_alias_path=entity_alias_path,
- entity_freq_path=entity_freq_path,
- prior_prob_path=prior_prob_path,
- entity_vector_length=entity_vector_length,
- )
- kb.dump(kb_path)
- logger.info("kb entities: {}".format(kb.get_size_entities()))
- logger.info("kb aliases: {}".format(kb.get_size_aliases()))
- nlp.to_disk(output_dir / KB_MODEL_DIR)
- else:
- logger.info("STEP 6: KB already exists at {}".format(kb_path))
-
- logger.info("Done!")
-
-
-if __name__ == "__main__":
- logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
- plac.call(main)
diff --git a/bin/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py
deleted file mode 100644
index 8a070f567..000000000
--- a/bin/wiki_entity_linking/wikidata_processor.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import bz2
-import json
-import logging
-
-from bin.wiki_entity_linking.wiki_namespaces import WD_META_ITEMS
-
-logger = logging.getLogger(__name__)
-
-
-def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang="en", parse_descr=True):
- # Read the JSON wiki data and parse out the entities. Takes about 7-10h to parse 55M lines.
- # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
-
- site_filter = '{}wiki'.format(lang)
-
- # filter: currently defined as OR: one hit suffices to be removed from further processing
- exclude_list = WD_META_ITEMS
-
- # punctuation
- exclude_list.extend(["Q1383557", "Q10617810"])
-
- # letters etc
- exclude_list.extend(["Q188725", "Q19776628", "Q3841820", "Q17907810", "Q9788", "Q9398093"])
-
- neg_prop_filter = {
- 'P31': exclude_list, # instance of
- 'P279': exclude_list # subclass
- }
-
- title_to_id = dict()
- id_to_descr = dict()
- id_to_alias = dict()
-
- # parse appropriate fields - depending on what we need in the KB
- parse_properties = False
- parse_sitelinks = True
- parse_labels = False
- parse_aliases = True
- parse_claims = True
-
- with bz2.open(wikidata_file, mode='rb') as file:
- for cnt, line in enumerate(file):
- if limit and cnt >= limit:
- break
- if cnt % 500000 == 0 and cnt > 0:
- logger.info("processed {} lines of WikiData JSON dump".format(cnt))
- clean_line = line.strip()
- if clean_line.endswith(b","):
- clean_line = clean_line[:-1]
- if len(clean_line) > 1:
- obj = json.loads(clean_line)
- entry_type = obj["type"]
-
- if entry_type == "item":
- keep = True
-
- claims = obj["claims"]
- if parse_claims:
- for prop, value_set in neg_prop_filter.items():
- claim_property = claims.get(prop, None)
- if claim_property:
- for cp in claim_property:
- cp_id = (
- cp["mainsnak"]
- .get("datavalue", {})
- .get("value", {})
- .get("id")
- )
- cp_rank = cp["rank"]
- if cp_rank != "deprecated" and cp_id in value_set:
- keep = False
-
- if keep:
- unique_id = obj["id"]
-
- if to_print:
- print("ID:", unique_id)
- print("type:", entry_type)
-
- # parsing all properties that refer to other entities
- if parse_properties:
- for prop, claim_property in claims.items():
- cp_dicts = [
- cp["mainsnak"]["datavalue"].get("value")
- for cp in claim_property
- if cp["mainsnak"].get("datavalue")
- ]
- cp_values = [
- cp_dict.get("id")
- for cp_dict in cp_dicts
- if isinstance(cp_dict, dict)
- if cp_dict.get("id") is not None
- ]
- if cp_values:
- if to_print:
- print("prop:", prop, cp_values)
-
- found_link = False
- if parse_sitelinks:
- site_value = obj["sitelinks"].get(site_filter, None)
- if site_value:
- site = site_value["title"]
- if to_print:
- print(site_filter, ":", site)
- title_to_id[site] = unique_id
- found_link = True
-
- if parse_labels:
- labels = obj["labels"]
- if labels:
- lang_label = labels.get(lang, None)
- if lang_label:
- if to_print:
- print(
- "label (" + lang + "):", lang_label["value"]
- )
-
- if found_link and parse_descr:
- descriptions = obj["descriptions"]
- if descriptions:
- lang_descr = descriptions.get(lang, None)
- if lang_descr:
- if to_print:
- print(
- "description (" + lang + "):",
- lang_descr["value"],
- )
- id_to_descr[unique_id] = lang_descr["value"]
-
- if parse_aliases:
- aliases = obj["aliases"]
- if aliases:
- lang_aliases = aliases.get(lang, None)
- if lang_aliases:
- for item in lang_aliases:
- if to_print:
- print(
- "alias (" + lang + "):", item["value"]
- )
- alias_list = id_to_alias.get(unique_id, [])
- alias_list.append(item["value"])
- id_to_alias[unique_id] = alias_list
-
- if to_print:
- print()
-
- # log final number of lines processed
- logger.info("Finished. Processed {} lines of WikiData JSON dump".format(cnt))
- return title_to_id, id_to_descr, id_to_alias
-
-
diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
deleted file mode 100644
index 54f00fc6f..000000000
--- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# coding: utf-8
-"""Script that takes a previously created Knowledge Base and trains an entity linking
-pipeline. The provided KB directory should hold the kb, the original nlp object and
-its vocab used to create the KB, and a few auxiliary files such as the entity definitions,
-as created by the script `wikidata_create_kb`.
-
-For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2
-from https://dumps.wikimedia.org/enwiki/latest/
-"""
-from __future__ import unicode_literals
-
-import random
-import logging
-import spacy
-from pathlib import Path
-import plac
-from tqdm import tqdm
-
-from bin.wiki_entity_linking import wikipedia_processor
-from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_MODEL_DIR, KB_FILE, LOG_FORMAT, OUTPUT_MODEL_DIR
-from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance
-from bin.wiki_entity_linking.kb_creator import read_kb
-
-from spacy.util import minibatch, compounding
-
-logger = logging.getLogger(__name__)
-
-
-@plac.annotations(
- dir_kb=("Directory with KB, NLP and related files", "positional", None, Path),
- output_dir=("Output directory", "option", "o", Path),
- loc_training=("Location to training data", "option", "k", Path),
- epochs=("Number of training iterations (default 10)", "option", "e", int),
- dropout=("Dropout to prevent overfitting (default 0.5)", "option", "p", float),
- lr=("Learning rate (default 0.005)", "option", "n", float),
- l2=("L2 regularization", "option", "r", float),
- train_articles=("# training articles (default 90% of all)", "option", "t", int),
- dev_articles=("# dev test articles (default 10% of all)", "option", "d", int),
- labels_discard=("NER labels to discard (default None)", "option", "l", str),
-)
-def main(
- dir_kb,
- output_dir=None,
- loc_training=None,
- epochs=10,
- dropout=0.5,
- lr=0.005,
- l2=1e-6,
- train_articles=None,
- dev_articles=None,
- labels_discard=None
-):
- if not output_dir:
- logger.warning("No output dir specified so no results will be written, are you sure about this ?")
-
- logger.info("Creating Entity Linker with Wikipedia and WikiData")
-
- output_dir = Path(output_dir) if output_dir else dir_kb
- training_path = loc_training if loc_training else dir_kb / TRAINING_DATA_FILE
- nlp_dir = dir_kb / KB_MODEL_DIR
- kb_path = dir_kb / KB_FILE
- nlp_output_dir = output_dir / OUTPUT_MODEL_DIR
-
- # STEP 0: set up IO
- if not output_dir.exists():
- output_dir.mkdir()
-
- # STEP 1 : load the NLP object
- logger.info("STEP 1a: Loading model from {}".format(nlp_dir))
- nlp = spacy.load(nlp_dir)
- logger.info("Original NLP pipeline has following pipeline components: {}".format(nlp.pipe_names))
-
- # check that there is a NER component in the pipeline
- if "ner" not in nlp.pipe_names:
- raise ValueError("The `nlp` object should have a pretrained `ner` component.")
-
- logger.info("STEP 1b: Loading KB from {}".format(kb_path))
- kb = read_kb(nlp, kb_path)
-
- # STEP 2: read the training dataset previously created from WP
- logger.info("STEP 2: Reading training & dev dataset from {}".format(training_path))
- train_indices, dev_indices = wikipedia_processor.read_training_indices(training_path)
- logger.info("Training set has {} articles, limit set to roughly {} articles per epoch"
- .format(len(train_indices), train_articles if train_articles else "all"))
- logger.info("Dev set has {} articles, limit set to rougly {} articles for evaluation"
- .format(len(dev_indices), dev_articles if dev_articles else "all"))
- if dev_articles:
- dev_indices = dev_indices[0:dev_articles]
-
- # STEP 3: create and train an entity linking pipe
- logger.info("STEP 3: Creating and training an Entity Linking pipe for {} epochs".format(epochs))
- if labels_discard:
- labels_discard = [x.strip() for x in labels_discard.split(",")]
- logger.info("Discarding {} NER types: {}".format(len(labels_discard), labels_discard))
- else:
- labels_discard = []
-
- el_pipe = nlp.create_pipe(
- name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name,
- "labels_discard": labels_discard}
- )
- el_pipe.set_kb(kb)
- nlp.add_pipe(el_pipe, last=True)
-
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
- with nlp.disable_pipes(*other_pipes): # only train Entity Linking
- optimizer = nlp.begin_training()
- optimizer.learn_rate = lr
- optimizer.L2 = l2
-
- logger.info("Dev Baseline Accuracies:")
- dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path,
- dev=True, line_ids=dev_indices,
- kb=kb, labels_discard=labels_discard)
-
- measure_performance(dev_data, kb, el_pipe, baseline=True, context=False, dev_limit=len(dev_indices))
-
- for itn in range(epochs):
- random.shuffle(train_indices)
- losses = {}
- batches = minibatch(train_indices, size=compounding(8.0, 128.0, 1.001))
- batchnr = 0
- articles_processed = 0
-
- # we either process the whole training file, or just a part each epoch
- bar_total = len(train_indices)
- if train_articles:
- bar_total = train_articles
-
- with tqdm(total=bar_total, leave=False, desc='Epoch ' + str(itn)) as pbar:
- for batch in batches:
- if not train_articles or articles_processed < train_articles:
- with nlp.disable_pipes("entity_linker"):
- train_batch = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path,
- dev=False, line_ids=batch,
- kb=kb, labels_discard=labels_discard)
- docs, golds = zip(*train_batch)
- try:
- with nlp.disable_pipes(*other_pipes):
- nlp.update(
- docs=docs,
- golds=golds,
- sgd=optimizer,
- drop=dropout,
- losses=losses,
- )
- batchnr += 1
- articles_processed += len(docs)
- pbar.update(len(docs))
- except Exception as e:
- logger.error("Error updating batch:" + str(e))
- if batchnr > 0:
- logging.info("Epoch {} trained on {} articles, train loss {}"
- .format(itn, articles_processed, round(losses["entity_linker"] / batchnr, 2)))
- # re-read the dev_data (data is returned as a generator)
- dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path,
- dev=True, line_ids=dev_indices,
- kb=kb, labels_discard=labels_discard)
- measure_performance(dev_data, kb, el_pipe, baseline=False, context=True, dev_limit=len(dev_indices))
-
- if output_dir:
- # STEP 4: write the NLP pipeline (now including an EL model) to file
- logger.info("Final NLP pipeline has following pipeline components: {}".format(nlp.pipe_names))
- logger.info("STEP 4: Writing trained NLP to {}".format(nlp_output_dir))
- nlp.to_disk(nlp_output_dir)
-
- logger.info("Done!")
-
-
-if __name__ == "__main__":
- logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
- plac.call(main)
diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py
index 315b1e916..e69de29bb 100644
--- a/bin/wiki_entity_linking/wikipedia_processor.py
+++ b/bin/wiki_entity_linking/wikipedia_processor.py
@@ -1,565 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-import bz2
-import logging
-import random
-import json
-
-from spacy.gold import GoldParse
-from bin.wiki_entity_linking import wiki_io as io
-from bin.wiki_entity_linking.wiki_namespaces import (
- WP_META_NAMESPACE,
- WP_FILE_NAMESPACE,
- WP_CATEGORY_NAMESPACE,
-)
-
-"""
-Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions.
-Write these results to file for downstream KB and training data generation.
-
-Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
-"""
-
-ENTITY_FILE = "gold_entities.csv"
-
-map_alias_to_link = dict()
-
-logger = logging.getLogger(__name__)
-
-title_regex = re.compile(r"(?<=).*(?= )")
-id_regex = re.compile(r"(?<=)\d*(?= )")
-text_regex = re.compile(r"(?<=).*(?= 0:
- logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
- clean_line = line.strip().decode("utf-8")
-
- # we attempt at reading the article's ID (but not the revision or contributor ID)
- if "" in clean_line or "" in clean_line:
- read_id = False
- if "" in clean_line:
- read_id = True
-
- if read_id:
- ids = id_regex.search(clean_line)
- if ids:
- current_article_id = ids[0]
-
- # only processing prior probabilities from true training (non-dev) articles
- if not is_dev(current_article_id):
- aliases, entities, normalizations = get_wp_links(clean_line)
- for alias, entity, norm in zip(aliases, entities, normalizations):
- _store_alias(
- alias, entity, normalize_alias=norm, normalize_entity=True
- )
-
- line = file.readline()
- cnt += 1
- logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
- logger.info("Finished. processed {} lines of Wikipedia XML dump".format(cnt))
-
- # write all aliases and their entities and count occurrences to file
- with prior_prob_output.open("w", encoding="utf8") as outputfile:
- outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
- for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
- s_dict = sorted(alias_dict.items(), key=lambda x: x[1], reverse=True)
- for entity, count in s_dict:
- outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
-
-
-def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
- alias = alias.strip()
- entity = entity.strip()
-
- # remove everything after # as this is not part of the title but refers to a specific paragraph
- if normalize_entity:
- # wikipedia titles are always capitalized
- entity = _capitalize_first(entity.split("#")[0])
- if normalize_alias:
- alias = alias.split("#")[0]
-
- if alias and entity:
- alias_dict = map_alias_to_link.get(alias, dict())
- entity_count = alias_dict.get(entity, 0)
- alias_dict[entity] = entity_count + 1
- map_alias_to_link[alias] = alias_dict
-
-
-def get_wp_links(text):
- aliases = []
- entities = []
- normalizations = []
-
- matches = link_regex.findall(text)
- for match in matches:
- match = match[2:][:-2].replace("_", " ").strip()
-
- if ns_regex.match(match):
- pass # ignore the entity if it points to a "meta" page
-
- # this is a simple [[link]], with the alias the same as the mention
- elif "|" not in match:
- aliases.append(match)
- entities.append(match)
- normalizations.append(True)
-
- # in wiki format, the link is written as [[entity|alias]]
- else:
- splits = match.split("|")
- entity = splits[0].strip()
- alias = splits[1].strip()
- # specific wiki format [[alias (specification)|]]
- if len(alias) == 0 and "(" in entity:
- alias = entity.split("(")[0]
- aliases.append(alias)
- entities.append(entity)
- normalizations.append(False)
- else:
- aliases.append(alias)
- entities.append(entity)
- normalizations.append(False)
-
- return aliases, entities, normalizations
-
-
-def _capitalize_first(text):
- if not text:
- return None
- result = text[0].capitalize()
- if len(result) > 0:
- result += text[1:]
- return result
-
-
-def create_training_and_desc(
- wp_input, def_input, desc_output, training_output, parse_desc, limit=None
-):
- wp_to_id = io.read_title_to_id(def_input)
- _process_wikipedia_texts(
- wp_input, wp_to_id, desc_output, training_output, parse_desc, limit
- )
-
-
-def _process_wikipedia_texts(
- wikipedia_input, wp_to_id, output, training_output, parse_descriptions, limit=None
-):
- """
- Read the XML wikipedia data to parse out training data:
- raw text data + positive instances
- """
-
- read_ids = set()
-
- with output.open("a", encoding="utf8") as descr_file, training_output.open(
- "w", encoding="utf8"
- ) as entity_file:
- if parse_descriptions:
- _write_training_description(descr_file, "WD_id", "description")
- with bz2.open(wikipedia_input, mode="rb") as file:
- article_count = 0
- article_text = ""
- article_title = None
- article_id = None
- reading_text = False
- reading_revision = False
-
- for line in file:
- clean_line = line.strip().decode("utf-8")
-
- if clean_line == "":
- reading_revision = True
- elif clean_line == " ":
- reading_revision = False
-
- # Start reading new page
- if clean_line == "":
- article_text = ""
- article_title = None
- article_id = None
- # finished reading this page
- elif clean_line == " ":
- if article_id:
- clean_text, entities = _process_wp_text(
- article_title, article_text, wp_to_id
- )
- if clean_text is not None and entities is not None:
- _write_training_entities(
- entity_file, article_id, clean_text, entities
- )
-
- if article_title in wp_to_id and parse_descriptions:
- description = " ".join(
- clean_text[:1000].split(" ")[:-1]
- )
- _write_training_description(
- descr_file, wp_to_id[article_title], description
- )
- article_count += 1
- if article_count % 10000 == 0 and article_count > 0:
- logger.info(
- "Processed {} articles".format(article_count)
- )
- if limit and article_count >= limit:
- break
- article_text = ""
- article_title = None
- article_id = None
- reading_text = False
- reading_revision = False
-
- # start reading text within a page
- if "")
- clean_text = clean_text.replace(r""", '"')
- clean_text = clean_text.replace(r" ", " ")
- clean_text = clean_text.replace(r"&", "&")
-
- # remove multiple spaces
- while " " in clean_text:
- clean_text = clean_text.replace(" ", " ")
-
- return clean_text.strip()
-
-
-def _remove_links(clean_text, wp_to_id):
- # read the text char by char to get the right offsets for the interwiki links
- entities = []
- final_text = ""
- open_read = 0
- reading_text = True
- reading_entity = False
- reading_mention = False
- reading_special_case = False
- entity_buffer = ""
- mention_buffer = ""
- for index, letter in enumerate(clean_text):
- if letter == "[":
- open_read += 1
- elif letter == "]":
- open_read -= 1
- elif letter == "|":
- if reading_text:
- final_text += letter
- # switch from reading entity to mention in the [[entity|mention]] pattern
- elif reading_entity:
- reading_text = False
- reading_entity = False
- reading_mention = True
- else:
- reading_special_case = True
- else:
- if reading_entity:
- entity_buffer += letter
- elif reading_mention:
- mention_buffer += letter
- elif reading_text:
- final_text += letter
- else:
- raise ValueError("Not sure at point", clean_text[index - 2 : index + 2])
-
- if open_read > 2:
- reading_special_case = True
-
- if open_read == 2 and reading_text:
- reading_text = False
- reading_entity = True
- reading_mention = False
-
- # we just finished reading an entity
- if open_read == 0 and not reading_text:
- if "#" in entity_buffer or entity_buffer.startswith(":"):
- reading_special_case = True
- # Ignore cases with nested structures like File: handles etc
- if not reading_special_case:
- if not mention_buffer:
- mention_buffer = entity_buffer
- start = len(final_text)
- end = start + len(mention_buffer)
- qid = wp_to_id.get(entity_buffer, None)
- if qid:
- entities.append((mention_buffer, qid, start, end))
- final_text += mention_buffer
-
- entity_buffer = ""
- mention_buffer = ""
-
- reading_text = True
- reading_entity = False
- reading_mention = False
- reading_special_case = False
- return final_text, entities
-
-
-def _write_training_description(outputfile, qid, description):
- if description is not None:
- line = str(qid) + "|" + description + "\n"
- outputfile.write(line)
-
-
-def _write_training_entities(outputfile, article_id, clean_text, entities):
- entities_data = [
- {"alias": ent[0], "entity": ent[1], "start": ent[2], "end": ent[3]}
- for ent in entities
- ]
- line = (
- json.dumps(
- {
- "article_id": article_id,
- "clean_text": clean_text,
- "entities": entities_data,
- },
- ensure_ascii=False,
- )
- + "\n"
- )
- outputfile.write(line)
-
-
-def read_training_indices(entity_file_path):
- """ This method creates two lists of indices into the training file: one with indices for the
- training examples, and one for the dev examples."""
- train_indices = []
- dev_indices = []
-
- with entity_file_path.open("r", encoding="utf8") as file:
- for i, line in enumerate(file):
- example = json.loads(line)
- article_id = example["article_id"]
- clean_text = example["clean_text"]
-
- if is_valid_article(clean_text):
- if is_dev(article_id):
- dev_indices.append(i)
- else:
- train_indices.append(i)
-
- return train_indices, dev_indices
-
-
-def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=None):
- """ This method provides training/dev examples that correspond to the entity annotations found by the nlp object.
- For training, it will include both positive and negative examples by using the candidate generator from the kb.
- For testing (kb=None), it will include all positive examples only."""
- if not labels_discard:
- labels_discard = []
-
- texts = []
- entities_list = []
-
- with entity_file_path.open("r", encoding="utf8") as file:
- for i, line in enumerate(file):
- if i in line_ids:
- example = json.loads(line)
- article_id = example["article_id"]
- clean_text = example["clean_text"]
- entities = example["entities"]
-
- if dev != is_dev(article_id) or not is_valid_article(clean_text):
- continue
-
- texts.append(clean_text)
- entities_list.append(entities)
-
- docs = nlp.pipe(texts, batch_size=50)
-
- for doc, entities in zip(docs, entities_list):
- gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard)
- if gold and len(gold.links) > 0:
- yield doc, gold
-
-
-def _get_gold_parse(doc, entities, dev, kb, labels_discard):
- gold_entities = {}
- tagged_ent_positions = {
- (ent.start_char, ent.end_char): ent
- for ent in doc.ents
- if ent.label_ not in labels_discard
- }
-
- for entity in entities:
- entity_id = entity["entity"]
- alias = entity["alias"]
- start = entity["start"]
- end = entity["end"]
-
- candidate_ids = []
- if kb and not dev:
- candidates = kb.get_candidates(alias)
- candidate_ids = [cand.entity_ for cand in candidates]
-
- tagged_ent = tagged_ent_positions.get((start, end), None)
- if tagged_ent:
- # TODO: check that alias == doc.text[start:end]
- should_add_ent = (dev or entity_id in candidate_ids) and is_valid_sentence(
- tagged_ent.sent.text
- )
-
- if should_add_ent:
- value_by_id = {entity_id: 1.0}
- if not dev:
- random.shuffle(candidate_ids)
- value_by_id.update(
- {kb_id: 0.0 for kb_id in candidate_ids if kb_id != entity_id}
- )
- gold_entities[(start, end)] = value_by_id
-
- return GoldParse(doc, links=gold_entities)
-
-
-def is_dev(article_id):
- if not article_id:
- return False
- return article_id.endswith("3")
-
-
-def is_valid_article(doc_text):
- # custom length cut-off
- return 10 < len(doc_text) < 30000
-
-
-def is_valid_sentence(sent_text):
- if not 10 < len(sent_text) < 3000:
- # custom length cut-off
- return False
-
- if sent_text.strip().startswith("*") or sent_text.strip().startswith("#"):
- # remove 'enumeration' sentences (occurs often on Wikipedia)
- return False
-
- return True
diff --git a/examples/training/pretrain_kb.py b/examples/training/create_kb.py
similarity index 75%
rename from examples/training/pretrain_kb.py
rename to examples/training/create_kb.py
index 54c68f653..cbdb5c05b 100644
--- a/examples/training/pretrain_kb.py
+++ b/examples/training/create_kb.py
@@ -1,15 +1,15 @@
#!/usr/bin/env python
# coding: utf8
-"""Example of defining and (pre)training spaCy's knowledge base,
+"""Example of defining a knowledge base in spaCy,
which is needed to implement entity linking functionality.
For more details, see the documentation:
* Knowledge base: https://spacy.io/api/kb
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
-Compatible with: spaCy v2.2.3
-Last tested with: v2.2.3
+Compatible with: spaCy v2.2.4
+Last tested with: v2.2.4
"""
from __future__ import unicode_literals, print_function
@@ -20,24 +20,18 @@ from spacy.vocab import Vocab
import spacy
from spacy.kb import KnowledgeBase
-from bin.wiki_entity_linking.train_descriptions import EntityEncoder
-
# Q2146908 (Russ Cochran): American golfer
# Q7381115 (Russ Cochran): publisher
ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
-INPUT_DIM = 300 # dimension of pretrained input vectors
-DESC_WIDTH = 64 # dimension of output entity vectors
-
@plac.annotations(
model=("Model name, should have pretrained word embeddings", "positional", None, str),
output_dir=("Optional output directory", "option", "o", Path),
- n_iter=("Number of training iterations", "option", "n", int),
)
-def main(model=None, output_dir=None, n_iter=50):
- """Load the model, create the KB and pretrain the entity encodings.
+def main(model=None, output_dir=None):
+ """Load the model and create the KB with pre-defined entity encodings.
If an output_dir is provided, the KB will be stored there in a file 'kb'.
The updated vocab will also be written to a directory in the output_dir."""
@@ -51,33 +45,23 @@ def main(model=None, output_dir=None, n_iter=50):
" cf. https://spacy.io/usage/models#languages."
)
- kb = KnowledgeBase(vocab=nlp.vocab)
+ # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality.
+ # For simplicity, we'll just use the original vector dimension here instead.
+ vectors_dim = nlp.vocab.vectors.shape[1]
+ kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim)
# set up the data
entity_ids = []
- descriptions = []
+ descr_embeddings = []
freqs = []
for key, value in ENTITIES.items():
desc, freq = value
entity_ids.append(key)
- descriptions.append(desc)
+ descr_embeddings.append(nlp(desc).vector)
freqs.append(freq)
- # training entity description encodings
- # this part can easily be replaced with a custom entity encoder
- encoder = EntityEncoder(
- nlp=nlp,
- input_dim=INPUT_DIM,
- desc_width=DESC_WIDTH,
- epochs=n_iter,
- )
- encoder.train(description_list=descriptions, to_print=True)
-
- # get the pretrained entity vectors
- embeddings = encoder.apply_encoder(descriptions)
-
# set the entities, can also be done by calling `kb.add_entity` for each entity
- kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings)
+ kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=descr_embeddings)
# adding aliases, the entities need to be defined in the KB beforehand
kb.add_alias(
@@ -113,8 +97,8 @@ def main(model=None, output_dir=None, n_iter=50):
vocab2 = Vocab().from_disk(vocab_path)
kb2 = KnowledgeBase(vocab=vocab2)
kb2.load_bulk(kb_path)
- _print_kb(kb2)
print()
+ _print_kb(kb2)
def _print_kb(kb):
@@ -126,6 +110,5 @@ if __name__ == "__main__":
plac.call(main)
# Expected output:
-
# 2 kb entities: ['Q2146908', 'Q7381115']
# 1 kb aliases: ['Russ Cochran']
diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py
index dd7c3a1b2..c7eba8a30 100644
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@@ -1,15 +1,15 @@
#!/usr/bin/env python
# coding: utf8
-"""Example of training spaCy's entity linker, starting off with an
-existing model and a pre-defined knowledge base.
+"""Example of training spaCy's entity linker, starting off with a predefined
+knowledge base and corresponding vocab, and a blank English model.
For more details, see the documentation:
* Training: https://spacy.io/usage/training
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
-Compatible with: spaCy v2.2.3
-Last tested with: v2.2.3
+Compatible with: spaCy v2.2.4
+Last tested with: v2.2.4
"""
from __future__ import unicode_literals, print_function
@@ -17,13 +17,11 @@ import plac
import random
from pathlib import Path
-from spacy.symbols import PERSON
from spacy.vocab import Vocab
import spacy
from spacy.kb import KnowledgeBase
from spacy.pipeline import EntityRuler
-from spacy.tokens import Span
from spacy.util import minibatch, compounding
diff --git a/website/docs/usage/examples.md b/website/docs/usage/examples.md
index 180b02ff4..9b210a69a 100644
--- a/website/docs/usage/examples.md
+++ b/website/docs/usage/examples.md
@@ -111,6 +111,27 @@ start.
https://github.com/explosion/spaCy/tree/master/examples/training/train_new_entity_type.py
```
+### Creating a Knowledge Base for Named Entity Linking {#kb}
+
+This example shows how to create a knowledge base in spaCy,
+which is needed to implement entity linking functionality.
+It requires as input a spaCy model with pretrained word vectors,
+and it stores the KB to file (if an `output_dir` is provided).
+
+```python
+https://github.com/explosion/spaCy/tree/master/examples/training/create_kb.py
+```
+
+### Training spaCy's Named Entity Linker {#nel}
+
+This example shows how to train spaCy's entity linker with your own custom
+examples, starting off with a predefined knowledge base and its vocab,
+and using a blank `English` class.
+
+```python
+https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py
+```
+
### Training spaCy's Dependency Parser {#parser}
This example shows how to update spaCy's dependency parser, starting off with an
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 59712939a..d17e5a661 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -579,9 +579,7 @@ import DisplacyEntHtml from 'images/displacy-ent2.html'
To ground the named entities into the "real world", spaCy provides functionality
to perform entity linking, which resolves a textual entity to a unique
-identifier from a knowledge base (KB). The
-[processing scripts](https://github.com/explosion/spaCy/tree/master/bin/wiki_entity_linking)
-we provide use WikiData identifiers, but you can create your own
+identifier from a knowledge base (KB). You can create your own
[`KnowledgeBase`](/api/kb) and
[train a new Entity Linking model](/usage/training#entity-linker) using that
custom-made KB.
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 479441edf..ecdc6720b 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -347,9 +347,9 @@ your data** to find a solution that works best for you.
### Updating the Named Entity Recognizer {#example-train-ner}
This example shows how to update spaCy's entity recognizer with your own
-examples, starting off with an existing, pretrained model, or from scratch
-using a blank `Language` class. To do this, you'll need **example texts** and
-the **character offsets** and **labels** of each entity contained in the texts.
+examples, starting off with an existing, pretrained model, or from scratch using
+a blank `Language` class. To do this, you'll need **example texts** and the
+**character offsets** and **labels** of each entity contained in the texts.
```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py
@@ -440,8 +440,8 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py
training the parser.
2. **Add the dependency labels** to the parser using the
[`add_label`](/api/dependencyparser#add_label) method. If you're starting off
- with a pretrained spaCy model, this is usually not necessary – but it
- doesn't hurt either, just to be safe.
+ with a pretrained spaCy model, this is usually not necessary – but it doesn't
+ hurt either, just to be safe.
3. **Shuffle and loop over** the examples. For each example, **update the
model** by calling [`nlp.update`](/api/language#update), which steps through
the words of the input. At each word, it makes a **prediction**. It then
@@ -605,16 +605,16 @@ To train an entity linking model, you first need to define a knowledge base
A KB consists of a list of entities with unique identifiers. Each such entity
has an entity vector that will be used to measure similarity with the context in
-which an entity is used. These vectors are pretrained and stored in the KB
-before the entity linking model will be trained.
+which an entity is used. These vectors have a fixed length and are stored in the
+KB.
The following example shows how to build a knowledge base from scratch, given a
-list of entities and potential aliases. The script further demonstrates how to
-pretrain and store the entity vectors. To run this example, the script needs
-access to a `vocab` instance or an `nlp` model with pretrained word embeddings.
+list of entities and potential aliases. The script requires an `nlp` model with
+pretrained word vectors to obtain an encoding of an entity's description as its
+vector.
```python
-https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py
+https://github.com/explosion/spaCy/tree/master/examples/training/create_kb.py
```
#### Step by step guide {#step-by-step-kb}
From eb117e2fce9d1029670f52690d30d17e6edbd24e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal
Date: Sat, 2 May 2020 14:09:21 +0200
Subject: [PATCH 126/496] Add load_config_from_str helper
---
spacy/util.py | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/spacy/util.py b/spacy/util.py
index ea3023629..b4ecc8b03 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -219,6 +219,23 @@ def load_config(path, create_objects=False):
return config
+def load_config_from_str(string, create_objects=False):
+ """Load a Thinc-formatted config, optionally filling in objects where
+ the config references registry entries. See "Thinc config files" for details.
+
+ string (unicode or Path): Text contents of the config file.
+ create_objects (bool): Whether to automatically create objects when the config
+ references registry entries. Defaults to False.
+
+ RETURNS (dict): The objects from the config file.
+ """
+ config = thinc.config.Config().from_str(string)
+ if create_objects:
+ return registry.make_from_config(config, validate=True)
+ else:
+ return config
+
+
def get_model_meta(path):
"""Get model meta.json from a directory path and validate its contents.
From b3969c14796d95b4419655f63cebcbde8fee4521 Mon Sep 17 00:00:00 2001
From: adrianeboyd
Date: Fri, 8 May 2020 10:36:25 +0200
Subject: [PATCH 127/496] Clarify Token.pos as UPOS (#5419)
---
website/docs/api/token.md | 4 ++--
website/docs/usage/101/_pos-deps.md | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index c30c01c20..b397efc55 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -461,8 +461,8 @@ The L2 norm of the token's vector representation.
| `like_email` | bool | Does the token resemble an email address? |
| `is_oov` | bool | Is the token out-of-vocabulary? |
| `is_stop` | bool | Is the token part of a "stop list"? |
-| `pos` | int | Coarse-grained part-of-speech. |
-| `pos_` | unicode | Coarse-grained part-of-speech. |
+| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). |
+| `pos_` | unicode | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). |
| `tag` | int | Fine-grained part-of-speech. |
| `tag_` | unicode | Fine-grained part-of-speech. |
| `dep` | int | Syntactic dependency relation. |
diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md
index 9d04d6ffc..1a438e424 100644
--- a/website/docs/usage/101/_pos-deps.md
+++ b/website/docs/usage/101/_pos-deps.md
@@ -25,7 +25,7 @@ for token in doc:
> - **Text:** The original word text.
> - **Lemma:** The base form of the word.
-> - **POS:** The simple part-of-speech tag.
+> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/) part-of-speech tag.
> - **Tag:** The detailed part-of-speech tag.
> - **Dep:** Syntactic dependency, i.e. the relation between tokens.
> - **Shape:** The word shape – capitalization, punctuation, digits.
From afb26d788f954f68f347b1db81927d8ccebb4b71 Mon Sep 17 00:00:00 2001
From: Travis Hoppe
Date: Fri, 8 May 2020 02:28:54 -0700
Subject: [PATCH 128/496] Added author information for NLPre (#5414)
* Add author links for NLPre and update category
* Add contributor statement
---
.github/contributors/thoppe.md | 106 +++++++++++++++++++++++++++++++++
website/meta/universe.json | 8 ++-
2 files changed, 113 insertions(+), 1 deletion(-)
create mode 100644 .github/contributors/thoppe.md
diff --git a/.github/contributors/thoppe.md b/.github/contributors/thoppe.md
new file mode 100644
index 000000000..9271a2601
--- /dev/null
+++ b/.github/contributors/thoppe.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Travis Hoppe |
+| Company name (if applicable) | |
+| Title or role (if applicable) | Data Scientist |
+| Date | 07 May 2020 |
+| GitHub username | thoppe |
+| Website (optional) | http://thoppe.github.io/ |
diff --git a/website/meta/universe.json b/website/meta/universe.json
index b5e1dbde0..22673834a 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -114,7 +114,13 @@
" text = f(text)",
"print(text)"
],
- "category": ["scientific"]
+ "category": ["scientific", "biomedical"],
+ "author": "Travis Hoppe",
+ "author_links": {
+ "github": "thoppe",
+ "twitter":"metasemantic",
+ "website" : "http://thoppe.github.io/"
+ }
},
{
"id": "Chatterbot",
From 9fe1e235127f4b6e870a3180af879fefc4c33e90 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Tue, 12 May 2020 13:51:25 +0200
Subject: [PATCH 129/496] update to thinc 8.0.0a6
---
pyproject.toml | 2 +-
requirements.txt | 2 +-
setup.cfg | 4 ++--
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 9440c2d44..a7b4c825e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
- "thinc==8.0.0a3",
+ "thinc==8.0.0a6",
"blis>=0.4.0,<0.5.0"
]
build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 73e595daf..814eaf3dc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
-thinc==8.0.0a3
+thinc==8.0.0a6
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 2ff13e3e1..80ceed207 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -36,13 +36,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
- thinc==8.0.0a3
+ thinc==8.0.0a6
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
- thinc==8.0.0a3
+ thinc==8.0.0a6
blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0
From 102c8c7e2f482b67d8fea8e4b9b341365da38565 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Tue, 12 May 2020 13:56:10 +0200
Subject: [PATCH 130/496] fix fan_in renaming
---
spacy/ml/_precomputable_affine.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
index c7328bad9..ec95cdafd 100644
--- a/spacy/ml/_precomputable_affine.py
+++ b/spacy/ml/_precomputable_affine.py
@@ -110,7 +110,8 @@ def init(model, X=None, Y=None):
pad = model.ops.alloc4f(1, nF, nO, nP)
ops = model.ops
- W = normal_init(ops, W.shape, fan_in=nF * nI)
+ scale = float(ops.xp.sqrt(1.0 / (nF * nI)))
+ W = normal_init(ops, W.shape, mean=scale)
model.set_param("W", W)
model.set_param("b", b)
model.set_param("pad", pad)
From e0fda2bd81bd7e7b1a9006c403d52c470954701b Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Fri, 15 May 2020 11:02:10 +0200
Subject: [PATCH 131/496] throw warning when model_cfg is None
---
spacy/errors.py | 2 ++
spacy/language.py | 3 +++
2 files changed, 5 insertions(+)
diff --git a/spacy/errors.py b/spacy/errors.py
index 23139e10a..99a0081c0 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -104,6 +104,8 @@ class Warnings(object):
"string \"Field1=Value1,Value2|Field2=Value3\".")
# TODO: fix numbering after merging develop into master
+ W097 = ("No Model config was provided to create the '{name}' component, "
+ "and no default configuration could be found either.")
W098 = ("No Model config was provided to create the '{name}' component, "
"so a default configuration was used.")
W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
diff --git a/spacy/language.py b/spacy/language.py
index 5343df4b7..2dd7ce406 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -184,6 +184,7 @@ class Language(object):
self.max_length = max_length
self._optimizer = None
+ # TODO: de-uglify (incorporating into component decorator didn't work because of circular imports)
from .ml.models.defaults import (
default_tagger_config,
default_parser_config,
@@ -349,6 +350,8 @@ class Language(object):
if model_cfg is None and default_config is not None:
warnings.warn(Warnings.W098.format(name=name))
model_cfg = default_config["model"]
+ if model_cfg is None:
+ warnings.warn(Warnings.W097.format(name=name))
model = None
if model_cfg is not None:
self.config[name] = {"model": model_cfg}
From 79d4f196e54cce1b85bb6e741714e1a89ed4689c Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Fri, 15 May 2020 11:53:01 +0200
Subject: [PATCH 132/496] pin flak8 to 3.5.0
---
azure-pipelines.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index f93dffaed..4dfb51296 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -27,7 +27,7 @@ jobs:
inputs:
versionSpec: '3.7'
- script: |
- pip install flake8
+ pip install flake8==3.5.0
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
displayName: 'flake8'
From 047f3d7d94a6ef9dec904a8a468497c9dcab7506 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Fri, 15 May 2020 13:25:00 +0200
Subject: [PATCH 133/496] remove ops argument for Adam
---
spacy/tests/parser/test_add_label.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index fb43458ae..647c9720c 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -29,7 +29,7 @@ def _train_parser(parser):
fix_random_seed(1)
parser.add_label("left")
parser.begin_training([], **parser.cfg)
- sgd = Adam(0.001, ops=NumpyOps())
+ sgd = Adam(0.001)
for i in range(5):
losses = {}
@@ -42,7 +42,7 @@ def _train_parser(parser):
def test_add_label(parser):
parser = _train_parser(parser)
parser.add_label("right")
- sgd = Adam(0.001, ops=NumpyOps())
+ sgd = Adam(0.001)
for i in range(100):
losses = {}
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
From 6fb6a8518c014f10bb07aab386503b2ee6540ec4 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Fri, 15 May 2020 13:25:54 +0200
Subject: [PATCH 134/496] bump to 3.0.0.dev7 and thinc to 8.0.0a8
---
pyproject.toml | 2 +-
requirements.txt | 2 +-
setup.cfg | 4 ++--
spacy/about.py | 2 +-
4 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index a7b4c825e..548664e89 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
- "thinc==8.0.0a6",
+ "thinc==8.0.0a8",
"blis>=0.4.0,<0.5.0"
]
build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 814eaf3dc..08b4c228a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
-thinc==8.0.0a6
+thinc==8.0.0a8
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 80ceed207..9fe02018b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -36,13 +36,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
- thinc==8.0.0a6
+ thinc==8.0.0a8
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
- thinc==8.0.0a6
+ thinc==8.0.0a8
blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0
diff --git a/spacy/about.py b/spacy/about.py
index 6fa1f4c0b..3f87c8dbc 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.0.0.dev6"
+__version__ = "3.0.0.dev7"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
From e8ff4c1e6a2b92eb0194d343a6f1f212172f4bb8 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Mon, 18 May 2020 10:50:21 +0200
Subject: [PATCH 135/496] Pin flake8 version
---
azure-pipelines.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index f93dffaed..4dfb51296 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -27,7 +27,7 @@ jobs:
inputs:
versionSpec: '3.7'
- script: |
- pip install flake8
+ pip install flake8==3.5.0
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
displayName: 'flake8'
From 333b1a308b8edd91e06ce914e49b10834e2de3ce Mon Sep 17 00:00:00 2001
From: Matthew Honnibal
Date: Mon, 18 May 2020 22:23:33 +0200
Subject: [PATCH 136/496] Adapt parser and NER for transformers (#5449)
* Draft layer for BILUO actions
* Fixes to biluo layer
* WIP on BILUO layer
* Add tests for BILUO layer
* Format
* Fix transitions
* Update test
* Link in the simple_ner
* Update BILUO tagger
* Update __init__
* Import simple_ner
* Update test
* Import
* Add files
* Add config
* Fix label passing for BILUO and tagger
* Fix label handling for simple_ner component
* Update simple NER test
* Update config
* Hack train script
* Update BILUO layer
* Fix SimpleNER component
* Update train_from_config
* Add biluo_to_iob helper
* Add IOB layer
* Add IOBTagger model
* Update biluo layer
* Update SimpleNER tagger
* Update BILUO
* Read random seed in train-from-config
* Update use of normal_init
* Fix normalization of gradient in SimpleNER
* Update IOBTagger
* Remove print
* Tweak masking in BILUO
* Add dropout in SimpleNER
* Update thinc
* Tidy up simple_ner
* Fix biluo model
* Unhack train-from-config
* Update setup.cfg and requirements
* Add tb_framework.py for parser model
* Try to avoid memory leak in BILUO
* Move ParserModel into spacy.ml, avoid need for subclass.
* Use updated parser model
* Remove incorrect call to model.initializre in PrecomputableAffine
* Update parser model
* Avoid divide by zero in tagger
* Add extra dropout layer in tagger
* Refine minibatch_by_words function to avoid oom
* Fix parser model after refactor
* Try to avoid div-by-zero in SimpleNER
* Fix infinite loop in minibatch_by_words
* Use SequenceCategoricalCrossentropy in Tagger
* Fix parser model when hidden layer
* Remove extra dropout from tagger
* Add extra nan check in tagger
* Fix thinc version
* Update tests and imports
* Fix test
* Update test
* Update tests
* Fix tests
* Fix test
Co-authored-by: Ines Montani
---
.../tok2vec-ner/multihashembed_tok2vec.cfg | 57 +--
examples/training/train_ner.py | 12 +-
spacy/cli/train_from_config.py | 81 ++--
spacy/gold.pyx | 8 +
spacy/language.py | 2 +
spacy/ml/_biluo.py | 109 +++++
spacy/ml/_iob.py | 92 ++++
spacy/ml/_precomputable_affine.py | 4 +-
spacy/ml/models/__init__.py | 1 +
spacy/ml/models/defaults/__init__.py | 10 +
.../models/defaults/simple_ner_defaults.cfg | 12 +
spacy/ml/models/parser.py | 30 +-
spacy/ml/models/simple_ner.py | 82 ++++
spacy/ml/models/tagger.py | 5 +-
spacy/ml/tb_framework.py | 86 ++++
spacy/pipeline/__init__.py | 2 +
spacy/pipeline/pipes.pyx | 33 +-
spacy/pipeline/simple_ner.py | 149 +++++++
spacy/syntax/_parser_model.pyx | 104 +----
spacy/syntax/nn_parser.pyx | 71 ++-
spacy/tests/parser/test_add_label.py | 2 +-
spacy/tests/parser/test_neural_parser.py | 6 +-
spacy/tests/pipeline/test_simple_ner.py | 417 ++++++++++++++++++
spacy/tests/regression/test_issue2001-2500.py | 3 +-
spacy/tests/regression/test_issue3001-3500.py | 3 +-
.../tests/serialize/test_serialize_config.py | 6 +-
.../serialize/test_serialize_pipeline.py | 2 +-
spacy/tests/test_misc.py | 2 +-
spacy/util.py | 36 +-
29 files changed, 1180 insertions(+), 247 deletions(-)
create mode 100644 spacy/ml/_biluo.py
create mode 100644 spacy/ml/_iob.py
create mode 100644 spacy/ml/models/defaults/simple_ner_defaults.cfg
create mode 100644 spacy/ml/models/simple_ner.py
create mode 100644 spacy/ml/tb_framework.py
create mode 100644 spacy/pipeline/simple_ner.py
create mode 100644 spacy/tests/pipeline/test_simple_ner.py
diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
index 4678a7d6b..dc25a1c3b 100644
--- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
@@ -4,12 +4,18 @@ limit = 0
dropout = 0.2
patience = 10000
eval_frequency = 200
-scores = ["ents_f"]
+scores = ["ents_p", "ents_r", "ents_f"]
score_weights = {"ents_f": 1}
orth_variant_level = 0.0
gold_preproc = true
max_length = 0
-batch_size = 25
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 3000
+stop = 3000
+compound = 1.001
+
[optimizer]
@optimizers = "Adam.v1"
@@ -21,45 +27,18 @@ beta2 = 0.999
lang = "en"
vectors = null
-[nlp.pipeline.tok2vec]
-factory = "tok2vec"
-
-[nlp.pipeline.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
-
-[nlp.pipeline.tok2vec.model.extract]
-@architectures = "spacy.Doc2Feats.v1"
-columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
-
-[nlp.pipeline.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-columns = ${nlp.pipeline.tok2vec.model.extract:columns}
-width = 96
-rows = 2000
-use_subwords = true
-pretrained_vectors = null
-
-[nlp.pipeline.tok2vec.model.embed.mix]
-@architectures = "spacy.LayerNormalizedMaxout.v1"
-width = ${nlp.pipeline.tok2vec.model.embed:width}
-maxout_pieces = 3
-
-[nlp.pipeline.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
-width = ${nlp.pipeline.tok2vec.model.embed:width}
-window_size = 1
-maxout_pieces = 3
-depth = 2
-
[nlp.pipeline.ner]
-factory = "ner"
+factory = "simple_ner"
[nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 6
-hidden_width = 64
-maxout_pieces = 2
+@architectures = "spacy.BiluoTagger.v1"
[nlp.pipeline.ner.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model.embed:width}
+@architectures = "spacy.HashEmbedCNN.v1"
+width = 128
+depth = 4
+embed_size = 7000
+maxout_pieces = 3
+window_size = 1
+subword_features = true
+pretrained_vectors = null
diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
index d5d034616..d4e0bf794 100644
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@@ -42,26 +42,28 @@ def main(model=None, output_dir=None, n_iter=100):
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
- if "ner" not in nlp.pipe_names:
- ner = nlp.create_pipe("ner")
+ if "simple_ner" not in nlp.pipe_names:
+ ner = nlp.create_pipe("simple_ner")
nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
- ner = nlp.get_pipe("ner")
+ ner = nlp.get_pipe("simple_ner")
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
+ print("Add label", ent[2])
ner.add_label(ent[2])
# get names of other pipes to disable them during training
- pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
+ pipe_exceptions = ["simple_ner"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train NER
# reset and initialize the weights randomly – but only if we're
# training a new model
if model is None:
nlp.begin_training()
+ print("Transitions", list(enumerate(nlp.get_pipe("simple_ner").get_tag_names())))
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
@@ -70,7 +72,7 @@ def main(model=None, output_dir=None, n_iter=100):
for batch in batches:
nlp.update(
batch,
- drop=0.5, # dropout - make it harder to memorise data
+ drop=0.0, # dropout - make it harder to memorise data
losses=losses,
)
print("Losses", losses)
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index 933b275c4..bd83deb04 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -8,6 +8,7 @@ from wasabi import msg
import thinc
import thinc.schedules
from thinc.api import Model
+import random
from ..gold import GoldCorpus
from .. import util
@@ -119,6 +120,7 @@ class ConfigSchema(BaseModel):
output_path=("Output directory to store model in", "option", "o", Path),
meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
+ use_gpu=("Use GPU", "option", "g", int),
# fmt: on
)
def train_from_config_cli(
@@ -130,6 +132,7 @@ def train_from_config_cli(
raw_text=None,
debug=False,
verbose=False,
+ use_gpu=-1
):
"""
Train or update a spaCy model. Requires data to be formatted in spaCy's
@@ -147,6 +150,12 @@ def train_from_config_cli(
if output_path is not None and not output_path.exists():
output_path.mkdir()
+ if use_gpu >= 0:
+ msg.info("Using GPU")
+ util.use_gpu(use_gpu)
+ else:
+ msg.info("Using CPU")
+
train_from_config(
config_path,
{"train": train_path, "dev": dev_path},
@@ -161,13 +170,8 @@ def train_from_config(
):
msg.info(f"Loading config from: {config_path}")
config = util.load_config(config_path, create_objects=False)
+ util.fix_random_seed(config["training"]["seed"])
nlp_config = config["nlp"]
- use_gpu = config["training"]["use_gpu"]
- if use_gpu >= 0:
- msg.info("Using GPU")
- util.use_gpu(use_gpu)
- else:
- msg.info("Using CPU")
config = util.load_config(config_path, create_objects=True)
msg.info("Creating nlp from config")
nlp = util.load_model_from_config(nlp_config)
@@ -177,7 +181,7 @@ def train_from_config(
msg.info("Loading training corpus")
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
msg.info("Initializing the nlp pipeline")
- nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
+ nlp.begin_training(lambda: corpus.train_examples)
train_batches = create_train_batches(nlp, corpus, training)
evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)
@@ -192,6 +196,7 @@ def train_from_config(
training["dropout"],
training["patience"],
training["eval_frequency"],
+ training["accumulate_gradient"]
)
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
@@ -220,43 +225,50 @@ def train_from_config(
def create_train_batches(nlp, corpus, cfg):
while True:
- train_examples = corpus.train_dataset(
+ train_examples = list(corpus.train_dataset(
nlp,
noise_level=0.0,
orth_variant_level=cfg["orth_variant_level"],
gold_preproc=cfg["gold_preproc"],
max_length=cfg["max_length"],
ignore_misaligned=True,
- )
- for batch in util.minibatch_by_words(train_examples, size=cfg["batch_size"]):
+ ))
+ random.shuffle(train_examples)
+ batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
+ for batch in batches:
yield batch
def create_evaluation_callback(nlp, optimizer, corpus, cfg):
def evaluate():
- with nlp.use_params(optimizer.averages):
- dev_examples = list(
- corpus.dev_dataset(
- nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
- )
+ dev_examples = list(
+ corpus.dev_dataset(
+ nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
)
- n_words = sum(len(ex.doc) for ex in dev_examples)
- start_time = timer()
- scorer = nlp.evaluate(dev_examples)
- end_time = timer()
- wps = n_words / (end_time - start_time)
- scores = scorer.scores
- # Calculate a weighted sum based on score_weights for the main score
- weights = cfg["score_weights"]
- weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
- scores["speed"] = wps
+ )
+ n_words = sum(len(ex.doc) for ex in dev_examples)
+ start_time = timer()
+
+ if optimizer.averages:
+ with nlp.use_params(optimizer.averages):
+ scorer = nlp.evaluate(dev_examples, batch_size=32)
+ else:
+ scorer = nlp.evaluate(dev_examples, batch_size=32)
+ end_time = timer()
+ wps = n_words / (end_time - start_time)
+ scores = scorer.scores
+ # Calculate a weighted sum based on score_weights for the main score
+ weights = cfg["score_weights"]
+ weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
+ scores["speed"] = wps
return weighted_score, scores
return evaluate
def train_while_improving(
- nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency
+ nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency,
+ accumulate_gradient
):
"""Train until an evaluation stops improving. Works as a generator,
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@@ -303,7 +315,7 @@ def train_while_improving(
losses = {}
for step, batch in enumerate(train_data):
dropout = next(dropouts)
- for subbatch in subdivide_batch(batch):
+ for subbatch in subdivide_batch(batch, accumulate_gradient):
nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
for name, proc in nlp.pipeline:
if hasattr(proc, "model"):
@@ -332,8 +344,19 @@ def train_while_improving(
break
-def subdivide_batch(batch):
- return [batch]
+def subdivide_batch(batch, accumulate_gradient):
+ batch = list(batch)
+ batch.sort(key=lambda eg: len(eg.doc))
+ sub_len = len(batch) // accumulate_gradient
+ start = 0
+ for i in range(accumulate_gradient):
+ subbatch = batch[start : start + sub_len]
+ if subbatch:
+ yield subbatch
+ start += len(subbatch)
+ subbatch = batch[start : ]
+ if subbatch:
+ yield subbatch
def setup_printer(training, nlp):
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index a9156c1a5..6647e41b4 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -608,6 +608,14 @@ def iob_to_biluo(tags):
return out
+def biluo_to_iob(tags):
+ out = []
+ for tag in tags:
+ tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1)
+ out.append(tag)
+ return out
+
+
def _consume_os(tags):
while tags and tags[0] == "O":
yield tags.pop(0)
diff --git a/spacy/language.py b/spacy/language.py
index 2dd7ce406..a7db5ef20 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -195,6 +195,7 @@ class Language(object):
default_senter_config,
default_tensorizer_config,
default_tok2vec_config,
+ default_simple_ner_config
)
self.defaults = {
@@ -205,6 +206,7 @@ class Language(object):
"entity_linker": default_nel_config(),
"morphologizer": default_morphologizer_config(),
"senter": default_senter_config(),
+ "simple_ner": default_simple_ner_config(),
"tensorizer": default_tensorizer_config(),
"tok2vec": default_tok2vec_config(),
}
diff --git a/spacy/ml/_biluo.py b/spacy/ml/_biluo.py
new file mode 100644
index 000000000..28339089a
--- /dev/null
+++ b/spacy/ml/_biluo.py
@@ -0,0 +1,109 @@
+"""Thinc layer to do simpler transition-based parsing, NER, etc."""
+from typing import List, Tuple, Dict, Optional
+import numpy
+from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
+from thinc.api import to_numpy
+from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
+
+from ..tokens import Doc
+
+
+def BILUO() -> Model[Padded, Padded]:
+ return Model(
+ "biluo",
+ forward,
+ init=init,
+ dims={"nO": None},
+ attrs={"get_num_actions": get_num_actions}
+ )
+
+
+def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None):
+ if X is not None and Y is not None:
+ if X.data.shape != Y.data.shape:
+ # TODO: Fix error
+ raise ValueError("Mismatched shapes (TODO: Fix message)")
+ model.set_dim("nO", X.data.shape[2])
+ elif X is not None:
+ model.set_dim("nO", X.data.shape[2])
+ elif Y is not None:
+ model.set_dim("nO", Y.data.shape[2])
+ elif model.get_dim("nO") is None:
+ raise ValueError("Dimension unset for BILUO: nO")
+
+
+def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
+ n_labels = (model.get_dim("nO") - 1) // 4
+ n_tokens, n_docs, n_actions = Xp.data.shape
+ # At each timestep, we make a validity mask of shape (n_docs, n_actions)
+ # to indicate which actions are valid next for each sequence. To construct
+ # the mask, we have a state of shape (2, n_actions) and a validity table of
+ # shape (2, n_actions+1, n_actions). The first dimension of the state indicates
+ # whether it's the last token, the second dimension indicates the previous
+ # action, plus a special 'null action' for the first entry.
+ valid_transitions = model.ops.asarray(_get_transition_table(n_labels))
+ prev_actions = model.ops.alloc1i(n_docs)
+ # Initialize as though prev action was O
+ prev_actions.fill(n_actions - 1)
+ Y = model.ops.alloc3f(*Xp.data.shape)
+ masks = model.ops.alloc3f(*Y.shape)
+ max_value = Xp.data.max()
+ for t in range(Xp.data.shape[0]):
+ is_last = (Xp.lengths < (t+2)).astype("i")
+ masks[t] = valid_transitions[is_last, prev_actions]
+ # Don't train the out-of-bounds sequences.
+ masks[t, Xp.size_at_t[t]:] = 0
+ # Valid actions get 0*10e8, invalid get large negative value
+ Y[t] = Xp.data[t] + ((masks[t]-1) * max_value * 10)
+ prev_actions = Y[t].argmax(axis=-1)
+
+ def backprop_biluo(dY: Padded) -> Padded:
+ dY.data *= masks
+ return dY
+
+ return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
+
+
+def get_num_actions(n_labels: int) -> int:
+ # One BEGIN action per label
+ # One IN action per label
+ # One LAST action per label
+ # One UNIT action per label
+ # One OUT action
+ return n_labels + n_labels + n_labels + n_labels + 1
+
+
+def _get_transition_table(
+ n_labels: int, *, _cache: Dict[int, Floats3d] = {}
+) -> Floats3d:
+ n_actions = get_num_actions(n_labels)
+ if n_actions in _cache:
+ return _cache[n_actions]
+ table = numpy.zeros((2, n_actions, n_actions), dtype="f")
+ B_start, B_end = (0, n_labels)
+ I_start, I_end = (B_end, B_end + n_labels)
+ L_start, L_end = (I_end, I_end + n_labels)
+ U_start, U_end = (L_end, L_end + n_labels)
+ # Using ranges allows us to set specific cells, which is necessary to express
+ # that only actions of the same label are valid continuations.
+ B_range = numpy.arange(B_start, B_end)
+ I_range = numpy.arange(I_start, I_end)
+ L_range = numpy.arange(L_start, L_end)
+ O_action = U_end
+ # If this is the last token and the previous action was B or I, only L
+ # of that label is valid
+ table[1, B_range, L_range] = 1
+ table[1, I_range, L_range] = 1
+ # If this isn't the last token and the previous action was B or I, only I or
+ # L of that label are valid.
+ table[0, B_range, I_range] = 1
+ table[0, B_range, L_range] = 1
+ table[0, I_range, I_range] = 1
+ table[0, I_range, L_range] = 1
+ # If this isn't the last token and the previous was L, U or O, B is valid
+ table[0, L_start:, :B_end] = 1
+ # Regardless of whether this is the last token, if the previous action was
+ # {L, U, O}, U and O are valid.
+ table[:, L_start:, U_start:] = 1
+ _cache[n_actions] = table
+ return table
diff --git a/spacy/ml/_iob.py b/spacy/ml/_iob.py
new file mode 100644
index 000000000..0ce9a71e6
--- /dev/null
+++ b/spacy/ml/_iob.py
@@ -0,0 +1,92 @@
+"""Thinc layer to do simpler transition-based parsing, NER, etc."""
+from typing import List, Tuple, Dict, Optional
+from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
+from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
+
+from ..tokens import Doc
+
+
+def IOB() -> Model[Padded, Padded]:
+ return Model(
+ "biluo",
+ forward,
+ init=init,
+ dims={"nO": None},
+ attrs={"get_num_actions": get_num_actions}
+ )
+
+
+def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None):
+ if X is not None and Y is not None:
+ if X.data.shape != Y.data.shape:
+ # TODO: Fix error
+ raise ValueError("Mismatched shapes (TODO: Fix message)")
+ model.set_dim("nO", X.data.shape[2])
+ elif X is not None:
+ model.set_dim("nO", X.data.shape[2])
+ elif Y is not None:
+ model.set_dim("nO", Y.data.shape[2])
+ elif model.get_dim("nO") is None:
+ raise ValueError("Dimension unset for BILUO: nO")
+
+
+def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
+ n_labels = (model.get_dim("nO") - 1) // 2
+ n_tokens, n_docs, n_actions = Xp.data.shape
+ # At each timestep, we make a validity mask of shape (n_docs, n_actions)
+ # to indicate which actions are valid next for each sequence. To construct
+ # the mask, we have a state of shape (2, n_actions) and a validity table of
+ # shape (2, n_actions+1, n_actions). The first dimension of the state indicates
+ # whether it's the last token, the second dimension indicates the previous
+ # action, plus a special 'null action' for the first entry.
+ valid_transitions = _get_transition_table(model.ops, n_labels)
+ prev_actions = model.ops.alloc1i(n_docs)
+ # Initialize as though prev action was O
+ prev_actions.fill(n_actions - 1)
+ Y = model.ops.alloc3f(*Xp.data.shape)
+ masks = model.ops.alloc3f(*Y.shape)
+ for t in range(Xp.data.shape[0]):
+ masks[t] = valid_transitions[prev_actions]
+ # Don't train the out-of-bounds sequences.
+ masks[t, Xp.size_at_t[t]:] = 0
+ # Valid actions get 0*10e8, invalid get -1*10e8
+ Y[t] = Xp.data[t] + ((masks[t]-1) * 10e8)
+ prev_actions = Y[t].argmax(axis=-1)
+
+ def backprop_biluo(dY: Padded) -> Padded:
+ # Masking the gradient seems to do poorly here. But why?
+ #dY.data *= masks
+ return dY
+
+ return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
+
+
+def get_num_actions(n_labels: int) -> int:
+ # One BEGIN action per label
+ # One IN action per label
+ # One LAST action per label
+ # One UNIT action per label
+ # One OUT action
+ return n_labels * 2 + 1
+
+
+def _get_transition_table(
+ ops: Ops, n_labels: int, _cache: Dict[int, Floats3d] = {}
+) -> Floats3d:
+ n_actions = get_num_actions(n_labels)
+ if n_actions in _cache:
+ return ops.asarray(_cache[n_actions])
+ table = ops.alloc2f(n_actions, n_actions)
+ B_start, B_end = (0, n_labels)
+ I_start, I_end = (B_end, B_end + n_labels)
+ O_action = I_end
+ B_range = ops.xp.arange(B_start, B_end)
+ I_range = ops.xp.arange(I_start, I_end)
+ # B and O are always valid
+ table[:, B_start : B_end] = 1
+ table[:, O_action] = 1
+ # I can only follow a matching B
+ table[B_range, I_range] = 1
+
+ _cache[n_actions] = table
+ return table
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
index ec95cdafd..f4b5b16fe 100644
--- a/spacy/ml/_precomputable_affine.py
+++ b/spacy/ml/_precomputable_affine.py
@@ -9,7 +9,6 @@ def PrecomputableAffine(nO, nI, nF, nP):
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
params={"W": None, "b": None, "pad": None},
)
- model.initialize()
return model
@@ -110,8 +109,7 @@ def init(model, X=None, Y=None):
pad = model.ops.alloc4f(1, nF, nO, nP)
ops = model.ops
- scale = float(ops.xp.sqrt(1.0 / (nF * nI)))
- W = normal_init(ops, W.shape, mean=scale)
+ W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
model.set_param("W", W)
model.set_param("b", b)
model.set_param("pad", pad)
diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py
index d44c7cb2e..ef1e8efca 100644
--- a/spacy/ml/models/__init__.py
+++ b/spacy/ml/models/__init__.py
@@ -1,5 +1,6 @@
from .entity_linker import * # noqa
from .parser import * # noqa
+from .simple_ner import *
from .tagger import * # noqa
from .tensorizer import * # noqa
from .textcat import * # noqa
diff --git a/spacy/ml/models/defaults/__init__.py b/spacy/ml/models/defaults/__init__.py
index d5490fd16..850d9fce0 100644
--- a/spacy/ml/models/defaults/__init__.py
+++ b/spacy/ml/models/defaults/__init__.py
@@ -91,3 +91,13 @@ def default_tok2vec_config():
def default_tok2vec():
loc = Path(__file__).parent / "tok2vec_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
+
+
+def default_simple_ner_config():
+ loc = Path(__file__).parent / "simple_ner_defaults.cfg"
+ return util.load_config(loc, create_objects=False)
+
+
+def default_simple_ner():
+ loc = Path(__file__).parent / "simple_ner_defaults.cfg"
+ return util.load_config(loc, create_objects=True)["model"]
diff --git a/spacy/ml/models/defaults/simple_ner_defaults.cfg b/spacy/ml/models/defaults/simple_ner_defaults.cfg
new file mode 100644
index 000000000..4e3b640df
--- /dev/null
+++ b/spacy/ml/models/defaults/simple_ner_defaults.cfg
@@ -0,0 +1,12 @@
+[model]
+@architectures = "spacy.BiluoTagger.v1"
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 128
+depth = 4
+embed_size = 7000
+window_size = 1
+maxout_pieces = 3
+subword_features = true
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index f2d51c2ba..710d36a1d 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,9 +1,9 @@
from pydantic import StrictInt
-from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
+from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array
from ...util import registry
from .._precomputable_affine import PrecomputableAffine
-from ...syntax._parser_model import ParserModel
+from ..tb_framework import TransitionModel
@registry.architectures.register("spacy.TransitionBasedParser.v1")
@@ -12,21 +12,27 @@ def build_tb_parser_model(
nr_feature_tokens: StrictInt,
hidden_width: StrictInt,
maxout_pieces: StrictInt,
+ use_upper=True,
nO=None,
):
token_vector_width = tok2vec.get_dim("nO")
- tok2vec = chain(tok2vec, list2array())
- tok2vec.set_dim("nO", token_vector_width)
+ tok2vec = chain(
+ tok2vec,
+ with_array(Linear(hidden_width, token_vector_width)),
+ list2array(),
+ )
+ tok2vec.set_dim("nO", hidden_width)
lower = PrecomputableAffine(
- nO=hidden_width,
+ nO=hidden_width if use_upper else nO,
nF=nr_feature_tokens,
nI=tok2vec.get_dim("nO"),
- nP=maxout_pieces,
+ nP=maxout_pieces
)
- lower.set_dim("nP", maxout_pieces)
- with use_ops("numpy"):
- # Initialize weights at zero, as it's a classification layer.
- upper = Linear(nO=nO, init_W=zero_init)
- model = ParserModel(tok2vec, lower, upper)
- return model
+ if use_upper:
+ with use_ops("numpy"):
+ # Initialize weights at zero, as it's a classification layer.
+ upper = Linear(nO=nO, init_W=zero_init)
+ else:
+ upper = None
+ return TransitionModel(tok2vec, lower, upper)
diff --git a/spacy/ml/models/simple_ner.py b/spacy/ml/models/simple_ner.py
new file mode 100644
index 000000000..01661f55b
--- /dev/null
+++ b/spacy/ml/models/simple_ner.py
@@ -0,0 +1,82 @@
+import functools
+from typing import List, Tuple, Dict, Optional
+from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list
+from thinc.api import chain, list2padded, configure_normal_init
+from thinc.api import Dropout
+from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
+
+from ...tokens import Doc
+from .._biluo import BILUO
+from .._iob import IOB
+from ...util import registry
+
+
+@registry.architectures.register("spacy.BiluoTagger.v1")
+def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
+ biluo = BILUO()
+ linear = Linear(
+ nO=None,
+ nI=tok2vec.get_dim("nO"),
+ init_W=configure_normal_init(mean=0.02)
+ )
+ model = chain(
+ tok2vec,
+ list2padded(),
+ with_array(chain(Dropout(0.1), linear)),
+ biluo,
+ with_array(softmax_activation()),
+ padded2list()
+ )
+
+ return Model(
+ "biluo-tagger",
+ forward,
+ init=init,
+ layers=[model, linear],
+ refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
+ dims={"nO": None},
+ attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
+ )
+
+@registry.architectures.register("spacy.IOBTagger.v1")
+def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]:
+ biluo = IOB()
+ linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
+ model = chain(
+ tok2vec,
+ list2padded(),
+ with_array(linear),
+ biluo,
+ with_array(softmax_activation()),
+ padded2list()
+ )
+
+ return Model(
+ "iob-tagger",
+ forward,
+ init=init,
+ layers=[model],
+ refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
+ dims={"nO": None},
+ attrs={"get_num_actions": biluo.attrs["get_num_actions"]}
+ )
+
+
+
+def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
+ if model.get_dim("nO") is None and Y:
+ model.set_dim("nO", Y[0].shape[1])
+ nO = model.get_dim("nO")
+ biluo = model.get_ref("biluo")
+ linear = model.get_ref("linear")
+ biluo.set_dim("nO", nO)
+ if linear.has_dim("nO") is None:
+ linear.set_dim("nO", nO)
+ model.layers[0].initialize(X=X, Y=Y)
+
+
+def forward(model: Model, X: List[Doc], is_train: bool):
+ return model.layers[0](X, is_train)
+
+
+__all__ = ["BiluoTagger"]
diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py
index baca325bd..683c8b518 100644
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@@ -1,4 +1,5 @@
-from thinc.api import zero_init, with_array, Softmax, chain, Model
+from thinc.api import zero_init, with_array, Softmax, chain, Model, Dropout
+from thinc.api import glorot_uniform_init
from ...util import registry
@@ -11,6 +12,6 @@ def build_tagger_model(tok2vec, nO=None) -> Model:
softmax = with_array(output_layer)
model = chain(tok2vec, softmax)
model.set_ref("tok2vec", tok2vec)
- model.set_ref("softmax", softmax)
+ model.set_ref("softmax", output_layer)
model.set_ref("output_layer", output_layer)
return model
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
new file mode 100644
index 000000000..e4301a644
--- /dev/null
+++ b/spacy/ml/tb_framework.py
@@ -0,0 +1,86 @@
+from thinc.api import Model, noop, use_ops, Linear
+from ..syntax._parser_model import ParserStepModel
+
+
+def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
+ """Set up a stepwise transition-based model"""
+ if upper is None:
+ has_upper = False
+ upper = noop()
+ else:
+ has_upper = True
+ # don't define nO for this object, because we can't dynamically change it
+ return Model(
+ name="parser_model",
+ forward=forward,
+ dims={"nI": tok2vec.get_dim("nI") if tok2vec.has_dim("nI") else None},
+ layers=[tok2vec, lower, upper],
+ refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
+ init=init,
+ attrs={
+ "has_upper": has_upper,
+ "unseen_classes": set(unseen_classes),
+ "resize_output": resize_output
+ }
+ )
+
+
+def forward(model, X, is_train):
+ step_model = ParserStepModel(
+ X,
+ model.layers,
+ unseen_classes=model.attrs["unseen_classes"],
+ train=is_train,
+ has_upper=model.attrs["has_upper"]
+ )
+
+ return step_model, step_model.finish_steps
+
+
+def init(model, X=None, Y=None):
+ tok2vec = model.get_ref("tok2vec").initialize()
+ lower = model.get_ref("lower").initialize(X=X)
+ if model.attrs["has_upper"]:
+ statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
+ model.get_ref("upper").initialize(X=statevecs)
+
+
+def resize_output(model, new_nO):
+ tok2vec = model.get_ref("tok2vec")
+ lower = model.get_ref("lower")
+ upper = model.get_ref("upper")
+ if not model.attrs["has_upper"]:
+ if lower.has_dim("nO") is None:
+ lower.set_dim("nO", new_nO)
+ return
+ elif upper.has_dim("nO") is None:
+ upper.set_dim("nO", new_nO)
+ return
+ elif new_nO == upper.get_dim("nO"):
+ return
+ smaller = upper
+ nI = None
+ if smaller.has_dim("nI"):
+ nI = smaller.get_dim("nI")
+ with use_ops('numpy'):
+ larger = Linear(nO=new_nO, nI=nI)
+ larger.init = smaller.init
+ # it could be that the model is not initialized yet, then skip this bit
+ if nI:
+ larger_W = larger.ops.alloc2f(new_nO, nI)
+ larger_b = larger.ops.alloc1f(new_nO)
+ smaller_W = smaller.get_param("W")
+ smaller_b = smaller.get_param("b")
+ # Weights are stored in (nr_out, nr_in) format, so we're basically
+ # just adding rows here.
+ if smaller.has_dim("nO"):
+ larger_W[:smaller.get_dim("nO")] = smaller_W
+ larger_b[:smaller.get_dim("nO")] = smaller_b
+ for i in range(smaller.get_dim("nO"), new_nO):
+ model.attrs["unseen_classes"].add(i)
+
+ larger.set_param("W", larger_W)
+ larger.set_param("b", larger_b)
+ model._layers[-1] = larger
+ model.set_ref("upper", larger)
+ return model
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 6a90de81c..b2866bad2 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -1,6 +1,7 @@
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
from .pipes import SentenceRecognizer
+from .simple_ner import SimpleNER
from .morphologizer import Morphologizer
from .entityruler import EntityRuler
from .tok2vec import Tok2Vec
@@ -22,6 +23,7 @@ __all__ = [
"SentenceSegmenter",
"SentenceRecognizer",
"SimilarityHook",
+ "SimpleNER",
"merge_entities",
"merge_noun_chunks",
"merge_subtokens",
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 1a0812442..61db11baa 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -3,7 +3,7 @@ import numpy
import srsly
import random
from thinc.api import CosineDistance, to_categorical, get_array_module
-from thinc.api import set_dropout_rate
+from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy
import warnings
from ..tokens.doc cimport Doc
@@ -464,6 +464,9 @@ class Tagger(Pipe):
return
set_dropout_rate(self.model, drop)
tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
+ for sc in tag_scores:
+ if self.model.ops.xp.isnan(sc.sum()):
+ raise ValueError("nan value in scores")
loss, d_tag_scores = self.get_loss(examples, tag_scores)
bp_tag_scores(d_tag_scores)
if sgd not in (None, False):
@@ -497,29 +500,11 @@ class Tagger(Pipe):
losses[self.name] += (gradient**2).sum()
def get_loss(self, examples, scores):
- scores = self.model.ops.flatten(scores)
- tag_index = {tag: i for i, tag in enumerate(self.labels)}
- cdef int idx = 0
- correct = numpy.zeros((scores.shape[0],), dtype="i")
- guesses = scores.argmax(axis=1)
- known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
- for ex in examples:
- gold = ex.gold
- for tag in gold.tags:
- if tag is None:
- correct[idx] = guesses[idx]
- elif tag in tag_index:
- correct[idx] = tag_index[tag]
- else:
- correct[idx] = 0
- known_labels[idx] = 0.
- idx += 1
- correct = self.model.ops.xp.array(correct, dtype="i")
- d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
- d_scores *= self.model.ops.asarray(known_labels)
- loss = (d_scores**2).sum()
- docs = [ex.doc for ex in examples]
- d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
+ loss_func = SequenceCategoricalCrossentropy(names=self.labels)
+ truths = [eg.gold.tags for eg in examples]
+ d_scores, loss = loss_func(scores, truths)
+ if self.model.ops.xp.isnan(loss):
+ raise ValueError("nan value when computing loss")
return float(loss), d_scores
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py
new file mode 100644
index 000000000..8d53152d8
--- /dev/null
+++ b/spacy/pipeline/simple_ner.py
@@ -0,0 +1,149 @@
+from typing import List
+from thinc.types import Floats2d
+from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate
+from thinc.util import to_numpy
+from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
+from ..tokens import Doc
+from ..language import component
+from ..util import link_vectors_to_models
+from .pipes import Pipe
+
+
+@component("simple_ner", assigns=["doc.ents"])
+class SimpleNER(Pipe):
+ """Named entity recognition with a tagging model. The model should include
+ validity constraints to ensure that only valid tag sequences are returned."""
+
+ def __init__(self, vocab, model):
+ self.vocab = vocab
+ self.model = model
+ self.cfg = {"labels": []}
+ self.loss_func = SequenceCategoricalCrossentropy(
+ names=self.get_tag_names(),
+ normalize=True,
+ missing_value=None
+ )
+ assert self.model is not None
+
+ @property
+ def labels(self):
+ return self.cfg["labels"]
+
+ @property
+ def is_biluo(self):
+ return self.model.name.startswith("biluo")
+
+ def add_label(self, label):
+ if label not in self.cfg["labels"]:
+ self.cfg["labels"].append(label)
+
+ def get_tag_names(self):
+ if self.is_biluo:
+ return (
+ [f"B-{label}" for label in self.labels] +
+ [f"I-{label}" for label in self.labels] +
+ [f"L-{label}" for label in self.labels] +
+ [f"U-{label}" for label in self.labels] +
+ ["O"]
+ )
+ else:
+ return (
+ [f"B-{label}" for label in self.labels] +
+ [f"I-{label}" for label in self.labels] +
+ ["O"]
+ )
+
+ def predict(self, docs: List[Doc]) -> List[Floats2d]:
+ scores = self.model.predict(docs)
+ return scores
+
+ def set_annotations(self, docs: List[Doc], scores: List[Floats2d], tensors=None):
+ """Set entities on a batch of documents from a batch of scores."""
+ tag_names = self.get_tag_names()
+ for i, doc in enumerate(docs):
+ actions = to_numpy(scores[i].argmax(axis=1))
+ tags = [tag_names[actions[j]] for j in range(len(doc))]
+ if not self.is_biluo:
+ tags = iob_to_biluo(tags)
+ doc.ents = spans_from_biluo_tags(doc, tags)
+
+ def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
+ if not any(_has_ner(eg) for eg in examples):
+ return 0
+ examples = Example.to_example_objects(examples)
+ docs = [ex.doc for ex in examples]
+ set_dropout_rate(self.model, drop)
+ scores, bp_scores = self.model.begin_update(docs)
+ loss, d_scores = self.get_loss(examples, scores)
+ bp_scores(d_scores)
+ if set_annotations:
+ self.set_annotations(docs, scores)
+ if sgd is not None:
+ self.model.finish_update(sgd)
+ if losses is not None:
+ losses.setdefault("ner", 0.0)
+ losses["ner"] += loss
+ return loss
+
+ def get_loss(self, examples, scores):
+ loss = 0
+ d_scores = []
+ truths = []
+ for eg in examples:
+ gold_tags = [(tag if tag != "-" else None) for tag in eg.gold.ner]
+ if not self.is_biluo:
+ gold_tags = biluo_to_iob(gold_tags)
+ truths.append(gold_tags)
+ for i in range(len(scores)):
+ if len(scores[i]) != len(truths[i]):
+ raise ValueError(
+ f"Mismatched output and gold sizes.\n"
+ f"Output: {len(scores[i])}, gold: {len(truths[i])}."
+ f"Input: {len(examples[i].doc)}"
+ )
+ d_scores, loss = self.loss_func(scores, truths)
+ return loss, d_scores
+
+ def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
+ self.cfg.update(kwargs)
+ if not hasattr(get_examples, '__call__'):
+ gold_tuples = get_examples
+ get_examples = lambda: gold_tuples
+ labels = _get_labels(get_examples())
+ for label in _get_labels(get_examples()):
+ self.add_label(label)
+ labels = self.labels
+ n_actions = self.model.attrs["get_num_actions"](len(labels))
+ self.model.set_dim("nO", n_actions)
+ self.model.initialize()
+ if pipeline is not None:
+ self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
+ link_vectors_to_models(self.vocab)
+ self.loss_func = SequenceCategoricalCrossentropy(
+ names=self.get_tag_names(),
+ normalize=True,
+ missing_value=None
+ )
+
+ return sgd
+
+ def init_multitask_objectives(self, *args, **kwargs):
+ pass
+
+
+def _has_ner(eg):
+ for ner_tag in eg.gold.ner:
+ if ner_tag != "-" and ner_tag != None:
+ return True
+ else:
+ return False
+
+
+def _get_labels(examples):
+ labels = set()
+ for eg in examples:
+ for ner_tag in eg.token_annotation.entities:
+ if ner_tag != 'O' and ner_tag != '-':
+ _, label = ner_tag.split('-', 1)
+ labels.add(label)
+ return list(sorted(labels))
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 4f4e5e4b0..69f5bd6f6 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -12,7 +12,7 @@ cimport blis.cy
import numpy
import numpy.random
-from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops
+from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops, noop
from ..typedefs cimport weight_t, class_t, hash_t
from ..tokens.doc cimport Doc
@@ -219,112 +219,27 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
return best
-class ParserModel(Model):
- def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None):
- # don't define nO for this object, because we can't dynamically change it
- Model.__init__(self, name="parser_model", forward=forward, dims={"nI": None})
- if tok2vec.has_dim("nI"):
- self.set_dim("nI", tok2vec.get_dim("nI"))
- self._layers = [tok2vec, lower_model]
- if upper_model is not None:
- self._layers.append(upper_model)
- self.unseen_classes = set()
- if unseen_classes:
- for class_ in unseen_classes:
- self.unseen_classes.add(class_)
- self.set_ref("tok2vec", tok2vec)
-
- def predict(self, docs):
- step_model = ParserStepModel(docs, self._layers,
- unseen_classes=self.unseen_classes, train=False)
- return step_model
-
- def resize_output(self, new_nO):
- if len(self._layers) == 2:
- return
- if self.upper.has_dim("nO") and (new_nO == self.upper.get_dim("nO")):
- return
- smaller = self.upper
- nI = None
- if smaller.has_dim("nI"):
- nI = smaller.get_dim("nI")
- with use_ops('numpy'):
- larger = Linear(nO=new_nO, nI=nI)
- larger.init = smaller.init
- # it could be that the model is not initialized yet, then skip this bit
- if nI:
- larger_W = larger.ops.alloc2f(new_nO, nI)
- larger_b = larger.ops.alloc1f(new_nO)
- smaller_W = smaller.get_param("W")
- smaller_b = smaller.get_param("b")
- # Weights are stored in (nr_out, nr_in) format, so we're basically
- # just adding rows here.
- if smaller.has_dim("nO"):
- larger_W[:smaller.get_dim("nO")] = smaller_W
- larger_b[:smaller.get_dim("nO")] = smaller_b
- for i in range(smaller.get_dim("nO"), new_nO):
- self.unseen_classes.add(i)
-
- larger.set_param("W", larger_W)
- larger.set_param("b", larger_b)
- self._layers[-1] = larger
-
- def initialize(self, X=None, Y=None):
- self.tok2vec.initialize()
- self.lower.initialize(X=X, Y=Y)
- if self.upper is not None:
- # In case we need to trigger the callbacks
- statevecs = self.ops.alloc((2, self.lower.get_dim("nO")))
- self.upper.initialize(X=statevecs)
-
- def finish_update(self, optimizer):
- self.tok2vec.finish_update(optimizer)
- self.lower.finish_update(optimizer)
- if self.upper is not None:
- self.upper.finish_update(optimizer)
-
- @property
- def tok2vec(self):
- return self._layers[0]
-
- @property
- def lower(self):
- return self._layers[1]
-
- @property
- def upper(self):
- return self._layers[2]
-
-
-def forward(model:ParserModel, X, is_train):
- step_model = ParserStepModel(X, model._layers, unseen_classes=model.unseen_classes,
- train=is_train)
-
- return step_model, step_model.finish_steps
-
class ParserStepModel(Model):
- def __init__(self, docs, layers, unseen_classes=None, train=True):
+ def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True):
Model.__init__(self, name="parser_step_model", forward=step_forward)
+ self.attrs["has_upper"] = has_upper
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
if layers[1].get_dim("nP") >= 2:
activation = "maxout"
- elif len(layers) == 2:
+ elif has_upper:
activation = None
else:
activation = "relu"
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
activation=activation, train=train)
- if len(layers) == 3:
+ if has_upper:
self.vec2scores = layers[-1]
else:
self.vec2scores = None
self.cuda_stream = util.get_cuda_stream(non_blocking=True)
self.backprops = []
- if self.vec2scores is None:
- self._class_mask = numpy.zeros((self.state2vec.nO,), dtype='f')
- else:
- self._class_mask = numpy.zeros((self.vec2scores.get_dim("nO"),), dtype='f')
+ self._class_mask = numpy.zeros((self.nO,), dtype='f')
self._class_mask.fill(1)
if unseen_classes is not None:
for class_ in unseen_classes:
@@ -332,7 +247,10 @@ class ParserStepModel(Model):
@property
def nO(self):
- return self.state2vec.nO
+ if self.attrs["has_upper"]:
+ return self.vec2scores.get_dim("nO")
+ else:
+ return self.state2vec.get_dim("nO")
def class_is_unseen(self, class_):
return self._class_mask[class_]
@@ -378,7 +296,7 @@ class ParserStepModel(Model):
def step_forward(model: ParserStepModel, states, is_train):
token_ids = model.get_token_ids(states)
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
- if model.vec2scores is not None:
+ if model.attrs["has_upper"]:
scores, get_d_vector = model.vec2scores(vector, is_train)
else:
scores = NumpyOps().asarray(vector)
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 01d6d5bfe..31aa4d413 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -36,7 +36,6 @@ from ..util import link_vectors_to_models, create_default_optimizer, registry
from ..compat import copy_array
from ..errors import Errors, Warnings
from .. import util
-from ._parser_model import ParserModel
from . import _beam_utils
from . import nonproj
@@ -69,7 +68,8 @@ cdef class Parser:
cfg.setdefault('beam_width', 1)
cfg.setdefault('beam_update_prob', 1.0) # or 0.5 (both defaults were previously used)
self.model = model
- self.set_output(self.moves.n_moves)
+ if self.moves.n_moves != 0:
+ self.set_output(self.moves.n_moves)
self.cfg = cfg
self._multitasks = []
self._rehearsal_model = None
@@ -105,7 +105,7 @@ cdef class Parser:
@property
def tok2vec(self):
'''Return the embedding and convolutional layer of the model.'''
- return self.model.tok2vec
+ return self.model.get_ref("tok2vec")
@property
def postprocesses(self):
@@ -122,9 +122,11 @@ cdef class Parser:
self._resize()
def _resize(self):
- self.model.resize_output(self.moves.n_moves)
+ self.model.attrs["resize_output"](self.model, self.moves.n_moves)
if self._rehearsal_model not in (True, False, None):
- self._rehearsal_model.resize_output(self.moves.n_moves)
+ self._rehearsal_model.attrs["resize_output"](
+ self._rehearsal_model, self.moves.n_moves
+ )
def add_multitask_objective(self, target):
# Defined in subclasses, to avoid circular import
@@ -216,7 +218,6 @@ cdef class Parser:
# expand our model output.
self._resize()
model = self.model.predict(docs)
- W_param = model.vec2scores.get_param("W")
weights = get_c_weights(model)
for state in batch:
if not state.is_final():
@@ -237,7 +238,7 @@ cdef class Parser:
# if labels are missing. We therefore have to check whether we need to
# expand our model output.
self._resize()
- cdef int nr_feature = self.model.lower.get_dim("nF")
+ cdef int nr_feature = self.model.get_ref("lower").get_dim("nF")
model = self.model.predict(docs)
token_ids = numpy.zeros((len(docs) * beam_width, nr_feature),
dtype='i', order='C')
@@ -370,13 +371,16 @@ cdef class Parser:
beam_density=self.cfg.get('beam_density', 0.001))
set_dropout_rate(self.model, drop)
- # Chop sequences into lengths of this many transitions, to make the
- # batch uniform length.
- cut_gold = numpy.random.choice(range(20, 100))
- states, golds, max_steps = self._init_gold_batch(examples, max_length=cut_gold)
+ cut_gold = True
+ if cut_gold:
+ # Chop sequences into lengths of this many transitions, to make the
+ # batch uniform length.
+ cut_gold = numpy.random.choice(range(20, 100))
+ states, golds, max_steps = self._init_gold_batch(examples, max_length=cut_gold)
+ else:
+ states, golds, max_steps = self._init_gold_batch_no_cut(examples)
states_golds = [(s, g) for (s, g) in zip(states, golds)
if not s.is_final() and g is not None]
-
# Prepare the stepwise model, and get the callback for finishing the batch
model, backprop_tok2vec = self.model.begin_update([ex.doc for ex in examples])
all_states = list(states)
@@ -456,9 +460,17 @@ cdef class Parser:
set_dropout_rate(self.model, drop)
model, backprop_tok2vec = self.model.begin_update(docs)
states_d_scores, backprops, beams = _beam_utils.update_beam(
- self.moves, self.model.lower.get_dim("nF"), 10000, states, golds,
- model.state2vec, model.vec2scores, width, losses=losses,
- beam_density=beam_density)
+ self.moves,
+ self.model.get_ref("lower").get_dim("nF"),
+ 10000,
+ states,
+ golds,
+ model.state2vec,
+ model.vec2scores,
+ width,
+ losses=losses,
+ beam_density=beam_density
+ )
for i, d_scores in enumerate(states_d_scores):
losses[self.name] += (d_scores**2).mean()
ids, bp_vectors, bp_scores = backprops[i]
@@ -497,6 +509,24 @@ cdef class Parser:
queue.extend(node._layers)
return gradients
+ def _init_gold_batch_no_cut(self, whole_examples):
+ states = self.moves.init_batch([eg.doc for eg in whole_examples])
+ good_docs = []
+ good_golds = []
+ good_states = []
+ for i, eg in enumerate(whole_examples):
+ doc = eg.doc
+ gold = self.moves.preprocess_gold(eg.gold)
+ if gold is not None and self.moves.has_gold(gold):
+ good_docs.append(doc)
+ good_golds.append(gold)
+ good_states.append(states[i])
+ n_moves = []
+ for doc, gold in zip(good_docs, good_golds):
+ oracle_actions = self.moves.get_oracle_sequence(doc, gold)
+ n_moves.append(len(oracle_actions))
+ return good_states, good_golds, max(n_moves, default=0) * 2
+
def _init_gold_batch(self, whole_examples, min_length=5, max_length=500):
"""Make a square batch, of length equal to the shortest doc. A long
doc will get multiple states. Let's say we have a doc of length 2*N,
@@ -550,16 +580,19 @@ cdef class Parser:
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
dtype='f', order='C')
c_d_scores = d_scores.data
+ unseen_classes = self.model.attrs["unseen_classes"]
for i, (state, gold) in enumerate(zip(states, golds)):
memset(is_valid, 0, self.moves.n_moves * sizeof(int))
memset(costs, 0, self.moves.n_moves * sizeof(float))
self.moves.set_costs(is_valid, costs, state, gold)
for j in range(self.moves.n_moves):
- if costs[j] <= 0.0 and j in self.model.unseen_classes:
- self.model.unseen_classes.remove(j)
+ if costs[j] <= 0.0 and j in unseen_classes:
+ unseen_classes.remove(j)
cpu_log_loss(c_d_scores,
costs, is_valid, &scores[i, 0], d_scores.shape[1])
c_d_scores += d_scores.shape[1]
+ if len(states):
+ d_scores /= len(states)
if losses is not None:
losses.setdefault(self.name, 0.)
losses[self.name] += (d_scores**2).sum()
@@ -569,8 +602,7 @@ cdef class Parser:
return create_default_optimizer()
def set_output(self, nO):
- if self.model.upper.has_dim("nO") is None:
- self.model.upper.set_dim("nO", nO)
+ self.model.attrs["resize_output"](self.model, nO)
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
self.cfg.update(kwargs)
@@ -597,7 +629,6 @@ cdef class Parser:
for doc, gold in parses:
doc_sample.append(doc)
gold_sample.append(gold)
-
self.model.initialize(doc_sample, gold_sample)
if pipeline is not None:
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index 647c9720c..39682ba3d 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -65,7 +65,7 @@ def test_add_label_deserializes_correctly():
ner2 = EntityRecognizer(Vocab(), default_ner())
# the second model needs to be resized before we can call from_bytes
- ner2.model.resize_output(ner1.moves.n_moves)
+ ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves)
ner2.from_bytes(ner1.to_bytes())
assert ner1.moves.n_moves == ner2.moves.n_moves
for i in range(ner1.moves.n_moves):
diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py
index 984af4d6b..c985cf87a 100644
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@@ -3,9 +3,9 @@ from spacy.ml.models.defaults import default_parser, default_tok2vec
from spacy.vocab import Vocab
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.nn_parser import Parser
-from spacy.syntax._parser_model import ParserModel
from spacy.tokens.doc import Doc
from spacy.gold import GoldParse
+from thinc.api import Model
@pytest.fixture
@@ -34,7 +34,7 @@ def parser(vocab, arc_eager):
@pytest.fixture
def model(arc_eager, tok2vec, vocab):
model = default_parser()
- model.resize_output(arc_eager.n_moves)
+ model.attrs["resize_output"](model, arc_eager.n_moves)
model.initialize()
return model
@@ -50,7 +50,7 @@ def gold(doc):
def test_can_init_nn_parser(parser):
- assert isinstance(parser.model, ParserModel)
+ assert isinstance(parser.model, Model)
def test_build_model(parser, vocab):
diff --git a/spacy/tests/pipeline/test_simple_ner.py b/spacy/tests/pipeline/test_simple_ner.py
new file mode 100644
index 000000000..9d4acf2fd
--- /dev/null
+++ b/spacy/tests/pipeline/test_simple_ner.py
@@ -0,0 +1,417 @@
+import pytest
+from collections import namedtuple
+
+from thinc.api import NumpyOps
+from spacy.ml._biluo import BILUO, _get_transition_table
+from spacy.pipeline.simple_ner import SimpleNER
+import spacy
+
+
+@pytest.fixture(params=[
+ ["PER", "ORG", "LOC", "MISC"],
+ ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"]
+])
+def labels(request):
+ return request.param
+
+@pytest.fixture
+def ops():
+ return NumpyOps()
+
+def _get_actions(labels):
+ action_names = (
+ [f"B{label}" for label in labels] + \
+ [f"I{label}" for label in labels] + \
+ [f"L{label}" for label in labels] + \
+ [f"U{label}" for label in labels] + \
+ ["O"]
+ )
+ A = namedtuple("actions", action_names)
+ return A(**{name: i for i, name in enumerate(action_names)})
+
+
+def test_init_biluo_layer(labels):
+ model = BILUO()
+ model.set_dim("nO", model.attrs["get_num_actions"](len(labels)))
+ model.initialize()
+ assert model.get_dim("nO") == len(labels) * 4 + 1
+
+
+def test_transition_table(ops):
+ labels = ["per", "loc", "org"]
+ table = _get_transition_table(len(labels))
+ a = _get_actions(labels)
+ assert table.shape == (2, len(a), len(a))
+ # Not last token, prev action was B
+ assert table[0, a.Bper, a.Bper] == 0
+ assert table[0, a.Bper, a.Bloc] == 0
+ assert table[0, a.Bper, a.Borg] == 0
+ assert table[0, a.Bper, a.Iper] == 1
+ assert table[0, a.Bper, a.Iloc] == 0
+ assert table[0, a.Bper, a.Iorg] == 0
+ assert table[0, a.Bper, a.Lper] == 1
+ assert table[0, a.Bper, a.Lloc] == 0
+ assert table[0, a.Bper, a.Lorg] == 0
+ assert table[0, a.Bper, a.Uper] == 0
+ assert table[0, a.Bper, a.Uloc] == 0
+ assert table[0, a.Bper, a.Uorg] == 0
+ assert table[0, a.Bper, a.O] == 0
+
+ assert table[0, a.Bloc, a.Bper] == 0
+ assert table[0, a.Bloc, a.Bloc] == 0
+ assert table[0, a.Bloc, a.Borg] == 0
+ assert table[0, a.Bloc, a.Iper] == 0
+ assert table[0, a.Bloc, a.Iloc] == 1
+ assert table[0, a.Bloc, a.Iorg] == 0
+ assert table[0, a.Bloc, a.Lper] == 0
+ assert table[0, a.Bloc, a.Lloc] == 1
+ assert table[0, a.Bloc, a.Lorg] == 0
+ assert table[0, a.Bloc, a.Uper] == 0
+ assert table[0, a.Bloc, a.Uloc] == 0
+ assert table[0, a.Bloc, a.Uorg] == 0
+ assert table[0, a.Bloc, a.O] == 0
+
+ assert table[0, a.Borg, a.Bper] == 0
+ assert table[0, a.Borg, a.Bloc] == 0
+ assert table[0, a.Borg, a.Borg] == 0
+ assert table[0, a.Borg, a.Iper] == 0
+ assert table[0, a.Borg, a.Iloc] == 0
+ assert table[0, a.Borg, a.Iorg] == 1
+ assert table[0, a.Borg, a.Lper] == 0
+ assert table[0, a.Borg, a.Lloc] == 0
+ assert table[0, a.Borg, a.Lorg] == 1
+ assert table[0, a.Borg, a.Uper] == 0
+ assert table[0, a.Borg, a.Uloc] == 0
+ assert table[0, a.Borg, a.Uorg] == 0
+ assert table[0, a.Borg, a.O] == 0
+
+ # Not last token, prev action was I
+ assert table[0, a.Iper, a.Bper] == 0
+ assert table[0, a.Iper, a.Bloc] == 0
+ assert table[0, a.Iper, a.Borg] == 0
+ assert table[0, a.Iper, a.Iper] == 1
+ assert table[0, a.Iper, a.Iloc] == 0
+ assert table[0, a.Iper, a.Iorg] == 0
+ assert table[0, a.Iper, a.Lper] == 1
+ assert table[0, a.Iper, a.Lloc] == 0
+ assert table[0, a.Iper, a.Lorg] == 0
+ assert table[0, a.Iper, a.Uper] == 0
+ assert table[0, a.Iper, a.Uloc] == 0
+ assert table[0, a.Iper, a.Uorg] == 0
+ assert table[0, a.Iper, a.O] == 0
+
+ assert table[0, a.Iloc, a.Bper] == 0
+ assert table[0, a.Iloc, a.Bloc] == 0
+ assert table[0, a.Iloc, a.Borg] == 0
+ assert table[0, a.Iloc, a.Iper] == 0
+ assert table[0, a.Iloc, a.Iloc] == 1
+ assert table[0, a.Iloc, a.Iorg] == 0
+ assert table[0, a.Iloc, a.Lper] == 0
+ assert table[0, a.Iloc, a.Lloc] == 1
+ assert table[0, a.Iloc, a.Lorg] == 0
+ assert table[0, a.Iloc, a.Uper] == 0
+ assert table[0, a.Iloc, a.Uloc] == 0
+ assert table[0, a.Iloc, a.Uorg] == 0
+ assert table[0, a.Iloc, a.O] == 0
+
+ assert table[0, a.Iorg, a.Bper] == 0
+ assert table[0, a.Iorg, a.Bloc] == 0
+ assert table[0, a.Iorg, a.Borg] == 0
+ assert table[0, a.Iorg, a.Iper] == 0
+ assert table[0, a.Iorg, a.Iloc] == 0
+ assert table[0, a.Iorg, a.Iorg] == 1
+ assert table[0, a.Iorg, a.Lper] == 0
+ assert table[0, a.Iorg, a.Lloc] == 0
+ assert table[0, a.Iorg, a.Lorg] == 1
+ assert table[0, a.Iorg, a.Uper] == 0
+ assert table[0, a.Iorg, a.Uloc] == 0
+ assert table[0, a.Iorg, a.Uorg] == 0
+ assert table[0, a.Iorg, a.O] == 0
+
+ # Not last token, prev action was L
+ assert table[0, a.Lper, a.Bper] == 1
+ assert table[0, a.Lper, a.Bloc] == 1
+ assert table[0, a.Lper, a.Borg] == 1
+ assert table[0, a.Lper, a.Iper] == 0
+ assert table[0, a.Lper, a.Iloc] == 0
+ assert table[0, a.Lper, a.Iorg] == 0
+ assert table[0, a.Lper, a.Lper] == 0
+ assert table[0, a.Lper, a.Lloc] == 0
+ assert table[0, a.Lper, a.Lorg] == 0
+ assert table[0, a.Lper, a.Uper] == 1
+ assert table[0, a.Lper, a.Uloc] == 1
+ assert table[0, a.Lper, a.Uorg] == 1
+ assert table[0, a.Lper, a.O] == 1
+
+ assert table[0, a.Lloc, a.Bper] == 1
+ assert table[0, a.Lloc, a.Bloc] == 1
+ assert table[0, a.Lloc, a.Borg] == 1
+ assert table[0, a.Lloc, a.Iper] == 0
+ assert table[0, a.Lloc, a.Iloc] == 0
+ assert table[0, a.Lloc, a.Iorg] == 0
+ assert table[0, a.Lloc, a.Lper] == 0
+ assert table[0, a.Lloc, a.Lloc] == 0
+ assert table[0, a.Lloc, a.Lorg] == 0
+ assert table[0, a.Lloc, a.Uper] == 1
+ assert table[0, a.Lloc, a.Uloc] == 1
+ assert table[0, a.Lloc, a.Uorg] == 1
+ assert table[0, a.Lloc, a.O] == 1
+
+ assert table[0, a.Lorg, a.Bper] == 1
+ assert table[0, a.Lorg, a.Bloc] == 1
+ assert table[0, a.Lorg, a.Borg] == 1
+ assert table[0, a.Lorg, a.Iper] == 0
+ assert table[0, a.Lorg, a.Iloc] == 0
+ assert table[0, a.Lorg, a.Iorg] == 0
+ assert table[0, a.Lorg, a.Lper] == 0
+ assert table[0, a.Lorg, a.Lloc] == 0
+ assert table[0, a.Lorg, a.Lorg] == 0
+ assert table[0, a.Lorg, a.Uper] == 1
+ assert table[0, a.Lorg, a.Uloc] == 1
+ assert table[0, a.Lorg, a.Uorg] == 1
+ assert table[0, a.Lorg, a.O] == 1
+
+ # Not last token, prev action was U
+ assert table[0, a.Uper, a.Bper] == 1
+ assert table[0, a.Uper, a.Bloc] == 1
+ assert table[0, a.Uper, a.Borg] == 1
+ assert table[0, a.Uper, a.Iper] == 0
+ assert table[0, a.Uper, a.Iloc] == 0
+ assert table[0, a.Uper, a.Iorg] == 0
+ assert table[0, a.Uper, a.Lper] == 0
+ assert table[0, a.Uper, a.Lloc] == 0
+ assert table[0, a.Uper, a.Lorg] == 0
+ assert table[0, a.Uper, a.Uper] == 1
+ assert table[0, a.Uper, a.Uloc] == 1
+ assert table[0, a.Uper, a.Uorg] == 1
+ assert table[0, a.Uper, a.O] == 1
+
+ assert table[0, a.Uloc, a.Bper] == 1
+ assert table[0, a.Uloc, a.Bloc] == 1
+ assert table[0, a.Uloc, a.Borg] == 1
+ assert table[0, a.Uloc, a.Iper] == 0
+ assert table[0, a.Uloc, a.Iloc] == 0
+ assert table[0, a.Uloc, a.Iorg] == 0
+ assert table[0, a.Uloc, a.Lper] == 0
+ assert table[0, a.Uloc, a.Lloc] == 0
+ assert table[0, a.Uloc, a.Lorg] == 0
+ assert table[0, a.Uloc, a.Uper] == 1
+ assert table[0, a.Uloc, a.Uloc] == 1
+ assert table[0, a.Uloc, a.Uorg] == 1
+ assert table[0, a.Uloc, a.O] == 1
+
+ assert table[0, a.Uorg, a.Bper] == 1
+ assert table[0, a.Uorg, a.Bloc] == 1
+ assert table[0, a.Uorg, a.Borg] == 1
+ assert table[0, a.Uorg, a.Iper] == 0
+ assert table[0, a.Uorg, a.Iloc] == 0
+ assert table[0, a.Uorg, a.Iorg] == 0
+ assert table[0, a.Uorg, a.Lper] == 0
+ assert table[0, a.Uorg, a.Lloc] == 0
+ assert table[0, a.Uorg, a.Lorg] == 0
+ assert table[0, a.Uorg, a.Uper] == 1
+ assert table[0, a.Uorg, a.Uloc] == 1
+ assert table[0, a.Uorg, a.Uorg] == 1
+ assert table[0, a.Uorg, a.O] == 1
+
+ # Not last token, prev action was O
+ assert table[0, a.O, a.Bper] == 1
+ assert table[0, a.O, a.Bloc] == 1
+ assert table[0, a.O, a.Borg] == 1
+ assert table[0, a.O, a.Iper] == 0
+ assert table[0, a.O, a.Iloc] == 0
+ assert table[0, a.O, a.Iorg] == 0
+ assert table[0, a.O, a.Lper] == 0
+ assert table[0, a.O, a.Lloc] == 0
+ assert table[0, a.O, a.Lorg] == 0
+ assert table[0, a.O, a.Uper] == 1
+ assert table[0, a.O, a.Uloc] == 1
+ assert table[0, a.O, a.Uorg] == 1
+ assert table[0, a.O, a.O] == 1
+
+ # Last token, prev action was B
+ assert table[1, a.Bper, a.Bper] == 0
+ assert table[1, a.Bper, a.Bloc] == 0
+ assert table[1, a.Bper, a.Borg] == 0
+ assert table[1, a.Bper, a.Iper] == 0
+ assert table[1, a.Bper, a.Iloc] == 0
+ assert table[1, a.Bper, a.Iorg] == 0
+ assert table[1, a.Bper, a.Lper] == 1
+ assert table[1, a.Bper, a.Lloc] == 0
+ assert table[1, a.Bper, a.Lorg] == 0
+ assert table[1, a.Bper, a.Uper] == 0
+ assert table[1, a.Bper, a.Uloc] == 0
+ assert table[1, a.Bper, a.Uorg] == 0
+ assert table[1, a.Bper, a.O] == 0
+
+ assert table[1, a.Bloc, a.Bper] == 0
+ assert table[1, a.Bloc, a.Bloc] == 0
+ assert table[0, a.Bloc, a.Borg] == 0
+ assert table[1, a.Bloc, a.Iper] == 0
+ assert table[1, a.Bloc, a.Iloc] == 0
+ assert table[1, a.Bloc, a.Iorg] == 0
+ assert table[1, a.Bloc, a.Lper] == 0
+ assert table[1, a.Bloc, a.Lloc] == 1
+ assert table[1, a.Bloc, a.Lorg] == 0
+ assert table[1, a.Bloc, a.Uper] == 0
+ assert table[1, a.Bloc, a.Uloc] == 0
+ assert table[1, a.Bloc, a.Uorg] == 0
+ assert table[1, a.Bloc, a.O] == 0
+
+ assert table[1, a.Borg, a.Bper] == 0
+ assert table[1, a.Borg, a.Bloc] == 0
+ assert table[1, a.Borg, a.Borg] == 0
+ assert table[1, a.Borg, a.Iper] == 0
+ assert table[1, a.Borg, a.Iloc] == 0
+ assert table[1, a.Borg, a.Iorg] == 0
+ assert table[1, a.Borg, a.Lper] == 0
+ assert table[1, a.Borg, a.Lloc] == 0
+ assert table[1, a.Borg, a.Lorg] == 1
+ assert table[1, a.Borg, a.Uper] == 0
+ assert table[1, a.Borg, a.Uloc] == 0
+ assert table[1, a.Borg, a.Uorg] == 0
+ assert table[1, a.Borg, a.O] == 0
+
+ # Last token, prev action was I
+ assert table[1, a.Iper, a.Bper] == 0
+ assert table[1, a.Iper, a.Bloc] == 0
+ assert table[1, a.Iper, a.Borg] == 0
+ assert table[1, a.Iper, a.Iper] == 0
+ assert table[1, a.Iper, a.Iloc] == 0
+ assert table[1, a.Iper, a.Iorg] == 0
+ assert table[1, a.Iper, a.Lper] == 1
+ assert table[1, a.Iper, a.Lloc] == 0
+ assert table[1, a.Iper, a.Lorg] == 0
+ assert table[1, a.Iper, a.Uper] == 0
+ assert table[1, a.Iper, a.Uloc] == 0
+ assert table[1, a.Iper, a.Uorg] == 0
+ assert table[1, a.Iper, a.O] == 0
+
+ assert table[1, a.Iloc, a.Bper] == 0
+ assert table[1, a.Iloc, a.Bloc] == 0
+ assert table[1, a.Iloc, a.Borg] == 0
+ assert table[1, a.Iloc, a.Iper] == 0
+ assert table[1, a.Iloc, a.Iloc] == 0
+ assert table[1, a.Iloc, a.Iorg] == 0
+ assert table[1, a.Iloc, a.Lper] == 0
+ assert table[1, a.Iloc, a.Lloc] == 1
+ assert table[1, a.Iloc, a.Lorg] == 0
+ assert table[1, a.Iloc, a.Uper] == 0
+ assert table[1, a.Iloc, a.Uloc] == 0
+ assert table[1, a.Iloc, a.Uorg] == 0
+ assert table[1, a.Iloc, a.O] == 0
+
+ assert table[1, a.Iorg, a.Bper] == 0
+ assert table[1, a.Iorg, a.Bloc] == 0
+ assert table[1, a.Iorg, a.Borg] == 0
+ assert table[1, a.Iorg, a.Iper] == 0
+ assert table[1, a.Iorg, a.Iloc] == 0
+ assert table[1, a.Iorg, a.Iorg] == 0
+ assert table[1, a.Iorg, a.Lper] == 0
+ assert table[1, a.Iorg, a.Lloc] == 0
+ assert table[1, a.Iorg, a.Lorg] == 1
+ assert table[1, a.Iorg, a.Uper] == 0
+ assert table[1, a.Iorg, a.Uloc] == 0
+ assert table[1, a.Iorg, a.Uorg] == 0
+ assert table[1, a.Iorg, a.O] == 0
+
+ # Last token, prev action was L
+ assert table[1, a.Lper, a.Bper] == 0
+ assert table[1, a.Lper, a.Bloc] == 0
+ assert table[1, a.Lper, a.Borg] == 0
+ assert table[1, a.Lper, a.Iper] == 0
+ assert table[1, a.Lper, a.Iloc] == 0
+ assert table[1, a.Lper, a.Iorg] == 0
+ assert table[1, a.Lper, a.Lper] == 0
+ assert table[1, a.Lper, a.Lloc] == 0
+ assert table[1, a.Lper, a.Lorg] == 0
+ assert table[1, a.Lper, a.Uper] == 1
+ assert table[1, a.Lper, a.Uloc] == 1
+ assert table[1, a.Lper, a.Uorg] == 1
+ assert table[1, a.Lper, a.O] == 1
+
+ assert table[1, a.Lloc, a.Bper] == 0
+ assert table[1, a.Lloc, a.Bloc] == 0
+ assert table[1, a.Lloc, a.Borg] == 0
+ assert table[1, a.Lloc, a.Iper] == 0
+ assert table[1, a.Lloc, a.Iloc] == 0
+ assert table[1, a.Lloc, a.Iorg] == 0
+ assert table[1, a.Lloc, a.Lper] == 0
+ assert table[1, a.Lloc, a.Lloc] == 0
+ assert table[1, a.Lloc, a.Lorg] == 0
+ assert table[1, a.Lloc, a.Uper] == 1
+ assert table[1, a.Lloc, a.Uloc] == 1
+ assert table[1, a.Lloc, a.Uorg] == 1
+ assert table[1, a.Lloc, a.O] == 1
+
+ assert table[1, a.Lorg, a.Bper] == 0
+ assert table[1, a.Lorg, a.Bloc] == 0
+ assert table[1, a.Lorg, a.Borg] == 0
+ assert table[1, a.Lorg, a.Iper] == 0
+ assert table[1, a.Lorg, a.Iloc] == 0
+ assert table[1, a.Lorg, a.Iorg] == 0
+ assert table[1, a.Lorg, a.Lper] == 0
+ assert table[1, a.Lorg, a.Lloc] == 0
+ assert table[1, a.Lorg, a.Lorg] == 0
+ assert table[1, a.Lorg, a.Uper] == 1
+ assert table[1, a.Lorg, a.Uloc] == 1
+ assert table[1, a.Lorg, a.Uorg] == 1
+ assert table[1, a.Lorg, a.O] == 1
+
+ # Last token, prev action was U
+ assert table[1, a.Uper, a.Bper] == 0
+ assert table[1, a.Uper, a.Bloc] == 0
+ assert table[1, a.Uper, a.Borg] == 0
+ assert table[1, a.Uper, a.Iper] == 0
+ assert table[1, a.Uper, a.Iloc] == 0
+ assert table[1, a.Uper, a.Iorg] == 0
+ assert table[1, a.Uper, a.Lper] == 0
+ assert table[1, a.Uper, a.Lloc] == 0
+ assert table[1, a.Uper, a.Lorg] == 0
+ assert table[1, a.Uper, a.Uper] == 1
+ assert table[1, a.Uper, a.Uloc] == 1
+ assert table[1, a.Uper, a.Uorg] == 1
+ assert table[1, a.Uper, a.O] == 1
+
+ assert table[1, a.Uloc, a.Bper] == 0
+ assert table[1, a.Uloc, a.Bloc] == 0
+ assert table[1, a.Uloc, a.Borg] == 0
+ assert table[1, a.Uloc, a.Iper] == 0
+ assert table[1, a.Uloc, a.Iloc] == 0
+ assert table[1, a.Uloc, a.Iorg] == 0
+ assert table[1, a.Uloc, a.Lper] == 0
+ assert table[1, a.Uloc, a.Lloc] == 0
+ assert table[1, a.Uloc, a.Lorg] == 0
+ assert table[1, a.Uloc, a.Uper] == 1
+ assert table[1, a.Uloc, a.Uloc] == 1
+ assert table[1, a.Uloc, a.Uorg] == 1
+ assert table[1, a.Uloc, a.O] == 1
+
+ assert table[1, a.Uorg, a.Bper] == 0
+ assert table[1, a.Uorg, a.Bloc] == 0
+ assert table[1, a.Uorg, a.Borg] == 0
+ assert table[1, a.Uorg, a.Iper] == 0
+ assert table[1, a.Uorg, a.Iloc] == 0
+ assert table[1, a.Uorg, a.Iorg] == 0
+ assert table[1, a.Uorg, a.Lper] == 0
+ assert table[1, a.Uorg, a.Lloc] == 0
+ assert table[1, a.Uorg, a.Lorg] == 0
+ assert table[1, a.Uorg, a.Uper] == 1
+ assert table[1, a.Uorg, a.Uloc] == 1
+ assert table[1, a.Uorg, a.Uorg] == 1
+ assert table[1, a.Uorg, a.O] == 1
+
+ # Last token, prev action was O
+ assert table[1, a.O, a.Bper] == 0
+ assert table[1, a.O, a.Bloc] == 0
+ assert table[1, a.O, a.Borg] == 0
+ assert table[1, a.O, a.Iper] == 0
+ assert table[1, a.O, a.Iloc] == 0
+ assert table[1, a.O, a.Iorg] == 0
+ assert table[1, a.O, a.Lper] == 0
+ assert table[1, a.O, a.Lloc] == 0
+ assert table[1, a.O, a.Lorg] == 0
+ assert table[1, a.O, a.Uper] == 1
+ assert table[1, a.O, a.Uloc] == 1
+ assert table[1, a.O, a.Uorg] == 1
+ assert table[1, a.O, a.O] == 1
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index d9a3e16b6..67966f70e 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -34,7 +34,8 @@ def test_issue2179():
nlp2.add_pipe(nlp2.create_pipe("ner"))
assert len(nlp2.get_pipe("ner").labels) == 0
- nlp2.get_pipe("ner").model.resize_output(nlp.get_pipe("ner").moves.n_moves)
+ model = nlp2.get_pipe("ner").model
+ model.attrs["resize_output"](model, nlp.get_pipe("ner").moves.n_moves)
nlp2.from_bytes(nlp.to_bytes())
assert "extra_labels" not in nlp2.get_pipe("ner").cfg
assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index df23efa4f..06ba6c4ac 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -104,7 +104,8 @@ def test_issue3209():
assert ner.move_names == move_names
nlp2 = English()
nlp2.add_pipe(nlp2.create_pipe("ner"))
- nlp2.get_pipe("ner").model.resize_output(ner.moves.n_moves)
+ model = nlp2.get_pipe("ner").model
+ model.attrs["resize_output"](model, ner.moves.n_moves)
nlp2.from_bytes(nlp.to_bytes())
assert nlp2.get_pipe("ner").move_names == move_names
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 298cddc74..ba63adfa4 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -110,10 +110,9 @@ def test_serialize_custom_nlp():
nlp2 = spacy.load(d)
model = nlp2.get_pipe("parser").model
tok2vec = model.get_ref("tok2vec")
- upper = model.upper
+ upper = model.get_ref("upper")
# check that we have the correct settings, not the default ones
- assert tok2vec.get_dim("nO") == 321
assert upper.get_dim("nI") == 65
@@ -131,8 +130,7 @@ def test_serialize_parser():
nlp2 = spacy.load(d)
model = nlp2.get_pipe("parser").model
tok2vec = model.get_ref("tok2vec")
- upper = model.upper
+ upper = model.get_ref("upper")
# check that we have the correct settings, not the default ones
assert upper.get_dim("nI") == 66
- assert tok2vec.get_dim("nO") == 333
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index a3381cb2f..475181c7b 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -63,7 +63,7 @@ def test_to_from_bytes(parser, blank_parser):
bytes_data = parser.to_bytes(exclude=["vocab"])
# the blank parser needs to be resized before we can call from_bytes
- blank_parser.model.resize_output(parser.moves.n_moves)
+ blank_parser.model.attrs["resize_output"](blank_parser.model, parser.moves.n_moves)
blank_parser.from_bytes(bytes_data)
assert blank_parser.model is not True
assert blank_parser.moves.n_moves == parser.moves.n_moves
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 1200407d7..c320b19c0 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -38,7 +38,7 @@ def test_util_get_package_path(package):
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
- model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP)
+ model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
assert model.get_param("W").shape == (nF, nO, nP, nI)
tensor = model.ops.alloc((10, nI))
Y, get_dX = model.begin_update(tensor)
diff --git a/spacy/util.py b/spacy/util.py
index b4ecc8b03..048d923ee 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -571,8 +571,10 @@ def decaying(start, stop, decay):
curr -= decay
-def minibatch_by_words(examples, size, tuples=True, count_words=len):
- """Create minibatches of a given number of words."""
+def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0.2):
+ """Create minibatches of roughly a given number of words. If any examples
+ are longer than the specified batch length, they will appear in a batch by
+ themselves."""
if isinstance(size, int):
size_ = itertools.repeat(size)
elif isinstance(size, List):
@@ -580,18 +582,36 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len):
else:
size_ = size
examples = iter(examples)
+ oversize = []
while True:
batch_size = next(size_)
+ tol_size = batch_size * 0.2
batch = []
- while batch_size >= 0:
+ if oversize:
+ example = oversize.pop(0)
+ n_words = count_words(example.doc)
+ batch.append(example)
+ batch_size -= n_words
+ while batch_size >= 1:
try:
example = next(examples)
except StopIteration:
- if batch:
- yield batch
- return
- batch_size -= count_words(example.doc)
- batch.append(example)
+ if oversize:
+ examples = iter(oversize)
+ oversize = []
+ if batch:
+ yield batch
+ break
+ else:
+ if batch:
+ yield batch
+ return
+ n_words = count_words(example.doc)
+ if n_words < (batch_size + tol_size):
+ batch_size -= n_words
+ batch.append(example)
+ else:
+ oversize.append(example)
if batch:
yield batch
From 0d94737857d443bbce230605bb98492d063c6e80 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Mon, 18 May 2020 22:27:10 +0200
Subject: [PATCH 137/496] Feature toggle_pipes (#5378)
* make disable_pipes deprecated in favour of the new toggle_pipes
* rewrite disable_pipes statements
* update documentation
* remove bin/wiki_entity_linking folder
* one more fix
* remove deprecated link to documentation
* few more doc fixes
* add note about name change to the docs
* restore original disable_pipes
* small fixes
* fix typo
* fix error number to W096
* rename to select_pipes
* also make changes to the documentation
Co-authored-by: Matthew Honnibal
---
bin/wiki_entity_linking/README.md | 37 --
bin/wiki_entity_linking/__init__.py | 12 -
.../entity_linker_evaluation.py | 204 -------
bin/wiki_entity_linking/kb_creator.py | 161 -----
bin/wiki_entity_linking/train_descriptions.py | 145 -----
bin/wiki_entity_linking/wiki_io.py | 127 ----
bin/wiki_entity_linking/wiki_namespaces.py | 128 ----
.../wikidata_pretrain_kb.py | 179 ------
bin/wiki_entity_linking/wikidata_processor.py | 154 -----
.../wikidata_train_entity_linker.py | 230 -------
.../wikipedia_processor.py | 565 ------------------
examples/training/pretrain_textcat.py | 5 +-
examples/training/rehearsal.py | 5 +-
.../textcatjsonl_to_trainjson.py | 6 +-
examples/training/train_entity_linker.py | 7 +-
examples/training/train_intent_parser.py | 4 +-
examples/training/train_ner.py | 5 +-
examples/training/train_new_entity_type.py | 6 +-
examples/training/train_parser.py | 5 +-
examples/training/train_textcat.py | 5 +-
spacy/cli/train.py | 6 +-
spacy/errors.py | 11 +-
spacy/language.py | 32 +-
spacy/pipeline/entityruler.py | 2 +-
spacy/tests/pipeline/test_pipe_methods.py | 55 +-
spacy/tests/regression/test_issue3611.py | 2 +-
spacy/tests/regression/test_issue4030.py | 2 +-
website/docs/api/language.md | 38 +-
website/docs/usage/processing-pipelines.md | 18 +-
website/docs/usage/rule-based-matching.md | 5 +-
website/docs/usage/spacy-101.md | 3 +-
website/docs/usage/training.md | 14 +-
32 files changed, 154 insertions(+), 2024 deletions(-)
delete mode 100644 bin/wiki_entity_linking/README.md
delete mode 100644 bin/wiki_entity_linking/__init__.py
delete mode 100644 bin/wiki_entity_linking/entity_linker_evaluation.py
delete mode 100644 bin/wiki_entity_linking/kb_creator.py
delete mode 100644 bin/wiki_entity_linking/train_descriptions.py
delete mode 100644 bin/wiki_entity_linking/wiki_io.py
delete mode 100644 bin/wiki_entity_linking/wiki_namespaces.py
delete mode 100644 bin/wiki_entity_linking/wikidata_pretrain_kb.py
delete mode 100644 bin/wiki_entity_linking/wikidata_processor.py
delete mode 100644 bin/wiki_entity_linking/wikidata_train_entity_linker.py
delete mode 100644 bin/wiki_entity_linking/wikipedia_processor.py
diff --git a/bin/wiki_entity_linking/README.md b/bin/wiki_entity_linking/README.md
deleted file mode 100644
index 4e4af5c21..000000000
--- a/bin/wiki_entity_linking/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-## Entity Linking with Wikipedia and Wikidata
-
-### Step 1: Create a Knowledge Base (KB) and training data
-
-Run `wikidata_pretrain_kb.py`
-* This takes as input the locations of a **Wikipedia and a Wikidata dump**, and produces a **KB directory** + **training file**
- * WikiData: get `latest-all.json.bz2` from https://dumps.wikimedia.org/wikidatawiki/entities/
- * Wikipedia: get `enwiki-latest-pages-articles-multistream.xml.bz2` from https://dumps.wikimedia.org/enwiki/latest/ (or for any other language)
-* You can set the filtering parameters for KB construction:
- * `max_per_alias` (`-a`): (max) number of candidate entities in the KB per alias/synonym
- * `min_freq` (`-f`): threshold of number of times an entity should occur in the corpus to be included in the KB
- * `min_pair` (`-c`): threshold of number of times an entity+alias combination should occur in the corpus to be included in the KB
-* Further parameters to set:
- * `descriptions_from_wikipedia` (`-wp`): whether to parse descriptions from Wikipedia (`True`) or Wikidata (`False`)
- * `entity_vector_length` (`-v`): length of the pre-trained entity description vectors
- * `lang` (`-la`): language for which to fetch Wikidata information (as the dump contains all languages)
-
-Quick testing and rerunning:
-* When trying out the pipeline for a quick test, set `limit_prior` (`-lp`), `limit_train` (`-lt`) and/or `limit_wd` (`-lw`) to read only parts of the dumps instead of everything.
- * e.g. set `-lt 20000 -lp 2000 -lw 3000 -f 1`
-* If you only want to (re)run certain parts of the pipeline, just remove the corresponding files and they will be recalculated or reparsed.
-
-
-### Step 2: Train an Entity Linking model
-
-Run `wikidata_train_entity_linker.py`
-* This takes the **KB directory** produced by Step 1, and trains an **Entity Linking model**
-* Specify the output directory (`-o`) in which the final, trained model will be saved
-* You can set the learning parameters for the EL training:
- * `epochs` (`-e`): number of training iterations
- * `dropout` (`-p`): dropout rate
- * `lr` (`-n`): learning rate
- * `l2` (`-r`): L2 regularization
-* Specify the number of training and dev testing articles with `train_articles` (`-t`) and `dev_articles` (`-d`) respectively
- * If not specified, the full dataset will be processed - this may take a LONG time !
-* Further parameters to set:
- * `labels_discard` (`-l`): NER label types to discard during training
diff --git a/bin/wiki_entity_linking/__init__.py b/bin/wiki_entity_linking/__init__.py
deleted file mode 100644
index de486bbcf..000000000
--- a/bin/wiki_entity_linking/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-TRAINING_DATA_FILE = "gold_entities.jsonl"
-KB_FILE = "kb"
-KB_MODEL_DIR = "nlp_kb"
-OUTPUT_MODEL_DIR = "nlp"
-
-PRIOR_PROB_PATH = "prior_prob.csv"
-ENTITY_DEFS_PATH = "entity_defs.csv"
-ENTITY_FREQ_PATH = "entity_freq.csv"
-ENTITY_ALIAS_PATH = "entity_alias.csv"
-ENTITY_DESCR_PATH = "entity_descriptions.csv"
-
-LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
diff --git a/bin/wiki_entity_linking/entity_linker_evaluation.py b/bin/wiki_entity_linking/entity_linker_evaluation.py
deleted file mode 100644
index 2aeffbfc2..000000000
--- a/bin/wiki_entity_linking/entity_linker_evaluation.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import logging
-import random
-from tqdm import tqdm
-from collections import defaultdict
-
-logger = logging.getLogger(__name__)
-
-
-class Metrics(object):
- true_pos = 0
- false_pos = 0
- false_neg = 0
-
- def update_results(self, true_entity, candidate):
- candidate_is_correct = true_entity == candidate
-
- # Assume that we have no labeled negatives in the data (i.e. cases where true_entity is "NIL")
- # Therefore, if candidate_is_correct then we have a true positive and never a true negative.
- self.true_pos += candidate_is_correct
- self.false_neg += not candidate_is_correct
- if candidate and candidate not in {"", "NIL"}:
- # A wrong prediction (e.g. Q42 != Q3) counts both as a FP as well as a FN.
- self.false_pos += not candidate_is_correct
-
- def calculate_precision(self):
- if self.true_pos == 0:
- return 0.0
- else:
- return self.true_pos / (self.true_pos + self.false_pos)
-
- def calculate_recall(self):
- if self.true_pos == 0:
- return 0.0
- else:
- return self.true_pos / (self.true_pos + self.false_neg)
-
- def calculate_fscore(self):
- p = self.calculate_precision()
- r = self.calculate_recall()
- if p + r == 0:
- return 0.0
- else:
- return 2 * p * r / (p + r)
-
-
-class EvaluationResults(object):
- def __init__(self):
- self.metrics = Metrics()
- self.metrics_by_label = defaultdict(Metrics)
-
- def update_metrics(self, ent_label, true_entity, candidate):
- self.metrics.update_results(true_entity, candidate)
- self.metrics_by_label[ent_label].update_results(true_entity, candidate)
-
- def report_metrics(self, model_name):
- model_str = model_name.title()
- recall = self.metrics.calculate_recall()
- precision = self.metrics.calculate_precision()
- fscore = self.metrics.calculate_fscore()
- return (
- "{}: ".format(model_str)
- + "F-score = {} | ".format(round(fscore, 3))
- + "Recall = {} | ".format(round(recall, 3))
- + "Precision = {} | ".format(round(precision, 3))
- + "F-score by label = {}".format(
- {k: v.calculate_fscore() for k, v in sorted(self.metrics_by_label.items())}
- )
- )
-
-
-class BaselineResults(object):
- def __init__(self):
- self.random = EvaluationResults()
- self.prior = EvaluationResults()
- self.oracle = EvaluationResults()
-
- def report_performance(self, model):
- results = getattr(self, model)
- return results.report_metrics(model)
-
- def update_baselines(
- self,
- true_entity,
- ent_label,
- random_candidate,
- prior_candidate,
- oracle_candidate,
- ):
- self.oracle.update_metrics(ent_label, true_entity, oracle_candidate)
- self.prior.update_metrics(ent_label, true_entity, prior_candidate)
- self.random.update_metrics(ent_label, true_entity, random_candidate)
-
-
-def measure_performance(dev_data, kb, el_pipe, baseline=True, context=True, dev_limit=None):
- counts = dict()
- baseline_results = BaselineResults()
- context_results = EvaluationResults()
- combo_results = EvaluationResults()
-
- for doc, gold in tqdm(dev_data, total=dev_limit, leave=False, desc='Processing dev data'):
- if len(doc) > 0:
- correct_ents = dict()
- for entity, kb_dict in gold.links.items():
- start, end = entity
- for gold_kb, value in kb_dict.items():
- if value:
- # only evaluating on positive examples
- offset = _offset(start, end)
- correct_ents[offset] = gold_kb
-
- if baseline:
- _add_baseline(baseline_results, counts, doc, correct_ents, kb)
-
- if context:
- # using only context
- el_pipe.cfg["incl_context"] = True
- el_pipe.cfg["incl_prior"] = False
- _add_eval_result(context_results, doc, correct_ents, el_pipe)
-
- # measuring combined accuracy (prior + context)
- el_pipe.cfg["incl_context"] = True
- el_pipe.cfg["incl_prior"] = True
- _add_eval_result(combo_results, doc, correct_ents, el_pipe)
-
- if baseline:
- logger.info("Counts: {}".format({k: v for k, v in sorted(counts.items())}))
- logger.info(baseline_results.report_performance("random"))
- logger.info(baseline_results.report_performance("prior"))
- logger.info(baseline_results.report_performance("oracle"))
-
- if context:
- logger.info(context_results.report_metrics("context only"))
- logger.info(combo_results.report_metrics("context and prior"))
-
-
-def _add_eval_result(results, doc, correct_ents, el_pipe):
- """
- Evaluate the ent.kb_id_ annotations against the gold standard.
- Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
- """
- try:
- doc = el_pipe(doc)
- for ent in doc.ents:
- ent_label = ent.label_
- start = ent.start_char
- end = ent.end_char
- offset = _offset(start, end)
- gold_entity = correct_ents.get(offset, None)
- # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
- if gold_entity is not None:
- pred_entity = ent.kb_id_
- results.update_metrics(ent_label, gold_entity, pred_entity)
-
- except Exception as e:
- logging.error("Error assessing accuracy " + str(e))
-
-
-def _add_baseline(baseline_results, counts, doc, correct_ents, kb):
- """
- Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound.
- Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
- """
- for ent in doc.ents:
- ent_label = ent.label_
- start = ent.start_char
- end = ent.end_char
- offset = _offset(start, end)
- gold_entity = correct_ents.get(offset, None)
-
- # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
- if gold_entity is not None:
- candidates = kb.get_candidates(ent.text)
- oracle_candidate = ""
- prior_candidate = ""
- random_candidate = ""
- if candidates:
- scores = []
-
- for c in candidates:
- scores.append(c.prior_prob)
- if c.entity_ == gold_entity:
- oracle_candidate = c.entity_
-
- best_index = scores.index(max(scores))
- prior_candidate = candidates[best_index].entity_
- random_candidate = random.choice(candidates).entity_
-
- current_count = counts.get(ent_label, 0)
- counts[ent_label] = current_count+1
-
- baseline_results.update_baselines(
- gold_entity,
- ent_label,
- random_candidate,
- prior_candidate,
- oracle_candidate,
- )
-
-
-def _offset(start, end):
- return "{}_{}".format(start, end)
diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py
deleted file mode 100644
index 8691308e0..000000000
--- a/bin/wiki_entity_linking/kb_creator.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import logging
-
-from spacy.kb import KnowledgeBase
-
-from bin.wiki_entity_linking.train_descriptions import EntityEncoder
-from bin.wiki_entity_linking import wiki_io as io
-
-
-logger = logging.getLogger(__name__)
-
-
-def create_kb(
- nlp,
- max_entities_per_alias,
- min_entity_freq,
- min_occ,
- entity_def_path,
- entity_descr_path,
- entity_alias_path,
- entity_freq_path,
- prior_prob_path,
- entity_vector_length,
-):
- # Create the knowledge base from Wikidata entries
- kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length)
- entity_list, filtered_title_to_id = _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length)
- _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path)
- return kb
-
-
-def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length):
- # read the mappings from file
- title_to_id = io.read_title_to_id(entity_def_path)
- id_to_descr = io.read_id_to_descr(entity_descr_path)
-
- # check the length of the nlp vectors
- if "vectors" in nlp.meta and nlp.vocab.vectors.size:
- input_dim = nlp.vocab.vectors_length
- logger.info("Loaded pretrained vectors of size %s" % input_dim)
- else:
- raise ValueError(
- "The `nlp` object should have access to pretrained word vectors, "
- " cf. https://spacy.io/usage/models#languages."
- )
-
- logger.info("Filtering entities with fewer than {} mentions or no description".format(min_entity_freq))
- entity_frequencies = io.read_entity_to_count(entity_freq_path)
- # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
- filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
- title_to_id,
- id_to_descr,
- entity_frequencies,
- min_entity_freq
- )
- logger.info("Kept {} entities from the set of {}".format(len(description_list), len(title_to_id.keys())))
-
- logger.info("Training entity encoder")
- encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
- encoder.train(description_list=description_list, to_print=True)
-
- logger.info("Getting entity embeddings")
- embeddings = encoder.apply_encoder(description_list)
-
- logger.info("Adding {} entities".format(len(entity_list)))
- kb.set_entities(
- entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings
- )
- return entity_list, filtered_title_to_id
-
-
-def _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path):
- logger.info("Adding aliases from Wikipedia and Wikidata")
- _add_aliases(
- kb,
- entity_list=entity_list,
- title_to_id=filtered_title_to_id,
- max_entities_per_alias=max_entities_per_alias,
- min_occ=min_occ,
- prior_prob_path=prior_prob_path,
- )
-
-
-def get_filtered_entities(title_to_id, id_to_descr, entity_frequencies,
- min_entity_freq: int = 10):
- filtered_title_to_id = dict()
- entity_list = []
- description_list = []
- frequency_list = []
- for title, entity in title_to_id.items():
- freq = entity_frequencies.get(title, 0)
- desc = id_to_descr.get(entity, None)
- if desc and freq > min_entity_freq:
- entity_list.append(entity)
- description_list.append(desc)
- frequency_list.append(freq)
- filtered_title_to_id[title] = entity
- return filtered_title_to_id, entity_list, description_list, frequency_list
-
-
-def _add_aliases(kb, entity_list, title_to_id, max_entities_per_alias, min_occ, prior_prob_path):
- wp_titles = title_to_id.keys()
-
- # adding aliases with prior probabilities
- # we can read this file sequentially, it's sorted by alias, and then by count
- logger.info("Adding WP aliases")
- with prior_prob_path.open("r", encoding="utf8") as prior_file:
- # skip header
- prior_file.readline()
- line = prior_file.readline()
- previous_alias = None
- total_count = 0
- counts = []
- entities = []
- while line:
- splits = line.replace("\n", "").split(sep="|")
- new_alias = splits[0]
- count = int(splits[1])
- entity = splits[2]
-
- if new_alias != previous_alias and previous_alias:
- # done reading the previous alias --> output
- if len(entities) > 0:
- selected_entities = []
- prior_probs = []
- for ent_count, ent_string in zip(counts, entities):
- if ent_string in wp_titles:
- wd_id = title_to_id[ent_string]
- p_entity_givenalias = ent_count / total_count
- selected_entities.append(wd_id)
- prior_probs.append(p_entity_givenalias)
-
- if selected_entities:
- try:
- kb.add_alias(
- alias=previous_alias,
- entities=selected_entities,
- probabilities=prior_probs,
- )
- except ValueError as e:
- logger.error(e)
- total_count = 0
- counts = []
- entities = []
-
- total_count += count
-
- if len(entities) < max_entities_per_alias and count >= min_occ:
- counts.append(count)
- entities.append(entity)
- previous_alias = new_alias
-
- line = prior_file.readline()
-
-
-def read_kb(nlp, kb_file):
- kb = KnowledgeBase(vocab=nlp.vocab)
- kb.load_bulk(kb_file)
- return kb
diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py
deleted file mode 100644
index b0cfbb4c6..000000000
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ /dev/null
@@ -1,145 +0,0 @@
-from random import shuffle
-
-import logging
-import numpy as np
-
-from thinc.api import Model, chain, CosineDistance, Linear
-
-from spacy.util import create_default_optimizer
-
-logger = logging.getLogger(__name__)
-
-
-class EntityEncoder:
- """
- Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D).
- This entity vector will be stored in the KB, for further downstream use in the entity model.
- """
-
- DROP = 0
- BATCH_SIZE = 1000
-
- # Set min. acceptable loss to avoid a 'mean of empty slice' warning by numpy
- MIN_LOSS = 0.01
-
- # Reasonable default to stop training when things are not improving
- MAX_NO_IMPROVEMENT = 20
-
- def __init__(self, nlp, input_dim, desc_width, epochs=5):
- self.nlp = nlp
- self.input_dim = input_dim
- self.desc_width = desc_width
- self.epochs = epochs
- self.distance = CosineDistance(ignore_zeros=True, normalize=False)
-
- def apply_encoder(self, description_list):
- if self.encoder is None:
- raise ValueError("Can not apply encoder before training it")
-
- batch_size = 100000
-
- start = 0
- stop = min(batch_size, len(description_list))
- encodings = []
-
- while start < len(description_list):
- docs = list(self.nlp.pipe(description_list[start:stop]))
- doc_embeddings = [self._get_doc_embedding(doc) for doc in docs]
- enc = self.encoder(np.asarray(doc_embeddings))
- encodings.extend(enc.tolist())
-
- start = start + batch_size
- stop = min(stop + batch_size, len(description_list))
- logger.info("Encoded: {} entities".format(stop))
-
- return encodings
-
- def train(self, description_list, to_print=False):
- processed, loss = self._train_model(description_list)
- if to_print:
- logger.info(
- "Trained entity descriptions on {} ".format(processed) +
- "(non-unique) descriptions across {} ".format(self.epochs) +
- "epochs"
- )
- logger.info("Final loss: {}".format(loss))
-
- def _train_model(self, description_list):
- best_loss = 1.0
- iter_since_best = 0
- self._build_network(self.input_dim, self.desc_width)
-
- processed = 0
- loss = 1
- # copy this list so that shuffling does not affect other functions
- descriptions = description_list.copy()
- to_continue = True
-
- for i in range(self.epochs):
- shuffle(descriptions)
-
- batch_nr = 0
- start = 0
- stop = min(self.BATCH_SIZE, len(descriptions))
-
- while to_continue and start < len(descriptions):
- batch = []
- for descr in descriptions[start:stop]:
- doc = self.nlp(descr)
- doc_vector = self._get_doc_embedding(doc)
- batch.append(doc_vector)
-
- loss = self._update(batch)
- if batch_nr % 25 == 0:
- logger.info("loss: {} ".format(loss))
- processed += len(batch)
-
- # in general, continue training if we haven't reached our ideal min yet
- to_continue = loss > self.MIN_LOSS
-
- # store the best loss and track how long it's been
- if loss < best_loss:
- best_loss = loss
- iter_since_best = 0
- else:
- iter_since_best += 1
-
- # stop learning if we haven't seen improvement since the last few iterations
- if iter_since_best > self.MAX_NO_IMPROVEMENT:
- to_continue = False
-
- batch_nr += 1
- start = start + self.BATCH_SIZE
- stop = min(stop + self.BATCH_SIZE, len(descriptions))
-
- return processed, loss
-
- @staticmethod
- def _get_doc_embedding(doc):
- indices = np.zeros((len(doc),), dtype="i")
- for i, word in enumerate(doc):
- if word.orth in doc.vocab.vectors.key2row:
- indices[i] = doc.vocab.vectors.key2row[word.orth]
- else:
- indices[i] = 0
- word_vectors = doc.vocab.vectors.data[indices]
- doc_vector = np.mean(word_vectors, axis=0)
- return doc_vector
-
- def _build_network(self, orig_width, hidden_with):
- with Model.define_operators({">>": chain}):
- # very simple encoder-decoder model
- self.encoder = Linear(hidden_with, orig_width)
- # TODO: removed the zero_init here - is oK?
- self.model = self.encoder >> Linear(orig_width, hidden_with)
- self.sgd = create_default_optimizer()
-
- def _update(self, vectors):
- truths = self.model.ops.asarray(vectors)
- predictions, bp_model = self.model.begin_update(
- truths, drop=self.DROP
- )
- d_scores, loss = self.distance(predictions, truths)
- bp_model(d_scores, sgd=self.sgd)
- return loss / len(vectors)
-
diff --git a/bin/wiki_entity_linking/wiki_io.py b/bin/wiki_entity_linking/wiki_io.py
deleted file mode 100644
index 43ae87f0f..000000000
--- a/bin/wiki_entity_linking/wiki_io.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import sys
-import csv
-
-# min() needed to prevent error on windows, cf https://stackoverflow.com/questions/52404416/
-csv.field_size_limit(min(sys.maxsize, 2147483646))
-
-""" This class provides reading/writing methods for temp files """
-
-
-# Entity definition: WP title -> WD ID #
-def write_title_to_id(entity_def_output, title_to_id):
- with entity_def_output.open("w", encoding="utf8") as id_file:
- id_file.write("WP_title" + "|" + "WD_id" + "\n")
- for title, qid in title_to_id.items():
- id_file.write(title + "|" + str(qid) + "\n")
-
-
-def read_title_to_id(entity_def_output):
- title_to_id = dict()
- with entity_def_output.open("r", encoding="utf8") as id_file:
- csvreader = csv.reader(id_file, delimiter="|")
- # skip header
- next(csvreader)
- for row in csvreader:
- title_to_id[row[0]] = row[1]
- return title_to_id
-
-
-# Entity aliases from WD: WD ID -> WD alias #
-def write_id_to_alias(entity_alias_path, id_to_alias):
- with entity_alias_path.open("w", encoding="utf8") as alias_file:
- alias_file.write("WD_id" + "|" + "alias" + "\n")
- for qid, alias_list in id_to_alias.items():
- for alias in alias_list:
- alias_file.write(str(qid) + "|" + alias + "\n")
-
-
-def read_id_to_alias(entity_alias_path):
- id_to_alias = dict()
- with entity_alias_path.open("r", encoding="utf8") as alias_file:
- csvreader = csv.reader(alias_file, delimiter="|")
- # skip header
- next(csvreader)
- for row in csvreader:
- qid = row[0]
- alias = row[1]
- alias_list = id_to_alias.get(qid, [])
- alias_list.append(alias)
- id_to_alias[qid] = alias_list
- return id_to_alias
-
-
-def read_alias_to_id_generator(entity_alias_path):
- """ Read (aliases, qid) tuples """
-
- with entity_alias_path.open("r", encoding="utf8") as alias_file:
- csvreader = csv.reader(alias_file, delimiter="|")
- # skip header
- next(csvreader)
- for row in csvreader:
- qid = row[0]
- alias = row[1]
- yield alias, qid
-
-
-# Entity descriptions from WD: WD ID -> WD alias #
-def write_id_to_descr(entity_descr_output, id_to_descr):
- with entity_descr_output.open("w", encoding="utf8") as descr_file:
- descr_file.write("WD_id" + "|" + "description" + "\n")
- for qid, descr in id_to_descr.items():
- descr_file.write(str(qid) + "|" + descr + "\n")
-
-
-def read_id_to_descr(entity_desc_path):
- id_to_desc = dict()
- with entity_desc_path.open("r", encoding="utf8") as descr_file:
- csvreader = csv.reader(descr_file, delimiter="|")
- # skip header
- next(csvreader)
- for row in csvreader:
- id_to_desc[row[0]] = row[1]
- return id_to_desc
-
-
-# Entity counts from WP: WP title -> count #
-def write_entity_to_count(prior_prob_input, count_output):
- # Write entity counts for quick access later
- entity_to_count = dict()
- total_count = 0
-
- with prior_prob_input.open("r", encoding="utf8") as prior_file:
- # skip header
- prior_file.readline()
- line = prior_file.readline()
-
- while line:
- splits = line.replace("\n", "").split(sep="|")
- # alias = splits[0]
- count = int(splits[1])
- entity = splits[2]
-
- current_count = entity_to_count.get(entity, 0)
- entity_to_count[entity] = current_count + count
-
- total_count += count
-
- line = prior_file.readline()
-
- with count_output.open("w", encoding="utf8") as entity_file:
- entity_file.write("entity" + "|" + "count" + "\n")
- for entity, count in entity_to_count.items():
- entity_file.write(entity + "|" + str(count) + "\n")
-
-
-def read_entity_to_count(count_input):
- entity_to_count = dict()
- with count_input.open("r", encoding="utf8") as csvfile:
- csvreader = csv.reader(csvfile, delimiter="|")
- # skip header
- next(csvreader)
- for row in csvreader:
- entity_to_count[row[0]] = int(row[1])
-
- return entity_to_count
diff --git a/bin/wiki_entity_linking/wiki_namespaces.py b/bin/wiki_entity_linking/wiki_namespaces.py
deleted file mode 100644
index e8f099ccd..000000000
--- a/bin/wiki_entity_linking/wiki_namespaces.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-# List of meta pages in Wikidata, should be kept out of the Knowledge base
-WD_META_ITEMS = [
- "Q163875",
- "Q191780",
- "Q224414",
- "Q4167836",
- "Q4167410",
- "Q4663903",
- "Q11266439",
- "Q13406463",
- "Q15407973",
- "Q18616576",
- "Q19887878",
- "Q22808320",
- "Q23894233",
- "Q33120876",
- "Q42104522",
- "Q47460393",
- "Q64875536",
- "Q66480449",
-]
-
-
-# TODO: add more cases from non-English WP's
-
-# List of prefixes that refer to Wikipedia "file" pages
-WP_FILE_NAMESPACE = ["Bestand", "File"]
-
-# List of prefixes that refer to Wikipedia "category" pages
-WP_CATEGORY_NAMESPACE = ["Kategori", "Category", "Categorie"]
-
-# List of prefixes that refer to Wikipedia "meta" pages
-# these will/should be matched ignoring case
-WP_META_NAMESPACE = (
- WP_FILE_NAMESPACE
- + WP_CATEGORY_NAMESPACE
- + [
- "b",
- "betawikiversity",
- "Book",
- "c",
- "Commons",
- "d",
- "dbdump",
- "download",
- "Draft",
- "Education",
- "Foundation",
- "Gadget",
- "Gadget definition",
- "Gebruiker",
- "gerrit",
- "Help",
- "Image",
- "Incubator",
- "m",
- "mail",
- "mailarchive",
- "media",
- "MediaWiki",
- "MediaWiki talk",
- "Mediawikiwiki",
- "MediaZilla",
- "Meta",
- "Metawikipedia",
- "Module",
- "mw",
- "n",
- "nost",
- "oldwikisource",
- "otrs",
- "OTRSwiki",
- "Overleg gebruiker",
- "outreach",
- "outreachwiki",
- "Portal",
- "phab",
- "Phabricator",
- "Project",
- "q",
- "quality",
- "rev",
- "s",
- "spcom",
- "Special",
- "species",
- "Strategy",
- "sulutil",
- "svn",
- "Talk",
- "Template",
- "Template talk",
- "Testwiki",
- "ticket",
- "TimedText",
- "Toollabs",
- "tools",
- "tswiki",
- "User",
- "User talk",
- "v",
- "voy",
- "w",
- "Wikibooks",
- "Wikidata",
- "wikiHow",
- "Wikinvest",
- "wikilivres",
- "Wikimedia",
- "Wikinews",
- "Wikipedia",
- "Wikipedia talk",
- "Wikiquote",
- "Wikisource",
- "Wikispecies",
- "Wikitech",
- "Wikiversity",
- "Wikivoyage",
- "wikt",
- "wiktionary",
- "wmf",
- "wmania",
- "WP",
- ]
-)
diff --git a/bin/wiki_entity_linking/wikidata_pretrain_kb.py b/bin/wiki_entity_linking/wikidata_pretrain_kb.py
deleted file mode 100644
index 003074feb..000000000
--- a/bin/wiki_entity_linking/wikidata_pretrain_kb.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# coding: utf-8
-"""Script to process Wikipedia and Wikidata dumps and create a knowledge base (KB)
-with specific parameters. Intermediate files are written to disk.
-
-Running the full pipeline on a standard laptop, may take up to 13 hours of processing.
-Use the -p, -d and -s options to speed up processing using the intermediate files
-from a previous run.
-
-For the Wikidata dump: get the latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
-For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2
-from https://dumps.wikimedia.org/enwiki/latest/
-
-"""
-from __future__ import unicode_literals
-
-import logging
-from pathlib import Path
-import plac
-
-from bin.wiki_entity_linking import wikipedia_processor as wp, wikidata_processor as wd
-from bin.wiki_entity_linking import wiki_io as io
-from bin.wiki_entity_linking import kb_creator
-from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_FILE, ENTITY_DESCR_PATH, KB_MODEL_DIR, LOG_FORMAT
-from bin.wiki_entity_linking import ENTITY_FREQ_PATH, PRIOR_PROB_PATH, ENTITY_DEFS_PATH, ENTITY_ALIAS_PATH
-import spacy
-from bin.wiki_entity_linking.kb_creator import read_kb
-
-logger = logging.getLogger(__name__)
-
-
-@plac.annotations(
- wd_json=("Path to the downloaded WikiData JSON dump.", "positional", None, Path),
- wp_xml=("Path to the downloaded Wikipedia XML dump.", "positional", None, Path),
- output_dir=("Output directory", "positional", None, Path),
- model=("Model name or path, should include pretrained vectors.", "positional", None, str),
- max_per_alias=("Max. # entities per alias (default 10)", "option", "a", int),
- min_freq=("Min. count of an entity in the corpus (default 20)", "option", "f", int),
- min_pair=("Min. count of entity-alias pairs (default 5)", "option", "c", int),
- entity_vector_length=("Length of entity vectors (default 64)", "option", "v", int),
- loc_prior_prob=("Location to file with prior probabilities", "option", "p", Path),
- loc_entity_defs=("Location to file with entity definitions", "option", "d", Path),
- loc_entity_desc=("Location to file with entity descriptions", "option", "s", Path),
- descr_from_wp=("Flag for using descriptions from WP instead of WD (default False)", "flag", "wp"),
- limit_prior=("Threshold to limit lines read from WP for prior probabilities", "option", "lp", int),
- limit_train=("Threshold to limit lines read from WP for training set", "option", "lt", int),
- limit_wd=("Threshold to limit lines read from WD", "option", "lw", int),
- lang=("Optional language for which to get Wikidata titles. Defaults to 'en'", "option", "la", str),
-)
-def main(
- wd_json,
- wp_xml,
- output_dir,
- model,
- max_per_alias=10,
- min_freq=20,
- min_pair=5,
- entity_vector_length=64,
- loc_prior_prob=None,
- loc_entity_defs=None,
- loc_entity_alias=None,
- loc_entity_desc=None,
- descr_from_wp=False,
- limit_prior=None,
- limit_train=None,
- limit_wd=None,
- lang="en",
-):
- entity_defs_path = loc_entity_defs if loc_entity_defs else output_dir / ENTITY_DEFS_PATH
- entity_alias_path = loc_entity_alias if loc_entity_alias else output_dir / ENTITY_ALIAS_PATH
- entity_descr_path = loc_entity_desc if loc_entity_desc else output_dir / ENTITY_DESCR_PATH
- entity_freq_path = output_dir / ENTITY_FREQ_PATH
- prior_prob_path = loc_prior_prob if loc_prior_prob else output_dir / PRIOR_PROB_PATH
- training_entities_path = output_dir / TRAINING_DATA_FILE
- kb_path = output_dir / KB_FILE
-
- logger.info("Creating KB with Wikipedia and WikiData")
-
- # STEP 0: set up IO
- if not output_dir.exists():
- output_dir.mkdir(parents=True)
-
- # STEP 1: Load the NLP object
- logger.info("STEP 1: Loading NLP model {}".format(model))
- nlp = spacy.load(model)
-
- # check the length of the nlp vectors
- if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
- raise ValueError(
- "The `nlp` object should have access to pretrained word vectors, "
- " cf. https://spacy.io/usage/models#languages."
- )
-
- # STEP 2: create prior probabilities from WP
- if not prior_prob_path.exists():
- # It takes about 2h to process 1000M lines of Wikipedia XML dump
- logger.info("STEP 2: Writing prior probabilities to {}".format(prior_prob_path))
- if limit_prior is not None:
- logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_prior))
- wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit_prior)
- else:
- logger.info("STEP 2: Reading prior probabilities from {}".format(prior_prob_path))
-
- # STEP 3: calculate entity frequencies
- if not entity_freq_path.exists():
- logger.info("STEP 3: Calculating and writing entity frequencies to {}".format(entity_freq_path))
- io.write_entity_to_count(prior_prob_path, entity_freq_path)
- else:
- logger.info("STEP 3: Reading entity frequencies from {}".format(entity_freq_path))
-
- # STEP 4: reading definitions and (possibly) descriptions from WikiData or from file
- if (not entity_defs_path.exists()) or (not descr_from_wp and not entity_descr_path.exists()):
- # It takes about 10h to process 55M lines of Wikidata JSON dump
- logger.info("STEP 4: Parsing and writing Wikidata entity definitions to {}".format(entity_defs_path))
- if limit_wd is not None:
- logger.warning("Warning: reading only {} lines of Wikidata dump".format(limit_wd))
- title_to_id, id_to_descr, id_to_alias = wd.read_wikidata_entities_json(
- wd_json,
- limit_wd,
- to_print=False,
- lang=lang,
- parse_descr=(not descr_from_wp),
- )
- io.write_title_to_id(entity_defs_path, title_to_id)
-
- logger.info("STEP 4b: Writing Wikidata entity aliases to {}".format(entity_alias_path))
- io.write_id_to_alias(entity_alias_path, id_to_alias)
-
- if not descr_from_wp:
- logger.info("STEP 4c: Writing Wikidata entity descriptions to {}".format(entity_descr_path))
- io.write_id_to_descr(entity_descr_path, id_to_descr)
- else:
- logger.info("STEP 4: Reading entity definitions from {}".format(entity_defs_path))
- logger.info("STEP 4b: Reading entity aliases from {}".format(entity_alias_path))
- if not descr_from_wp:
- logger.info("STEP 4c: Reading entity descriptions from {}".format(entity_descr_path))
-
- # STEP 5: Getting gold entities from Wikipedia
- if (not training_entities_path.exists()) or (descr_from_wp and not entity_descr_path.exists()):
- logger.info("STEP 5: Parsing and writing Wikipedia gold entities to {}".format(training_entities_path))
- if limit_train is not None:
- logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_train))
- wp.create_training_and_desc(wp_xml, entity_defs_path, entity_descr_path,
- training_entities_path, descr_from_wp, limit_train)
- if descr_from_wp:
- logger.info("STEP 5b: Parsing and writing Wikipedia descriptions to {}".format(entity_descr_path))
- else:
- logger.info("STEP 5: Reading gold entities from {}".format(training_entities_path))
- if descr_from_wp:
- logger.info("STEP 5b: Reading entity descriptions from {}".format(entity_descr_path))
-
- # STEP 6: creating the actual KB
- # It takes ca. 30 minutes to pretrain the entity embeddings
- if not kb_path.exists():
- logger.info("STEP 6: Creating the KB at {}".format(kb_path))
- kb = kb_creator.create_kb(
- nlp=nlp,
- max_entities_per_alias=max_per_alias,
- min_entity_freq=min_freq,
- min_occ=min_pair,
- entity_def_path=entity_defs_path,
- entity_descr_path=entity_descr_path,
- entity_alias_path=entity_alias_path,
- entity_freq_path=entity_freq_path,
- prior_prob_path=prior_prob_path,
- entity_vector_length=entity_vector_length,
- )
- kb.dump(kb_path)
- logger.info("kb entities: {}".format(kb.get_size_entities()))
- logger.info("kb aliases: {}".format(kb.get_size_aliases()))
- nlp.to_disk(output_dir / KB_MODEL_DIR)
- else:
- logger.info("STEP 6: KB already exists at {}".format(kb_path))
-
- logger.info("Done!")
-
-
-if __name__ == "__main__":
- logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
- plac.call(main)
diff --git a/bin/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py
deleted file mode 100644
index 8a070f567..000000000
--- a/bin/wiki_entity_linking/wikidata_processor.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import bz2
-import json
-import logging
-
-from bin.wiki_entity_linking.wiki_namespaces import WD_META_ITEMS
-
-logger = logging.getLogger(__name__)
-
-
-def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang="en", parse_descr=True):
- # Read the JSON wiki data and parse out the entities. Takes about 7-10h to parse 55M lines.
- # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
-
- site_filter = '{}wiki'.format(lang)
-
- # filter: currently defined as OR: one hit suffices to be removed from further processing
- exclude_list = WD_META_ITEMS
-
- # punctuation
- exclude_list.extend(["Q1383557", "Q10617810"])
-
- # letters etc
- exclude_list.extend(["Q188725", "Q19776628", "Q3841820", "Q17907810", "Q9788", "Q9398093"])
-
- neg_prop_filter = {
- 'P31': exclude_list, # instance of
- 'P279': exclude_list # subclass
- }
-
- title_to_id = dict()
- id_to_descr = dict()
- id_to_alias = dict()
-
- # parse appropriate fields - depending on what we need in the KB
- parse_properties = False
- parse_sitelinks = True
- parse_labels = False
- parse_aliases = True
- parse_claims = True
-
- with bz2.open(wikidata_file, mode='rb') as file:
- for cnt, line in enumerate(file):
- if limit and cnt >= limit:
- break
- if cnt % 500000 == 0 and cnt > 0:
- logger.info("processed {} lines of WikiData JSON dump".format(cnt))
- clean_line = line.strip()
- if clean_line.endswith(b","):
- clean_line = clean_line[:-1]
- if len(clean_line) > 1:
- obj = json.loads(clean_line)
- entry_type = obj["type"]
-
- if entry_type == "item":
- keep = True
-
- claims = obj["claims"]
- if parse_claims:
- for prop, value_set in neg_prop_filter.items():
- claim_property = claims.get(prop, None)
- if claim_property:
- for cp in claim_property:
- cp_id = (
- cp["mainsnak"]
- .get("datavalue", {})
- .get("value", {})
- .get("id")
- )
- cp_rank = cp["rank"]
- if cp_rank != "deprecated" and cp_id in value_set:
- keep = False
-
- if keep:
- unique_id = obj["id"]
-
- if to_print:
- print("ID:", unique_id)
- print("type:", entry_type)
-
- # parsing all properties that refer to other entities
- if parse_properties:
- for prop, claim_property in claims.items():
- cp_dicts = [
- cp["mainsnak"]["datavalue"].get("value")
- for cp in claim_property
- if cp["mainsnak"].get("datavalue")
- ]
- cp_values = [
- cp_dict.get("id")
- for cp_dict in cp_dicts
- if isinstance(cp_dict, dict)
- if cp_dict.get("id") is not None
- ]
- if cp_values:
- if to_print:
- print("prop:", prop, cp_values)
-
- found_link = False
- if parse_sitelinks:
- site_value = obj["sitelinks"].get(site_filter, None)
- if site_value:
- site = site_value["title"]
- if to_print:
- print(site_filter, ":", site)
- title_to_id[site] = unique_id
- found_link = True
-
- if parse_labels:
- labels = obj["labels"]
- if labels:
- lang_label = labels.get(lang, None)
- if lang_label:
- if to_print:
- print(
- "label (" + lang + "):", lang_label["value"]
- )
-
- if found_link and parse_descr:
- descriptions = obj["descriptions"]
- if descriptions:
- lang_descr = descriptions.get(lang, None)
- if lang_descr:
- if to_print:
- print(
- "description (" + lang + "):",
- lang_descr["value"],
- )
- id_to_descr[unique_id] = lang_descr["value"]
-
- if parse_aliases:
- aliases = obj["aliases"]
- if aliases:
- lang_aliases = aliases.get(lang, None)
- if lang_aliases:
- for item in lang_aliases:
- if to_print:
- print(
- "alias (" + lang + "):", item["value"]
- )
- alias_list = id_to_alias.get(unique_id, [])
- alias_list.append(item["value"])
- id_to_alias[unique_id] = alias_list
-
- if to_print:
- print()
-
- # log final number of lines processed
- logger.info("Finished. Processed {} lines of WikiData JSON dump".format(cnt))
- return title_to_id, id_to_descr, id_to_alias
-
-
diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
deleted file mode 100644
index af0e68768..000000000
--- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# coding: utf-8
-"""Script that takes a previously created Knowledge Base and trains an entity linking
-pipeline. The provided KB directory should hold the kb, the original nlp object and
-its vocab used to create the KB, and a few auxiliary files such as the entity definitions,
-as created by the script `wikidata_create_kb`.
-
-For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2
-from https://dumps.wikimedia.org/enwiki/latest/
-"""
-from __future__ import unicode_literals
-
-import random
-import logging
-import spacy
-from pathlib import Path
-import plac
-from tqdm import tqdm
-
-from bin.wiki_entity_linking import wikipedia_processor
-from bin.wiki_entity_linking import (
- TRAINING_DATA_FILE,
- KB_MODEL_DIR,
- KB_FILE,
- LOG_FORMAT,
- OUTPUT_MODEL_DIR,
-)
-from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance
-from bin.wiki_entity_linking.kb_creator import read_kb
-
-from spacy.util import minibatch, compounding
-
-logger = logging.getLogger(__name__)
-
-
-@plac.annotations(
- dir_kb=("Directory with KB, NLP and related files", "positional", None, Path),
- output_dir=("Output directory", "option", "o", Path),
- loc_training=("Location to training data", "option", "k", Path),
- epochs=("Number of training iterations (default 10)", "option", "e", int),
- dropout=("Dropout to prevent overfitting (default 0.5)", "option", "p", float),
- lr=("Learning rate (default 0.005)", "option", "n", float),
- l2=("L2 regularization", "option", "r", float),
- train_articles=("# training articles (default 90% of all)", "option", "t", int),
- dev_articles=("# dev test articles (default 10% of all)", "option", "d", int),
- labels_discard=("NER labels to discard (default None)", "option", "l", str),
-)
-def main(
- dir_kb,
- output_dir=None,
- loc_training=None,
- epochs=10,
- dropout=0.5,
- lr=0.005,
- l2=1e-6,
- train_articles=None,
- dev_articles=None,
- labels_discard=None,
-):
- if not output_dir:
- logger.warning(
- "No output dir specified so no results will be written, are you sure about this ?"
- )
-
- logger.info("Creating Entity Linker with Wikipedia and WikiData")
-
- output_dir = Path(output_dir) if output_dir else dir_kb
- training_path = loc_training if loc_training else dir_kb / TRAINING_DATA_FILE
- nlp_dir = dir_kb / KB_MODEL_DIR
- kb_path = dir_kb / KB_FILE
- nlp_output_dir = output_dir / OUTPUT_MODEL_DIR
-
- # STEP 0: set up IO
- if not output_dir.exists():
- output_dir.mkdir()
-
- # STEP 1 : load the NLP object
- logger.info("STEP 1a: Loading model from {}".format(nlp_dir))
- nlp = spacy.load(nlp_dir)
- logger.info(
- "Original NLP pipeline has following pipeline components: {}".format(
- nlp.pipe_names
- )
- )
-
- # check that there is a NER component in the pipeline
- if "ner" not in nlp.pipe_names:
- raise ValueError("The `nlp` object should have a pretrained `ner` component.")
-
- logger.info("STEP 1b: Loading KB from {}".format(kb_path))
- kb = read_kb(nlp, kb_path)
-
- # STEP 2: read the training dataset previously created from WP
- logger.info("STEP 2: Reading training & dev dataset from {}".format(training_path))
- train_indices, dev_indices = wikipedia_processor.read_training_indices(
- training_path
- )
- logger.info(
- "Training set has {} articles, limit set to roughly {} articles per epoch".format(
- len(train_indices), train_articles if train_articles else "all"
- )
- )
- logger.info(
- "Dev set has {} articles, limit set to rougly {} articles for evaluation".format(
- len(dev_indices), dev_articles if dev_articles else "all"
- )
- )
- if dev_articles:
- dev_indices = dev_indices[0:dev_articles]
-
- # STEP 3: create and train an entity linking pipe
- logger.info(
- "STEP 3: Creating and training an Entity Linking pipe for {} epochs".format(
- epochs
- )
- )
- if labels_discard:
- labels_discard = [x.strip() for x in labels_discard.split(",")]
- logger.info(
- "Discarding {} NER types: {}".format(len(labels_discard), labels_discard)
- )
- else:
- labels_discard = []
-
- el_pipe = nlp.create_pipe(
- name="entity_linker",
- config={
- "pretrained_vectors": nlp.vocab.vectors,
- "labels_discard": labels_discard,
- },
- )
- el_pipe.set_kb(kb)
- nlp.add_pipe(el_pipe, last=True)
-
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
- with nlp.disable_pipes(*other_pipes): # only train Entity Linking
- optimizer = nlp.begin_training()
- optimizer.learn_rate = lr
- optimizer.L2 = l2
-
- logger.info("Dev Baseline Accuracies:")
- dev_data = wikipedia_processor.read_el_docs_golds(
- nlp=nlp,
- entity_file_path=training_path,
- dev=True,
- line_ids=dev_indices,
- kb=kb,
- labels_discard=labels_discard,
- )
-
- measure_performance(
- dev_data, kb, el_pipe, baseline=True, context=False, dev_limit=len(dev_indices)
- )
-
- for itn in range(epochs):
- random.shuffle(train_indices)
- losses = {}
- batches = minibatch(train_indices, size=compounding(8.0, 128.0, 1.001))
- batchnr = 0
- articles_processed = 0
-
- # we either process the whole training file, or just a part each epoch
- bar_total = len(train_indices)
- if train_articles:
- bar_total = train_articles
-
- with tqdm(total=bar_total, leave=False, desc=f"Epoch {itn}") as pbar:
- for batch in batches:
- if not train_articles or articles_processed < train_articles:
- with nlp.disable_pipes("entity_linker"):
- train_batch = wikipedia_processor.read_el_docs_golds(
- nlp=nlp,
- entity_file_path=training_path,
- dev=False,
- line_ids=batch,
- kb=kb,
- labels_discard=labels_discard,
- )
- try:
- with nlp.disable_pipes(*other_pipes):
- nlp.update(
- examples=train_batch,
- sgd=optimizer,
- drop=dropout,
- losses=losses,
- )
- batchnr += 1
- articles_processed += len(docs)
- pbar.update(len(docs))
- except Exception as e:
- logger.error("Error updating batch:" + str(e))
- if batchnr > 0:
- logging.info(
- "Epoch {} trained on {} articles, train loss {}".format(
- itn, articles_processed, round(losses["entity_linker"] / batchnr, 2)
- )
- )
- # re-read the dev_data (data is returned as a generator)
- dev_data = wikipedia_processor.read_el_docs_golds(
- nlp=nlp,
- entity_file_path=training_path,
- dev=True,
- line_ids=dev_indices,
- kb=kb,
- labels_discard=labels_discard,
- )
- measure_performance(
- dev_data,
- kb,
- el_pipe,
- baseline=False,
- context=True,
- dev_limit=len(dev_indices),
- )
-
- if output_dir:
- # STEP 4: write the NLP pipeline (now including an EL model) to file
- logger.info(
- "Final NLP pipeline has following pipeline components: {}".format(
- nlp.pipe_names
- )
- )
- logger.info("STEP 4: Writing trained NLP to {}".format(nlp_output_dir))
- nlp.to_disk(nlp_output_dir)
-
- logger.info("Done!")
-
-
-if __name__ == "__main__":
- logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
- plac.call(main)
diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py
deleted file mode 100644
index 315b1e916..000000000
--- a/bin/wiki_entity_linking/wikipedia_processor.py
+++ /dev/null
@@ -1,565 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-import bz2
-import logging
-import random
-import json
-
-from spacy.gold import GoldParse
-from bin.wiki_entity_linking import wiki_io as io
-from bin.wiki_entity_linking.wiki_namespaces import (
- WP_META_NAMESPACE,
- WP_FILE_NAMESPACE,
- WP_CATEGORY_NAMESPACE,
-)
-
-"""
-Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions.
-Write these results to file for downstream KB and training data generation.
-
-Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
-"""
-
-ENTITY_FILE = "gold_entities.csv"
-
-map_alias_to_link = dict()
-
-logger = logging.getLogger(__name__)
-
-title_regex = re.compile(r"(?<=).*(?= )")
-id_regex = re.compile(r"(?<=)\d*(?= )")
-text_regex = re.compile(r"(?<=).*(?= 0:
- logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
- clean_line = line.strip().decode("utf-8")
-
- # we attempt at reading the article's ID (but not the revision or contributor ID)
- if "" in clean_line or "" in clean_line:
- read_id = False
- if "" in clean_line:
- read_id = True
-
- if read_id:
- ids = id_regex.search(clean_line)
- if ids:
- current_article_id = ids[0]
-
- # only processing prior probabilities from true training (non-dev) articles
- if not is_dev(current_article_id):
- aliases, entities, normalizations = get_wp_links(clean_line)
- for alias, entity, norm in zip(aliases, entities, normalizations):
- _store_alias(
- alias, entity, normalize_alias=norm, normalize_entity=True
- )
-
- line = file.readline()
- cnt += 1
- logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
- logger.info("Finished. processed {} lines of Wikipedia XML dump".format(cnt))
-
- # write all aliases and their entities and count occurrences to file
- with prior_prob_output.open("w", encoding="utf8") as outputfile:
- outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
- for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
- s_dict = sorted(alias_dict.items(), key=lambda x: x[1], reverse=True)
- for entity, count in s_dict:
- outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
-
-
-def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
- alias = alias.strip()
- entity = entity.strip()
-
- # remove everything after # as this is not part of the title but refers to a specific paragraph
- if normalize_entity:
- # wikipedia titles are always capitalized
- entity = _capitalize_first(entity.split("#")[0])
- if normalize_alias:
- alias = alias.split("#")[0]
-
- if alias and entity:
- alias_dict = map_alias_to_link.get(alias, dict())
- entity_count = alias_dict.get(entity, 0)
- alias_dict[entity] = entity_count + 1
- map_alias_to_link[alias] = alias_dict
-
-
-def get_wp_links(text):
- aliases = []
- entities = []
- normalizations = []
-
- matches = link_regex.findall(text)
- for match in matches:
- match = match[2:][:-2].replace("_", " ").strip()
-
- if ns_regex.match(match):
- pass # ignore the entity if it points to a "meta" page
-
- # this is a simple [[link]], with the alias the same as the mention
- elif "|" not in match:
- aliases.append(match)
- entities.append(match)
- normalizations.append(True)
-
- # in wiki format, the link is written as [[entity|alias]]
- else:
- splits = match.split("|")
- entity = splits[0].strip()
- alias = splits[1].strip()
- # specific wiki format [[alias (specification)|]]
- if len(alias) == 0 and "(" in entity:
- alias = entity.split("(")[0]
- aliases.append(alias)
- entities.append(entity)
- normalizations.append(False)
- else:
- aliases.append(alias)
- entities.append(entity)
- normalizations.append(False)
-
- return aliases, entities, normalizations
-
-
-def _capitalize_first(text):
- if not text:
- return None
- result = text[0].capitalize()
- if len(result) > 0:
- result += text[1:]
- return result
-
-
-def create_training_and_desc(
- wp_input, def_input, desc_output, training_output, parse_desc, limit=None
-):
- wp_to_id = io.read_title_to_id(def_input)
- _process_wikipedia_texts(
- wp_input, wp_to_id, desc_output, training_output, parse_desc, limit
- )
-
-
-def _process_wikipedia_texts(
- wikipedia_input, wp_to_id, output, training_output, parse_descriptions, limit=None
-):
- """
- Read the XML wikipedia data to parse out training data:
- raw text data + positive instances
- """
-
- read_ids = set()
-
- with output.open("a", encoding="utf8") as descr_file, training_output.open(
- "w", encoding="utf8"
- ) as entity_file:
- if parse_descriptions:
- _write_training_description(descr_file, "WD_id", "description")
- with bz2.open(wikipedia_input, mode="rb") as file:
- article_count = 0
- article_text = ""
- article_title = None
- article_id = None
- reading_text = False
- reading_revision = False
-
- for line in file:
- clean_line = line.strip().decode("utf-8")
-
- if clean_line == "":
- reading_revision = True
- elif clean_line == " ":
- reading_revision = False
-
- # Start reading new page
- if clean_line == "":
- article_text = ""
- article_title = None
- article_id = None
- # finished reading this page
- elif clean_line == " ":
- if article_id:
- clean_text, entities = _process_wp_text(
- article_title, article_text, wp_to_id
- )
- if clean_text is not None and entities is not None:
- _write_training_entities(
- entity_file, article_id, clean_text, entities
- )
-
- if article_title in wp_to_id and parse_descriptions:
- description = " ".join(
- clean_text[:1000].split(" ")[:-1]
- )
- _write_training_description(
- descr_file, wp_to_id[article_title], description
- )
- article_count += 1
- if article_count % 10000 == 0 and article_count > 0:
- logger.info(
- "Processed {} articles".format(article_count)
- )
- if limit and article_count >= limit:
- break
- article_text = ""
- article_title = None
- article_id = None
- reading_text = False
- reading_revision = False
-
- # start reading text within a page
- if "")
- clean_text = clean_text.replace(r""", '"')
- clean_text = clean_text.replace(r" ", " ")
- clean_text = clean_text.replace(r"&", "&")
-
- # remove multiple spaces
- while " " in clean_text:
- clean_text = clean_text.replace(" ", " ")
-
- return clean_text.strip()
-
-
-def _remove_links(clean_text, wp_to_id):
- # read the text char by char to get the right offsets for the interwiki links
- entities = []
- final_text = ""
- open_read = 0
- reading_text = True
- reading_entity = False
- reading_mention = False
- reading_special_case = False
- entity_buffer = ""
- mention_buffer = ""
- for index, letter in enumerate(clean_text):
- if letter == "[":
- open_read += 1
- elif letter == "]":
- open_read -= 1
- elif letter == "|":
- if reading_text:
- final_text += letter
- # switch from reading entity to mention in the [[entity|mention]] pattern
- elif reading_entity:
- reading_text = False
- reading_entity = False
- reading_mention = True
- else:
- reading_special_case = True
- else:
- if reading_entity:
- entity_buffer += letter
- elif reading_mention:
- mention_buffer += letter
- elif reading_text:
- final_text += letter
- else:
- raise ValueError("Not sure at point", clean_text[index - 2 : index + 2])
-
- if open_read > 2:
- reading_special_case = True
-
- if open_read == 2 and reading_text:
- reading_text = False
- reading_entity = True
- reading_mention = False
-
- # we just finished reading an entity
- if open_read == 0 and not reading_text:
- if "#" in entity_buffer or entity_buffer.startswith(":"):
- reading_special_case = True
- # Ignore cases with nested structures like File: handles etc
- if not reading_special_case:
- if not mention_buffer:
- mention_buffer = entity_buffer
- start = len(final_text)
- end = start + len(mention_buffer)
- qid = wp_to_id.get(entity_buffer, None)
- if qid:
- entities.append((mention_buffer, qid, start, end))
- final_text += mention_buffer
-
- entity_buffer = ""
- mention_buffer = ""
-
- reading_text = True
- reading_entity = False
- reading_mention = False
- reading_special_case = False
- return final_text, entities
-
-
-def _write_training_description(outputfile, qid, description):
- if description is not None:
- line = str(qid) + "|" + description + "\n"
- outputfile.write(line)
-
-
-def _write_training_entities(outputfile, article_id, clean_text, entities):
- entities_data = [
- {"alias": ent[0], "entity": ent[1], "start": ent[2], "end": ent[3]}
- for ent in entities
- ]
- line = (
- json.dumps(
- {
- "article_id": article_id,
- "clean_text": clean_text,
- "entities": entities_data,
- },
- ensure_ascii=False,
- )
- + "\n"
- )
- outputfile.write(line)
-
-
-def read_training_indices(entity_file_path):
- """ This method creates two lists of indices into the training file: one with indices for the
- training examples, and one for the dev examples."""
- train_indices = []
- dev_indices = []
-
- with entity_file_path.open("r", encoding="utf8") as file:
- for i, line in enumerate(file):
- example = json.loads(line)
- article_id = example["article_id"]
- clean_text = example["clean_text"]
-
- if is_valid_article(clean_text):
- if is_dev(article_id):
- dev_indices.append(i)
- else:
- train_indices.append(i)
-
- return train_indices, dev_indices
-
-
-def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=None):
- """ This method provides training/dev examples that correspond to the entity annotations found by the nlp object.
- For training, it will include both positive and negative examples by using the candidate generator from the kb.
- For testing (kb=None), it will include all positive examples only."""
- if not labels_discard:
- labels_discard = []
-
- texts = []
- entities_list = []
-
- with entity_file_path.open("r", encoding="utf8") as file:
- for i, line in enumerate(file):
- if i in line_ids:
- example = json.loads(line)
- article_id = example["article_id"]
- clean_text = example["clean_text"]
- entities = example["entities"]
-
- if dev != is_dev(article_id) or not is_valid_article(clean_text):
- continue
-
- texts.append(clean_text)
- entities_list.append(entities)
-
- docs = nlp.pipe(texts, batch_size=50)
-
- for doc, entities in zip(docs, entities_list):
- gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard)
- if gold and len(gold.links) > 0:
- yield doc, gold
-
-
-def _get_gold_parse(doc, entities, dev, kb, labels_discard):
- gold_entities = {}
- tagged_ent_positions = {
- (ent.start_char, ent.end_char): ent
- for ent in doc.ents
- if ent.label_ not in labels_discard
- }
-
- for entity in entities:
- entity_id = entity["entity"]
- alias = entity["alias"]
- start = entity["start"]
- end = entity["end"]
-
- candidate_ids = []
- if kb and not dev:
- candidates = kb.get_candidates(alias)
- candidate_ids = [cand.entity_ for cand in candidates]
-
- tagged_ent = tagged_ent_positions.get((start, end), None)
- if tagged_ent:
- # TODO: check that alias == doc.text[start:end]
- should_add_ent = (dev or entity_id in candidate_ids) and is_valid_sentence(
- tagged_ent.sent.text
- )
-
- if should_add_ent:
- value_by_id = {entity_id: 1.0}
- if not dev:
- random.shuffle(candidate_ids)
- value_by_id.update(
- {kb_id: 0.0 for kb_id in candidate_ids if kb_id != entity_id}
- )
- gold_entities[(start, end)] = value_by_id
-
- return GoldParse(doc, links=gold_entities)
-
-
-def is_dev(article_id):
- if not article_id:
- return False
- return article_id.endswith("3")
-
-
-def is_valid_article(doc_text):
- # custom length cut-off
- return 10 < len(doc_text) < 30000
-
-
-def is_valid_sentence(sent_text):
- if not 10 < len(sent_text) < 3000:
- # custom length cut-off
- return False
-
- if sent_text.strip().startswith("*") or sent_text.strip().startswith("#"):
- # remove 'enumeration' sentences (occurs often on Wikipedia)
- return False
-
- return True
diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py
index 0aefec9ef..5c41c0e92 100644
--- a/examples/training/pretrain_textcat.py
+++ b/examples/training/pretrain_textcat.py
@@ -129,10 +129,7 @@ def train_textcat(nlp, n_texts, n_iter=10):
)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
- # get names of other pipes to disable them during training
- pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
- with nlp.disable_pipes(*other_pipes): # only train textcat
+ with nlp.select_pipes(enable="textcat"): # only train textcat
optimizer = nlp.begin_training()
textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights)
print("Training the model...")
diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py
index a0455c0a9..24fc67ebb 100644
--- a/examples/training/rehearsal.py
+++ b/examples/training/rehearsal.py
@@ -62,11 +62,8 @@ def main(model_name, unlabelled_loc):
optimizer.b1 = 0.0
optimizer.b2 = 0.0
- # get names of other pipes to disable them during training
- pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
sizes = compounding(1.0, 4.0, 1.001)
- with nlp.disable_pipes(*other_pipes):
+ with nlp.select_pipes(enable="ner"):
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
random.shuffle(raw_docs)
diff --git a/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py b/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py
index 339ce39be..66d96ff68 100644
--- a/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py
+++ b/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py
@@ -5,16 +5,17 @@ from spacy.gold import docs_to_json
import srsly
import sys
+
@plac.annotations(
model=("Model name. Defaults to 'en'.", "option", "m", str),
input_file=("Input file (jsonl)", "positional", None, Path),
output_dir=("Output directory", "positional", None, Path),
n_texts=("Number of texts to convert", "option", "t", int),
)
-def convert(model='en', input_file=None, output_dir=None, n_texts=0):
+def convert(model="en", input_file=None, output_dir=None, n_texts=0):
# Load model with tokenizer + sentencizer only
nlp = spacy.load(model)
- nlp.disable_pipes(*nlp.pipe_names)
+ nlp.select_pipes(disable=nlp.pipe_names)
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer, first=True)
@@ -49,5 +50,6 @@ def convert(model='en', input_file=None, output_dir=None, n_texts=0):
srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)])
+
if __name__ == "__main__":
plac.call(convert)
diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py
index 9776ad351..a22f255e7 100644
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@@ -97,7 +97,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
TRAIN_DOCS = []
for text, annotation in TRAIN_DATA:
- with nlp.disable_pipes("entity_linker"):
+ with nlp.select_pipes(disable="entity_linker"):
doc = nlp(text)
annotation_clean = annotation
for offset, kb_id_dict in annotation["links"].items():
@@ -112,10 +112,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
annotation_clean["links"][offset] = new_dict
TRAIN_DOCS.append((doc, annotation_clean))
- # get names of other pipes to disable them during training
- pipe_exceptions = ["entity_linker", "trf_wordpiecer", "trf_tok2vec"]
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
- with nlp.disable_pipes(*other_pipes): # only train entity linker
+ with nlp.select_pipes(enable="entity_linker"): # only train entity linker
# reset and initialize the weights randomly
optimizer = nlp.begin_training()
diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py
index bfec23d09..c3d5a279b 100644
--- a/examples/training/train_intent_parser.py
+++ b/examples/training/train_intent_parser.py
@@ -124,9 +124,7 @@ def main(model=None, output_dir=None, n_iter=15):
for dep in annotations.get("deps", []):
parser.add_label(dep)
- pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
- with nlp.disable_pipes(*other_pipes): # only train parser
+ with nlp.select_pipes(enable="parser"): # only train parser
optimizer = nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
index d4e0bf794..f0f3affe7 100644
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@@ -55,10 +55,7 @@ def main(model=None, output_dir=None, n_iter=100):
print("Add label", ent[2])
ner.add_label(ent[2])
- # get names of other pipes to disable them during training
- pipe_exceptions = ["simple_ner"]
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
- with nlp.disable_pipes(*other_pipes): # only train NER
+ with nlp.select_pipes(enable="ner"): # only train NER
# reset and initialize the weights randomly – but only if we're
# training a new model
if model is None:
diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index 47420e524..445c3fc27 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -94,10 +94,8 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
else:
optimizer = nlp.resume_training()
move_names = list(ner.move_names)
- # get names of other pipes to disable them during training
- pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
- with nlp.disable_pipes(*other_pipes): # only train NER
+
+ with nlp.select_pipes(enable="ner"): # only train NER
sizes = compounding(1.0, 4.0, 1.001)
# batch up the examples using spaCy's minibatch
for itn in range(n_iter):
diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py
index 7bb3e8586..4f4409e31 100644
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@@ -64,10 +64,7 @@ def main(model=None, output_dir=None, n_iter=15):
for dep in annotations.get("deps", []):
parser.add_label(dep)
- # get names of other pipes to disable them during training
- pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
- with nlp.disable_pipes(*other_pipes): # only train parser
+ with nlp.select_pipes(enable="parser"): # only train parser
optimizer = nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py
index dfb95b038..65acadb07 100644
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@@ -68,10 +68,7 @@ def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=Non
ex = Example.from_gold(gold, doc=doc)
train_examples.append(ex)
- # get names of other pipes to disable them during training
- pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
- with nlp.disable_pipes(*other_pipes): # only train textcat
+ with nlp.select_pipes(enable="textcat"): # only train textcat
optimizer = nlp.begin_training()
if init_tok2vec is not None:
with init_tok2vec.open("rb") as file_:
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 5fa09da78..19e0a81e0 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -145,7 +145,7 @@ def train(
msg.text(f"Loading vectors from model '{vectors}'")
_load_vectors(nlp, vectors)
- nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
+ nlp.select_pipes(disable=[p for p in nlp.pipe_names if p not in pipeline])
for pipe in pipeline:
# first, create the model.
# Bit of a hack after the refactor to get the vectors into a default config
@@ -201,8 +201,8 @@ def train(
exits=1,
)
msg.text(f"Extending component from base model '{pipe}'")
- disabled_pipes = nlp.disable_pipes(
- [p for p in nlp.pipe_names if p not in pipeline]
+ disabled_pipes = nlp.select_pipes(
+ disable=[p for p in nlp.pipe_names if p not in pipeline]
)
else:
msg.text(f"Starting with blank model '{lang}'")
diff --git a/spacy/errors.py b/spacy/errors.py
index 99a0081c0..7a7b44731 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -104,6 +104,8 @@ class Warnings(object):
"string \"Field1=Value1,Value2|Field2=Value3\".")
# TODO: fix numbering after merging develop into master
+ W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' "
+ "instead.")
W097 = ("No Model config was provided to create the '{name}' component, "
"and no default configuration could be found either.")
W098 = ("No Model config was provided to create the '{name}' component, "
@@ -132,7 +134,7 @@ class Errors(object):
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
E008 = ("Some current components would be lost when restoring previous "
"pipeline state. If you added components after calling "
- "`nlp.disable_pipes()`, you should remove them explicitly with "
+ "`nlp.select_pipes()`, you should remove them explicitly with "
"`nlp.remove_pipe()` before the pipeline is restored. Names of "
"the new components: {names}")
E009 = ("The `update` method expects same number of docs and golds, but "
@@ -546,6 +548,13 @@ class Errors(object):
"token itself.")
# TODO: fix numbering after merging develop into master
+ E991 = ("The function 'select_pipes' should be called with either a "
+ "'disable' argument to list the names of the pipe components "
+ "that should be disabled, or with an 'enable' argument that "
+ "specifies which pipes should not be disabled.")
+ E992 = ("The function `select_pipes` was called with `enable`={enable} "
+ "and `disable`={disable} but that information is conflicting "
+ "for the `nlp` pipeline with components {names}.")
E993 = ("The config for 'nlp' should include either a key 'name' to "
"refer to an existing model by name or path, or a key 'lang' "
"to create a new blank model.")
diff --git a/spacy/language.py b/spacy/language.py
index a7db5ef20..5f617b1f6 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -511,11 +511,37 @@ class Language(object):
of the block. Otherwise, a DisabledPipes object is returned, that has
a `.restore()` method you can use to undo your changes.
- DOCS: https://spacy.io/api/language#disable_pipes
+ This method has been deprecated since 3.0
"""
+ warnings.warn(Warnings.W096, DeprecationWarning)
if len(names) == 1 and isinstance(names[0], (list, tuple)):
names = names[0] # support list of names instead of spread
- return DisabledPipes(self, *names)
+ return DisabledPipes(self, names)
+
+ def select_pipes(self, disable=None, enable=None):
+ """Disable one or more pipeline components. If used as a context
+ manager, the pipeline will be restored to the initial state at the end
+ of the block. Otherwise, a DisabledPipes object is returned, that has
+ a `.restore()` method you can use to undo your changes.
+
+ disable (str or iterable): The name(s) of the pipes to disable
+ enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled
+
+ DOCS: https://spacy.io/api/language#select_pipes
+ """
+ if enable is None and disable is None:
+ raise ValueError(Errors.E991)
+ if disable is not None and isinstance(disable, str):
+ disable = [disable]
+ if enable is not None:
+ if isinstance(enable, str):
+ enable = [enable]
+ to_disable = [pipe for pipe in self.pipe_names if pipe not in enable]
+ # raise an error if the enable and disable keywords are not consistent
+ if disable is not None and disable != to_disable:
+ raise ValueError(Errors.E992.format(enable=enable, disable=disable, names=self.pipe_names))
+ disable = to_disable
+ return DisabledPipes(self, disable)
def make_doc(self, text):
return self.tokenizer(text)
@@ -1117,7 +1143,7 @@ def _fix_pretrained_vectors_name(nlp):
class DisabledPipes(list):
"""Manager for temporary pipeline disabling."""
- def __init__(self, nlp, *names):
+ def __init__(self, nlp, names):
self.nlp = nlp
self.names = names
# Important! Not deep copy -- we just want the container (but we also
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 06c568ac9..58160c2e9 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -200,7 +200,7 @@ class EntityRuler(object):
]
except ValueError:
subsequent_pipes = []
- with self.nlp.disable_pipes(subsequent_pipes):
+ with self.nlp.select_pipes(disable=subsequent_pipes):
token_patterns = []
phrase_pattern_labels = []
phrase_pattern_texts = []
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index e2fb02a2a..d42216655 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -88,7 +88,16 @@ def test_remove_pipe(nlp, name):
def test_disable_pipes_method(nlp, name):
nlp.add_pipe(new_pipe, name=name)
assert nlp.has_pipe(name)
- disabled = nlp.disable_pipes(name)
+ disabled = nlp.select_pipes(disable=name)
+ assert not nlp.has_pipe(name)
+ disabled.restore()
+
+
+@pytest.mark.parametrize("name", ["my_component"])
+def test_enable_pipes_method(nlp, name):
+ nlp.add_pipe(new_pipe, name=name)
+ assert nlp.has_pipe(name)
+ disabled = nlp.select_pipes(enable=[])
assert not nlp.has_pipe(name)
disabled.restore()
@@ -97,19 +106,57 @@ def test_disable_pipes_method(nlp, name):
def test_disable_pipes_context(nlp, name):
nlp.add_pipe(new_pipe, name=name)
assert nlp.has_pipe(name)
- with nlp.disable_pipes(name):
+ with nlp.select_pipes(disable=name):
assert not nlp.has_pipe(name)
assert nlp.has_pipe(name)
-def test_disable_pipes_list_arg(nlp):
+def test_select_pipes_list_arg(nlp):
for name in ["c1", "c2", "c3"]:
nlp.add_pipe(new_pipe, name=name)
assert nlp.has_pipe(name)
- with nlp.disable_pipes(["c1", "c2"]):
+ with nlp.select_pipes(disable=["c1", "c2"]):
assert not nlp.has_pipe("c1")
assert not nlp.has_pipe("c2")
assert nlp.has_pipe("c3")
+ with nlp.select_pipes(enable="c3"):
+ assert not nlp.has_pipe("c1")
+ assert not nlp.has_pipe("c2")
+ assert nlp.has_pipe("c3")
+ with nlp.select_pipes(enable=["c1", "c2"], disable="c3"):
+ assert nlp.has_pipe("c1")
+ assert nlp.has_pipe("c2")
+ assert not nlp.has_pipe("c3")
+ with nlp.select_pipes(enable=[]):
+ assert not nlp.has_pipe("c1")
+ assert not nlp.has_pipe("c2")
+ assert not nlp.has_pipe("c3")
+ with nlp.select_pipes(enable=["c1", "c2", "c3"], disable=[]):
+ assert nlp.has_pipe("c1")
+ assert nlp.has_pipe("c2")
+ assert nlp.has_pipe("c3")
+ with nlp.select_pipes(disable=["c1", "c2", "c3"], enable=[]):
+ assert not nlp.has_pipe("c1")
+ assert not nlp.has_pipe("c2")
+ assert not nlp.has_pipe("c3")
+
+
+def test_select_pipes_errors(nlp):
+ for name in ["c1", "c2", "c3"]:
+ nlp.add_pipe(new_pipe, name=name)
+ assert nlp.has_pipe(name)
+
+ with pytest.raises(ValueError):
+ nlp.select_pipes()
+
+ with pytest.raises(ValueError):
+ nlp.select_pipes(enable=["c1", "c2"], disable=["c1"])
+
+ with pytest.raises(ValueError):
+ nlp.select_pipes(enable=["c1", "c2"], disable=[])
+
+ with pytest.raises(ValueError):
+ nlp.select_pipes(enable=[], disable=["c3"])
@pytest.mark.parametrize("n_pipes", [100])
diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py
index 120cea1d2..cab68793c 100644
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@@ -31,7 +31,7 @@ def test_issue3611():
nlp.add_pipe(textcat, last=True)
# training the network
- with nlp.disable_pipes([p for p in nlp.pipe_names if p != "textcat"]):
+ with nlp.select_pipes(enable="textcat"):
optimizer = nlp.begin_training(X=x_train, Y=y_train)
for i in range(3):
losses = {}
diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py
index 7158d9b21..b641213ad 100644
--- a/spacy/tests/regression/test_issue4030.py
+++ b/spacy/tests/regression/test_issue4030.py
@@ -31,7 +31,7 @@ def test_issue4030():
nlp.add_pipe(textcat, last=True)
# training the network
- with nlp.disable_pipes([p for p in nlp.pipe_names if p != "textcat"]):
+ with nlp.select_pipes(enable="textcat"):
optimizer = nlp.begin_training()
for i in range(3):
losses = {}
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index d548a1f64..703a0f678 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -314,45 +314,47 @@ component function.
| `name` | unicode | Name of the component to remove. |
| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. |
-## Language.disable_pipes {#disable_pipes tag="contextmanager, method" new="2"}
+## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"}
Disable one or more pipeline components. If used as a context manager, the
pipeline will be restored to the initial state at the end of the block.
Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method
you can use to undo your changes.
+You can specify either `disable` (as a list or string), or `enable`. In the
+latter case, all components not in the `enable` list, will be disabled.
+
> #### Example
>
> ```python
-> # New API as of v2.2.2
-> with nlp.disable_pipes(["tagger", "parser"]):
+> # New API as of v3.0
+> with nlp.select_pipes(disable=["tagger", "parser"]):
> nlp.begin_training()
>
-> with nlp.disable_pipes("tagger", "parser"):
+> with nlp.select_pipes(enable="ner"):
> nlp.begin_training()
>
-> disabled = nlp.disable_pipes("tagger", "parser")
+> disabled = nlp.select_pipes(disable=["tagger", "parser"])
> nlp.begin_training()
> disabled.restore()
> ```
-| Name | Type | Description |
-| ----------------------------------------- | --------------- | ------------------------------------------------------------------------------------ |
-| `disabled` 2.2.2 | list | Names of pipeline components to disable. |
-| `*disabled` | unicode | Names of pipeline components to disable. |
-| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
+| Name | Type | Description |
+| ----------- | --------------- | ------------------------------------------------------------------------------------ |
+| `disable` | list | Names of pipeline components to disable. |
+| `disable` | unicode | Name of pipeline component to disable. |
+| `enable` | list | Names of pipeline components that will not be disabled. |
+| `enable` | unicode | Name of pipeline component that will not be disabled. |
+| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
-
-As of spaCy v2.2.2, the `Language.disable_pipes` method can also take a list of
-component names as its first argument (instead of a variable number of
-arguments). This is especially useful if you're generating the component names
-to disable programmatically. The new syntax will become the default in the
-future.
+
+
+As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
```diff
-- disabled = nlp.disable_pipes("tagger", "parser")
-+ disabled = nlp.disable_pipes(["tagger", "parser"])
+- nlp.disable_pipes(["tagger", "parser"])
++ nlp.select_pipes(disable=["tagger", "parser"])
```
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 7382f2b8c..696e11106 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -252,9 +252,9 @@ for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
If you need to **execute more code** with components disabled – e.g. to reset
the weights or update only some components during training – you can use the
-[`nlp.disable_pipes`](/api/language#disable_pipes) contextmanager. At the end of
+[`nlp.select_pipes`](/api/language#select_pipes) contextmanager. At the end of
the `with` block, the disabled pipeline components will be restored
-automatically. Alternatively, `disable_pipes` returns an object that lets you
+automatically. Alternatively, `select_pipes` returns an object that lets you
call its `restore()` method to restore the disabled components when needed. This
can be useful if you want to prevent unnecessary code indentation of large
blocks.
@@ -262,16 +262,26 @@ blocks.
```python
### Disable for block
# 1. Use as a contextmanager
-with nlp.disable_pipes("tagger", "parser"):
+with nlp.select_pipes(disable=["tagger", "parser"]):
doc = nlp("I won't be tagged and parsed")
doc = nlp("I will be tagged and parsed")
# 2. Restore manually
-disabled = nlp.disable_pipes("ner")
+disabled = nlp.select_pipes(disable="ner")
doc = nlp("I won't have named entities")
disabled.restore()
```
+If you want to disable all pipes except for one or a few, you can use the `enable`
+keyword. Just like the `disable` keyword, it takes a list of pipe names, or a string
+defining just one pipe.
+```python
+# Enable only the parser
+with nlp.select_pipes(enable="parser"):
+ doc = nlp("I will only be parsed")
+```
+
+
Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method
to remove pipeline components from an existing pipeline, the
[`rename_pipe`](/api/language#rename_pipe) method to rename them, or the
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 1db2405d1..5f47bd2e3 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -906,7 +906,7 @@ pipeline component, **make sure that the pipeline component runs** when you
create the pattern. For example, to match on `POS` or `LEMMA`, the pattern `Doc`
objects need to have part-of-speech tags set by the `tagger`. You can either
call the `nlp` object on your pattern texts instead of `nlp.make_doc`, or use
-[`nlp.disable_pipes`](/api/language#disable_pipes) to disable components
+[`nlp.select_pipes`](/api/language#select_pipes) to disable components
selectively.
@@ -1121,8 +1121,7 @@ while adding the phrase patterns.
entityruler = EntityRuler(nlp)
patterns = [{"label": "TEST", "pattern": str(i)} for i in range(100000)]
-other_pipes = [p for p in nlp.pipe_names if p != "tagger"]
-with nlp.disable_pipes(*other_pipes):
+with nlp.select_pipes(enable="tagger"):
entityruler.add_patterns(patterns)
```
diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md
index 479bdd264..39d732724 100644
--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@@ -647,8 +647,7 @@ import random
nlp = spacy.load("en_core_web_sm")
train_data = [("Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})]
-other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
-with nlp.disable_pipes(*other_pipes):
+with nlp.select_pipes(enable="ner"):
optimizer = nlp.begin_training()
for i in range(10):
random.shuffle(train_data)
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 479441edf..a10c60357 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -362,7 +362,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py
you're using a blank model, don't forget to add the entity recognizer to the
pipeline. If you're using an existing model, make sure to disable all other
pipeline components during training using
- [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be
+ [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be
training the entity recognizer.
2. **Shuffle and loop over** the examples. For each example, **update the
model** by calling [`nlp.update`](/api/language#update), which steps through
@@ -403,7 +403,7 @@ referred to as the "catastrophic forgetting" problem.
you're using a blank model, don't forget to add the entity recognizer to the
pipeline. If you're using an existing model, make sure to disable all other
pipeline components during training using
- [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be
+ [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be
training the entity recognizer.
2. **Add the new entity label** to the entity recognizer using the
[`add_label`](/api/entityrecognizer#add_label) method. You can access the
@@ -436,7 +436,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py
you're using a blank model, don't forget to add the parser to the pipeline.
If you're using an existing model, make sure to disable all other pipeline
components during training using
- [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be
+ [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be
training the parser.
2. **Add the dependency labels** to the parser using the
[`add_label`](/api/dependencyparser#add_label) method. If you're starting off
@@ -470,7 +470,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_tagger.py
you're using a blank model, don't forget to add the tagger to the pipeline.
If you're using an existing model, make sure to disable all other pipeline
components during training using
- [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be
+ [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be
training the tagger.
2. **Add the tag map** to the tagger using the
[`add_label`](/api/tagger#add_label) method. The first argument is the new
@@ -544,7 +544,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_intent_pa
you're using a blank model, don't forget to add the custom parser to the
pipeline. If you're using an existing model, make sure to **remove the old
parser** from the pipeline, and disable all other pipeline components during
- training using [`nlp.disable_pipes`](/api/language#disable_pipes). This way,
+ training using [`nlp.select_pipes`](/api/language#select_pipes). This way,
you'll only be training the parser.
3. **Add the dependency labels** to the parser using the
[`add_label`](/api/dependencyparser#add_label) method.
@@ -576,7 +576,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_textcat.p
[`spacy.blank`](/api/top-level#spacy.blank) with the ID of your language. If
you're using an existing model, make sure to disable all other pipeline
components during training using
- [`nlp.disable_pipes`](/api/language#disable_pipes). This way, you'll only be
+ [`nlp.select_pipes`](/api/language#select_pipes). This way, you'll only be
training the text classifier.
2. **Add the text classifier** to the pipeline, and add the labels you want to
train – for example, `POSITIVE`.
@@ -653,7 +653,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_li
pipeline including also a component for
[named entity recognition](/usage/training#ner). If you're using a model with
additional components, make sure to disable all other pipeline components
- during training using [`nlp.disable_pipes`](/api/language#disable_pipes).
+ during training using [`nlp.select_pipes`](/api/language#select_pipes).
This way, you'll only be training the entity linker.
2. **Shuffle and loop over** the examples. For each example, **update the
model** by calling [`nlp.update`](/api/language#update), which steps through
From f00de445dd04b61bc55a0fe010c9cd3862d38aef Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Tue, 19 May 2020 16:20:03 +0200
Subject: [PATCH 138/496] default models defined in component decorator (#5452)
* move defaults to pipeline and use in component decorator
* black formatting
* relative import
---
spacy/language.py | 54 +++++++------------
.../models => pipeline}/defaults/__init__.py | 2 +-
.../defaults/entity_linker_defaults.cfg | 0
.../defaults/morphologizer_defaults.cfg | 0
.../defaults/ner_defaults.cfg | 0
.../defaults/parser_defaults.cfg | 0
.../defaults/senter_defaults.cfg | 0
.../defaults/simple_ner_defaults.cfg | 0
.../defaults/tagger_defaults.cfg | 0
.../defaults/tensorizer_defaults.cfg | 0
.../defaults/textcat_bow_defaults.cfg | 0
.../defaults/textcat_cnn_defaults.cfg | 0
.../defaults/textcat_defaults.cfg | 0
.../defaults/tok2vec_defaults.cfg | 0
spacy/pipeline/morphologizer.pyx | 3 +-
spacy/pipeline/pipes.pyx | 29 +++++++---
spacy/pipeline/simple_ner.py | 4 +-
spacy/pipeline/tok2vec.py | 3 +-
spacy/tests/doc/test_add_entities.py | 2 +-
spacy/tests/parser/test_add_label.py | 2 +-
spacy/tests/parser/test_arc_eager_oracle.py | 2 +-
spacy/tests/parser/test_ner.py | 2 +-
spacy/tests/parser/test_neural_parser.py | 2 +-
spacy/tests/parser/test_nn_beam.py | 2 +-
spacy/tests/parser/test_preset_sbd.py | 2 +-
spacy/tests/pipeline/test_textcat.py | 2 +-
spacy/tests/regression/test_issue1501-2000.py | 2 +-
spacy/tests/regression/test_issue3001-3500.py | 2 +-
spacy/tests/regression/test_issue3830.py | 2 +-
spacy/tests/regression/test_issue4042.py | 2 +-
spacy/tests/regression/test_issue4313.py | 2 +-
.../serialize/test_serialize_pipeline.py | 4 +-
32 files changed, 64 insertions(+), 61 deletions(-)
rename spacy/{ml/models => pipeline}/defaults/__init__.py (99%)
rename spacy/{ml/models => pipeline}/defaults/entity_linker_defaults.cfg (100%)
rename spacy/{ml/models => pipeline}/defaults/morphologizer_defaults.cfg (100%)
rename spacy/{ml/models => pipeline}/defaults/ner_defaults.cfg (100%)
rename spacy/{ml/models => pipeline}/defaults/parser_defaults.cfg (100%)
rename spacy/{ml/models => pipeline}/defaults/senter_defaults.cfg (100%)
rename spacy/{ml/models => pipeline}/defaults/simple_ner_defaults.cfg (100%)
rename spacy/{ml/models => pipeline}/defaults/tagger_defaults.cfg (100%)
rename spacy/{ml/models => pipeline}/defaults/tensorizer_defaults.cfg (100%)
rename spacy/{ml/models => pipeline}/defaults/textcat_bow_defaults.cfg (100%)
rename spacy/{ml/models => pipeline}/defaults/textcat_cnn_defaults.cfg (100%)
rename spacy/{ml/models => pipeline}/defaults/textcat_defaults.cfg (100%)
rename spacy/{ml/models => pipeline}/defaults/tok2vec_defaults.cfg (100%)
diff --git a/spacy/language.py b/spacy/language.py
index 5f617b1f6..2b8fa129e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -184,33 +184,6 @@ class Language(object):
self.max_length = max_length
self._optimizer = None
- # TODO: de-uglify (incorporating into component decorator didn't work because of circular imports)
- from .ml.models.defaults import (
- default_tagger_config,
- default_parser_config,
- default_ner_config,
- default_textcat_config,
- default_nel_config,
- default_morphologizer_config,
- default_senter_config,
- default_tensorizer_config,
- default_tok2vec_config,
- default_simple_ner_config
- )
-
- self.defaults = {
- "tagger": default_tagger_config(),
- "parser": default_parser_config(),
- "ner": default_ner_config(),
- "textcat": default_textcat_config(),
- "entity_linker": default_nel_config(),
- "morphologizer": default_morphologizer_config(),
- "senter": default_senter_config(),
- "simple_ner": default_simple_ner_config(),
- "tensorizer": default_tensorizer_config(),
- "tok2vec": default_tok2vec_config(),
- }
-
@property
def path(self):
return self._path
@@ -338,7 +311,6 @@ class Language(object):
else:
raise KeyError(Errors.E002.format(name=name))
factory = self.factories[name]
- default_config = self.defaults.get(name, None)
# transform the model's config to an actual Model
factory_cfg = dict(config)
@@ -349,11 +321,6 @@ class Language(object):
warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name))
model_cfg = None
del factory_cfg["model"]
- if model_cfg is None and default_config is not None:
- warnings.warn(Warnings.W098.format(name=name))
- model_cfg = default_config["model"]
- if model_cfg is None:
- warnings.warn(Warnings.W097.format(name=name))
model = None
if model_cfg is not None:
self.config[name] = {"model": model_cfg}
@@ -539,7 +506,11 @@ class Language(object):
to_disable = [pipe for pipe in self.pipe_names if pipe not in enable]
# raise an error if the enable and disable keywords are not consistent
if disable is not None and disable != to_disable:
- raise ValueError(Errors.E992.format(enable=enable, disable=disable, names=self.pipe_names))
+ raise ValueError(
+ Errors.E992.format(
+ enable=enable, disable=disable, names=self.pipe_names
+ )
+ )
disable = to_disable
return DisabledPipes(self, disable)
@@ -1085,7 +1056,14 @@ class component(object):
# NB: This decorator needs to live here, because it needs to write to
# Language.factories. All other solutions would cause circular import.
- def __init__(self, name=None, assigns=tuple(), requires=tuple(), retokenizes=False):
+ def __init__(
+ self,
+ name=None,
+ assigns=tuple(),
+ requires=tuple(),
+ retokenizes=False,
+ default_model=lambda: None,
+ ):
"""Decorate a pipeline component.
name (unicode): Default component and factory name.
@@ -1097,6 +1075,7 @@ class component(object):
self.assigns = validate_attrs(assigns)
self.requires = validate_attrs(requires)
self.retokenizes = retokenizes
+ self.default_model = default_model
def __call__(self, *args, **kwargs):
obj = args[0]
@@ -1109,6 +1088,11 @@ class component(object):
obj.retokenizes = self.retokenizes
def factory(nlp, model, **cfg):
+ if model is None:
+ model = self.default_model()
+ warnings.warn(Warnings.W098.format(name=self.name))
+ if model is None:
+ warnings.warn(Warnings.W097.format(name=self.name))
if hasattr(obj, "from_nlp"):
return obj.from_nlp(nlp, model, **cfg)
elif isinstance(obj, type):
diff --git a/spacy/ml/models/defaults/__init__.py b/spacy/pipeline/defaults/__init__.py
similarity index 99%
rename from spacy/ml/models/defaults/__init__.py
rename to spacy/pipeline/defaults/__init__.py
index 850d9fce0..e17e2d3b4 100644
--- a/spacy/ml/models/defaults/__init__.py
+++ b/spacy/pipeline/defaults/__init__.py
@@ -1,6 +1,6 @@
from pathlib import Path
-from .... import util
+from ... import util
def default_nel_config():
diff --git a/spacy/ml/models/defaults/entity_linker_defaults.cfg b/spacy/pipeline/defaults/entity_linker_defaults.cfg
similarity index 100%
rename from spacy/ml/models/defaults/entity_linker_defaults.cfg
rename to spacy/pipeline/defaults/entity_linker_defaults.cfg
diff --git a/spacy/ml/models/defaults/morphologizer_defaults.cfg b/spacy/pipeline/defaults/morphologizer_defaults.cfg
similarity index 100%
rename from spacy/ml/models/defaults/morphologizer_defaults.cfg
rename to spacy/pipeline/defaults/morphologizer_defaults.cfg
diff --git a/spacy/ml/models/defaults/ner_defaults.cfg b/spacy/pipeline/defaults/ner_defaults.cfg
similarity index 100%
rename from spacy/ml/models/defaults/ner_defaults.cfg
rename to spacy/pipeline/defaults/ner_defaults.cfg
diff --git a/spacy/ml/models/defaults/parser_defaults.cfg b/spacy/pipeline/defaults/parser_defaults.cfg
similarity index 100%
rename from spacy/ml/models/defaults/parser_defaults.cfg
rename to spacy/pipeline/defaults/parser_defaults.cfg
diff --git a/spacy/ml/models/defaults/senter_defaults.cfg b/spacy/pipeline/defaults/senter_defaults.cfg
similarity index 100%
rename from spacy/ml/models/defaults/senter_defaults.cfg
rename to spacy/pipeline/defaults/senter_defaults.cfg
diff --git a/spacy/ml/models/defaults/simple_ner_defaults.cfg b/spacy/pipeline/defaults/simple_ner_defaults.cfg
similarity index 100%
rename from spacy/ml/models/defaults/simple_ner_defaults.cfg
rename to spacy/pipeline/defaults/simple_ner_defaults.cfg
diff --git a/spacy/ml/models/defaults/tagger_defaults.cfg b/spacy/pipeline/defaults/tagger_defaults.cfg
similarity index 100%
rename from spacy/ml/models/defaults/tagger_defaults.cfg
rename to spacy/pipeline/defaults/tagger_defaults.cfg
diff --git a/spacy/ml/models/defaults/tensorizer_defaults.cfg b/spacy/pipeline/defaults/tensorizer_defaults.cfg
similarity index 100%
rename from spacy/ml/models/defaults/tensorizer_defaults.cfg
rename to spacy/pipeline/defaults/tensorizer_defaults.cfg
diff --git a/spacy/ml/models/defaults/textcat_bow_defaults.cfg b/spacy/pipeline/defaults/textcat_bow_defaults.cfg
similarity index 100%
rename from spacy/ml/models/defaults/textcat_bow_defaults.cfg
rename to spacy/pipeline/defaults/textcat_bow_defaults.cfg
diff --git a/spacy/ml/models/defaults/textcat_cnn_defaults.cfg b/spacy/pipeline/defaults/textcat_cnn_defaults.cfg
similarity index 100%
rename from spacy/ml/models/defaults/textcat_cnn_defaults.cfg
rename to spacy/pipeline/defaults/textcat_cnn_defaults.cfg
diff --git a/spacy/ml/models/defaults/textcat_defaults.cfg b/spacy/pipeline/defaults/textcat_defaults.cfg
similarity index 100%
rename from spacy/ml/models/defaults/textcat_defaults.cfg
rename to spacy/pipeline/defaults/textcat_defaults.cfg
diff --git a/spacy/ml/models/defaults/tok2vec_defaults.cfg b/spacy/pipeline/defaults/tok2vec_defaults.cfg
similarity index 100%
rename from spacy/ml/models/defaults/tok2vec_defaults.cfg
rename to spacy/pipeline/defaults/tok2vec_defaults.cfg
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 7a2bc3b17..c45a72b25 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -17,9 +17,10 @@ from ..util import link_vectors_to_models, create_default_optimizer
from ..errors import Errors, TempErrors
from .pipes import Tagger, _load_cfg
from .. import util
+from .defaults import default_morphologizer
-@component("morphologizer", assigns=["token.morph", "token.pos"])
+@component("morphologizer", assigns=["token.morph", "token.pos"], default_model=default_morphologizer)
class Morphologizer(Tagger):
def __init__(self, vocab, model, **cfg):
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 61db11baa..4ff956e1d 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -2,6 +2,7 @@
import numpy
import srsly
import random
+
from thinc.api import CosineDistance, to_categorical, get_array_module
from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy
import warnings
@@ -13,6 +14,8 @@ from ..syntax.arc_eager cimport ArcEager
from ..morphology cimport Morphology
from ..vocab cimport Vocab
+from .defaults import default_tagger, default_parser, default_ner, default_textcat
+from .defaults import default_nel, default_senter, default_tensorizer
from .functions import merge_subtokens
from ..language import Language, component
from ..syntax import nonproj
@@ -234,7 +237,7 @@ class Pipe(object):
return self
-@component("tensorizer", assigns=["doc.tensor"])
+@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer)
class Tensorizer(Pipe):
"""Pre-train position-sensitive vectors for tokens."""
@@ -366,7 +369,7 @@ class Tensorizer(Pipe):
return sgd
-@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"])
+@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger)
class Tagger(Pipe):
"""Pipeline component for part-of-speech tagging.
@@ -636,7 +639,7 @@ class Tagger(Pipe):
return self
-@component("senter", assigns=["token.is_sent_start"])
+@component("senter", assigns=["token.is_sent_start"], default_model=default_senter)
class SentenceRecognizer(Tagger):
"""Pipeline component for sentence segmentation.
@@ -976,7 +979,7 @@ class ClozeMultitask(Pipe):
losses[self.name] += loss
-@component("textcat", assigns=["doc.cats"])
+@component("textcat", assigns=["doc.cats"], default_model=default_textcat)
class TextCategorizer(Pipe):
"""Pipeline component for text classification.
@@ -1227,7 +1230,8 @@ cdef class EntityRecognizer(Parser):
@component(
"entity_linker",
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
- assigns=["token.ent_kb_id"]
+ assigns=["token.ent_kb_id"],
+ default_model=default_nel,
)
class EntityLinker(Pipe):
"""Pipeline component for named entity linking.
@@ -1673,8 +1677,19 @@ class Sentencizer(Pipe):
# Cython classes can't be decorated, so we need to add the factories here
-Language.factories["parser"] = lambda nlp, model, **cfg: DependencyParser.from_nlp(nlp, model, **cfg)
-Language.factories["ner"] = lambda nlp, model, **cfg: EntityRecognizer.from_nlp(nlp, model, **cfg)
+Language.factories["parser"] = lambda nlp, model, **cfg: parser_factory(nlp, model, **cfg)
+Language.factories["ner"] = lambda nlp, model, **cfg: ner_factory(nlp, model, **cfg)
+def parser_factory(nlp, model, **cfg):
+ if model is None:
+ model = default_parser()
+ warnings.warn(Warnings.W098.format(name="parser"))
+ return DependencyParser.from_nlp(nlp, model, **cfg)
+
+def ner_factory(nlp, model, **cfg):
+ if model is None:
+ model = default_ner()
+ warnings.warn(Warnings.W098.format(name="ner"))
+ return EntityRecognizer.from_nlp(nlp, model, **cfg)
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py
index 8d53152d8..c674046af 100644
--- a/spacy/pipeline/simple_ner.py
+++ b/spacy/pipeline/simple_ner.py
@@ -2,6 +2,8 @@ from typing import List
from thinc.types import Floats2d
from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate
from thinc.util import to_numpy
+
+from .defaults import default_simple_ner
from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
from ..tokens import Doc
from ..language import component
@@ -9,7 +11,7 @@ from ..util import link_vectors_to_models
from .pipes import Pipe
-@component("simple_ner", assigns=["doc.ents"])
+@component("simple_ner", assigns=["doc.ents"], default_model=default_simple_ner)
class SimpleNER(Pipe):
"""Named entity recognition with a tagging model. The model should include
validity constraints to ensure that only valid tag sequences are returned."""
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 83a4454e3..5882fa266 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -6,9 +6,10 @@ from ..tokens import Doc
from ..vocab import Vocab
from ..language import component
from ..util import link_vectors_to_models, minibatch, eg2doc
+from .defaults import default_tok2vec
-@component("tok2vec", assigns=["doc.tensor"])
+@component("tok2vec", assigns=["doc.tensor"], default_model=default_tok2vec)
class Tok2Vec(Pipe):
@classmethod
def from_nlp(cls, nlp, model, **cfg):
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 3a466b24c..c92fc1ff9 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -3,7 +3,7 @@ from spacy.tokens import Span
import pytest
from ..util import get_doc
-from ...ml.models.defaults import default_ner
+from spacy.pipeline.defaults import default_ner
def test_doc_add_entities_set_ents_iob(en_vocab):
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index 39682ba3d..ee1bba886 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -4,7 +4,7 @@ from spacy.attrs import NORM
from spacy.gold import GoldParse
from spacy.vocab import Vocab
-from spacy.ml.models.defaults import default_parser, default_ner
+from spacy.pipeline.defaults import default_parser, default_ner
from spacy.tokens import Doc
from spacy.pipeline import DependencyParser, EntityRecognizer
from spacy.util import fix_random_seed
diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py
index 2426805d2..30b4a6f6d 100644
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@@ -1,7 +1,7 @@
import pytest
from spacy.vocab import Vocab
-from spacy.ml.models.defaults import default_parser
+from spacy.pipeline.defaults import default_parser
from spacy.pipeline import DependencyParser
from spacy.tokens import Doc
from spacy.gold import GoldParse
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 2fd87ead3..9656d3a10 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -2,7 +2,7 @@ import pytest
from spacy import util
from spacy.lang.en import English
-from spacy.ml.models.defaults import default_ner
+from spacy.pipeline.defaults import default_ner
from spacy.pipeline import EntityRecognizer, EntityRuler
from spacy.vocab import Vocab
diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py
index c985cf87a..b648e9a00 100644
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@@ -1,5 +1,5 @@
import pytest
-from spacy.ml.models.defaults import default_parser, default_tok2vec
+from spacy.pipeline.defaults import default_parser, default_tok2vec
from spacy.vocab import Vocab
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.nn_parser import Parser
diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py
index 619e0cc0b..db9eb5e6f 100644
--- a/spacy/tests/parser/test_nn_beam.py
+++ b/spacy/tests/parser/test_nn_beam.py
@@ -2,7 +2,7 @@ import pytest
import numpy
from spacy.vocab import Vocab
from spacy.language import Language
-from spacy.ml.models.defaults import default_parser
+from spacy.pipeline.defaults import default_parser
from spacy.pipeline import DependencyParser
from spacy.syntax.arc_eager import ArcEager
from spacy.tokens import Doc
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index af777aa6b..dc13fcdf1 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -4,7 +4,7 @@ from spacy.attrs import NORM
from spacy.gold import GoldParse
from spacy.vocab import Vocab
-from spacy.ml.models.defaults import default_parser
+from spacy.pipeline.defaults import default_parser
from spacy.tokens import Doc
from spacy.pipeline import DependencyParser
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index b091ec0de..725a4fd69 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -11,7 +11,7 @@ from spacy.gold import GoldParse
from spacy.util import fix_random_seed
from ..util import make_tempdir
-from ...ml.models.defaults import default_tok2vec
+from spacy.pipeline.defaults import default_tok2vec
TRAIN_DATA = [
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 5f5f0c9eb..5a76697bc 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -10,7 +10,7 @@ from spacy.lang.lex_attrs import is_stop
from spacy.vectors import Vectors
from spacy.vocab import Vocab
from spacy.language import Language
-from spacy.ml.models.defaults import default_ner, default_tagger
+from spacy.pipeline.defaults import default_ner, default_tagger
from spacy.tokens import Doc, Span, Token
from spacy.pipeline import Tagger, EntityRecognizer
from spacy.attrs import HEAD, DEP
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index 06ba6c4ac..240163d6e 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -1,7 +1,7 @@
import pytest
from spacy.lang.en import English
from spacy.lang.de import German
-from spacy.ml.models.defaults import default_ner
+from spacy.pipeline.defaults import default_ner
from spacy.pipeline import EntityRuler, EntityRecognizer
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc
diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py
index 9752f70df..3d8e80847 100644
--- a/spacy/tests/regression/test_issue3830.py
+++ b/spacy/tests/regression/test_issue3830.py
@@ -1,7 +1,7 @@
from spacy.pipeline.pipes import DependencyParser
from spacy.vocab import Vocab
-from spacy.ml.models.defaults import default_parser
+from spacy.pipeline.defaults import default_parser
def test_issue3830_no_subtok():
diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py
index 75a1c23b7..30081543b 100644
--- a/spacy/tests/regression/test_issue4042.py
+++ b/spacy/tests/regression/test_issue4042.py
@@ -3,7 +3,7 @@ from spacy.pipeline import EntityRecognizer, EntityRuler
from spacy.lang.en import English
from spacy.tokens import Span
from spacy.util import ensure_path
-from spacy.ml.models.defaults import default_ner
+from spacy.pipeline.defaults import default_ner
from ..util import make_tempdir
diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py
index 30688601f..ba4d2deab 100644
--- a/spacy/tests/regression/test_issue4313.py
+++ b/spacy/tests/regression/test_issue4313.py
@@ -1,6 +1,6 @@
from collections import defaultdict
-from spacy.ml.models.defaults import default_ner
+from spacy.pipeline.defaults import default_ner
from spacy.pipeline import EntityRecognizer
from spacy.lang.en import English
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 475181c7b..4fc277c4f 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -1,8 +1,8 @@
import pytest
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
-from spacy.ml.models.defaults import default_parser, default_tensorizer, default_tagger
-from spacy.ml.models.defaults import default_textcat, default_senter
+from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger
+from spacy.pipeline.defaults import default_textcat, default_senter
from ..util import make_tempdir
From a2830c3ef52167e7e99cb44d5ebd21a75e461146 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal
Date: Tue, 19 May 2020 16:23:11 +0200
Subject: [PATCH 139/496] Use thinc 8.0.0a9
---
requirements.txt | 2 +-
setup.cfg | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index 08b4c228a..e5f1ae10b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
-thinc==8.0.0a8
+thinc==8.0.0a9
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 9fe02018b..df1658fd0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -42,7 +42,7 @@ install_requires =
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
- thinc==8.0.0a8
+ thinc==8.0.0a9
blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0
From 664a3603b0313b650b3b43e2897f381f1e3598df Mon Sep 17 00:00:00 2001
From: Matthew Honnibal
Date: Tue, 19 May 2020 17:15:39 +0200
Subject: [PATCH 140/496] Set version to v3.0.0.dev8
---
spacy/about.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/about.py b/spacy/about.py
index 3f87c8dbc..3af1b77a0 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.0.0.dev7"
+__version__ = "3.0.0.dev8"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
From 7f5715a08159c06c249c3efe4d8934df2c98544d Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Wed, 20 May 2020 11:41:12 +0200
Subject: [PATCH 141/496] Various fixes to NEL functionality, Example class etc
(#5460)
* setting KB in the EL constructor, similar to how the model is passed on
* removing wikipedia example files - moved to projects
* throw an error when nlp.update is called with 2 positional arguments
* rewriting the config logic in create pipe to accomodate for other objects (e.g. KB) in the config
* update config files with new parameters
* avoid training pipeline components that don't have a model (like sentencizer)
* various small fixes + UX improvements
* small fixes
* set thinc to 8.0.0a9 everywhere
* remove outdated comment
---
.../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 2 ++
.../ptb-joint-pos-dep/defaults.cfg | 2 ++
.../tok2vec-ner/charembed_tok2vec.cfg | 2 ++
.../tok2vec-ner/multihashembed_tok2vec.cfg | 2 ++
examples/training/train_entity_linker.py | 8 ++---
pyproject.toml | 2 +-
setup.cfg | 2 +-
spacy/cli/train_from_config.py | 14 ++++++---
spacy/errors.py | 21 ++++++++++---
spacy/gold.pyx | 25 ++++++++++++---
spacy/language.py | 31 +++++++++++--------
spacy/ml/models/entity_linker.py | 12 +++++++
spacy/pipeline/pipes.pyx | 22 ++++++++-----
spacy/tests/pipeline/test_entity_linker.py | 7 ++---
spacy/util.py | 2 ++
15 files changed, 108 insertions(+), 46 deletions(-)
diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
index b6b4e82b6..e152fa5e0 100644
--- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@@ -12,6 +12,8 @@ use_gpu = 0
scores = ["tags_acc", "uas", "las"]
score_weights = {"las": 0.8, "tags_acc": 0.2}
limit = 0
+seed = 0
+accumulate_gradient = 2
[training.batch_size]
@schedules = "compounding.v1"
diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
index 2ceaab0be..9a10c45f0 100644
--- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
@@ -12,6 +12,8 @@ use_gpu = -1
scores = ["tags_acc", "uas", "las"]
score_weights = {"las": 0.8, "tags_acc": 0.2}
limit = 0
+seed = 0
+accumulate_gradient = 2
[training.batch_size]
@schedules = "compounding.v1"
diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
index b8219ad10..796c8670f 100644
--- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
@@ -10,6 +10,8 @@ orth_variant_level = 0.0
gold_preproc = true
max_length = 0
batch_size = 25
+seed = 0
+accumulate_gradient = 2
[optimizer]
@optimizers = "Adam.v1"
diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
index dc25a1c3b..3ac70675b 100644
--- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
@@ -9,6 +9,8 @@ score_weights = {"ents_f": 1}
orth_variant_level = 0.0
gold_preproc = true
max_length = 0
+seed = 0
+accumulate_gradient = 2
[training.batch_size]
@schedules = "compounding.v1"
diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py
index a22f255e7..2da1db26d 100644
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@@ -83,13 +83,13 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
# Create the Entity Linker component and add it to the pipeline.
if "entity_linker" not in nlp.pipe_names:
- # use only the predicted EL score and not the prior probability (for demo purposes)
- cfg = {"incl_prior": False}
- entity_linker = nlp.create_pipe("entity_linker", cfg)
kb = KnowledgeBase(vocab=nlp.vocab)
kb.load_bulk(kb_path)
print("Loaded Knowledge Base from '%s'" % kb_path)
- entity_linker.set_kb(kb)
+
+ # use only the predicted EL score and not the prior probability (for demo purposes)
+ cfg = {"kb": kb, "incl_prior": False}
+ entity_linker = nlp.create_pipe("entity_linker", cfg)
nlp.add_pipe(entity_linker, last=True)
# Convert the texts to docs to make sure we have doc.ents set for the training examples.
diff --git a/pyproject.toml b/pyproject.toml
index 548664e89..66a06c1d9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
- "thinc==8.0.0a8",
+ "thinc==8.0.0a9",
"blis>=0.4.0,<0.5.0"
]
build-backend = "setuptools.build_meta"
diff --git a/setup.cfg b/setup.cfg
index df1658fd0..1cd088279 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -36,7 +36,7 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
- thinc==8.0.0a8
+ thinc==8.0.0a9
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index bd83deb04..96c5b676e 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -12,7 +12,7 @@ import random
from ..gold import GoldCorpus
from .. import util
-
+from ..errors import Errors
registry = util.registry
@@ -233,6 +233,8 @@ def create_train_batches(nlp, corpus, cfg):
max_length=cfg["max_length"],
ignore_misaligned=True,
))
+ if len(train_examples) == 0:
+ raise ValueError(Errors.E988)
random.shuffle(train_examples)
batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
for batch in batches:
@@ -313,12 +315,14 @@ def train_while_improving(
dropouts = dropout
results = []
losses = {}
+ to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")]
+
for step, batch in enumerate(train_data):
dropout = next(dropouts)
- for subbatch in subdivide_batch(batch, accumulate_gradient):
- nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
- for name, proc in nlp.pipeline:
- if hasattr(proc, "model"):
+ with nlp.select_pipes(enable=to_enable):
+ for subbatch in subdivide_batch(batch, accumulate_gradient):
+ nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
+ for name, proc in nlp.pipeline:
proc.model.finish_update(optimizer)
optimizer.step_schedules()
if not (step % eval_frequency):
diff --git a/spacy/errors.py b/spacy/errors.py
index 7a7b44731..4d38ab586 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -195,7 +195,7 @@ class Errors(object):
"the documentation:\nhttps://spacy.io/usage/models")
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: "
- "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
+ "nlp.add_pipe(nlp.create_pipe('sentencizer')). "
"Alternatively, add the dependency parser, or set sentence "
"boundaries by setting doc[i].is_sent_start.")
E031 = ("Invalid token: empty string ('') at position {i}.")
@@ -430,8 +430,7 @@ class Errors(object):
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
"includes either the `text` or `tokens` key. For more info, see "
"the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
- E139 = ("Knowledge Base for component '{name}' not initialized. Did you "
- "forget to call set_kb()?")
+ E139 = ("Knowledge Base for component '{name}' is empty.")
E140 = ("The list of entities, prior probabilities and entity vectors "
"should be of equal length.")
E141 = ("Entity vectors should be of length {required} instead of the "
@@ -548,6 +547,18 @@ class Errors(object):
"token itself.")
# TODO: fix numbering after merging develop into master
+
+ E987 = ("The text of an example training instance is either a Doc or "
+ "a string, but found {type} instead.")
+ E988 = ("Could not parse any training examples. Ensure the data is "
+ "formatted correctly.")
+ E989 = ("'nlp.update()' was called with two positional arguments. This "
+ "may be due to a backwards-incompatible change to the format "
+ "of the training data in spaCy 3.0 onwards. The 'update' "
+ "function should now be called with a batch of 'Example' "
+ "objects, instead of (text, annotation) tuples. ")
+ E990 = ("An entity linking component needs to be initialized with a "
+ "KnowledgeBase object, but found {type} instead.")
E991 = ("The function 'select_pipes' should be called with either a "
"'disable' argument to list the names of the pipe components "
"that should be disabled, or with an 'enable' argument that "
@@ -562,8 +573,8 @@ class Errors(object):
E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes "
"'{token_attrs}'.")
- E998 = ("Can only create GoldParse objects from Example objects without a "
- "Doc if get_gold_parses() is called with a Vocab object.")
+ E998 = ("To create GoldParse objects from Example objects without a "
+ "Doc, get_gold_parses() should be called with a Vocab object.")
E999 = ("Encountered an unexpected format for the dictionary holding "
"gold annotations: {gold_dict}")
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 6647e41b4..46a6ae583 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -212,6 +212,8 @@ class GoldCorpus(object):
doc = ex_dict.get("doc", None)
if doc is None:
doc = ex_dict.get("text", None)
+ if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)):
+ raise ValueError(Errors.E987.format(type=type(doc)))
examples.append(Example.from_dict(ex_dict, doc=doc))
elif file_name.endswith("msg"):
@@ -288,7 +290,6 @@ class GoldCorpus(object):
""" Setting gold_preproc will result in creating a doc per sentence """
for example in examples:
if gold_preproc:
- example.doc = None
split_examples = example.split_sents()
example_golds = []
for split_example in split_examples:
@@ -716,6 +717,12 @@ cdef class TokenAnnotation:
def get_sent_start(self, i):
return self.sent_starts[i] if i < len(self.sent_starts) else None
+ def __str__(self):
+ return str(self.to_dict())
+
+ def __repr__(self):
+ return self.__str__()
+
cdef class DocAnnotation:
def __init__(self, cats=None, links=None):
@@ -729,6 +736,12 @@ cdef class DocAnnotation:
def to_dict(self):
return {"cats": self.cats, "links": self.links}
+ def __str__(self):
+ return str(self.to_dict())
+
+ def __repr__(self):
+ return self.__str__()
+
cdef class Example:
def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
@@ -747,9 +760,9 @@ cdef class Example:
@classmethod
def from_dict(cls, example_dict, doc=None):
- token_dict = example_dict["token_annotation"]
+ token_dict = example_dict.get("token_annotation", {})
token_annotation = TokenAnnotation.from_dict(token_dict)
- doc_dict = example_dict["doc_annotation"]
+ doc_dict = example_dict.get("doc_annotation", {})
doc_annotation = DocAnnotation.from_dict(doc_dict)
return cls(doc_annotation, token_annotation, doc)
@@ -791,6 +804,8 @@ cdef class Example:
def split_sents(self):
""" Split the token annotations into multiple Examples based on
sent_starts and return a list of the new Examples"""
+ if not self.token_annotation.words:
+ return [self]
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
@@ -842,7 +857,7 @@ cdef class Example:
if merge:
t = self.token_annotation
doc = self.doc
- if self.doc is None:
+ if doc is None or not isinstance(doc, Doc):
if not vocab:
raise ValueError(Errors.E998)
doc = Doc(vocab, words=t.words)
@@ -1052,7 +1067,7 @@ cdef class GoldParse:
self.sent_starts = [None] * len(doc)
# This needs to be done before we align the words
- if make_projective and heads is not None and deps is not None:
+ if make_projective and any(heads) and any(deps) :
heads, deps = nonproj.projectivize(heads, deps)
# Do many-to-one alignment for misaligned tokens.
diff --git a/spacy/language.py b/spacy/language.py
index 2b8fa129e..d71c27406 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -314,19 +314,20 @@ class Language(object):
# transform the model's config to an actual Model
factory_cfg = dict(config)
- model_cfg = None
+
+ # check whether we have a proper model config, or load a default one
+ if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict):
+ warnings.warn(Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name))
+
+ # refer to the model configuration in the cfg settings for this component
if "model" in factory_cfg:
- model_cfg = factory_cfg["model"]
- if not isinstance(model_cfg, dict):
- warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name))
- model_cfg = None
+ self.config[name] = {"model": factory_cfg["model"]}
+
+ # create all objects in the config
+ factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)["config"]
+ model = factory_cfg.get("model", None)
+ if model is not None:
del factory_cfg["model"]
- model = None
- if model_cfg is not None:
- self.config[name] = {"model": model_cfg}
- model = registry.make_from_config({"model": model_cfg}, validate=True)[
- "model"
- ]
return factory(self, model, **factory_cfg)
def add_pipe(
@@ -517,10 +518,11 @@ class Language(object):
def make_doc(self, text):
return self.tokenizer(text)
- def update(self, examples, drop=0.0, sgd=None, losses=None, component_cfg=None):
+ def update(self, examples, dummy=None, *, drop=0.0, sgd=None, losses=None, component_cfg=None):
"""Update the models in the pipeline.
examples (iterable): A batch of `Example` or `Doc` objects.
+ dummy: Should not be set - serves to catch backwards-incompatible scripts.
drop (float): The dropout rate.
sgd (callable): An optimizer.
losses (dict): Dictionary to update with the loss, keyed by component.
@@ -529,6 +531,9 @@ class Language(object):
DOCS: https://spacy.io/api/language#update
"""
+ if dummy is not None:
+ raise ValueError(Errors.E989)
+
if len(examples) == 0:
return
examples = Example.to_example_objects(examples, make_doc=self.make_doc)
@@ -735,7 +740,7 @@ class Language(object):
contexts = [
pipe.use_params(params)
for name, pipe in self.pipeline
- if hasattr(pipe, "use_params")
+ if hasattr(pipe, "use_params") and hasattr(pipe, "model")
]
# TODO: Having trouble with contextlib
# Workaround: these aren't actually context managers atm.
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 9cbaba984..00689e85b 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -1,7 +1,11 @@
+from pathlib import Path
+
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
from thinc.api import Model, Maxout, Linear
from ...util import registry
+from ...kb import KnowledgeBase
+from ...vocab import Vocab
@registry.architectures.register("spacy.EntityLinker.v1")
@@ -19,3 +23,11 @@ def build_nel_encoder(tok2vec, nO=None):
model.set_ref("output_layer", output_layer)
model.set_ref("tok2vec", tok2vec)
return model
+
+
+@registry.assets.register("spacy.KBFromFile.v1")
+def load_kb(nlp_path, kb_path) -> KnowledgeBase:
+ vocab = Vocab().from_disk(Path(nlp_path) / "vocab")
+ kb = KnowledgeBase(vocab=vocab)
+ kb.load_bulk(kb_path)
+ return kb
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 4ff956e1d..56fe54664 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -2,6 +2,7 @@
import numpy
import srsly
import random
+from ast import literal_eval
from thinc.api import CosineDistance, to_categorical, get_array_module
from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy
@@ -1244,15 +1245,20 @@ class EntityLinker(Pipe):
self.vocab = vocab
self.model = model
self.kb = None
+ self.kb = cfg.get("kb", None)
+ if self.kb is None:
+ # create an empty KB that should be filled by calling from_disk
+ self.kb = KnowledgeBase(vocab=vocab)
+ else:
+ del cfg["kb"] # we don't want to duplicate its serialization
+ if not isinstance(self.kb, KnowledgeBase):
+ raise ValueError(Errors.E990.format(type=type(self.kb)))
self.cfg = dict(cfg)
self.distance = CosineDistance(normalize=False)
- def set_kb(self, kb):
- self.kb = kb
-
def require_kb(self):
# Raise an error if the knowledge base is not initialized.
- if getattr(self, "kb", None) in (None, True, False):
+ if len(self.kb) == 0:
raise ValueError(Errors.E139.format(name=self.name))
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
@@ -1285,6 +1291,8 @@ class EntityLinker(Pipe):
ents_by_offset[(ent.start_char, ent.end_char)] = ent
for entity, kb_dict in gold.links.items():
+ if isinstance(entity, str):
+ entity = literal_eval(entity)
start, end = entity
mention = doc.text[start:end]
@@ -1375,7 +1383,6 @@ class EntityLinker(Pipe):
def predict(self, docs):
""" Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
self.require_kb()
-
entity_count = 0
final_kb_ids = []
final_tensors = []
@@ -1486,9 +1493,8 @@ class EntityLinker(Pipe):
raise ValueError(Errors.E149)
def load_kb(p):
- kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
- kb.load_bulk(p)
- self.set_kb(kb)
+ self.kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
+ self.kb.load_bulk(p)
deserialize = {}
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index cdd8451fd..32b434e04 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -203,8 +203,8 @@ def test_preserving_links_asdoc(nlp):
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
- el_pipe = nlp.create_pipe(name="entity_linker")
- el_pipe.set_kb(mykb)
+ cfg = {"kb": mykb, "incl_prior": False}
+ el_pipe = nlp.create_pipe(name="entity_linker", config=cfg)
el_pipe.begin_training()
el_pipe.incl_context = False
el_pipe.incl_prior = True
@@ -288,8 +288,7 @@ def test_overfitting_IO():
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
# Create the Entity Linker component and add it to the pipeline
- entity_linker = nlp.create_pipe("entity_linker")
- entity_linker.set_kb(mykb)
+ entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb})
nlp.add_pipe(entity_linker, last=True)
# train the NEL pipe
diff --git a/spacy/util.py b/spacy/util.py
index 048d923ee..f39813694 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -34,6 +34,7 @@ class registry(thinc.registry):
lookups = catalogue.create("spacy", "lookups", entry_points=True)
factories = catalogue.create("spacy", "factories", entry_points=True)
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
+ assets = catalogue.create("spacy", "assets", entry_points=True)
def set_env_log(value):
@@ -160,6 +161,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
for name in pipeline:
if name not in disable:
config = meta.get("pipeline_args", {}).get(name, {})
+ config.update(overrides)
factory = factories.get(name, name)
if nlp_config.get(name, None):
model_config = nlp_config[name]["model"]
From 24efd54a42e7e5f22b040018f222d24867e83a87 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal
Date: Wed, 20 May 2020 12:27:31 +0200
Subject: [PATCH 142/496] Merge from develop
---
spacy/cli/train_from_config.py | 5 ++++-
spacy/syntax/_parser_model.pyx | 6 +++++-
spacy/util.py | 13 +++----------
3 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index 96c5b676e..54eedf69e 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -224,8 +224,9 @@ def train_from_config(
def create_train_batches(nlp, corpus, cfg):
+ is_first = True
while True:
- train_examples = list(corpus.train_dataset(
+ train_examples = corpus.train_dataset(
nlp,
noise_level=0.0,
orth_variant_level=cfg["orth_variant_level"],
@@ -323,6 +324,8 @@ def train_while_improving(
for subbatch in subdivide_batch(batch, accumulate_gradient):
nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
for name, proc in nlp.pipeline:
+ for name, proc in nlp.pipeline:
+ if hasattr(proc, "model"):
proc.model.finish_update(optimizer)
optimizer.step_schedules()
if not (step % eval_frequency):
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 69f5bd6f6..60d22a1ab 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -474,7 +474,11 @@ cdef class precompute_hiddens:
# This will usually be on GPU
d_best = ops.asarray(d_best)
# Fix nans (which can occur from unseen classes.)
- d_best[ops.xp.isnan(d_best)] = 0.
+ try:
+ d_best[ops.xp.isnan(d_best)] = 0.
+ except:
+ print(ops.xp.isnan(d_best))
+ raise
if self.activation == "maxout":
mask_ = ops.asarray(mask)
return ops.backprop_maxout(d_best, mask_, self.nP)
diff --git a/spacy/util.py b/spacy/util.py
index f39813694..7f35c2f7c 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -598,16 +598,9 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0
try:
example = next(examples)
except StopIteration:
- if oversize:
- examples = iter(oversize)
- oversize = []
- if batch:
- yield batch
- break
- else:
- if batch:
- yield batch
- return
+ if batch:
+ yield batch
+ return
n_words = count_words(example.doc)
if n_words < (batch_size + tol_size):
batch_size -= n_words
From fda7355508cdb246f0ca12da1fd76b9c35cd8fa2 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal
Date: Wed, 20 May 2020 12:30:21 +0200
Subject: [PATCH 143/496] Fix train-from-config
---
spacy/cli/train_from_config.py | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index 54eedf69e..429a3cf49 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -226,7 +226,7 @@ def train_from_config(
def create_train_batches(nlp, corpus, cfg):
is_first = True
while True:
- train_examples = corpus.train_dataset(
+ train_examples = list(corpus.train_dataset(
nlp,
noise_level=0.0,
orth_variant_level=cfg["orth_variant_level"],
@@ -324,9 +324,8 @@ def train_while_improving(
for subbatch in subdivide_batch(batch, accumulate_gradient):
nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
for name, proc in nlp.pipeline:
- for name, proc in nlp.pipeline:
- if hasattr(proc, "model"):
- proc.model.finish_update(optimizer)
+ if hasattr(proc, "model"):
+ proc.model.finish_update(optimizer)
optimizer.step_schedules()
if not (step % eval_frequency):
score, other_scores = evaluate()
From 60e8da481300da3540138d2689f73324a07b071b Mon Sep 17 00:00:00 2001
From: Matthw Honnibal
Date: Wed, 20 May 2020 12:56:27 +0200
Subject: [PATCH 144/496] Tidy up train-from-config a bit
---
spacy/cli/train_from_config.py | 38 +++++++++++++++++++++-------------
1 file changed, 24 insertions(+), 14 deletions(-)
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index 429a3cf49..c75c861cc 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -193,10 +193,11 @@ def train_from_config(
optimizer,
train_batches,
evaluate,
- training["dropout"],
- training["patience"],
- training["eval_frequency"],
- training["accumulate_gradient"]
+ dropout=training["dropout"],
+ accumulate_gradient=training["accumulate_gradient"],
+ patience=training.get("patience", 0),
+ max_steps=training.get("max_steps", 0),
+ eval_frequency=training["eval_frequency"],
)
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
@@ -214,17 +215,17 @@ def train_from_config(
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
finally:
if output_path is not None:
- with nlp.use_params(optimizer.averages):
- final_model_path = output_path / "model-final"
+ final_model_path = output_path / "model-final"
+ if optimizer.averages:
+ with nlp.use_params(optimizer.averages):
+ nlp.to_disk(final_model_path)
+ else:
nlp.to_disk(final_model_path)
msg.good("Saved model to output directory", final_model_path)
- # with msg.loading("Creating best model..."):
- # best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
- # msg.good("Created best model", best_model_path)
def create_train_batches(nlp, corpus, cfg):
- is_first = True
+ epochs_todo = cfg.get("max_epochs", 0)
while True:
train_examples = list(corpus.train_dataset(
nlp,
@@ -240,6 +241,11 @@ def create_train_batches(nlp, corpus, cfg):
batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
for batch in batches:
yield batch
+ epochs_todo -= 1
+ # We intentionally compare exactly to 0 here, so that max_epochs < 1
+ # will not break.
+ if epochs_todo == 0:
+ break
def create_evaluation_callback(nlp, optimizer, corpus, cfg):
@@ -270,8 +276,8 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
def train_while_improving(
- nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency,
- accumulate_gradient
+ nlp, optimizer, train_data, evaluate, *, dropout, eval_frequency,
+ accumulate_gradient=1, patience=0, max_steps=0
):
"""Train until an evaluation stops improving. Works as a generator,
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@@ -281,6 +287,7 @@ def train_while_improving(
Positional arguments:
nlp: The spaCy pipeline to evaluate.
+ optimizer: The optimizer callable.
train_data (Iterable[Batch]): A generator of batches, with the training
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
data iterable needs to take care of iterating over the epochs and
@@ -344,9 +351,12 @@ def train_while_improving(
yield batch, info, is_best_checkpoint
if is_best_checkpoint is not None:
losses = {}
- # Stop if no improvement in `patience` updates
+ # Stop if no improvement in `patience` updates (if specified)
best_score, best_step = max(results)
- if (step - best_step) >= patience:
+ if patience and (step - best_step) >= patience:
+ break
+ # Stop if we've exhausted our max steps (if specified)
+ if max_steps and (step * accumulate_gradient) >= max_steps:
break
From 609c0ba557964f4b3111c4c253571b6d57377d18 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal
Date: Wed, 20 May 2020 18:48:18 +0200
Subject: [PATCH 145/496] Fix accidentally quadratic runtime in
Example.split_sents (#5464)
* Tidy up train-from-config a bit
* Fix accidentally quadratic perf in TokenAnnotation.brackets
When we're reading in the gold data, we had a nested loop where
we looped over the brackets for each token, looking for brackets
that start on that word. This is accidentally quadratic, because
we have one bracket per word (for the POS tags). So we had
an O(N**2) behaviour here that ended up being pretty slow.
To solve this I'm indexing the brackets by their starting word
on the TokenAnnotations object, and having a property to provide
the previous view.
* Fixes
---
spacy/cli/train_from_config.py | 38 +++++++++++++++++++++-------------
spacy/gold.pxd | 2 +-
spacy/gold.pyx | 28 +++++++++++++++++++------
3 files changed, 47 insertions(+), 21 deletions(-)
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index 429a3cf49..c75c861cc 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -193,10 +193,11 @@ def train_from_config(
optimizer,
train_batches,
evaluate,
- training["dropout"],
- training["patience"],
- training["eval_frequency"],
- training["accumulate_gradient"]
+ dropout=training["dropout"],
+ accumulate_gradient=training["accumulate_gradient"],
+ patience=training.get("patience", 0),
+ max_steps=training.get("max_steps", 0),
+ eval_frequency=training["eval_frequency"],
)
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
@@ -214,17 +215,17 @@ def train_from_config(
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
finally:
if output_path is not None:
- with nlp.use_params(optimizer.averages):
- final_model_path = output_path / "model-final"
+ final_model_path = output_path / "model-final"
+ if optimizer.averages:
+ with nlp.use_params(optimizer.averages):
+ nlp.to_disk(final_model_path)
+ else:
nlp.to_disk(final_model_path)
msg.good("Saved model to output directory", final_model_path)
- # with msg.loading("Creating best model..."):
- # best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
- # msg.good("Created best model", best_model_path)
def create_train_batches(nlp, corpus, cfg):
- is_first = True
+ epochs_todo = cfg.get("max_epochs", 0)
while True:
train_examples = list(corpus.train_dataset(
nlp,
@@ -240,6 +241,11 @@ def create_train_batches(nlp, corpus, cfg):
batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
for batch in batches:
yield batch
+ epochs_todo -= 1
+ # We intentionally compare exactly to 0 here, so that max_epochs < 1
+ # will not break.
+ if epochs_todo == 0:
+ break
def create_evaluation_callback(nlp, optimizer, corpus, cfg):
@@ -270,8 +276,8 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
def train_while_improving(
- nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency,
- accumulate_gradient
+ nlp, optimizer, train_data, evaluate, *, dropout, eval_frequency,
+ accumulate_gradient=1, patience=0, max_steps=0
):
"""Train until an evaluation stops improving. Works as a generator,
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@@ -281,6 +287,7 @@ def train_while_improving(
Positional arguments:
nlp: The spaCy pipeline to evaluate.
+ optimizer: The optimizer callable.
train_data (Iterable[Batch]): A generator of batches, with the training
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
data iterable needs to take care of iterating over the epochs and
@@ -344,9 +351,12 @@ def train_while_improving(
yield batch, info, is_best_checkpoint
if is_best_checkpoint is not None:
losses = {}
- # Stop if no improvement in `patience` updates
+ # Stop if no improvement in `patience` updates (if specified)
best_score, best_step = max(results)
- if (step - best_step) >= patience:
+ if patience and (step - best_step) >= patience:
+ break
+ # Stop if we've exhausted our max steps (if specified)
+ if max_steps and (step * accumulate_gradient) >= max_steps:
break
diff --git a/spacy/gold.pxd b/spacy/gold.pxd
index c5ab6ebbe..bf724868f 100644
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@@ -53,7 +53,7 @@ cdef class TokenAnnotation:
cdef public list deps
cdef public list entities
cdef public list sent_starts
- cdef public list brackets
+ cdef public dict brackets_by_start
cdef class DocAnnotation:
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 46a6ae583..1864b7a04 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -658,7 +658,18 @@ cdef class TokenAnnotation:
self.deps = deps if deps else []
self.entities = entities if entities else []
self.sent_starts = sent_starts if sent_starts else []
- self.brackets = brackets if brackets else []
+ self.brackets_by_start = {}
+ if brackets:
+ for b_start, b_end, b_label in brackets:
+ self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label))
+
+ @property
+ def brackets(self):
+ brackets = []
+ for start, ends_labels in self.brackets_by_start.items():
+ for end, label in ends_labels:
+ brackets.append((start, end, label))
+ return brackets
@classmethod
def from_dict(cls, token_dict):
@@ -811,8 +822,10 @@ cdef class Example:
s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
s_brackets = []
sent_start_i = 0
- t = self.token_annotation
+ cdef TokenAnnotation t = self.token_annotation
split_examples = []
+ cdef int b_start, b_end
+ cdef unicode b_label
for i in range(len(t.words)):
if i > 0 and t.sent_starts[i] == 1:
s_example.set_token_annotation(ids=s_ids,
@@ -836,9 +849,10 @@ cdef class Example:
s_deps.append(t.get_dep(i))
s_ents.append(t.get_entity(i))
s_sent_starts.append(t.get_sent_start(i))
- s_brackets.extend((b[0] - sent_start_i,
- b[1] - sent_start_i, b[2])
- for b in t.brackets if b[0] == i)
+ for b_end, b_label in t.brackets_by_start.get(i, []):
+ s_brackets.append(
+ (i - sent_start_i, b_end - sent_start_i, b_label)
+ )
i += 1
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads,
@@ -904,8 +918,10 @@ cdef class Example:
examples = [examples]
converted_examples = []
for ex in examples:
+ if isinstance(ex, Example):
+ converted_examples.append(ex)
# convert string to Doc to Example
- if isinstance(ex, str):
+ elif isinstance(ex, str):
if keep_raw_text:
converted_examples.append(Example(doc=ex))
else:
From 4b229bfc220f1c8ab63ac2fa9b17365689d4c5a2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Wed, 20 May 2020 18:48:51 +0200
Subject: [PATCH 146/496] Improve handling of NER in CoNLL-U MISC
---
spacy/cli/converters/conllu2json.py | 45 +++++++++++++++--------------
spacy/tests/test_cli.py | 28 +++++++++++++-----
2 files changed, 43 insertions(+), 30 deletions(-)
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index ecdc2ae66..0b2920802 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -26,7 +26,7 @@ def conllu2json(
Extract NER tags if available and convert them so that they follow
BILUO and the Wikipedia scheme
"""
- MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?"
+ MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
msg = Printer(no_print=no_print)
n_sents_info(msg, n_sents)
docs = []
@@ -39,7 +39,7 @@ def conllu2json(
ner_map=ner_map,
merge_subtokens=merge_subtokens,
)
- has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN)
+ has_ner_tags = has_ner(input_data, MISC_NER_PATTERN)
for i, example in enumerate(conll_data):
raw += example.text
sentences.append(
@@ -65,21 +65,20 @@ def conllu2json(
def has_ner(input_data, ner_tag_pattern):
"""
- Check the 10th column of the first token to determine if the file contains
- NER tags
+ Check the MISC column for NER tags.
"""
for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n")
if lines:
while lines[0].startswith("#"):
lines.pop(0)
- if lines:
- parts = lines[0].split("\t")
+ for line in lines:
+ parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
- if re.search(ner_tag_pattern, misc):
- return True
- else:
- return False
+ for misc_part in misc.split("|"):
+ if re.match(ner_tag_pattern, misc_part):
+ return True
+ return False
def read_conllx(
@@ -127,19 +126,21 @@ def get_entities(lines, tag_pattern, ner_map=None):
iob = []
for misc in miscs:
- tag_match = re.search(tag_pattern, misc)
iob_tag = "O"
- if tag_match:
- prefix = tag_match.group(2)
- suffix = tag_match.group(3)
- if prefix and suffix:
- iob_tag = prefix + "-" + suffix
- if ner_map:
- suffix = ner_map.get(suffix, suffix)
- if suffix == "":
- iob_tag = "O"
- else:
- iob_tag = prefix + "-" + suffix
+ for misc_part in misc.split("|"):
+ tag_match = re.match(tag_pattern, misc_part)
+ if tag_match:
+ prefix = tag_match.group(2)
+ suffix = tag_match.group(3)
+ if prefix and suffix:
+ iob_tag = prefix + "-" + suffix
+ if ner_map:
+ suffix = ner_map.get(suffix, suffix)
+ if suffix == "":
+ iob_tag = "O"
+ else:
+ iob_tag = prefix + "-" + suffix
+ break
iob.append(iob_tag)
return iob_to_biluo(iob)
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 306adc881..132f7ac9f 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -29,14 +29,26 @@ def test_cli_converters_conllu2json():
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
-def test_cli_converters_conllu2json_name_ner_map():
- lines = [
- "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
- "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER",
- "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER",
- "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
- "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD",
- ]
+@pytest.mark.parametrize(
+ "lines",
+ [
+ (
+ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
+ "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|name=B-PER",
+ "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tname=I-PER",
+ "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
+ "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=B-BAD",
+ ),
+ (
+ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\t_",
+ "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tSpaceAfter=No|NE=B-PER",
+ "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tNE=L-PER",
+ "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No",
+ "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tNE=B-BAD",
+ ),
+ ],
+)
+def test_cli_converters_conllu2json_name_ner_map(lines):
input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
assert len(converted) == 1
From a3b7ae4f984bc7244402d50bbd6850f421fa29f7 Mon Sep 17 00:00:00 2001
From: Kevin Lu
Date: Wed, 20 May 2020 09:11:32 -0700
Subject: [PATCH 147/496] Update universe.json
---
website/meta/universe.json | 102 +++++++++++++++++++++++++++++++++++--
1 file changed, 99 insertions(+), 3 deletions(-)
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 22673834a..8aaabf408 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -115,11 +115,11 @@
"print(text)"
],
"category": ["scientific", "biomedical"],
- "author": "Travis Hoppe",
+ "author": "Travis Hoppe",
"author_links": {
"github": "thoppe",
- "twitter":"metasemantic",
- "website" : "http://thoppe.github.io/"
+ "twitter": "metasemantic",
+ "website": "http://thoppe.github.io/"
}
},
{
@@ -2099,6 +2099,102 @@
"predict_output = clf.predict(predict_input)"
],
"category": ["standalone"]
+ },
+ {
+ "id": "spacy_fastlang",
+ "title": "Spacy FastLang",
+ "slogan": "Language detection done fast",
+ "description": "Fast language detection using FastText and Spacy.",
+ "github": "thomasthiebaud/spacy-fastlang",
+ "pip": "spacy_fastlang",
+ "code_example": [
+ "import spacy",
+ "from spacy_fastlang import LanguageDetector",
+ "",
+ "nlp = spacy.load('en_core_web_sm')",
+ "nlp.add_pipe(LanguageDetector())",
+ "doc = nlp('Life is like a box of chocolates. You never know what you're gonna get.')",
+ "",
+ "assert doc._.language == 'en'",
+ "assert doc._.language_score >= 0.8"
+ ],
+ "author": "Thomas Thiebaud",
+ "author_links": {
+ "github": "thomasthiebaud"
+ },
+ "category": ["pipeline"]
+ },
+ {
+ "id": "mlflow",
+ "title": "MLflow",
+ "slogan": "An open source platform for the machine learning lifecycle",
+ "description": "MLflow is an open source platform to manage the ML lifecycle, including experimentation, reproducibility, deployment, and a central model registry. MLflow currently offers four components: Tracking, Projects, Models and Registry.",
+ "github": "mlflow/mlflow",
+ "pip": "mlflow",
+ "thumb": "https://www.mlflow.org/docs/latest/_static/MLflow-logo-final-black.png",
+ "image": "",
+ "url": "https://mlflow.org/",
+ "author": "Databricks",
+ "author_links": {
+ "github": "databricks",
+ "twitter": "databricks",
+ "website": "https://databricks.com/"
+ },
+ "category": ["standalone", "apis"],
+ "code_example": [
+ "import mlflow",
+ "import mlflow.spacy",
+ "",
+ "# MLflow Tracking",
+ "nlp = spacy.load('my_best_model_path/output/model-best')",
+ "with mlflow.start_run(run_name='Spacy'):",
+ " mlflow.set_tag('model_flavor', 'spacy')",
+ " mlflow.spacy.log_model(spacy_model=nlp, artifact_path='model')",
+ " mlflow.log_metric(('accuracy', 0.72))",
+ " my_run_id = mlflow.active_run().info.run_id",
+ "",
+ "",
+ "# MLflow Models",
+ "model_uri = f'runs:/{my_run_id}/model'",
+ "nlp2 = mlflow.spacy.load_model(model_uri=model_uri)"
+ ]
+ },
+ {
+ "id": "pyate",
+ "title": "PyATE",
+ "slogan": "Python Automated Term Extraction",
+ "description": "PyATE is a term extraction library written in Python using Spacy POS tagging with Basic, Combo Basic, C-Value, TermExtractor, and Weirdness.",
+ "github": "kevinlu1248/pyate",
+ "pip": "pyate",
+ "code_example": [
+ "import spacy",
+ "from pyate.term_extraction_pipeline import TermExtractionPipeline",
+ "",
+ "nlp = spacy.load('en_core_web_sm')",
+ "nlp.add_pipe(TermExtractionPipeline())",
+ "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/",
+ "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'",
+ "",
+ "doc = nlp(string)",
+ "print(doc._.combo_basic.sort_values(ascending=False).head(5))",
+ "\"\"\"\"\"\"",
+ "dysfunctional tumor 1.443147",
+ "tumor suppressors 1.443147",
+ "genetic changes 1.386294",
+ "cancer cells 1.386294",
+ "dysfunctional tumor suppressors 1.298612",
+ "\"\"\"\"\"\""
+ ],
+ "code_language": "python",
+ "url": "https://github.com/kevinlu1248/pyate",
+ "author": "Kevin Lu",
+ "author_links": {
+ "twitter": "kevinlu1248",
+ "github": "kevinlu1248",
+ "website": "https://github.com/kevinlu1248/pyate"
+ },
+ "category": ["pipeline", "research"],
+ "tags": ["term_extraction"]
}
],
From f44897e4c67e26228c143afd5edcf716b1e4912f Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 18:39:11 +0200
Subject: [PATCH 148/496] Update warning IDs
---
spacy/morphology.pyx | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 0b53b124c..f7e38bbea 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -42,7 +42,7 @@ def _normalize_props(props):
elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
out[key] = value
else:
- warnings.warn(Warnings.W029.format(feature={key: value}))
+ warnings.warn(Warnings.W095.format(feature={key: value}))
return out
@@ -112,7 +112,7 @@ cdef class Morphology:
return tag_ptr.key
features = self.feats_to_dict(features)
if not isinstance(features, dict):
- warnings.warn(Warnings.W029.format(feature=features))
+ warnings.warn(Warnings.W095.format(feature=features))
features = {}
features = _normalize_props(features)
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
From d34fc0915eda68114d088394a7fee304039d0486 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 18:48:21 +0200
Subject: [PATCH 149/496] Remove serialization getter
---
spacy/vocab.pyx | 1 -
1 file changed, 1 deletion(-)
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index ab240df90..19896f07b 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -496,7 +496,6 @@ cdef class Vocab:
getters = {
"strings": lambda: self.strings.to_bytes(),
- "lexemes": lambda: self.lexemes_to_bytes(),
"vectors": deserialize_vectors,
"lookups": lambda: self.lookups.to_bytes(),
"lookups_extra": lambda: self.lookups_extra.to_bytes()
From 631e20d0c64635dc8d2512ddd068293325ef5ebe Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 19:01:02 +0200
Subject: [PATCH 150/496] Fix test and schemas
---
spacy/schemas.py | 1 +
spacy/tests/parser/test_ner.py | 6 +-----
2 files changed, 2 insertions(+), 5 deletions(-)
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 3b6313db8..3024326dd 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -62,6 +62,7 @@ class TokenPatternNumber(BaseModel):
IN: Optional[List[StrictInt]] = None
NOT_IN: Optional[List[StrictInt]] = None
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
+ NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index e78cac757..8e41a16c0 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -198,11 +198,7 @@ def test_train_empty():
batches = util.minibatch(train_data)
for batch in batches:
texts, annotations = zip(*batch)
- nlp.update(
- texts, # batch of texts
- annotations, # batch of annotations
- losses=losses,
- )
+ nlp.update(train_data, losses=losses)
def test_overwrite_token():
From f075655debdd35e2cd648bd845b8b966edb5c733 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal
Date: Thu, 21 May 2020 19:26:29 +0200
Subject: [PATCH 151/496] Fix shape inference in begin_training
---
spacy/ml/models/parser.py | 3 +--
spacy/ml/models/tagger.py | 3 +--
spacy/ml/tb_framework.py | 4 ++--
spacy/pipeline/pipes.pyx | 7 ++++++-
spacy/syntax/nn_parser.pyx | 4 ++++
5 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 710d36a1d..0e0857ca8 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -15,10 +15,9 @@ def build_tb_parser_model(
use_upper=True,
nO=None,
):
- token_vector_width = tok2vec.get_dim("nO")
tok2vec = chain(
tok2vec,
- with_array(Linear(hidden_width, token_vector_width)),
+ with_array(Linear(hidden_width)),
list2array(),
)
tok2vec.set_dim("nO", hidden_width)
diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py
index 683c8b518..87256cb5c 100644
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@@ -6,9 +6,8 @@ from ...util import registry
@registry.architectures.register("spacy.Tagger.v1")
def build_tagger_model(tok2vec, nO=None) -> Model:
- token_vector_width = tok2vec.get_dim("nO")
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
- output_layer = Softmax(nO, nI=token_vector_width, init_W=zero_init)
+ output_layer = Softmax(nO, init_W=zero_init)
softmax = with_array(output_layer)
model = chain(tok2vec, softmax)
model.set_ref("tok2vec", tok2vec)
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
index e4301a644..251189389 100644
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@@ -38,8 +38,8 @@ def forward(model, X, is_train):
def init(model, X=None, Y=None):
- tok2vec = model.get_ref("tok2vec").initialize()
- lower = model.get_ref("lower").initialize(X=X)
+ tok2vec = model.get_ref("tok2vec").initialize(X=X)
+ lower = model.get_ref("lower").initialize()
if model.attrs["has_upper"]:
statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 56fe54664..00c8894fd 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -531,7 +531,12 @@ class Tagger(Pipe):
vocab.morphology.lemmatizer,
exc=vocab.morphology.exc)
self.set_output(len(self.labels))
- self.model.initialize()
+ doc_sample = [Doc(self.vocab, words=["hello", "world"])]
+ for name, component in pipeline:
+ if component is self:
+ break
+ doc_sample = list(component.pipe(doc_sample))
+ self.model.initialize(X=doc_sample)
# Get batch of example docs, example outputs to call begin_training().
# This lets the model infer shapes.
link_vectors_to_models(self.vocab)
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 31aa4d413..94369a828 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -629,6 +629,10 @@ cdef class Parser:
for doc, gold in parses:
doc_sample.append(doc)
gold_sample.append(gold)
+ for name, component in pipeline:
+ if component is self:
+ break
+ doc_sample = list(component.pipe(doc_sample))
self.model.initialize(doc_sample, gold_sample)
if pipeline is not None:
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
From 3b5cfec1fcf34e45d86fd2b133120be13141488a Mon Sep 17 00:00:00 2001
From: Matthw Honnibal
Date: Thu, 21 May 2020 19:32:04 +0200
Subject: [PATCH 152/496] Tweak memory management in train_from_config
---
spacy/cli/train_from_config.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index c75c861cc..eeb21c10c 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -213,6 +213,12 @@ def train_from_config(
if is_best_checkpoint and output_path is not None:
nlp.to_disk(output_path)
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
+ # Clean up the objects to faciliate garbage collection.
+ for eg in batch:
+ eg.doc = None
+ eg.goldparse = None
+ eg.doc_annotation = None
+ eg.token_annotation = None
finally:
if output_path is not None:
final_model_path = output_path / "model-final"
From 245f91df78e2fd3977ec5b937bac67d3689dd41c Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 19:42:13 +0200
Subject: [PATCH 153/496] Fix merge issues
---
spacy/gold.pyx | 6 ++++++
spacy/tests/regression/test_issue5137.py | 3 ++-
2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 13e448342..5aa7da456 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -1050,6 +1050,12 @@ cdef class GoldParse:
# avoid allocating memory if the doc does not contain any tokens
if self.length == 0:
+ self.words = []
+ self.tags = []
+ self.heads = []
+ self.labels = []
+ self.ner = []
+ self.morphs = []
# set a minimal orig so that the scorer can score an empty doc
self.orig = TokenAnnotation(ids=[])
else:
diff --git a/spacy/tests/regression/test_issue5137.py b/spacy/tests/regression/test_issue5137.py
index 4b4e597d3..e9fd268c8 100644
--- a/spacy/tests/regression/test_issue5137.py
+++ b/spacy/tests/regression/test_issue5137.py
@@ -21,7 +21,8 @@ def test_issue5137():
def from_disk(self, path, **cfg):
pass
- Language.factories["my_component"] = lambda nlp, **cfg: MyComponent(nlp, **cfg)
+ factory = lambda nlp, model, **cfg: MyComponent(nlp, **cfg)
+ Language.factories["my_component"] = factory
nlp = English()
nlp.add_pipe(nlp.create_pipe("my_component"))
From 17ee9ab53acd5f39a2684e3442490201b66d2be4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Thu, 21 May 2020 19:49:08 +0200
Subject: [PATCH 154/496] Fix _SP/POS=SPACE in strings serialization tests
---
.../serialize/test_serialize_vocab_strings.py | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py
index f44426a1a..d3e82296e 100644
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@@ -8,6 +8,7 @@ from ..util import make_tempdir
test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
+default_strings = ("_SP", "POS=SPACE")
@pytest.mark.xfail
@@ -34,8 +35,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
assert vocab1.to_bytes() == vocab1_b
new_vocab1 = Vocab().from_bytes(vocab1_b)
assert new_vocab1.to_bytes() == vocab1_b
- assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP
- assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])
+ assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE
+ assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings))
@pytest.mark.parametrize("strings1,strings2", test_strings)
@@ -50,15 +51,15 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
vocab1_d = Vocab().from_disk(file_path1)
vocab2_d = Vocab().from_disk(file_path2)
# check strings rather than lexemes, which are only reloaded on demand
- assert strings1 == [s for s in vocab1_d.strings if s != "_SP"]
- assert strings2 == [s for s in vocab2_d.strings if s != "_SP"]
+ assert strings1 == [s for s in vocab1_d.strings if s not in default_strings]
+ assert strings2 == [s for s in vocab2_d.strings if s not in default_strings]
if strings1 == strings2:
- assert [s for s in vocab1_d.strings if s != "_SP"] == [
- s for s in vocab2_d.strings if s != "_SP"
+ assert [s for s in vocab1_d.strings if s not in default_strings] == [
+ s for s in vocab2_d.strings if s not in default_strings
]
else:
- assert [s for s in vocab1_d.strings if s != "_SP"] != [
- s for s in vocab2_d.strings if s != "_SP"
+ assert [s for s in vocab1_d.strings if s not in default_strings] != [
+ s for s in vocab2_d.strings if s not in default_strings
]
@@ -78,7 +79,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
# Reported in #2153
vocab = Vocab(strings=strings)
vocab.from_bytes(vocab.to_bytes())
- assert len(vocab.strings) == len(strings) + 1 # adds _SP
+ assert len(vocab.strings) == len(strings) + 2 # adds _SP and POS=SPACE
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
From 581bda9f985eba04e01c69c2c2f0a978ae6e6684 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 20:17:14 +0200
Subject: [PATCH 155/496] Update senter test and auto-format
---
spacy/tests/pipeline/test_senter.py | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 197fdca6e..041da2c9f 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -12,14 +12,21 @@ def test_label_types():
with pytest.raises(NotImplementedError):
nlp.get_pipe("senter").add_label("A")
+
SENT_STARTS = [0] * 14
SENT_STARTS[0] = 1
SENT_STARTS[5] = 1
SENT_STARTS[9] = 1
TRAIN_DATA = [
- ("I like green eggs. Eat blue ham. I like purple eggs.", {"sent_starts": SENT_STARTS}),
- ("She likes purple eggs. They hate ham. You like yellow eggs.", {"sent_starts": SENT_STARTS}),
+ (
+ "I like green eggs. Eat blue ham. I like purple eggs.",
+ {"sent_starts": SENT_STARTS},
+ ),
+ (
+ "She likes purple eggs. They hate ham. You like yellow eggs.",
+ {"sent_starts": SENT_STARTS},
+ ),
]
@@ -36,7 +43,7 @@ def test_overfitting_IO():
assert losses["senter"] < 0.001
# test the trained model
- test_text = "I like purple eggs. They eat ham. You like yellow eggs."
+ test_text = TRAIN_DATA[0][0]
doc = nlp(test_text)
gold_sent_starts = [0] * 14
gold_sent_starts[0] = 1
From df87c32a4068484471f5ce53b1f4eb7e4f9e4c43 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal
Date: Thu, 21 May 2020 20:17:24 +0200
Subject: [PATCH 156/496] Pass smaller doc sample into model initialize
---
spacy/syntax/nn_parser.pyx | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 94369a828..ed4697302 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -624,11 +624,12 @@ cdef class Parser:
sgd = self.create_optimizer()
doc_sample = []
gold_sample = []
- for example in islice(get_examples(), 1000):
+ for example in islice(get_examples(), 10):
parses = example.get_gold_parses(merge=False, vocab=self.vocab)
for doc, gold in parses:
- doc_sample.append(doc)
- gold_sample.append(gold)
+ if len(doc):
+ doc_sample.append(doc)
+ gold_sample.append(gold)
for name, component in pipeline:
if component is self:
break
From d507ac28d8db197e8eac6b8c420ef3502af0a006 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal
Date: Thu, 21 May 2020 20:46:10 +0200
Subject: [PATCH 157/496] Fix shape inference
---
spacy/ml/models/parser.py | 3 ++-
spacy/ml/models/tagger.py | 3 ++-
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 0e0857ca8..bdcd709b1 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -15,9 +15,10 @@ def build_tb_parser_model(
use_upper=True,
nO=None,
):
+ t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain(
tok2vec,
- with_array(Linear(hidden_width)),
+ with_array(Linear(hidden_width, t2v_width)),
list2array(),
)
tok2vec.set_dim("nO", hidden_width)
diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py
index 87256cb5c..00e268ede 100644
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@@ -7,7 +7,8 @@ from ...util import registry
@registry.architectures.register("spacy.Tagger.v1")
def build_tagger_model(tok2vec, nO=None) -> Model:
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
- output_layer = Softmax(nO, init_W=zero_init)
+ t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+ output_layer = Softmax(nO, t2v_width, init_W=zero_init)
softmax = with_array(output_layer)
model = chain(tok2vec, softmax)
model.set_ref("tok2vec", tok2vec)
From bc94fdabd0ec7362a68f38aa8cbb0b80f818f243 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal
Date: Thu, 21 May 2020 20:46:21 +0200
Subject: [PATCH 158/496] Fix begin_training
---
spacy/pipeline/pipes.pyx | 12 ++++++++----
spacy/syntax/nn_parser.pyx | 18 +++++++++++++-----
2 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 00c8894fd..f75ed1659 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -532,10 +532,14 @@ class Tagger(Pipe):
exc=vocab.morphology.exc)
self.set_output(len(self.labels))
doc_sample = [Doc(self.vocab, words=["hello", "world"])]
- for name, component in pipeline:
- if component is self:
- break
- doc_sample = list(component.pipe(doc_sample))
+ if pipeline is not None:
+ for name, component in pipeline:
+ if component is self:
+ break
+ if hasattr(component, "pipe"):
+ doc_sample = list(component.pipe(doc_sample))
+ else:
+ doc_sample = [component(doc) for doc in doc_sample]
self.model.initialize(X=doc_sample)
# Get batch of example docs, example outputs to call begin_training().
# This lets the model infer shapes.
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index ed4697302..f8e819268 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -630,11 +630,19 @@ cdef class Parser:
if len(doc):
doc_sample.append(doc)
gold_sample.append(gold)
- for name, component in pipeline:
- if component is self:
- break
- doc_sample = list(component.pipe(doc_sample))
- self.model.initialize(doc_sample, gold_sample)
+
+ if pipeline is not None:
+ for name, component in pipeline:
+ if component is self:
+ break
+ if hasattr(component, "pipe"):
+ doc_sample = list(component.pipe(doc_sample))
+ else:
+ doc_sample = [component(doc) for doc in doc_sample]
+ if doc_sample:
+ self.model.initialize(doc_sample)
+ else:
+ self.model.initialize()
if pipeline is not None:
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
link_vectors_to_models(self.vocab)
From 25b51f4fc8a102fd1c83d62d078f071823f222eb Mon Sep 17 00:00:00 2001
From: Matthw Honnibal
Date: Thu, 21 May 2020 20:47:52 +0200
Subject: [PATCH 159/496] Set version to v3.0.0.dev9
---
spacy/about.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/about.py b/spacy/about.py
index 3af1b77a0..04a660ad1 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.0.0.dev8"
+__version__ = "3.0.0.dev9"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
From 32c2bb3d99606d4516f9db3e6c3b8d00d5d99d2b Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 21 May 2020 20:45:33 +0200
Subject: [PATCH 160/496] Add course to landing [ci skip]
---
website/src/styles/landing.module.sass | 1 +
website/src/widgets/landing.js | 47 ++++++++++++++------------
2 files changed, 26 insertions(+), 22 deletions(-)
diff --git a/website/src/styles/landing.module.sass b/website/src/styles/landing.module.sass
index d7340229b..fab07ce9b 100644
--- a/website/src/styles/landing.module.sass
+++ b/website/src/styles/landing.module.sass
@@ -81,6 +81,7 @@
.banner-content-small
display: block
+ margin-bottom: 0 !important
.banner-title
display: block
diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index 2dc5d40dc..77d32a6ad 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -9,7 +9,6 @@ import {
LandingGrid,
LandingCard,
LandingCol,
- LandingButton,
LandingDemo,
LandingBannerGrid,
LandingBanner,
@@ -19,7 +18,8 @@ import { H2 } from '../components/typography'
import { Ul, Li } from '../components/list'
import Button from '../components/button'
import Link from '../components/link'
-import irlBackground from '../images/spacy-irl.jpg'
+
+import courseImage from '../../docs/images/course.jpg'
import BenchmarksChoi from 'usage/_benchmarks-choi.md'
@@ -154,13 +154,35 @@ const Landing = ({ data }) => {
+
+
+
+
+
+
+ In this free and interactive online course you’ll learn how to
+ use spaCy to build advanced natural language understanding systems, using both
+ rule-based and machine learning approaches. It includes{' '}
+ 55 exercises featuring videos, slide decks, multiple-choice
+ questions and interactive coding practice in the browser.
+
+
Prodigy is an annotation tool so efficient that data scientists
@@ -171,25 +193,6 @@ const Landing = ({ data }) => {
update your model in real-time and chain models together to build more complex
systems.
-
-
- We were pleased to invite the spaCy community and other folks working on Natural
- Language Processing to Berlin this summer for a small and intimate event{' '}
- July 6, 2019 . We booked a beautiful venue, hand-picked an
- awesome lineup of speakers and scheduled plenty of social time to get to know
- each other and exchange ideas. The YouTube playlist includes 12 talks about NLP
- research, development and applications, with keynotes by Sebastian Ruder
- (DeepMind) and Yoav Goldberg (Allen AI).
-
From 5753b43e60a50e411fb1c92540dbb137e74f333f Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Mon, 20 Apr 2020 20:33:13 +0200
Subject: [PATCH 161/496] Tidy up and fix alignment of landing cards (#5317)
---
website/src/components/landing.js | 13 ++++++--
website/src/styles/landing.module.sass | 5 +++
website/src/widgets/landing.js | 44 +++++++++++---------------
3 files changed, 34 insertions(+), 28 deletions(-)
diff --git a/website/src/components/landing.js b/website/src/components/landing.js
index 16c342e3f..fb03d2845 100644
--- a/website/src/components/landing.js
+++ b/website/src/components/landing.js
@@ -46,10 +46,17 @@ export const LandingGrid = ({ cols = 3, blocks = false, children }) => (
export const LandingCol = ({ children }) => {children}
-export const LandingCard = ({ title, children }) => (
+export const LandingCard = ({ title, button, url, children }) => (
- {title &&
{title} }
- {children}
+
+ {title && {title} }
+ {children}
+
+ {button && url && (
+
+ )}
)
diff --git a/website/src/styles/landing.module.sass b/website/src/styles/landing.module.sass
index fab07ce9b..c29c0fffb 100644
--- a/website/src/styles/landing.module.sass
+++ b/website/src/styles/landing.module.sass
@@ -49,12 +49,17 @@
margin-bottom: -25rem
.card
+ display: flex
+ flex-direction: column
padding: 3rem 2.5rem
background: var(--color-back)
border-radius: var(--border-radius)
box-shadow: var(--box-shadow)
margin-bottom: 3rem
+.card-text
+ flex: 100%
+
.button
width: 100%
diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index 77d32a6ad..c96905733 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -79,34 +79,28 @@ const Landing = ({ data }) => {
in Python
-
-
- spaCy is designed to help you do real work — to build real products, or
- gather real insights. The library respects your time, and tries to avoid
- wasting it. It's easy to install, and its API is simple and productive. We
- like to think of spaCy as the Ruby on Rails of Natural Language Processing.
-
- Get started
+
+ spaCy is designed to help you do real work — to build real products, or gather
+ real insights. The library respects your time, and tries to avoid wasting it.
+ It's easy to install, and its API is simple and productive. We like to think of
+ spaCy as the Ruby on Rails of Natural Language Processing.
-
-
- spaCy excels at large-scale information extraction tasks. It's written from
- the ground up in carefully memory-managed Cython. Independent research in
- 2015 found spaCy to be the fastest in the world. If your application needs
- to process entire web dumps, spaCy is the library you want to be using.
-
- Facts & Figures
+
+ spaCy excels at large-scale information extraction tasks. It's written from the
+ ground up in carefully memory-managed Cython. Independent research in 2015 found
+ spaCy to be the fastest in the world. If your application needs to process
+ entire web dumps, spaCy is the library you want to be using.
-
-
- spaCy is the best way to prepare text for deep learning. It interoperates
- seamlessly with TensorFlow, PyTorch, scikit-learn, Gensim and the rest of
- Python's awesome AI ecosystem. With spaCy, you can easily construct
- linguistically sophisticated statistical models for a variety of NLP
- problems.
-
- Read more
+
+ spaCy is the best way to prepare text for deep learning. It interoperates
+ seamlessly with TensorFlow, PyTorch, scikit-learn, Gensim and the rest of
+ Python's awesome AI ecosystem. With spaCy, you can easily construct
+ linguistically sophisticated statistical models for a variety of NLP problems.
From 6e6db6afb62a0377bcd5f0c64220ad05f512c073 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Fri, 22 May 2020 15:42:46 +0200
Subject: [PATCH 162/496] Better model compatibility and validation
---
requirements.txt | 1 +
setup.cfg | 1 +
spacy/cli/info.py | 4 ++-
spacy/cli/package.py | 5 ++--
spacy/cli/train.py | 2 +-
spacy/cli/validate.py | 39 +++++++++++++++++----------
spacy/language.py | 2 +-
spacy/util.py | 61 +++++++++++++++++++++++++++++++++++++++++++
8 files changed, 96 insertions(+), 19 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index e5f1ae10b..c43ffa7bb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,6 +13,7 @@ numpy>=1.15.0
requests>=2.13.0,<3.0.0
plac>=0.9.6,<1.2.0
tqdm>=4.38.0,<5.0.0
+importlib_metadata>=0.20; python_version < "3.8"
# Optional dependencies
jsonschema>=2.6.0,<3.1.0
pydantic>=1.3.0,<2.0.0
diff --git a/setup.cfg b/setup.cfg
index 1cd088279..eb7608c4e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -56,6 +56,7 @@ install_requires =
requests>=2.13.0,<3.0.0
pydantic>=1.3.0,<2.0.0
tqdm>=4.38.0,<5.0.0
+ importlib_metadata>=0.20; python_version < "3.8"
[options.extras_require]
lookups =
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 23f766368..d779eb2b3 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -48,7 +48,9 @@ def info(
"Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(),
"Python version": platform.python_version(),
- "Models": ", ".join(model["name"] for model in all_models.values()),
+ "Models": ", ".join(
+ f"{m['name']} ({m['version']})" for m in all_models.values()
+ ),
}
if not silent:
title = "Info about spaCy"
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 8e27e44d0..cf93c872f 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -83,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg):
("lang", "Model language", meta.get("lang", "en")),
("name", "Model name", meta.get("name", "model")),
("version", "Model version", meta.get("version", "0.0.0")),
- ("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"),
("description", "Model description", meta.get("description", False)),
("author", "Author", meta.get("author", False)),
("email", "Author email", meta.get("email", False)),
("url", "Author website", meta.get("url", False)),
- ("license", "License", meta.get("license", "CC BY-SA 3.0")),
+ ("license", "License", meta.get("license", "MIT")),
]
nlp = util.load_model_from_path(Path(model_path))
+ meta["spacy_version"] = about.__version__
meta["pipeline"] = nlp.pipe_names
meta["vectors"] = {
"width": nlp.vocab.vectors_length,
@@ -168,6 +168,7 @@ def setup_package():
package_data={model_name: list_files(model_dir)},
install_requires=list_requirements(meta),
zip_safe=False,
+ entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]}
)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 19e0a81e0..c205fa5b2 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -467,7 +467,7 @@ def train(
# Update model meta.json
meta["lang"] = nlp.lang
meta["pipeline"] = nlp.pipe_names
- meta["spacy_version"] = f">={about.__version__}"
+ meta["spacy_version"] = about.__version__
if beam_width == 1:
meta["speed"] = {
"nwords": nwords,
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index a23ce3453..c39cadc7b 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -4,6 +4,8 @@ import requests
from wasabi import msg
from .. import about
+from ..util import get_package_version, get_installed_models, split_version
+from ..util import get_package_path, get_model_meta, is_compatible_model
def validate():
@@ -25,7 +27,7 @@ def validate():
msg.info(f"spaCy installation: {spacy_dir}")
if model_pkgs:
- header = ("NAME", "VERSION", "")
+ header = ("NAME", "SPACY", "VERSION", "")
rows = []
for name, data in model_pkgs.items():
if data["compat"]:
@@ -34,7 +36,7 @@ def validate():
else:
version = msg.text(data["version"], color="red", no_print=True)
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
- rows.append((data["name"], version, comp))
+ rows.append((data["name"], data["spacy"], version, comp))
msg.table(rows, header=header)
else:
msg.text("No models found in your current environment.", exits=0)
@@ -44,8 +46,9 @@ def validate():
cmd = "python -m spacy download {}"
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
if na_models:
- msg.warn(
- f"The following models are not available for spaCy v{about.__version__}:",
+ msg.info(
+ f"The following models are custom spaCy models or not "
+ f"available for spaCy v{about.__version__}:",
", ".join(na_models),
)
if incompat_models:
@@ -53,8 +56,6 @@ def validate():
def get_model_pkgs():
- import pkg_resources
-
with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__)
if r.status_code != 200:
@@ -66,20 +67,30 @@ def get_model_pkgs():
msg.good("Loaded compatibility table")
compat = r.json()["spacy"]
all_models = set()
+ installed_models = get_installed_models()
for spacy_v, models in dict(compat).items():
all_models.update(models.keys())
for model, model_vs in models.items():
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
pkgs = {}
- for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
+ for pkg_name in installed_models:
package = pkg_name.replace("-", "_")
- if package in all_models:
- version = pkg_data.version
- pkgs[pkg_name] = {
- "name": package,
- "version": version,
- "compat": package in compat and version in compat[package],
- }
+ version = get_package_version(pkg_name)
+ if package in compat:
+ is_compat = version in compat[package]
+ v_maj, v_min = split_version(about.__version__)
+ spacy_version = f"{v_maj}.{v_min}"
+ else:
+ model_path = get_package_path(package)
+ model_meta = get_model_meta(model_path)
+ is_compat = is_compatible_model(model_meta)
+ spacy_version = model_meta.get("spacy_version", "n/a")
+ pkgs[pkg_name] = {
+ "name": package,
+ "version": version,
+ "spacy": spacy_version,
+ "compat": is_compat,
+ }
return pkgs, compat
diff --git a/spacy/language.py b/spacy/language.py
index d71c27406..f770cda2c 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -196,7 +196,7 @@ class Language(object):
self._meta.setdefault("lang", self.lang)
self._meta.setdefault("name", "model")
self._meta.setdefault("version", "0.0.0")
- self._meta.setdefault("spacy_version", f">={about.__version__}")
+ self._meta.setdefault("spacy_version", about.__version__)
self._meta.setdefault("description", "")
self._meta.setdefault("author", "")
self._meta.setdefault("email", "")
diff --git a/spacy/util.py b/spacy/util.py
index 7f35c2f7c..5a7c633fa 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -21,9 +21,16 @@ try:
except ImportError:
cupy = None
+try: # Python 3.8
+ import importlib.metadata as importlib_metadata
+except ImportError:
+ import importlib_metadata
+
from .symbols import ORTH
from .compat import cupy, CudaStream
from .errors import Errors, Warnings
+from . import about
+
_PRINT_ENV = False
@@ -35,6 +42,10 @@ class registry(thinc.registry):
factories = catalogue.create("spacy", "factories", entry_points=True)
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
assets = catalogue.create("spacy", "assets", entry_points=True)
+ # This is mostly used to get a list of all installed models in the current
+ # environment. spaCy models packaged with `spacy package` will "advertise"
+ # themselves via entry points.
+ models = catalogue.create("spacy", "models", entry_points=True)
def set_env_log(value):
@@ -204,6 +215,56 @@ def load_model_from_init_py(init_file, **overrides):
return load_model_from_path(data_path, meta, **overrides)
+def get_installed_models():
+ """List all model packages currently installed in the environment.
+
+ RETURNS (list): The string names of the models.
+ """
+ return list(registry.models.get_all().keys())
+
+
+def get_package_version(name):
+ """Get the version of an installed package. Typically used to get model
+ package versions.
+
+ name (unicode): The name of the installed Python package.
+ RETURNS (unicode / None): The version or None if package not installed.
+ """
+ try:
+ return importlib_metadata.version(name)
+ except importlib_metadata.PackageNotFoundError:
+ return None
+
+
+def split_version(version):
+ """RETURNS (tuple): Two integers, the major and minor spaCy version."""
+ pieces = version.split(".", 3)
+ return int(pieces[0]), int(pieces[1])
+
+
+def is_compatible_model(meta):
+ """Check if a model is compatible with the current version of spaCy, based
+ on its meta.json. We compare the version of spaCy the model was created with
+ with the current version. If the minor version is different, it's considered
+ incompatible.
+
+ meta (dict): The model's meta.
+ RETURNS (bool / None): Whether the model is compatible with the current
+ spaCy or None if we don't have enough info.
+ """
+ cur_v = about.__version__
+ pkg_v = meta.get("spacy_version")
+ if not pkg_v or not isinstance(pkg_v, str):
+ return None
+ # Handle spacy_version values like >=x,
Date: Fri, 22 May 2020 15:55:45 +0200
Subject: [PATCH 163/496] Guess set_annotations=True in nlp.update
During `nlp.update`, components can be passed a boolean set_annotations
to indicate whether they should assign annotations to the `Doc`. This
needs to be called if downstream components expect to use the
annotations during training, e.g. if we wanted to use tagger features in
the parser.
Components can specify their assignments and requirements, so we can
figure out which components have these inter-dependencies. After
figuring this out, we can guess whether to pass set_annotations=True.
We could also call set_annotations=True always, or even just have this
as the only behaviour. The downside of this is that it would require the
`Doc` objects to be created afresh to avoid problematic modifications.
One approach would be to make a fresh copy of the `Doc` objects within
`nlp.update()`, so that we can write to the objects without any
problems. If we do that, we can drop this logic and also drop the
`set_annotations` mechanism. I would be fine with that approach,
although it runs the risk of introducing some performance overhead, and
we'll have to take care to copy all extension attributes etc.
---
spacy/language.py | 24 +++++++++++++++++++++--
spacy/tests/pipeline/test_pipe_methods.py | 18 ++++++++++++++++-
2 files changed, 39 insertions(+), 3 deletions(-)
diff --git a/spacy/language.py b/spacy/language.py
index d71c27406..afc988583 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -545,13 +545,14 @@ class Language(object):
if component_cfg is None:
component_cfg = {}
+ component_deps = _count_pipeline_inter_dependencies(self.pipeline)
# Determine whether component should set annotations. In theory I guess
# we should do this by inspecting the meta? Or we could just always
# say "yes"
- for name, proc in self.pipeline:
+ for i, (name, proc) in enumerate(self.pipeline):
component_cfg.setdefault(name, {})
component_cfg[name].setdefault("drop", drop)
- component_cfg[name].setdefault("set_annotations", False)
+ component_cfg[name]["set_annotations"] = bool(component_deps[i])
for name, proc in self.pipeline:
if not hasattr(proc, "update"):
continue
@@ -1159,6 +1160,25 @@ class DisabledPipes(list):
self[:] = []
+def _count_pipeline_inter_dependencies(pipeline):
+ """Count how many subsequent components require an annotation set by each
+ component in the pipeline.
+ """
+ pipe_assigns = []
+ pipe_requires = []
+ for name, pipe in pipeline:
+ pipe_assigns.append(set(getattr(pipe, "assigns", [])))
+ pipe_requires.append(set(getattr(pipe, "requires", [])))
+ counts = []
+ for i, assigns in enumerate(pipe_assigns):
+ count = 0
+ for requires in pipe_requires[i+1:]:
+ if assigns.intersection(requires):
+ count += 1
+ counts.append(count)
+ return counts
+
+
def _pipe(examples, proc, kwargs):
# We added some args for pipe that __call__ doesn't expect.
kwargs = dict(kwargs)
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index d42216655..0397d490d 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -1,5 +1,5 @@
import pytest
-from spacy.language import Language
+from spacy.language import Language, _count_pipeline_inter_dependencies
@pytest.fixture
@@ -198,3 +198,19 @@ def test_pipe_labels(nlp):
assert len(nlp.pipe_labels) == len(input_labels)
for name, labels in nlp.pipe_labels.items():
assert sorted(input_labels[name]) == sorted(labels)
+
+
+def test_pipe_inter_dependencies():
+ class Fancifier:
+ name = "fancifier"
+ assigns = ("doc._.fancy",)
+ requires = tuple()
+
+ class FancyNeeder:
+ name = "needer"
+ assigns = tuple()
+ requires = ("doc._.fancy",)
+
+ pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())]
+ counts = _count_pipeline_inter_dependencies(pipeline)
+ assert counts == [1, 0]
From f7f6df7275ea2884fc47fa7823c6bcba1caa5cb4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal
Date: Fri, 22 May 2020 16:43:18 +0200
Subject: [PATCH 164/496] Move to spacy.analysis
---
spacy/analysis.py | 21 +++++++++++++++++++++
spacy/language.py | 22 ++--------------------
spacy/tests/pipeline/test_analysis.py | 17 +++++++++++++++++
spacy/tests/pipeline/test_pipe_methods.py | 18 +-----------------
4 files changed, 41 insertions(+), 37 deletions(-)
diff --git a/spacy/analysis.py b/spacy/analysis.py
index c2600048f..41591661c 100644
--- a/spacy/analysis.py
+++ b/spacy/analysis.py
@@ -173,3 +173,24 @@ def print_summary(nlp, pretty=True, no_print=False):
msg.good("No problems found.")
if no_print:
return {"overview": overview, "problems": problems}
+
+
+def count_pipeline_interdependencies(pipeline):
+ """Count how many subsequent components require an annotation set by each
+ component in the pipeline.
+ """
+ pipe_assigns = []
+ pipe_requires = []
+ for name, pipe in pipeline:
+ pipe_assigns.append(set(getattr(pipe, "assigns", [])))
+ pipe_requires.append(set(getattr(pipe, "requires", [])))
+ counts = []
+ for i, assigns in enumerate(pipe_assigns):
+ count = 0
+ for requires in pipe_requires[i+1:]:
+ if assigns.intersection(requires):
+ count += 1
+ counts.append(count)
+ return counts
+
+
diff --git a/spacy/language.py b/spacy/language.py
index afc988583..b228c2155 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -18,6 +18,7 @@ from .vocab import Vocab
from .lemmatizer import Lemmatizer
from .lookups import Lookups
from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
+from .analysis import count_pipeline_interdependencies
from .gold import Example
from .scorer import Scorer
from .util import link_vectors_to_models, create_default_optimizer, registry
@@ -545,7 +546,7 @@ class Language(object):
if component_cfg is None:
component_cfg = {}
- component_deps = _count_pipeline_inter_dependencies(self.pipeline)
+ component_deps = count_pipeline_interdependencies(self.pipeline)
# Determine whether component should set annotations. In theory I guess
# we should do this by inspecting the meta? Or we could just always
# say "yes"
@@ -1160,25 +1161,6 @@ class DisabledPipes(list):
self[:] = []
-def _count_pipeline_inter_dependencies(pipeline):
- """Count how many subsequent components require an annotation set by each
- component in the pipeline.
- """
- pipe_assigns = []
- pipe_requires = []
- for name, pipe in pipeline:
- pipe_assigns.append(set(getattr(pipe, "assigns", [])))
- pipe_requires.append(set(getattr(pipe, "requires", [])))
- counts = []
- for i, assigns in enumerate(pipe_assigns):
- count = 0
- for requires in pipe_requires[i+1:]:
- if assigns.intersection(requires):
- count += 1
- counts.append(count)
- return counts
-
-
def _pipe(examples, proc, kwargs):
# We added some args for pipe that __call__ doesn't expect.
kwargs = dict(kwargs)
diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py
index cda39f6ee..e608f2c34 100644
--- a/spacy/tests/pipeline/test_analysis.py
+++ b/spacy/tests/pipeline/test_analysis.py
@@ -2,6 +2,7 @@ import spacy.language
from spacy.language import Language, component
from spacy.analysis import print_summary, validate_attrs
from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
+from spacy.analysis import count_pipeline_interdependencies
from mock import Mock, ANY
import pytest
@@ -161,3 +162,19 @@ def test_analysis_validate_attrs_remove_pipe():
with pytest.warns(None) as record:
nlp.remove_pipe("c2")
assert not record.list
+
+
+def test_pipe_interdependencies():
+ class Fancifier:
+ name = "fancifier"
+ assigns = ("doc._.fancy",)
+ requires = tuple()
+
+ class FancyNeeder:
+ name = "needer"
+ assigns = tuple()
+ requires = ("doc._.fancy",)
+
+ pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())]
+ counts = count_pipeline_interdependencies(pipeline)
+ assert counts == [1, 0]
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 0397d490d..d42216655 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -1,5 +1,5 @@
import pytest
-from spacy.language import Language, _count_pipeline_inter_dependencies
+from spacy.language import Language
@pytest.fixture
@@ -198,19 +198,3 @@ def test_pipe_labels(nlp):
assert len(nlp.pipe_labels) == len(input_labels)
for name, labels in nlp.pipe_labels.items():
assert sorted(input_labels[name]) == sorted(labels)
-
-
-def test_pipe_inter_dependencies():
- class Fancifier:
- name = "fancifier"
- assigns = ("doc._.fancy",)
- requires = tuple()
-
- class FancyNeeder:
- name = "needer"
- assigns = tuple()
- requires = ("doc._.fancy",)
-
- pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())]
- counts = _count_pipeline_inter_dependencies(pipeline)
- assert counts == [1, 0]
From 12b7be1d9874048c1f3f20dffb833a88308544c4 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Fri, 22 May 2020 16:49:26 +0200
Subject: [PATCH 165/496] Remove jsonschema from dependencies
---
Makefile | 4 ++--
requirements.txt | 2 --
spacy/tests/package/test_requirements.py | 1 -
3 files changed, 2 insertions(+), 5 deletions(-)
diff --git a/Makefile b/Makefile
index cf96d6294..9916e3cf5 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ VENV := ./env$(PYVER)
version := $(shell "bin/get-version.sh")
dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
- $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data
+ $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy_lookups_data
chmod a+rx $@
dist/pytest.pex : wheelhouse/pytest-*.whl
@@ -14,7 +14,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl
wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
$(VENV)/bin/pip wheel . -w ./wheelhouse
- $(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse
+ $(VENV)/bin/pip wheel spacy_lookups_data -w ./wheelhouse
touch $@
wheelhouse/pytest-%.whl : $(VENV)/bin/pex
diff --git a/requirements.txt b/requirements.txt
index c43ffa7bb..add083a05 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,8 +14,6 @@ requests>=2.13.0,<3.0.0
plac>=0.9.6,<1.2.0
tqdm>=4.38.0,<5.0.0
importlib_metadata>=0.20; python_version < "3.8"
-# Optional dependencies
-jsonschema>=2.6.0,<3.1.0
pydantic>=1.3.0,<2.0.0
# Development dependencies
cython>=0.25
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index 59a8569ee..0dc0f9d6c 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -9,7 +9,6 @@ def test_build_dependencies():
"pytest-timeout",
"mock",
"flake8",
- "jsonschema",
]
libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]
From d844528c5f62f27904d6925f16cc7d1ee3e16949 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Fri, 22 May 2020 16:55:15 +0200
Subject: [PATCH 166/496] Add test for is_compatible_model
---
spacy/tests/test_misc.py | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index c320b19c0..0a0f4c7be 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -2,6 +2,7 @@ import pytest
import os
import ctypes
from pathlib import Path
+from spacy.about import __version__ as spacy_version
from spacy import util
from spacy import prefer_gpu, require_gpu
from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding
@@ -87,3 +88,11 @@ def test_ascii_filenames():
root = Path(__file__).parent.parent
for path in root.glob("**/*"):
assert all(ord(c) < 128 for c in path.name), path.name
+
+
+@pytest.mark.parametrize(
+ "version,compatible",
+ [(spacy_version, True), ("2.0.0", False), (">=1.2.3,<4.5.6", False)],
+)
+def test_is_compatible_model(version, compatible):
+ assert util.is_compatible_model({"spacy_version": version}) is compatible
From 569a65b60e4205311817d1a8add57fa16b407de7 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Fri, 22 May 2020 16:55:42 +0200
Subject: [PATCH 167/496] Auto-format
---
spacy/tests/test_misc.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 0a0f4c7be..ddf1bb332 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -5,7 +5,8 @@ from pathlib import Path
from spacy.about import __version__ as spacy_version
from spacy import util
from spacy import prefer_gpu, require_gpu
-from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding
+from spacy.ml._precomputable_affine import PrecomputableAffine
+from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
@pytest.fixture
From 4465cad6c5bc188f628dc92183e2e855e26bcfc4 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Fri, 22 May 2020 17:42:06 +0200
Subject: [PATCH 168/496] Rename spacy.analysis to spacy.pipe_analysis
---
spacy/language.py | 23 ++++++++++++++++++-----
spacy/{analysis.py => pipe_analysis.py} | 4 +---
spacy/tests/pipeline/test_analysis.py | 8 ++++----
3 files changed, 23 insertions(+), 12 deletions(-)
rename spacy/{analysis.py => pipe_analysis.py} (99%)
diff --git a/spacy/language.py b/spacy/language.py
index 8c44cf26b..5286bd3b9 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -17,8 +17,8 @@ from .tokens.underscore import Underscore
from .vocab import Vocab
from .lemmatizer import Lemmatizer
from .lookups import Lookups
-from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
-from .analysis import count_pipeline_interdependencies
+from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
+from .pipe_analysis import count_pipeline_interdependencies
from .gold import Example
from .scorer import Scorer
from .util import link_vectors_to_models, create_default_optimizer, registry
@@ -318,14 +318,18 @@ class Language(object):
# check whether we have a proper model config, or load a default one
if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict):
- warnings.warn(Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name))
+ warnings.warn(
+ Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name)
+ )
# refer to the model configuration in the cfg settings for this component
if "model" in factory_cfg:
self.config[name] = {"model": factory_cfg["model"]}
# create all objects in the config
- factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)["config"]
+ factory_cfg = registry.make_from_config({"config": factory_cfg}, validate=True)[
+ "config"
+ ]
model = factory_cfg.get("model", None)
if model is not None:
del factory_cfg["model"]
@@ -519,7 +523,16 @@ class Language(object):
def make_doc(self, text):
return self.tokenizer(text)
- def update(self, examples, dummy=None, *, drop=0.0, sgd=None, losses=None, component_cfg=None):
+ def update(
+ self,
+ examples,
+ dummy=None,
+ *,
+ drop=0.0,
+ sgd=None,
+ losses=None,
+ component_cfg=None,
+ ):
"""Update the models in the pipeline.
examples (iterable): A batch of `Example` or `Doc` objects.
diff --git a/spacy/analysis.py b/spacy/pipe_analysis.py
similarity index 99%
rename from spacy/analysis.py
rename to spacy/pipe_analysis.py
index 41591661c..4c0950453 100644
--- a/spacy/analysis.py
+++ b/spacy/pipe_analysis.py
@@ -187,10 +187,8 @@ def count_pipeline_interdependencies(pipeline):
counts = []
for i, assigns in enumerate(pipe_assigns):
count = 0
- for requires in pipe_requires[i+1:]:
+ for requires in pipe_requires[i + 1 :]:
if assigns.intersection(requires):
count += 1
counts.append(count)
return counts
-
-
diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py
index e608f2c34..b826438f5 100644
--- a/spacy/tests/pipeline/test_analysis.py
+++ b/spacy/tests/pipeline/test_analysis.py
@@ -1,8 +1,8 @@
import spacy.language
from spacy.language import Language, component
-from spacy.analysis import print_summary, validate_attrs
-from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
-from spacy.analysis import count_pipeline_interdependencies
+from spacy.pipe_analysis import print_summary, validate_attrs
+from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr
+from spacy.pipe_analysis import count_pipeline_interdependencies
from mock import Mock, ANY
import pytest
@@ -169,7 +169,7 @@ def test_pipe_interdependencies():
name = "fancifier"
assigns = ("doc._.fancy",)
requires = tuple()
-
+
class FancyNeeder:
name = "needer"
assigns = tuple()
From 2d9de8684df7d28477986eb497e13b403c03d9d9 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal
Date: Fri, 22 May 2020 23:10:40 +0200
Subject: [PATCH 169/496] Support use_pytorch_for_gpu_memory config
---
spacy/cli/train_from_config.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index eeb21c10c..c0e3bd169 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -7,7 +7,7 @@ from pathlib import Path
from wasabi import msg
import thinc
import thinc.schedules
-from thinc.api import Model
+from thinc.api import Model, use_pytorch_for_gpu_memory
import random
from ..gold import GoldCorpus
@@ -171,6 +171,8 @@ def train_from_config(
msg.info(f"Loading config from: {config_path}")
config = util.load_config(config_path, create_objects=False)
util.fix_random_seed(config["training"]["seed"])
+ if config["training"]["use_pytorch_for_gpu_memory"]:
+ use_pytorch_for_gpu_memory()
nlp_config = config["nlp"]
config = util.load_config(config_path, create_objects=True)
msg.info("Creating nlp from config")
From f9786d765edf16afa092cf378a0a45fb321efe22 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sun, 24 May 2020 14:48:56 +0200
Subject: [PATCH 170/496] Simplify is_package check
---
spacy/cli/download.py | 18 ++----------------
spacy/util.py | 13 +++++--------
2 files changed, 7 insertions(+), 24 deletions(-)
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0230e272d..af132bbbe 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -5,6 +5,7 @@ import sys
from wasabi import msg
from .. import about
+from ..util import is_package
def download(
@@ -17,7 +18,7 @@ def download(
flag is set, the command expects the full model name with version.
For direct downloads, the compatibility check will be skipped.
"""
- if not require_package("spacy") and "--no-deps" not in pip_args:
+ if not is_package("spacy") and "--no-deps" not in pip_args:
msg.warn(
"Skipping model package dependencies and setting `--no-deps`. "
"You don't seem to have the spaCy package itself installed "
@@ -45,21 +46,6 @@ def download(
"Download and installation successful",
f"You can now load the model via spacy.load('{model_name}')",
)
- # If a model is downloaded and then loaded within the same process, our
- # is_package check currently fails, because pkg_resources.working_set
- # is not refreshed automatically (see #3923). We're trying to work
- # around this here be requiring the package explicitly.
- require_package(model_name)
-
-
-def require_package(name):
- try:
- import pkg_resources
-
- pkg_resources.working_set.require(name)
- return True
- except: # noqa: E722
- return False
def get_json(url, desc):
diff --git a/spacy/util.py b/spacy/util.py
index 5a7c633fa..41af881c9 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -341,14 +341,11 @@ def is_package(name):
name (unicode): Name of package.
RETURNS (bool): True if installed package, False if not.
"""
- import pkg_resources
-
- name = name.lower() # compare package name against lowercase name
- packages = pkg_resources.working_set.by_key.keys()
- for package in packages:
- if package.lower().replace("-", "_") == name:
- return True
- return False
+ try:
+ importlib_metadata.distribution(name)
+ return True
+ except: # noqa: E722
+ return False
def get_package_path(name):
From 387c7aba15228557cdbbfae0ee3ab90009769584 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sun, 24 May 2020 14:55:16 +0200
Subject: [PATCH 171/496] Update test
---
spacy/tests/test_misc.py | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index ddf1bb332..9e67ae83b 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -26,10 +26,12 @@ def test_util_ensure_path_succeeds(text):
assert isinstance(path, Path)
-@pytest.mark.parametrize("package", ["numpy"])
-def test_util_is_package(package):
+@pytest.mark.parametrize(
+ "package,result", [("numpy", True), ("sfkodskfosdkfpsdpofkspdof", False)]
+)
+def test_util_is_package(package, result):
"""Test that an installed package via pip is recognised by util.is_package."""
- assert util.is_package(package)
+ assert util.is_package(package) is result
@pytest.mark.parametrize("package", ["thinc"])
From 5d3806e059178c9516fb6cf57064cb10cfbf0f29 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sun, 24 May 2020 17:20:58 +0200
Subject: [PATCH 172/496] unicode -> str consistency
---
spacy/cli/converters/conllu2json.py | 8 ++--
spacy/displacy/__init__.py | 8 ++--
spacy/displacy/render.py | 26 +++++------
spacy/errors.py | 2 +-
spacy/glossary.py | 4 +-
spacy/kb.pyx | 4 +-
spacy/language.py | 26 +++++------
spacy/lemmatizer.py | 10 ++---
spacy/lexeme.pyx | 20 ++++-----
spacy/lookups.py | 28 ++++++------
spacy/matcher/dependencymatcher.pyx | 2 +-
spacy/matcher/matcher.pyx | 6 +--
spacy/matcher/phrasematcher.pyx | 6 +--
spacy/morphology.pyx | 4 +-
spacy/pipe_analysis.py | 6 +--
spacy/pipeline/entityruler.py | 4 +-
spacy/pipeline/functions.py | 2 +-
spacy/strings.pyx | 6 +--
spacy/tokenizer.pyx | 16 +++----
spacy/tokens/doc.pyx | 14 +++---
spacy/tokens/span.pyx | 22 +++++-----
spacy/tokens/token.pyx | 44 +++++++++----------
spacy/util.py | 36 +++++++--------
spacy/vectors.pyx | 8 ++--
spacy/vocab.pyx | 6 +--
website/docs/api/lexeme.md | 16 +++----
website/docs/api/vocab.md | 30 ++++++-------
website/docs/usage/rule-based-matching.md | 53 +++++++++++++----------
28 files changed, 212 insertions(+), 205 deletions(-)
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index 0b2920802..1ece755b8 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -111,8 +111,8 @@ def get_entities(lines, tag_pattern, ner_map=None):
final entity type with `ner_map` if mapping present. Entity tag is 'O' if
the pattern is not matched.
- lines (unicode): CONLL-U lines for one sentences
- tag_pattern (unicode): Regex pattern for entity tag
+ lines (str): CONLL-U lines for one sentences
+ tag_pattern (str): Regex pattern for entity tag
ner_map (dict): Map old NER tag names to new ones, '' maps to O.
RETURNS (list): List of BILUO entity tags
"""
@@ -187,8 +187,8 @@ def example_from_conllu_sentence(
"""Create an Example from the lines for one CoNLL-U sentence, merging
subtokens and appending morphology to tags if required.
- lines (unicode): The non-comment lines for a CoNLL-U sentence
- ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col
+ lines (str): The non-comment lines for a CoNLL-U sentence
+ ner_tag_pattern (str): The regex pattern for matching NER in MISC col
RETURNS (Example): An example containing the annotation
"""
# create a Doc with each subtoken as its own token
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index 3f84dabce..2c377a043 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -22,13 +22,13 @@ def render(
"""Render displaCy visualisation.
docs (list or Doc): Document(s) to visualise.
- style (unicode): Visualisation style, 'dep' or 'ent'.
+ style (str): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
jupyter (bool): Override Jupyter auto-detection.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
- RETURNS (unicode): Rendered HTML markup.
+ RETURNS (str): Rendered HTML markup.
DOCS: https://spacy.io/api/top-level#displacy.render
USAGE: https://spacy.io/usage/visualizers
@@ -73,13 +73,13 @@ def serve(
"""Serve displaCy visualisation.
docs (list or Doc): Document(s) to visualise.
- style (unicode): Visualisation style, 'dep' or 'ent'.
+ style (str): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
port (int): Port to serve visualisation.
- host (unicode): Host to serve visualisation.
+ host (str): Host to serve visualisation.
DOCS: https://spacy.io/api/top-level#displacy.serve
USAGE: https://spacy.io/usage/visualizers
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 0d4cdb77f..d3572ce78 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -47,7 +47,7 @@ class DependencyRenderer(object):
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
- RETURNS (unicode): Rendered SVG or HTML markup.
+ RETURNS (str): Rendered SVG or HTML markup.
"""
# Create a random ID prefix to make sure parses don't receive the
# same ID, even if they're identical
@@ -78,7 +78,7 @@ class DependencyRenderer(object):
render_id (int): Unique ID, typically index of document.
words (list): Individual words and their tags.
arcs (list): Individual arcs and their start, end, direction and label.
- RETURNS (unicode): Rendered SVG markup.
+ RETURNS (str): Rendered SVG markup.
"""
self.levels = self.get_levels(arcs)
self.highest_level = len(self.levels)
@@ -112,10 +112,10 @@ class DependencyRenderer(object):
):
"""Render individual word.
- text (unicode): Word text.
- tag (unicode): Part-of-speech tag.
+ text (str): Word text.
+ tag (str): Part-of-speech tag.
i (int): Unique ID, typically word index.
- RETURNS (unicode): Rendered SVG markup.
+ RETURNS (str): Rendered SVG markup.
"""
y = self.offset_y + self.word_spacing
x = self.offset_x + i * self.distance
@@ -131,12 +131,12 @@ class DependencyRenderer(object):
def render_arrow(self, label, start, end, direction, i):
"""Render individual arrow.
- label (unicode): Dependency label.
+ label (str): Dependency label.
start (int): Index of start word.
end (int): Index of end word.
- direction (unicode): Arrow direction, 'left' or 'right'.
+ direction (str): Arrow direction, 'left' or 'right'.
i (int): Unique ID, typically arrow index.
- RETURNS (unicode): Rendered SVG markup.
+ RETURNS (str): Rendered SVG markup.
"""
if start < 0 or end < 0:
error_args = dict(start=start, end=end, label=label, dir=direction)
@@ -179,7 +179,7 @@ class DependencyRenderer(object):
y (int): Y-coordinate of arrow start and end point.
y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
x_end (int): X-coordinate of arrow end point.
- RETURNS (unicode): Definition of the arc path ('d' attribute).
+ RETURNS (str): Definition of the arc path ('d' attribute).
"""
template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
if self.compact:
@@ -189,11 +189,11 @@ class DependencyRenderer(object):
def get_arrowhead(self, direction, x, y, end):
"""Render individual arrow head.
- direction (unicode): Arrow direction, 'left' or 'right'.
+ direction (str): Arrow direction, 'left' or 'right'.
x (int): X-coordinate of arrow start point.
y (int): Y-coordinate of arrow start and end point.
end (int): X-coordinate of arrow end point.
- RETURNS (unicode): Definition of the arrow head path ('d' attribute).
+ RETURNS (str): Definition of the arrow head path ('d' attribute).
"""
if direction == "left":
pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
@@ -279,7 +279,7 @@ class EntityRenderer(object):
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
- RETURNS (unicode): Rendered HTML markup.
+ RETURNS (str): Rendered HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):
@@ -300,7 +300,7 @@ class EntityRenderer(object):
def render_ents(self, text, spans, title):
"""Render entities in text.
- text (unicode): Original text.
+ text (str): Original text.
spans (list): Individual entity spans and their start, end and label.
title (unicode or None): Document title set in Doc.user_data['title'].
"""
diff --git a/spacy/errors.py b/spacy/errors.py
index 4d38ab586..932bb1eff 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -598,7 +598,7 @@ class MatchPatternError(ValueError):
def __init__(self, key, errors):
"""Custom error for validating match patterns.
- key (unicode): The name of the matcher rule.
+ key (str): The name of the matcher rule.
errors (dict): Validation errors (sequence of strings) mapped to pattern
ID, i.e. the index of the added pattern.
"""
diff --git a/spacy/glossary.py b/spacy/glossary.py
index 938a575cd..c4a6a5c45 100644
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@@ -1,8 +1,8 @@
def explain(term):
"""Get a description for a given POS tag, dependency label or entity type.
- term (unicode): The term to explain.
- RETURNS (unicode): The explanation, or `None` if not found in the glossary.
+ term (str): The term to explain.
+ RETURNS (str): The explanation, or `None` if not found in the glossary.
EXAMPLE:
>>> spacy.explain(u'NORP')
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 86a8d49b8..8d8464f3c 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -38,7 +38,7 @@ cdef class Candidate:
@property
def entity_(self):
- """RETURNS (unicode): ID/name of this entity in the KB"""
+ """RETURNS (str): ID/name of this entity in the KB"""
return self.kb.vocab.strings[self.entity_hash]
@property
@@ -48,7 +48,7 @@ cdef class Candidate:
@property
def alias_(self):
- """RETURNS (unicode): ID of the original alias"""
+ """RETURNS (str): ID of the original alias"""
return self.kb.vocab.strings[self.alias_hash]
@property
diff --git a/spacy/language.py b/spacy/language.py
index 5286bd3b9..e3b770723 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -122,7 +122,7 @@ class Language(object):
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
- lang (unicode): Two-letter language ID, i.e. ISO code.
+ lang (str): Two-letter language ID, i.e. ISO code.
DOCS: https://spacy.io/api/language
"""
@@ -287,7 +287,7 @@ class Language(object):
def get_pipe(self, name):
"""Get a pipeline component for a given component name.
- name (unicode): Name of pipeline component to get.
+ name (str): Name of pipeline component to get.
RETURNS (callable): The pipeline component.
DOCS: https://spacy.io/api/language#get_pipe
@@ -300,7 +300,7 @@ class Language(object):
def create_pipe(self, name, config=dict()):
"""Create a pipeline component from a factory.
- name (unicode): Factory name to look up in `Language.factories`.
+ name (str): Factory name to look up in `Language.factories`.
config (dict): Configuration parameters to initialise component.
RETURNS (callable): Pipeline component.
@@ -343,12 +343,12 @@ class Language(object):
of before/after/first/last can be set. Default behaviour is "last".
component (callable): The pipeline component.
- name (unicode): Name of pipeline component. Overwrites existing
+ name (str): Name of pipeline component. Overwrites existing
component.name attribute if available. If no name is set and
the component exposes no name attribute, component.__name__ is
used. An error is raised if a name already exists in the pipeline.
- before (unicode): Component name to insert component directly before.
- after (unicode): Component name to insert component directly after.
+ before (str): Component name to insert component directly before.
+ after (str): Component name to insert component directly after.
first (bool): Insert component first / not first in the pipeline.
last (bool): Insert component last / not last in the pipeline.
@@ -389,7 +389,7 @@ class Language(object):
"""Check if a component name is present in the pipeline. Equivalent to
`name in nlp.pipe_names`.
- name (unicode): Name of the component.
+ name (str): Name of the component.
RETURNS (bool): Whether a component of the name exists in the pipeline.
DOCS: https://spacy.io/api/language#has_pipe
@@ -399,7 +399,7 @@ class Language(object):
def replace_pipe(self, name, component):
"""Replace a component in the pipeline.
- name (unicode): Name of the component to replace.
+ name (str): Name of the component to replace.
component (callable): Pipeline component.
DOCS: https://spacy.io/api/language#replace_pipe
@@ -418,8 +418,8 @@ class Language(object):
def rename_pipe(self, old_name, new_name):
"""Rename a pipeline component.
- old_name (unicode): Name of the component to rename.
- new_name (unicode): New name of the component.
+ old_name (str): Name of the component to rename.
+ new_name (str): New name of the component.
DOCS: https://spacy.io/api/language#rename_pipe
"""
@@ -433,7 +433,7 @@ class Language(object):
def remove_pipe(self, name):
"""Remove a component from the pipeline.
- name (unicode): Name of the component to remove.
+ name (str): Name of the component to remove.
RETURNS (tuple): A `(name, component)` tuple of the removed component.
DOCS: https://spacy.io/api/language#remove_pipe
@@ -450,7 +450,7 @@ class Language(object):
and can contain arbitrary whitespace. Alignment into the original string
is preserved.
- text (unicode): The text to be processed.
+ text (str): The text to be processed.
disable (list): Names of the pipeline components to disable.
component_cfg (dict): An optional dictionary with extra keyword arguments
for specific components.
@@ -1086,7 +1086,7 @@ class component(object):
):
"""Decorate a pipeline component.
- name (unicode): Default component and factory name.
+ name (str): Default component and factory name.
assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
requires (list): Attributes required by component, e.g. `["token.dep"]`.
retokenizes (bool): Whether the component changes the tokenization.
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index 3ba86c169..aeedbde84 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -29,8 +29,8 @@ class Lemmatizer(object):
def __call__(self, string, univ_pos, morphology=None):
"""Lemmatize a string.
- string (unicode): The string to lemmatize, e.g. the token text.
- univ_pos (unicode / int): The token's universal part-of-speech tag.
+ string (str): The string to lemmatize, e.g. the token text.
+ univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
RETURNS (list): The available lemmas for the string.
@@ -69,7 +69,7 @@ class Lemmatizer(object):
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
- univ_pos (unicode / int): The token's universal part-of-speech tag.
+ univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
@@ -128,10 +128,10 @@ class Lemmatizer(object):
"""Look up a lemma in the table, if available. If no lemma is found,
the original string is returned.
- string (unicode): The original string.
+ string (str): The original string.
orth (int): Optional hash of the string to look up. If not set, the
string will be used and hashed.
- RETURNS (unicode): The lemma if the string was found, otherwise the
+ RETURNS (str): The lemma if the string was found, otherwise the
original string.
"""
lookup_table = self.lookups.get_table("lemma_lookup", {})
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 20e175f03..911112d50 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -190,7 +190,7 @@ cdef class Lexeme:
self.vocab.set_vector(self.c.orth, vector)
property rank:
- """RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
+ """RETURNS (str): Sequential ID of the lexemes's lexical type, used
to index into tables, e.g. for word vectors."""
def __get__(self):
return self.c.id
@@ -209,18 +209,18 @@ cdef class Lexeme:
@property
def orth_(self):
- """RETURNS (unicode): The original verbatim text of the lexeme
+ """RETURNS (str): The original verbatim text of the lexeme
(identical to `Lexeme.text`). Exists mostly for consistency with
the other attributes."""
return self.vocab.strings[self.c.orth]
@property
def text(self):
- """RETURNS (unicode): The original verbatim text of the lexeme."""
+ """RETURNS (str): The original verbatim text of the lexeme."""
return self.orth_
property lower:
- """RETURNS (unicode): Lowercase form of the lexeme."""
+ """RETURNS (str): Lowercase form of the lexeme."""
def __get__(self):
return self.c.lower
@@ -293,7 +293,7 @@ cdef class Lexeme:
self.c.prob = x
property lower_:
- """RETURNS (unicode): Lowercase form of the word."""
+ """RETURNS (str): Lowercase form of the word."""
def __get__(self):
return self.vocab.strings[self.c.lower]
@@ -301,7 +301,7 @@ cdef class Lexeme:
self.c.lower = self.vocab.strings.add(x)
property norm_:
- """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
+ """RETURNS (str): The lexemes's norm, i.e. a normalised form of the
lexeme text.
"""
def __get__(self):
@@ -311,7 +311,7 @@ cdef class Lexeme:
self.c.norm = self.vocab.strings.add(x)
property shape_:
- """RETURNS (unicode): Transform of the word's string, to show
+ """RETURNS (str): Transform of the word's string, to show
orthographic features.
"""
def __get__(self):
@@ -321,7 +321,7 @@ cdef class Lexeme:
self.c.shape = self.vocab.strings.add(x)
property prefix_:
- """RETURNS (unicode): Length-N substring from the start of the word.
+ """RETURNS (str): Length-N substring from the start of the word.
Defaults to `N=1`.
"""
def __get__(self):
@@ -331,7 +331,7 @@ cdef class Lexeme:
self.c.prefix = self.vocab.strings.add(x)
property suffix_:
- """RETURNS (unicode): Length-N substring from the end of the word.
+ """RETURNS (str): Length-N substring from the end of the word.
Defaults to `N=3`.
"""
def __get__(self):
@@ -341,7 +341,7 @@ cdef class Lexeme:
self.c.suffix = self.vocab.strings.add(x)
property lang_:
- """RETURNS (unicode): Language of the parent vocabulary."""
+ """RETURNS (str): Language of the parent vocabulary."""
def __get__(self):
return self.vocab.strings[self.c.lang]
diff --git a/spacy/lookups.py b/spacy/lookups.py
index a9d371b79..5661897e1 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -31,7 +31,7 @@ class Lookups(object):
"""Check if the lookups contain a table of a given name. Delegates to
Lookups.has_table.
- name (unicode): Name of the table.
+ name (str): Name of the table.
RETURNS (bool): Whether a table of that name is in the lookups.
"""
return self.has_table(name)
@@ -48,7 +48,7 @@ class Lookups(object):
def add_table(self, name, data=SimpleFrozenDict()):
"""Add a new table to the lookups. Raises an error if the table exists.
- name (unicode): Unique name of table.
+ name (str): Unique name of table.
data (dict): Optional data to add to the table.
RETURNS (Table): The newly added table.
@@ -64,7 +64,7 @@ class Lookups(object):
"""Get a table. Raises an error if the table doesn't exist and no
default value is provided.
- name (unicode): Name of the table.
+ name (str): Name of the table.
default: Optional default value to return if table doesn't exist.
RETURNS (Table): The table.
@@ -79,7 +79,7 @@ class Lookups(object):
def remove_table(self, name):
"""Remove a table. Raises an error if the table doesn't exist.
- name (unicode): Name of the table to remove.
+ name (str): Name of the table to remove.
RETURNS (Table): The removed table.
DOCS: https://spacy.io/api/lookups#remove_table
@@ -91,7 +91,7 @@ class Lookups(object):
def has_table(self, name):
"""Check if the lookups contain a table of a given name.
- name (unicode): Name of the table.
+ name (str): Name of the table.
RETURNS (bool): Whether a table of that name exists.
DOCS: https://spacy.io/api/lookups#has_table
@@ -125,7 +125,7 @@ class Lookups(object):
"""Save the lookups to a directory as lookups.bin. Expects a path to a
directory, which will be created if it doesn't exist.
- path (unicode / Path): The file path.
+ path (str / Path): The file path.
DOCS: https://spacy.io/api/lookups#to_disk
"""
@@ -141,7 +141,7 @@ class Lookups(object):
"""Load lookups from a directory containing a lookups.bin. Will skip
loading if the file doesn't exist.
- path (unicode / Path): The directory path.
+ path (str / Path): The directory path.
RETURNS (Lookups): The loaded lookups.
DOCS: https://spacy.io/api/lookups#from_disk
@@ -167,7 +167,7 @@ class Table(OrderedDict):
"""Initialize a new table from a dict.
data (dict): The dictionary.
- name (unicode): Optional table name for reference.
+ name (str): Optional table name for reference.
RETURNS (Table): The newly created object.
DOCS: https://spacy.io/api/lookups#table.from_dict
@@ -179,7 +179,7 @@ class Table(OrderedDict):
def __init__(self, name=None, data=None):
"""Initialize a new table.
- name (unicode): Optional table name for reference.
+ name (str): Optional table name for reference.
data (dict): Initial data, used to hint Bloom Filter.
RETURNS (Table): The newly created object.
@@ -197,7 +197,7 @@ class Table(OrderedDict):
def __setitem__(self, key, value):
"""Set new key/value pair. String keys will be hashed.
- key (unicode / int): The key to set.
+ key (str / int): The key to set.
value: The value to set.
"""
key = get_string_id(key)
@@ -208,7 +208,7 @@ class Table(OrderedDict):
"""Set new key/value pair. String keys will be hashed.
Same as table[key] = value.
- key (unicode / int): The key to set.
+ key (str / int): The key to set.
value: The value to set.
"""
self[key] = value
@@ -216,7 +216,7 @@ class Table(OrderedDict):
def __getitem__(self, key):
"""Get the value for a given key. String keys will be hashed.
- key (unicode / int): The key to get.
+ key (str / int): The key to get.
RETURNS: The value.
"""
key = get_string_id(key)
@@ -225,7 +225,7 @@ class Table(OrderedDict):
def get(self, key, default=None):
"""Get the value for a given key. String keys will be hashed.
- key (unicode / int): The key to get.
+ key (str / int): The key to get.
default: The default value to return.
RETURNS: The value.
"""
@@ -235,7 +235,7 @@ class Table(OrderedDict):
def __contains__(self, key):
"""Check whether a key is in the table. String keys will be hashed.
- key (unicode / int): The key to check.
+ key (str / int): The key to check.
RETURNS (bool): Whether the key is in the table.
"""
key = get_string_id(key)
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index ff707a71c..732931380 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -66,7 +66,7 @@ cdef class DependencyMatcher:
def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID.
- key (unicode): The match ID.
+ key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
return self._normalize_key(key) in self._patterns
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 2bcb82a2a..225eba9a9 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -63,7 +63,7 @@ cdef class Matcher:
def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID.
- key (unicode): The match ID.
+ key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
return self._normalize_key(key) in self._patterns
@@ -97,7 +97,7 @@ cdef class Matcher:
number of arguments). The on_match callback becomes an optional keyword
argument.
- key (unicode): The match ID.
+ key (str): The match ID.
patterns (list): The patterns to add for the given key.
on_match (callable): Optional callback executed on match.
*_patterns (list): For backwards compatibility: list of patterns to add
@@ -138,7 +138,7 @@ cdef class Matcher:
"""Remove a rule from the matcher. A KeyError is raised if the key does
not exist.
- key (unicode): The ID of the match rule.
+ key (str): The ID of the match rule.
"""
norm_key = self._normalize_key(key)
if not norm_key in self._patterns:
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 14cc39787..f7ce44ece 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -70,7 +70,7 @@ cdef class PhraseMatcher:
def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID.
- key (unicode): The match ID.
+ key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
DOCS: https://spacy.io/api/phrasematcher#contains
@@ -85,7 +85,7 @@ cdef class PhraseMatcher:
"""Remove a rule from the matcher by match ID. A KeyError is raised if
the key does not exist.
- key (unicode): The match ID.
+ key (str): The match ID.
DOCS: https://spacy.io/api/phrasematcher#remove
"""
@@ -159,7 +159,7 @@ cdef class PhraseMatcher:
number of arguments). The on_match callback becomes an optional keyword
argument.
- key (unicode): The match ID.
+ key (str): The match ID.
docs (list): List of `Doc` objects representing match patterns.
on_match (callable): Callback executed on match.
*_docs (Doc): For backwards compatibility: list of patterns to add
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 0b53b124c..5dcf81ea7 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -198,8 +198,8 @@ cdef class Morphology:
"""Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties.
- tag (unicode): The part-of-speech tag to key the exception.
- orth (unicode): The word-form to key the exception.
+ tag (str): The part-of-speech tag to key the exception.
+ orth (str): The word-form to key the exception.
"""
attrs = dict(attrs)
attrs = _normalize_props(attrs)
diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py
index 4c0950453..971ebe518 100644
--- a/spacy/pipe_analysis.py
+++ b/spacy/pipe_analysis.py
@@ -11,7 +11,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
fulfilled (e.g. if previous components assign the attributes).
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
- name (unicode): The name of the pipeline component to analyze.
+ name (str): The name of the pipeline component to analyze.
pipe (callable): The pipeline component function to analyze.
index (int): The index of the component in the pipeline.
warn (bool): Show user warning if problem is found.
@@ -125,7 +125,7 @@ def get_assigns_for_attr(pipeline, attr):
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
- attr (unicode): The attribute to check.
+ attr (str): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that assign the attr.
"""
return _get_feature_for_attr(pipeline, attr, "assigns")
@@ -135,7 +135,7 @@ def get_requires_for_attr(pipeline, attr):
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
- attr (unicode): The attribute to check.
+ attr (str): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that require the attr.
"""
return _get_feature_for_attr(pipeline, attr, "requires")
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 58160c2e9..cdacc82f6 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -315,7 +315,7 @@ class EntityRuler(object):
"""Load the entity ruler from a file. Expects a file containing
newline-delimited JSON (JSONL) with one entry per line.
- path (unicode / Path): The JSONL file to load.
+ path (str / Path): The JSONL file to load.
**kwargs: Other config paramters, mostly for consistency.
RETURNS (EntityRuler): The loaded entity ruler.
@@ -351,7 +351,7 @@ class EntityRuler(object):
"""Save the entity ruler patterns to a directory. The patterns will be
saved as newline-delimited JSON (JSONL).
- path (unicode / Path): The JSONL file to save.
+ path (str / Path): The JSONL file to save.
**kwargs: Other config paramters, mostly for consistency.
DOCS: https://spacy.io/api/entityruler#to_disk
diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
index 6e9d4197c..622791512 100644
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@@ -50,7 +50,7 @@ def merge_subtokens(doc, label="subtok"):
"""Merge subtokens into a single token.
doc (Doc): The Doc object.
- label (unicode): The subtoken dependency label.
+ label (str): The subtoken dependency label.
RETURNS (Doc): The Doc object with merged subtokens.
DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index a30f11729..9fe5af154 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -152,7 +152,7 @@ cdef class StringStore:
def add(self, string):
"""Add a string to the StringStore.
- string (unicode): The string to add.
+ string (str): The string to add.
RETURNS (uint64): The string's hash value.
"""
if isinstance(string, unicode):
@@ -179,7 +179,7 @@ cdef class StringStore:
def __contains__(self, string not None):
"""Check whether a string is in the store.
- string (unicode): The string to check.
+ string (str): The string to check.
RETURNS (bool): Whether the store contains the string.
"""
cdef hash_t key
@@ -205,7 +205,7 @@ cdef class StringStore:
def __iter__(self):
"""Iterate over the strings in the store, in order.
- YIELDS (unicode): A string in the store.
+ YIELDS (str): A string in the store.
"""
cdef int i
cdef hash_t key
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 7e75052f7..b628b1171 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -134,7 +134,7 @@ cdef class Tokenizer:
def __call__(self, unicode string):
"""Tokenize a string.
- string (unicode): The string to tokenize.
+ string (str): The string to tokenize.
RETURNS (Doc): A container for linguistic annotations.
DOCS: https://spacy.io/api/tokenizer#call
@@ -147,7 +147,7 @@ cdef class Tokenizer:
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
"""Tokenize according to affix and token_match settings.
- string (unicode): The string to tokenize.
+ string (str): The string to tokenize.
RETURNS (Doc): A container for linguistic annotations.
"""
if len(string) >= (2 ** 30):
@@ -527,7 +527,7 @@ cdef class Tokenizer:
def find_infix(self, unicode string):
"""Find internal split points of the string, such as hyphens.
- string (unicode): The string to segment.
+ string (str): The string to segment.
RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
and `.end()` methods, denoting the placement of internal segment
separators, e.g. hyphens.
@@ -542,7 +542,7 @@ cdef class Tokenizer:
"""Find the length of a prefix that should be segmented from the
string, or None if no prefix rules match.
- string (unicode): The string to segment.
+ string (str): The string to segment.
RETURNS (int): The length of the prefix if present, otherwise `None`.
DOCS: https://spacy.io/api/tokenizer#find_prefix
@@ -556,7 +556,7 @@ cdef class Tokenizer:
"""Find the length of a suffix that should be segmented from the
string, or None if no suffix rules match.
- string (unicode): The string to segment.
+ string (str): The string to segment.
Returns (int): The length of the suffix if present, otherwise `None`.
DOCS: https://spacy.io/api/tokenizer#find_suffix
@@ -576,7 +576,7 @@ cdef class Tokenizer:
def _validate_special_case(self, chunk, substrings):
"""Check whether the `ORTH` fields match the string.
- string (unicode): The string to specially tokenize.
+ string (str): The string to specially tokenize.
substrings (iterable): A sequence of dicts, where each dict describes
a token and its attributes.
"""
@@ -588,7 +588,7 @@ cdef class Tokenizer:
def add_special_case(self, unicode string, substrings):
"""Add a special-case tokenization rule.
- string (unicode): The string to specially tokenize.
+ string (str): The string to specially tokenize.
substrings (iterable): A sequence of dicts, where each dict describes
a token and its attributes. The `ORTH` fields of the attributes
must exactly match the string when they are concatenated.
@@ -629,7 +629,7 @@ cdef class Tokenizer:
produced are identical to `nlp.tokenizer()` except for whitespace
tokens.
- string (unicode): The string to tokenize.
+ string (str): The string to tokenize.
RETURNS (list): A list of (pattern_string, token_string) tuples
DOCS: https://spacy.io/api/tokenizer#explain
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 0716b2b3d..f6d0dbf4a 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -107,7 +107,7 @@ cdef class Doc:
def set_extension(cls, name, **kwargs):
"""Define a custom attribute which becomes available as `Doc._`.
- name (unicode): Name of the attribute to set.
+ name (str): Name of the attribute to set.
default: Optional default value of the attribute.
getter (callable): Optional getter function.
setter (callable): Optional setter function.
@@ -125,7 +125,7 @@ cdef class Doc:
def get_extension(cls, name):
"""Look up a previously registered extension by name.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
DOCS: https://spacy.io/api/doc#get_extension
@@ -136,7 +136,7 @@ cdef class Doc:
def has_extension(cls, name):
"""Check whether an extension has been registered.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (bool): Whether the extension has been registered.
DOCS: https://spacy.io/api/doc#has_extension
@@ -147,7 +147,7 @@ cdef class Doc:
def remove_extension(cls, name):
"""Remove a previously registered extension.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
removed extension.
@@ -473,7 +473,7 @@ cdef class Doc:
def text(self):
"""A unicode representation of the document text.
- RETURNS (unicode): The original verbatim text of the document.
+ RETURNS (str): The original verbatim text of the document.
"""
return "".join(t.text_with_ws for t in self)
@@ -482,7 +482,7 @@ cdef class Doc:
"""An alias of `Doc.text`, provided for duck-type compatibility with
`Span` and `Token`.
- RETURNS (unicode): The original verbatim text of the document.
+ RETURNS (str): The original verbatim text of the document.
"""
return self.text
@@ -628,7 +628,7 @@ cdef class Doc:
@property
def lang_(self):
- """RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'."""
+ """RETURNS (str): Language of the doc's vocabulary, e.g. 'en'."""
return self.vocab.lang
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 66e8d8c3e..59323c393 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -33,7 +33,7 @@ cdef class Span:
def set_extension(cls, name, **kwargs):
"""Define a custom attribute which becomes available as `Span._`.
- name (unicode): Name of the attribute to set.
+ name (str): Name of the attribute to set.
default: Optional default value of the attribute.
getter (callable): Optional getter function.
setter (callable): Optional setter function.
@@ -51,7 +51,7 @@ cdef class Span:
def get_extension(cls, name):
"""Look up a previously registered extension by name.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
DOCS: https://spacy.io/api/span#get_extension
@@ -62,7 +62,7 @@ cdef class Span:
def has_extension(cls, name):
"""Check whether an extension has been registered.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (bool): Whether the extension has been registered.
DOCS: https://spacy.io/api/span#has_extension
@@ -73,7 +73,7 @@ cdef class Span:
def remove_extension(cls, name):
"""Remove a previously registered extension.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
removed extension.
@@ -501,7 +501,7 @@ cdef class Span:
@property
def text(self):
- """RETURNS (unicode): The original verbatim text of the span."""
+ """RETURNS (str): The original verbatim text of the span."""
text = self.text_with_ws
if self[-1].whitespace_:
text = text[:-1]
@@ -512,7 +512,7 @@ cdef class Span:
"""The text content of the span with a trailing whitespace character if
the last token has one.
- RETURNS (unicode): The text content of the span (with trailing
+ RETURNS (str): The text content of the span (with trailing
whitespace).
"""
return "".join([t.text_with_ws for t in self])
@@ -688,7 +688,7 @@ cdef class Span:
raise NotImplementedError(TempErrors.T007.format(attr="ent_id"))
property ent_id_:
- """RETURNS (unicode): The (string) entity ID."""
+ """RETURNS (str): The (string) entity ID."""
def __get__(self):
return self.root.ent_id_
@@ -700,12 +700,12 @@ cdef class Span:
"""Verbatim text content (identical to `Span.text`). Exists mostly for
consistency with other attributes.
- RETURNS (unicode): The span's text."""
+ RETURNS (str): The span's text."""
return self.text
@property
def lemma_(self):
- """RETURNS (unicode): The span's lemma."""
+ """RETURNS (str): The span's lemma."""
return " ".join([t.lemma_ for t in self]).strip()
@property
@@ -724,7 +724,7 @@ cdef class Span:
return "".join([t.text_with_ws for t in self])
property label_:
- """RETURNS (unicode): The span's label."""
+ """RETURNS (str): The span's label."""
def __get__(self):
return self.doc.vocab.strings[self.label]
@@ -734,7 +734,7 @@ cdef class Span:
raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_))
property kb_id_:
- """RETURNS (unicode): The named entity's KB ID."""
+ """RETURNS (str): The named entity's KB ID."""
def __get__(self):
return self.doc.vocab.strings[self.kb_id]
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 2486ed991..0d1e82322 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -36,7 +36,7 @@ cdef class Token:
def set_extension(cls, name, **kwargs):
"""Define a custom attribute which becomes available as `Token._`.
- name (unicode): Name of the attribute to set.
+ name (str): Name of the attribute to set.
default: Optional default value of the attribute.
getter (callable): Optional getter function.
setter (callable): Optional setter function.
@@ -54,7 +54,7 @@ cdef class Token:
def get_extension(cls, name):
"""Look up a previously registered extension by name.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
DOCS: https://spacy.io/api/token#get_extension
@@ -65,7 +65,7 @@ cdef class Token:
def has_extension(cls, name):
"""Check whether an extension has been registered.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (bool): Whether the extension has been registered.
DOCS: https://spacy.io/api/token#has_extension
@@ -76,7 +76,7 @@ cdef class Token:
def remove_extension(cls, name):
"""Remove a previously registered extension.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
removed extension.
@@ -244,12 +244,12 @@ cdef class Token:
@property
def text(self):
- """RETURNS (unicode): The original verbatim text of the token."""
+ """RETURNS (str): The original verbatim text of the token."""
return self.orth_
@property
def text_with_ws(self):
- """RETURNS (unicode): The text content of the span (with trailing
+ """RETURNS (str): The text content of the span (with trailing
whitespace).
"""
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
@@ -740,7 +740,7 @@ cdef class Token:
self.c.ent_type = ent_type
property ent_type_:
- """RETURNS (unicode): Named entity type."""
+ """RETURNS (str): Named entity type."""
def __get__(self):
return self.vocab.strings[self.c.ent_type]
@@ -763,7 +763,7 @@ cdef class Token:
and "" means no entity tag is set. "B" with an empty ent_type
means that the token is blocked from further processing by NER.
- RETURNS (unicode): IOB code of named entity tag.
+ RETURNS (str): IOB code of named entity tag.
"""
iob_strings = ("", "I", "O", "B")
return iob_strings[self.c.ent_iob]
@@ -779,7 +779,7 @@ cdef class Token:
self.c.ent_id = key
property ent_id_:
- """RETURNS (unicode): ID of the entity the token is an instance of,
+ """RETURNS (str): ID of the entity the token is an instance of,
if any.
"""
def __get__(self):
@@ -797,7 +797,7 @@ cdef class Token:
self.c.ent_kb_id = ent_kb_id
property ent_kb_id_:
- """RETURNS (unicode): Named entity KB ID."""
+ """RETURNS (str): Named entity KB ID."""
def __get__(self):
return self.vocab.strings[self.c.ent_kb_id]
@@ -806,12 +806,12 @@ cdef class Token:
@property
def whitespace_(self):
- """RETURNS (unicode): The trailing whitespace character, if present."""
+ """RETURNS (str): The trailing whitespace character, if present."""
return " " if self.c.spacy else ""
@property
def orth_(self):
- """RETURNS (unicode): Verbatim text content (identical to
+ """RETURNS (str): Verbatim text content (identical to
`Token.text`). Exists mostly for consistency with the other
attributes.
"""
@@ -819,13 +819,13 @@ cdef class Token:
@property
def lower_(self):
- """RETURNS (unicode): The lowercase token text. Equivalent to
+ """RETURNS (str): The lowercase token text. Equivalent to
`Token.text.lower()`.
"""
return self.vocab.strings[self.c.lex.lower]
property norm_:
- """RETURNS (unicode): The token's norm, i.e. a normalised form of the
+ """RETURNS (str): The token's norm, i.e. a normalised form of the
token text. Usually set in the language's tokenizer exceptions or
norm exceptions.
"""
@@ -837,34 +837,34 @@ cdef class Token:
@property
def shape_(self):
- """RETURNS (unicode): Transform of the tokens's string, to show
+ """RETURNS (str): Transform of the tokens's string, to show
orthographic features. For example, "Xxxx" or "dd".
"""
return self.vocab.strings[self.c.lex.shape]
@property
def prefix_(self):
- """RETURNS (unicode): A length-N substring from the start of the token.
+ """RETURNS (str): A length-N substring from the start of the token.
Defaults to `N=1`.
"""
return self.vocab.strings[self.c.lex.prefix]
@property
def suffix_(self):
- """RETURNS (unicode): A length-N substring from the end of the token.
+ """RETURNS (str): A length-N substring from the end of the token.
Defaults to `N=3`.
"""
return self.vocab.strings[self.c.lex.suffix]
@property
def lang_(self):
- """RETURNS (unicode): Language of the parent document's vocabulary,
+ """RETURNS (str): Language of the parent document's vocabulary,
e.g. 'en'.
"""
return self.vocab.strings[self.c.lex.lang]
property lemma_:
- """RETURNS (unicode): The token lemma, i.e. the base form of the word,
+ """RETURNS (str): The token lemma, i.e. the base form of the word,
with no inflectional suffixes.
"""
def __get__(self):
@@ -877,7 +877,7 @@ cdef class Token:
self.c.lemma = self.vocab.strings.add(lemma_)
property pos_:
- """RETURNS (unicode): Coarse-grained part-of-speech tag."""
+ """RETURNS (str): Coarse-grained part-of-speech tag."""
def __get__(self):
return parts_of_speech.NAMES[self.c.pos]
@@ -885,7 +885,7 @@ cdef class Token:
self.c.pos = parts_of_speech.IDS[pos_name]
property tag_:
- """RETURNS (unicode): Fine-grained part-of-speech tag."""
+ """RETURNS (str): Fine-grained part-of-speech tag."""
def __get__(self):
return self.vocab.strings[self.c.tag]
@@ -893,7 +893,7 @@ cdef class Token:
self.tag = self.vocab.strings.add(tag)
property dep_:
- """RETURNS (unicode): The syntactic dependency label."""
+ """RETURNS (str): The syntactic dependency label."""
def __get__(self):
return self.vocab.strings[self.c.dep]
diff --git a/spacy/util.py b/spacy/util.py
index 41af881c9..fc5837755 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -58,7 +58,7 @@ def lang_class_is_loaded(lang):
loaded lazily, to avoid expensive setup code associated with the language
data.
- lang (unicode): Two-letter language code, e.g. 'en'.
+ lang (str): Two-letter language code, e.g. 'en'.
RETURNS (bool): Whether a Language class has been loaded.
"""
return lang in registry.languages
@@ -67,7 +67,7 @@ def lang_class_is_loaded(lang):
def get_lang_class(lang):
"""Import and load a Language class.
- lang (unicode): Two-letter language code, e.g. 'en'.
+ lang (str): Two-letter language code, e.g. 'en'.
RETURNS (Language): Language class.
"""
# Check if language is registered / entry point is available
@@ -85,7 +85,7 @@ def get_lang_class(lang):
def set_lang_class(name, cls):
"""Set a custom Language class name that can be loaded via get_lang_class.
- name (unicode): Name of Language class.
+ name (str): Name of Language class.
cls (Language): Language class.
"""
registry.languages.register(name, func=cls)
@@ -107,7 +107,7 @@ def load_language_data(path):
"""Load JSON language data using the given path as a base. If the provided
path isn't present, will attempt to load a gzipped version before giving up.
- path (unicode / Path): The data to load.
+ path (str / Path): The data to load.
RETURNS: The loaded data.
"""
path = ensure_path(path)
@@ -128,7 +128,7 @@ def get_module_path(module):
def load_model(name, **overrides):
"""Load a model from a package or data path.
- name (unicode): Package name or model path.
+ name (str): Package name or model path.
**overrides: Specific overrides, like pipeline components to disable.
RETURNS (Language): `Language` class with the loaded model.
"""
@@ -202,7 +202,7 @@ def load_model_from_init_py(init_file, **overrides):
"""Helper function to use in the `load()` method of a model package's
__init__.py.
- init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
+ init_file (str): Path to model's __init__.py, i.e. `__file__`.
**overrides: Specific overrides, like pipeline components to disable.
RETURNS (Language): `Language` class with loaded model.
"""
@@ -227,8 +227,8 @@ def get_package_version(name):
"""Get the version of an installed package. Typically used to get model
package versions.
- name (unicode): The name of the installed Python package.
- RETURNS (unicode / None): The version or None if package not installed.
+ name (str): The name of the installed Python package.
+ RETURNS (str / None): The version or None if package not installed.
"""
try:
return importlib_metadata.version(name)
@@ -338,7 +338,7 @@ def get_model_config(path):
def is_package(name):
"""Check if string maps to a package installed via pip.
- name (unicode): Name of package.
+ name (str): Name of package.
RETURNS (bool): True if installed package, False if not.
"""
try:
@@ -351,7 +351,7 @@ def is_package(name):
def get_package_path(name):
"""Get the path to an installed package.
- name (unicode): Package name.
+ name (str): Package name.
RETURNS (Path): Path to installed package.
"""
name = name.lower() # use lowercase version to be safe
@@ -526,8 +526,8 @@ def expand_exc(excs, search, replace):
For example, to add additional versions with typographic apostrophes.
excs (dict): Tokenizer exceptions.
- search (unicode): String to find and replace.
- replace (unicode): Replacement.
+ search (str): String to find and replace.
+ replace (str): Replacement.
RETURNS (dict): Combined tokenizer exceptions.
"""
@@ -761,8 +761,8 @@ def from_disk(path, readers, exclude):
def import_file(name, loc):
"""Import module from a file. Used to load models from a directory.
- name (unicode): Name of module to load.
- loc (unicode / Path): Path to the file.
+ name (str): Name of module to load.
+ loc (str / Path): Path to the file.
RETURNS: The loaded module.
"""
loc = str(loc)
@@ -777,8 +777,8 @@ def minify_html(html):
Disclaimer: NOT a general-purpose solution, only removes indentation and
newlines.
- html (unicode): Markup to minify.
- RETURNS (unicode): "Minified" HTML.
+ html (str): Markup to minify.
+ RETURNS (str): "Minified" HTML.
"""
return html.strip().replace(" ", "").replace("\n", "")
@@ -787,8 +787,8 @@ def escape_html(text):
"""Replace <, >, &, " with their HTML encoded representation. Intended to
prevent HTML errors in rendered displaCy markup.
- text (unicode): The original text.
- RETURNS (unicode): Equivalent text to be safely used within HTML.
+ text (str): The original text.
+ RETURNS (str): Equivalent text to be safely used within HTML.
"""
text = text.replace("&", "&")
text = text.replace("<", "<")
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index e100ae915..0ed2462c6 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -57,7 +57,7 @@ cdef class Vectors:
shape (tuple): Size of the table, as (# entries, # columns)
data (numpy.ndarray): The vector data.
keys (iterable): A sequence of keys, aligned with the data.
- name (unicode): A name to identify the vectors table.
+ name (str): A name to identify the vectors table.
RETURNS (Vectors): The newly created object.
DOCS: https://spacy.io/api/vectors#init
@@ -237,7 +237,7 @@ cdef class Vectors:
def find(self, *, key=None, keys=None, row=None, rows=None):
"""Look up one or more keys by row, or vice versa.
- key (unicode / int): Find the row that the given key points to.
+ key (str / int): Find the row that the given key points to.
Returns int, -1 if missing.
keys (iterable): Find rows that the keys point to.
Returns ndarray.
@@ -352,7 +352,7 @@ cdef class Vectors:
def to_disk(self, path, **kwargs):
"""Save the current state to a directory.
- path (unicode / Path): A path to a directory, which will be created if
+ path (str / Path): A path to a directory, which will be created if
it doesn't exists.
DOCS: https://spacy.io/api/vectors#to_disk
@@ -372,7 +372,7 @@ cdef class Vectors:
"""Loads state from a directory. Modifies the object in place and
returns it.
- path (unicode / Path): Directory path, string or Path-like object.
+ path (str / Path): Directory path, string or Path-like object.
RETURNS (Vectors): The modified object.
DOCS: https://spacy.io/api/vectors#from_disk
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index a1929559f..ed37f6e98 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -41,7 +41,7 @@ cdef class Vocab:
strings (StringStore): StringStore that maps strings to integers, and
vice versa.
lookups (Lookups): Container for large lookup tables and dictionaries.
- name (unicode): Optional name to identify the vectors table.
+ name (str): Optional name to identify the vectors table.
RETURNS (Vocab): The newly constructed object.
"""
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
@@ -97,7 +97,7 @@ cdef class Vocab:
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
`Token.check_flag`.
- flag_getter (callable): A function `f(unicode) -> bool`, to get the
+ flag_getter (callable): A function `f(str) -> bool`, to get the
flag value.
flag_id (int): An integer between 1 and 63 (inclusive), specifying
the bit at which the flag will be stored. If -1, the lowest
@@ -187,7 +187,7 @@ cdef class Vocab:
def __contains__(self, key):
"""Check whether the string or int key has an entry in the vocabulary.
- string (unicode): The ID string.
+ string (str): The ID string.
RETURNS (bool) Whether the string has an entry in the vocabulary.
DOCS: https://spacy.io/api/vocab#contains
diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md
index feb167a9d..39148e476 100644
--- a/website/docs/api/lexeme.md
+++ b/website/docs/api/lexeme.md
@@ -125,21 +125,21 @@ The L2 norm of the lexeme's vector representation.
| Name | Type | Description |
| -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | The lexeme's vocabulary. |
-| `text` | unicode | Verbatim text content. |
+| `text` | str | Verbatim text content. |
| `orth` | int | ID of the verbatim text content. |
-| `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. |
+| `orth_` | str | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. |
| `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. |
| `flags` | int | Container of the lexeme's binary flags. |
| `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. |
-| `norm_` | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text. |
+| `norm_` | str | The lexemes's norm, i.e. a normalized form of the lexeme text. |
| `lower` | int | Lowercase form of the word. |
-| `lower_` | unicode | Lowercase form of the word. |
+| `lower_` | str | Lowercase form of the word. |
| `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
-| `shape_` | unicode | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
+| `shape_` | str | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
| `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. |
-| `prefix_` | unicode | Length-N substring from the start of the word. Defaults to `N=1`. |
+| `prefix_` | str | Length-N substring from the start of the word. Defaults to `N=1`. |
| `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. |
-| `suffix_` | unicode | Length-N substring from the start of the word. Defaults to `N=3`. |
+| `suffix_` | str | Length-N substring from the start of the word. Defaults to `N=3`. |
| `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. |
| `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. |
| `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. |
@@ -159,7 +159,7 @@ The L2 norm of the lexeme's vector representation.
| `is_oov` | bool | Is the lexeme out-of-vocabulary? |
| `is_stop` | bool | Is the lexeme part of a "stop list"? |
| `lang` | int | Language of the parent vocabulary. |
-| `lang_` | unicode | Language of the parent vocabulary. |
+| `lang_` | str | Language of the parent vocabulary. |
| `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). |
| `cluster` | int | Brown cluster ID. |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. |
diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md
index e024ab54a..b851f6882 100644
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@@ -27,7 +27,7 @@ Create the vocabulary.
| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
| `lemmatizer` | object | A lemmatizer. Defaults to `None`. |
| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. |
-| `vectors_name` 2.2 | unicode | A name to identify the vectors table. |
+| `vectors_name` 2.2 | str | A name to identify the vectors table. |
| **RETURNS** | `Vocab` | The newly constructed object. |
## Vocab.\_\_len\_\_ {#len tag="method"}
@@ -91,10 +91,10 @@ given string, you need to look it up in
> assert oov not in nlp.vocab
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------------------- |
-| `string` | unicode | The ID string. |
-| **RETURNS** | bool | Whether the string has an entry in the vocabulary. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------------------- |
+| `string` | str | The ID string. |
+| **RETURNS** | bool | Whether the string has an entry in the vocabulary. |
## Vocab.add_flag {#add_flag tag="method"}
@@ -117,7 +117,7 @@ using `token.check_flag(flag_id)`.
| Name | Type | Description |
| ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
-| `flag_getter` | dict | A function `f(unicode) -> bool`, to get the flag value. |
+| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value. |
| `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. |
| **RETURNS** | int | The integer ID by which the flag value can be checked. |
@@ -227,10 +227,10 @@ Save the current state to a directory.
> nlp.vocab.to_disk("/path/to/vocab")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Vocab.from_disk {#from_disk tag="method" new="2"}
@@ -243,11 +243,11 @@ Loads state from a directory. Modifies the object in place and returns it.
> vocab = Vocab().from_disk("/path/to/vocab")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
-| **RETURNS** | `Vocab` | The modified `Vocab` object. |
+| Name | Type | Description |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Vocab` | The modified `Vocab` object. |
## Vocab.to_bytes {#to_bytes tag="method"}
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 5f47bd2e3..a84399312 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -157,19 +157,19 @@ The available token pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are:
-| Attribute | Type | Description |
-| -------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------ |
-| `ORTH` | unicode | The exact verbatim text of a token. |
-| `TEXT` 2.1 | unicode | The exact verbatim text of a token. |
-| `LOWER` | unicode | The lowercase form of the token text. |
-| `LENGTH` | int | The length of the token text. |
-| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
-| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
-| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
-| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
-| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
-| `ENT_TYPE` | unicode | The token's entity label. |
-| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
+| Attribute | Type | Description |
+| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
+| `ORTH` | str | The exact verbatim text of a token. |
+| `TEXT` 2.1 | str | The exact verbatim text of a token. |
+| `LOWER` | str | The lowercase form of the token text. |
+| `LENGTH` | int | The length of the token text. |
+| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
+| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
+| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
+| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
+| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
+| `ENT_TYPE` | str | The token's entity label. |
+| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
@@ -1101,21 +1101,28 @@ powerful model packages with binary weights _and_ rules included!
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
-When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**,
-the EntityRuler calls the nlp object to construct a doc object. This happens in case you try
-to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to
-extract matches based on the pattern's POS signature.
+When using a large amount of **phrase patterns** (roughly > 10000) it's useful
+to understand how the `add_patterns` function of the EntityRuler works. For each
+**phrase pattern**, the EntityRuler calls the nlp object to construct a doc
+object. This happens in case you try to add the EntityRuler at the end of an
+existing pipeline with, for example, a POS tagger and want to extract matches
+based on the pattern's POS signature.
-In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler.
+In this case you would pass a config value of `phrase_matcher_attr="POS"` for
+the EntityRuler.
-Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns.
+Running the full language pipeline across every pattern in a large list scales
+linearly and can therefore take a long time on large amounts of phrase patterns.
-As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively.
+As of spaCy 2.2.4 the `add_patterns` function has been refactored to use
+nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with
+5,000-100,000 phrase patterns respectively.
-Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time.
+Even with this speedup (but especially if you're using an older version) the
+`add_patterns` function can still take a long time.
-An easy workaround to make this function run faster is disabling the other language pipes
-while adding the phrase patterns.
+An easy workaround to make this function run faster is disabling the other
+language pipes while adding the phrase patterns.
```python
entityruler = EntityRuler(nlp)
From 262d306eaa5a8715ca5905c8fde341ba65771d09 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sun, 24 May 2020 17:23:00 +0200
Subject: [PATCH 173/496] unicode -> str consistency
---
website/docs/api/cli.md | 8 +-
website/docs/api/cython-classes.md | 2 +-
website/docs/api/dependencyparser.md | 16 +-
website/docs/api/doc.md | 50 +++---
website/docs/api/entitylinker.md | 18 +--
website/docs/api/entityrecognizer.md | 16 +-
website/docs/api/entityruler.md | 27 ++--
website/docs/api/goldcorpus.md | 10 +-
website/docs/api/goldparse.md | 5 +-
website/docs/api/kb.md | 170 +++++++++++----------
website/docs/api/language.md | 79 +++++-----
website/docs/api/lemmatizer.md | 24 +--
website/docs/api/lookups.md | 50 +++---
website/docs/api/matcher.md | 24 +--
website/docs/api/phrasematcher.md | 16 +-
website/docs/api/pipeline-functions.md | 10 +-
website/docs/api/sentencizer.md | 14 +-
website/docs/api/span.md | 40 ++---
website/docs/api/stringstore.md | 46 +++---
website/docs/api/tagger.md | 26 ++--
website/docs/api/textcategorizer.md | 18 +--
website/docs/api/token.md | 160 +++++++++----------
website/docs/api/tokenizer.md | 88 +++++------
website/docs/api/top-level.md | 126 +++++++--------
website/docs/api/vectors.md | 20 +--
website/docs/usage/linguistic-features.md | 18 +--
website/docs/usage/processing-pipelines.md | 20 +--
website/docs/usage/saving-loading.md | 12 +-
website/docs/usage/visualizers.md | 12 +-
29 files changed, 564 insertions(+), 561 deletions(-)
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index f067ba5a7..d507e13ec 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -504,10 +504,10 @@ tokenization can be provided.
> srsly.write_jsonl("/path/to/text.jsonl", data)
> ```
-| Key | Type | Description |
-| -------- | ------- | ---------------------------------------------------------- |
-| `text` | unicode | The raw input text. Is not required if `tokens` available. |
-| `tokens` | list | Optional tokenization, one string per token. |
+| Key | Type | Description |
+| -------- | ---- | ---------------------------------------------------------- |
+| `text` | str | The raw input text. Is not required if `tokens` available. |
+| `tokens` | list | Optional tokenization, one string per token. |
```json
### Example
diff --git a/website/docs/api/cython-classes.md b/website/docs/api/cython-classes.md
index 77d6fdd10..9dea04284 100644
--- a/website/docs/api/cython-classes.md
+++ b/website/docs/api/cython-classes.md
@@ -170,7 +170,7 @@ vocabulary.
| Name | Type | Description |
| ----------- | ---------------- | ------------------------------------------------------------------------------------------- |
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. |
-| `string` | unicode | The string of the word to look up. |
+| `string` | str | The string of the word to look up. |
| **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. |
### Vocab.get_by_orth {#vocab_get_by_orth tag="method"}
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index df0df3e38..0980dc2e0 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -229,9 +229,9 @@ Add a new label to the pipe.
> parser.add_label("MY_LABEL")
> ```
-| Name | Type | Description |
-| ------- | ------- | ----------------- |
-| `label` | unicode | The label to add. |
+| Name | Type | Description |
+| ------- | ---- | ----------------- |
+| `label` | str | The label to add. |
## DependencyParser.to_disk {#to_disk tag="method"}
@@ -244,10 +244,10 @@ Serialize the pipe to disk.
> parser.to_disk("/path/to/parser")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## DependencyParser.from_disk {#from_disk tag="method"}
@@ -262,7 +262,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description |
| ----------- | ------------------ | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. |
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index ab85c1deb..75491358d 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -123,7 +123,7 @@ details, see the documentation on
| Name | Type | Description |
| --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. |
+| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. |
| `default` | - | Optional default value of the attribute if no getter or method is defined. |
| `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. |
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. |
@@ -145,10 +145,10 @@ Look up a previously registered extension by name. Returns a 4-tuple
> assert extension == (False, None, None, None)
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------------------------- |
-| `name` | unicode | Name of the extension. |
-| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
+| Name | Type | Description |
+| ----------- | ----- | ------------------------------------------------------------- |
+| `name` | str | Name of the extension. |
+| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
## Doc.has_extension {#has_extension tag="classmethod" new="2"}
@@ -162,10 +162,10 @@ Check whether an extension has been registered on the `Doc` class.
> assert Doc.has_extension('has_city')
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------ |
-| `name` | unicode | Name of the extension to check. |
-| **RETURNS** | bool | Whether the extension has been registered. |
+| Name | Type | Description |
+| ----------- | ---- | ------------------------------------------ |
+| `name` | str | Name of the extension to check. |
+| **RETURNS** | bool | Whether the extension has been registered. |
## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
@@ -180,10 +180,10 @@ Remove a previously registered extension.
> assert not Doc.has_extension('has_city')
> ```
-| Name | Type | Description |
-| ----------- | ------- | --------------------------------------------------------------------- |
-| `name` | unicode | Name of the extension. |
-| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
+| Name | Type | Description |
+| ----------- | ----- | --------------------------------------------------------------------- |
+| `name` | str | Name of the extension. |
+| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
## Doc.char_span {#char_span tag="method" new="2"}
@@ -368,10 +368,10 @@ Save the current state to a directory.
> doc.to_disk("/path/to/doc")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Doc.from_disk {#from_disk tag="method" new="2"}
@@ -385,11 +385,11 @@ Loads state from a directory. Modifies the object in place and returns it.
> doc = Doc(Vocab()).from_disk("/path/to/doc")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
-| **RETURNS** | `Doc` | The modified `Doc` object. |
+| Name | Type | Description |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Doc` | The modified `Doc` object. |
## Doc.to_bytes {#to_bytes tag="method"}
@@ -648,15 +648,15 @@ The L2 norm of the document's vector representation.
| Name | Type | Description |
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `text` | unicode | A unicode representation of the document text. |
-| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
+| `text` | str | A unicode representation of the document text. |
+| `text_with_ws` | str | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
| `vocab` | `Vocab` | The store of lexical types. |
| `tensor` 2 | `ndarray` | Container for dense vector representations. |
| `cats` 2 | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
| `user_data` | - | A generic storage area, for user custom data. |
| `lang` 2.1 | int | Language of the document's vocabulary. |
-| `lang_` 2.1 | unicode | Language of the document's vocabulary. |
+| `lang_` 2.1 | str | Language of the document's vocabulary. |
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. |
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. |
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. |
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index a9d6a31a5..d7f25ed56 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -258,10 +258,10 @@ Serialize the pipe to disk.
> entity_linker.to_disk("/path/to/entity_linker")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## EntityLinker.from_disk {#from_disk tag="method"}
@@ -274,11 +274,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
> entity_linker.from_disk("/path/to/entity_linker")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
-| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |
+| Name | Type | Description |
+| ----------- | -------------- | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |
## Serialization fields {#serialization-fields}
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 9a2766c07..1d0c1de3a 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -230,9 +230,9 @@ Add a new label to the pipe.
> ner.add_label("MY_LABEL")
> ```
-| Name | Type | Description |
-| ------- | ------- | ----------------- |
-| `label` | unicode | The label to add. |
+| Name | Type | Description |
+| ------- | ---- | ----------------- |
+| `label` | str | The label to add. |
## EntityRecognizer.to_disk {#to_disk tag="method"}
@@ -245,10 +245,10 @@ Serialize the pipe to disk.
> ner.to_disk("/path/to/ner")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## EntityRecognizer.from_disk {#from_disk tag="method"}
@@ -263,7 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description |
| ----------- | ------------------ | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. |
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index 0fd24897d..7bee3a77a 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -72,10 +72,10 @@ Whether a label is present in the patterns.
> assert not "PERSON" in ruler
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------------- |
-| `label` | unicode | The label to check. |
-| **RETURNS** | bool | Whether the entity ruler contains the label. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------------- |
+| `label` | str | The label to check. |
+| **RETURNS** | bool | Whether the entity ruler contains the label. |
## EntityRuler.\_\_call\_\_ {#call tag="method"}
@@ -83,8 +83,9 @@ Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
happens automatically after the component has been added to the pipeline using
[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
with `overwrite_ents=True`, existing entities will be replaced if they overlap
-with the matches. When matches overlap in a Doc, the entity ruler prioritizes longer
-patterns over shorter, and if equal the match occuring first in the Doc is chosen.
+with the matches. When matches overlap in a Doc, the entity ruler prioritizes
+longer patterns over shorter, and if equal the match occuring first in the Doc
+is chosen.
> #### Example
>
@@ -139,9 +140,9 @@ only the patterns are saved as JSONL. If a directory name is provided, a
> ruler.to_disk("/path/to/entity_ruler") # saves patterns and config
> ```
-| Name | Type | Description |
-| ------ | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| Name | Type | Description |
+| ------ | ------------ | ----------------------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## EntityRuler.from_disk {#from_disk tag="method"}
@@ -158,10 +159,10 @@ configuration.
> ruler.from_disk("/path/to/entity_ruler") # loads patterns and config
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | ---------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. |
+| Name | Type | Description |
+| ----------- | ------------- | ---------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. |
+| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. |
## EntityRuler.to_bytes {#to_bytes tag="method"}
diff --git a/website/docs/api/goldcorpus.md b/website/docs/api/goldcorpus.md
index a18ef4d32..7767b28bd 100644
--- a/website/docs/api/goldcorpus.md
+++ b/website/docs/api/goldcorpus.md
@@ -17,8 +17,8 @@ Create a `GoldCorpus`. IF the input data is an iterable, each item should be a
[`gold.read_json_file`](https://github.com/explosion/spaCy/tree/master/spacy/gold.pyx)
for further details.
-| Name | Type | Description |
-| ----------- | --------------------------- | ------------------------------------------------------------ |
-| `train` | unicode / `Path` / iterable | Training data, as a path (file or directory) or iterable. |
-| `dev` | unicode / `Path` / iterable | Development data, as a path (file or directory) or iterable. |
-| **RETURNS** | `GoldCorpus` | The newly constructed object. |
+| Name | Type | Description |
+| ----------- | ----------------------- | ------------------------------------------------------------ |
+| `train` | str / `Path` / iterable | Training data, as a path (file or directory) or iterable. |
+| `dev` | str / `Path` / iterable | Development data, as a path (file or directory) or iterable. |
+| **RETURNS** | `GoldCorpus` | The newly constructed object. |
diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md
index 1ef6f0362..2f841eedd 100644
--- a/website/docs/api/goldparse.md
+++ b/website/docs/api/goldparse.md
@@ -62,7 +62,8 @@ Whether the provided syntactic annotations form a projective dependency tree.
Convert a list of Doc objects into the
[JSON-serializable format](/api/annotation#json-input) used by the
-[`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc.
+[`spacy train`](/api/cli#train) command. Each input doc will be treated as a
+'paragraph' in the output doc.
> #### Example
>
@@ -160,7 +161,7 @@ single-token entity.
| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. |
| `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. |
-| **RETURNS** | list | Unicode strings, describing the [BILUO](/api/annotation#biluo) tags. |
+| **RETURNS** | list | str strings, describing the [BILUO](/api/annotation#biluo) tags. |
### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}
diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md
index eeba85e84..f088815fd 100644
--- a/website/docs/api/kb.md
+++ b/website/docs/api/kb.md
@@ -1,16 +1,19 @@
---
title: KnowledgeBase
-teaser: A storage class for entities and aliases of a specific knowledge base (ontology)
+teaser:
+ A storage class for entities and aliases of a specific knowledge base
+ (ontology)
tag: class
source: spacy/kb.pyx
new: 2.2
---
-The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init)
-objects, which are plausible external identifiers given a certain textual mention.
-Each such `Candidate` holds information from the relevant KB entities,
-such as its frequency in text and possible aliases.
-Each entity in the knowledge base also has a pretrained entity vector of a fixed size.
+The `KnowledgeBase` object provides a method to generate
+[`Candidate`](/api/kb/#candidate_init) objects, which are plausible external
+identifiers given a certain textual mention. Each such `Candidate` holds
+information from the relevant KB entities, such as its frequency in text and
+possible aliases. Each entity in the knowledge base also has a pretrained entity
+vector of a fixed size.
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
@@ -24,25 +27,25 @@ Create the knowledge base.
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
> ```
-| Name | Type | Description |
-| ----------------------- | ---------------- | ----------------------------------------- |
-| `vocab` | `Vocab` | A `Vocab` object. |
-| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
-| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
-
+| Name | Type | Description |
+| ---------------------- | --------------- | ---------------------------------------- |
+| `vocab` | `Vocab` | A `Vocab` object. |
+| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
+| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
The length of the fixed-size entity vectors in the knowledge base.
-| Name | Type | Description |
-| ----------- | ---- | ----------------------------------------- |
-| **RETURNS** | int | Length of the fixed-size entity vectors. |
+| Name | Type | Description |
+| ----------- | ---- | ---------------------------------------- |
+| **RETURNS** | int | Length of the fixed-size entity vectors. |
## KnowledgeBase.add_entity {#add_entity tag="method"}
-Add an entity to the knowledge base, specifying its corpus frequency
-and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length).
+Add an entity to the knowledge base, specifying its corpus frequency and entity
+vector, which should be of length
+[`entity_vector_length`](/api/kb#entity_vector_length).
> #### Example
>
@@ -51,16 +54,16 @@ and entity vector, which should be of length [`entity_vector_length`](/api/kb#en
> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
> ```
-| Name | Type | Description |
-| --------------- | ------------- | ------------------------------------------------- |
-| `entity` | unicode | The unique entity identifier |
-| `freq` | float | The frequency of the entity in a typical corpus |
-| `entity_vector` | vector | The pretrained vector of the entity |
+| Name | Type | Description |
+| --------------- | ------ | ----------------------------------------------- |
+| `entity` | str | The unique entity identifier |
+| `freq` | float | The frequency of the entity in a typical corpus |
+| `entity_vector` | vector | The pretrained vector of the entity |
## KnowledgeBase.set_entities {#set_entities tag="method"}
-Define the full list of entities in the knowledge base, specifying the corpus frequency
-and entity vector for each entity.
+Define the full list of entities in the knowledge base, specifying the corpus
+frequency and entity vector for each entity.
> #### Example
>
@@ -68,18 +71,19 @@ and entity vector for each entity.
> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
> ```
-| Name | Type | Description |
-| ------------- | ------------- | ------------------------------------------------- |
-| `entity_list` | iterable | List of unique entity identifiers |
-| `freq_list` | iterable | List of entity frequencies |
-| `vector_list` | iterable | List of entity vectors |
+| Name | Type | Description |
+| ------------- | -------- | --------------------------------- |
+| `entity_list` | iterable | List of unique entity identifiers |
+| `freq_list` | iterable | List of entity frequencies |
+| `vector_list` | iterable | List of entity vectors |
## KnowledgeBase.add_alias {#add_alias tag="method"}
-Add an alias or mention to the knowledge base, specifying its potential KB identifiers
-and their prior probabilities. The entity identifiers should refer to entities previously
-added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities).
-The sum of the prior probabilities should not exceed 1.
+Add an alias or mention to the knowledge base, specifying its potential KB
+identifiers and their prior probabilities. The entity identifiers should refer
+to entities previously added with [`add_entity`](/api/kb#add_entity) or
+[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
+should not exceed 1.
> #### Example
>
@@ -87,11 +91,11 @@ The sum of the prior probabilities should not exceed 1.
> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
> ```
-| Name | Type | Description |
-| -------------- | ------------- | -------------------------------------------------- |
-| `alias` | unicode | The textual mention or alias |
-| `entities` | iterable | The potential entities that the alias may refer to |
-| `probabilities`| iterable | The prior probabilities of each entity |
+| Name | Type | Description |
+| --------------- | -------- | -------------------------------------------------- |
+| `alias` | str | The textual mention or alias |
+| `entities` | iterable | The potential entities that the alias may refer to |
+| `probabilities` | iterable | The prior probabilities of each entity |
## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
@@ -117,9 +121,9 @@ Get a list of all entity IDs in the knowledge base.
> all_entities = kb.get_entity_strings()
> ```
-| Name | Type | Description |
-| ----------- | ---- | --------------------------------------------- |
-| **RETURNS** | list | The list of entities in the knowledge base. |
+| Name | Type | Description |
+| ----------- | ---- | ------------------------------------------- |
+| **RETURNS** | list | The list of entities in the knowledge base. |
## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
@@ -131,9 +135,9 @@ Get the total number of aliases in the knowledge base.
> total_aliases = kb.get_size_aliases()
> ```
-| Name | Type | Description |
-| ----------- | ---- | --------------------------------------------- |
-| **RETURNS** | int | The number of aliases in the knowledge base. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------------- |
+| **RETURNS** | int | The number of aliases in the knowledge base. |
## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
@@ -145,9 +149,9 @@ Get a list of all aliases in the knowledge base.
> all_aliases = kb.get_alias_strings()
> ```
-| Name | Type | Description |
-| ----------- | ---- | --------------------------------------------- |
-| **RETURNS** | list | The list of aliases in the knowledge base. |
+| Name | Type | Description |
+| ----------- | ---- | ------------------------------------------ |
+| **RETURNS** | list | The list of aliases in the knowledge base. |
## KnowledgeBase.get_candidates {#get_candidates tag="method"}
@@ -160,10 +164,10 @@ of type [`Candidate`](/api/kb/#candidate_init).
> candidates = kb.get_candidates("Douglas")
> ```
-| Name | Type | Description |
-| ------------- | ------------- | -------------------------------------------------- |
-| `alias` | unicode | The textual mention or alias |
-| **RETURNS** | iterable | The list of relevant `Candidate` objects |
+| Name | Type | Description |
+| ----------- | -------- | ---------------------------------------- |
+| `alias` | str | The textual mention or alias |
+| **RETURNS** | iterable | The list of relevant `Candidate` objects |
## KnowledgeBase.get_vector {#get_vector tag="method"}
@@ -175,15 +179,15 @@ Given a certain entity ID, retrieve its pretrained entity vector.
> vector = kb.get_vector("Q42")
> ```
-| Name | Type | Description |
-| ------------- | ------------- | -------------------------------------------------- |
-| `entity` | unicode | The entity ID |
-| **RETURNS** | vector | The entity vector |
+| Name | Type | Description |
+| ----------- | ------ | ----------------- |
+| `entity` | str | The entity ID |
+| **RETURNS** | vector | The entity vector |
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
-Given a certain entity ID and a certain textual mention, retrieve
-the prior probability of the fact that the mention links to the entity ID.
+Given a certain entity ID and a certain textual mention, retrieve the prior
+probability of the fact that the mention links to the entity ID.
> #### Example
>
@@ -191,11 +195,11 @@ the prior probability of the fact that the mention links to the entity ID.
> probability = kb.get_prior_prob("Q42", "Douglas")
> ```
-| Name | Type | Description |
-| ------------- | ------------- | --------------------------------------------------------------- |
-| `entity` | unicode | The entity ID |
-| `alias` | unicode | The textual mention or alias |
-| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` |
+| Name | Type | Description |
+| ----------- | ----- | -------------------------------------------------------------- |
+| `entity` | str | The entity ID |
+| `alias` | str | The textual mention or alias |
+| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` |
## KnowledgeBase.dump {#dump tag="method"}
@@ -207,14 +211,14 @@ Save the current state of the knowledge base to a directory.
> kb.dump(loc)
> ```
-| Name | Type | Description |
-| ------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ |
-| `loc` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| Name | Type | Description |
+| ----- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `loc` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## KnowledgeBase.load_bulk {#load_bulk tag="method"}
-Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab)
-should also be the same as the one used to create the KB.
+Restore the state of the knowledge base from a given directory. Note that the
+[`Vocab`](/api/vocab) should also be the same as the one used to create the KB.
> #### Example
>
@@ -226,18 +230,16 @@ should also be the same as the one used to create the KB.
> kb.load_bulk("/path/to/kb")
> ```
-
-| Name | Type | Description |
-| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
-| `loc` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. |
-
+| Name | Type | Description |
+| ----------- | --------------- | -------------------------------------------------------------------------- |
+| `loc` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. |
## Candidate.\_\_init\_\_ {#candidate_init tag="method"}
Construct a `Candidate` object. Usually this constructor is not called directly,
-but instead these objects are returned by the [`get_candidates`](/api/kb#get_candidates) method
-of a `KnowledgeBase`.
+but instead these objects are returned by the
+[`get_candidates`](/api/kb#get_candidates) method of a `KnowledgeBase`.
> #### Example
>
@@ -257,12 +259,12 @@ of a `KnowledgeBase`.
## Candidate attributes {#candidate_attributes}
-| Name | Type | Description |
-| ---------------------- | ------------ | ------------------------------------------------------------------ |
-| `entity` | int | The entity's unique KB identifier |
-| `entity_` | unicode | The entity's unique KB identifier |
-| `alias` | int | The alias or textual mention |
-| `alias_` | unicode | The alias or textual mention |
-| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
-| `entity_freq` | long | The frequency of the entity in a typical corpus |
-| `entity_vector` | vector | The pretrained vector of the entity |
+| Name | Type | Description |
+| --------------- | ------ | -------------------------------------------------------------- |
+| `entity` | int | The entity's unique KB identifier |
+| `entity_` | str | The entity's unique KB identifier |
+| `alias` | int | The alias or textual mention |
+| `alias_` | str | The alias or textual mention |
+| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
+| `entity_freq` | long | The frequency of the entity in a typical corpus |
+| `entity_vector` | vector | The pretrained vector of the entity |
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 703a0f678..496c89776 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -49,11 +49,11 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
> assert (doc[0].text, doc[0].head.tag_) == ("An", "NN")
> ```
-| Name | Type | Description |
-| ----------- | ------- | --------------------------------------------------------------------------------- |
-| `text` | unicode | The text to be processed. |
-| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
-| **RETURNS** | `Doc` | A container for accessing the annotations. |
+| Name | Type | Description |
+| ----------- | ----- | --------------------------------------------------------------------------------- |
+| `text` | str | The text to be processed. |
+| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
+| **RETURNS** | `Doc` | A container for accessing the annotations. |
@@ -201,7 +201,7 @@ Create a pipeline component from a factory.
| Name | Type | Description |
| ----------- | -------- | ---------------------------------------------------------------------------------- |
-| `name` | unicode | Factory name to look up in [`Language.factories`](/api/language#class-attributes). |
+| `name` | str | Factory name to look up in [`Language.factories`](/api/language#class-attributes). |
| `config` | dict | Configuration parameters to initialize component. |
| **RETURNS** | callable | The pipeline component. |
@@ -224,9 +224,9 @@ take a `Doc` object, modify it and return it. Only one of `before`, `after`,
| Name | Type | Description |
| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `component` | callable | The pipeline component. |
-| `name` | unicode | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. |
-| `before` | unicode | Component name to insert component directly before. |
-| `after` | unicode | Component name to insert component directly after: |
+| `name` | str | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. |
+| `before` | str | Component name to insert component directly before. |
+| `after` | str | Component name to insert component directly after: |
| `first` | bool | Insert component first / not first in the pipeline. |
| `last` | bool | Insert component last / not last in the pipeline. |
@@ -243,10 +243,10 @@ Check whether a component is present in the pipeline. Equivalent to
> assert nlp.has_pipe("component")
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------------------------- |
-| `name` | unicode | Name of the pipeline component to check. |
-| **RETURNS** | bool | Whether a component of that name exists in the pipeline. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------------------------- |
+| `name` | str | Name of the pipeline component to check. |
+| **RETURNS** | bool | Whether a component of that name exists in the pipeline. |
## Language.get_pipe {#get_pipe tag="method" new="2"}
@@ -261,7 +261,7 @@ Get a pipeline component for a given component name.
| Name | Type | Description |
| ----------- | -------- | -------------------------------------- |
-| `name` | unicode | Name of the pipeline component to get. |
+| `name` | str | Name of the pipeline component to get. |
| **RETURNS** | callable | The pipeline component. |
## Language.replace_pipe {#replace_pipe tag="method" new="2"}
@@ -276,7 +276,7 @@ Replace a component in the pipeline.
| Name | Type | Description |
| ----------- | -------- | --------------------------------- |
-| `name` | unicode | Name of the component to replace. |
+| `name` | str | Name of the component to replace. |
| `component` | callable | The pipeline component to insert. |
## Language.rename_pipe {#rename_pipe tag="method" new="2"}
@@ -292,10 +292,10 @@ added to the pipeline, you can also use the `name` argument on
> nlp.rename_pipe("parser", "spacy_parser")
> ```
-| Name | Type | Description |
-| ---------- | ------- | -------------------------------- |
-| `old_name` | unicode | Name of the component to rename. |
-| `new_name` | unicode | New name of the component. |
+| Name | Type | Description |
+| ---------- | ---- | -------------------------------- |
+| `old_name` | str | Name of the component to rename. |
+| `new_name` | str | New name of the component. |
## Language.remove_pipe {#remove_pipe tag="method" new="2"}
@@ -309,10 +309,10 @@ component function.
> assert name == "parser"
> ```
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------------- |
-| `name` | unicode | Name of the component to remove. |
-| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. |
+| Name | Type | Description |
+| ----------- | ----- | ----------------------------------------------------- |
+| `name` | str | Name of the component to remove. |
+| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. |
## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"}
@@ -342,12 +342,11 @@ latter case, all components not in the `enable` list, will be disabled.
| Name | Type | Description |
| ----------- | --------------- | ------------------------------------------------------------------------------------ |
| `disable` | list | Names of pipeline components to disable. |
-| `disable` | unicode | Name of pipeline component to disable. |
+| `disable` | str | Name of pipeline component to disable. |
| `enable` | list | Names of pipeline components that will not be disabled. |
-| `enable` | unicode | Name of pipeline component that will not be disabled. |
+| `enable` | str | Name of pipeline component that will not be disabled. |
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
-
As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
@@ -370,10 +369,10 @@ the model**.
> nlp.to_disk("/path/to/models")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
## Language.from_disk {#from_disk tag="method" new="2"}
@@ -395,11 +394,11 @@ loaded object.
> nlp = English().from_disk("/path/to/en_model")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
-| **RETURNS** | `Language` | The modified `Language` object. |
+| Name | Type | Description |
+| ----------- | ------------ | ----------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Language` | The modified `Language` object. |
@@ -480,11 +479,11 @@ per component.
## Class attributes {#class-attributes}
-| Name | Type | Description |
-| -------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
-| `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
-| `factories` 2 | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |
+| Name | Type | Description |
+| -------------------------------------- | ----- | ----------------------------------------------------------------------------------------------------------------------------------- |
+| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
+| `lang` | str | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
+| `factories` 2 | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |
## Serialization fields {#serialization-fields}
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index f43e17fd3..16cd624f5 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -63,8 +63,8 @@ Lemmatize a string.
| Name | Type | Description |
| ------------ | ------------- | -------------------------------------------------------------------------------------------------------- |
-| `string` | unicode | The string to lemmatize, e.g. the token text. |
-| `univ_pos` | unicode / int | The token's universal part-of-speech tag. |
+| `string` | str | The string to lemmatize, e.g. the token text. |
+| `univ_pos` | str / int | The token's universal part-of-speech tag. |
| `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. |
| **RETURNS** | list | The available lemmas for the string. |
@@ -82,11 +82,11 @@ original string is returned. Languages can provide a
> assert lemmatizer.lookup("going") == "go"
> ```
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- |
-| `string` | unicode | The string to look up. |
-| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
-| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. |
+| Name | Type | Description |
+| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- |
+| `string` | str | The string to look up. |
+| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
+| **RETURNS** | str | The lemma if the string was found, otherwise the original string. |
## Lemmatizer.is_base_form {#is_base_form tag="method"}
@@ -102,11 +102,11 @@ lemmatization entirely.
> assert is_base_form == True
> ```
-| Name | Type | Description |
-| ------------ | ------------- | --------------------------------------------------------------------------------------- |
-| `univ_pos` | unicode / int | The token's universal part-of-speech tag. |
-| `morphology` | dict | The token's morphological features. |
-| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. |
+| Name | Type | Description |
+| ------------ | --------- | --------------------------------------------------------------------------------------- |
+| `univ_pos` | str / int | The token's universal part-of-speech tag. |
+| `morphology` | dict | The token's morphological features. |
+| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. |
## Attributes {#attributes}
diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md
index bd3b38303..b91d92646 100644
--- a/website/docs/api/lookups.md
+++ b/website/docs/api/lookups.md
@@ -56,10 +56,10 @@ Check if the lookups contain a table of a given name. Delegates to
> assert "some_table" in lookups
> ```
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------- |
-| `name` | unicode | Name of the table. |
-| **RETURNS** | bool | Whether a table of that name is in the lookups. |
+| Name | Type | Description |
+| ----------- | ---- | ----------------------------------------------- |
+| `name` | str | Name of the table. |
+| **RETURNS** | bool | Whether a table of that name is in the lookups. |
## Lookups.tables {#tables tag="property"}
@@ -91,7 +91,7 @@ exists.
| Name | Type | Description |
| ----------- | ----------------------------- | ---------------------------------- |
-| `name` | unicode | Unique name of the table. |
+| `name` | str | Unique name of the table. |
| `data` | dict | Optional data to add to the table. |
| **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. |
@@ -110,7 +110,7 @@ Get a table from the lookups. Raises an error if the table doesn't exist.
| Name | Type | Description |
| ----------- | ----------------------------- | ------------------ |
-| `name` | unicode | Name of the table. |
+| `name` | str | Name of the table. |
| **RETURNS** | [`Table`](/api/lookups#table) | The table. |
## Lookups.remove_table {#remove_table tag="method"}
@@ -128,7 +128,7 @@ Remove a table from the lookups. Raises an error if the table doesn't exist.
| Name | Type | Description |
| ----------- | ----------------------------- | ---------------------------- |
-| `name` | unicode | Name of the table to remove. |
+| `name` | str | Name of the table to remove. |
| **RETURNS** | [`Table`](/api/lookups#table) | The removed table. |
## Lookups.has_table {#has_table tag="method"}
@@ -144,10 +144,10 @@ Check if the lookups contain a table of a given name. Equivalent to
> assert lookups.has_table("some_table")
> ```
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------- |
-| `name` | unicode | Name of the table. |
-| **RETURNS** | bool | Whether a table of that name is in the lookups. |
+| Name | Type | Description |
+| ----------- | ---- | ----------------------------------------------- |
+| `name` | str | Name of the table. |
+| **RETURNS** | bool | Whether a table of that name is in the lookups. |
## Lookups.to_bytes {#to_bytes tag="method"}
@@ -191,9 +191,9 @@ which will be created if it doesn't exist.
> lookups.to_disk("/path/to/lookups")
> ```
-| Name | Type | Description |
-| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| Name | Type | Description |
+| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## Lookups.from_disk {#from_disk tag="method"}
@@ -208,10 +208,10 @@ the file doesn't exist.
> lookups.from_disk("/path/to/lookups")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `Lookups` | The loaded lookups. |
+| Name | Type | Description |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| **RETURNS** | `Lookups` | The loaded lookups. |
## Table {#table tag="class, ordererddict"}
@@ -238,7 +238,7 @@ Initialize a new table.
| Name | Type | Description |
| ----------- | ------- | ---------------------------------- |
-| `name` | unicode | Optional table name for reference. |
+| `name` | str | Optional table name for reference. |
| **RETURNS** | `Table` | The newly constructed object. |
### Table.from_dict {#table.from_dict tag="classmethod"}
@@ -256,7 +256,7 @@ Initialize a new table from a dict.
| Name | Type | Description |
| ----------- | ------- | ---------------------------------- |
| `data` | dict | The dictionary. |
-| `name` | unicode | Optional table name for reference. |
+| `name` | str | Optional table name for reference. |
| **RETURNS** | `Table` | The newly constructed object. |
### Table.set {#table.set tag="method"}
@@ -273,10 +273,10 @@ Set a new key / value pair. String keys will be hashed. Same as
> assert table["foo"] == "bar"
> ```
-| Name | Type | Description |
-| ------- | ------------- | ----------- |
-| `key` | unicode / int | The key. |
-| `value` | - | The value. |
+| Name | Type | Description |
+| ------- | --------- | ----------- |
+| `key` | str / int | The key. |
+| `value` | - | The value. |
### Table.to_bytes {#table.to_bytes tag="method"}
@@ -313,6 +313,6 @@ Load a table from a bytestring.
| Name | Type | Description |
| -------------- | --------------------------- | ----------------------------------------------------- |
-| `name` | unicode | Table name. |
+| `name` | str | Table name. |
| `default_size` | int | Default size of bloom filters if no data is provided. |
| `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. |
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index bfd4fb0ec..8a872558c 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -125,10 +125,10 @@ Check whether the matcher contains rules for a match ID.
> assert 'Rule' in matcher
> ```
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------------- |
-| `key` | unicode | The match ID. |
-| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
+| Name | Type | Description |
+| ----------- | ---- | ----------------------------------------------------- |
+| `key` | str | The match ID. |
+| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
## Matcher.add {#add tag="method" new="2"}
@@ -153,7 +153,7 @@ overwritten.
| Name | Type | Description |
| ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id` | unicode | An ID for the thing you're matching. |
+| `match_id` | str | An ID for the thing you're matching. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
@@ -188,9 +188,9 @@ exist.
> assert "Rule" not in matcher
> ```
-| Name | Type | Description |
-| ----- | ------- | ------------------------- |
-| `key` | unicode | The ID of the match rule. |
+| Name | Type | Description |
+| ----- | ---- | ------------------------- |
+| `key` | str | The ID of the match rule. |
## Matcher.get {#get tag="method" new="2"}
@@ -204,7 +204,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
> on_match, patterns = matcher.get("Rule")
> ```
-| Name | Type | Description |
-| ----------- | ------- | --------------------------------------------- |
-| `key` | unicode | The ID of the match rule. |
-| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
+| Name | Type | Description |
+| ----------- | ----- | --------------------------------------------- |
+| `key` | str | The ID of the match rule. |
+| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md
index a72277420..fa6729f41 100644
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@@ -133,10 +133,10 @@ Check whether the matcher contains rules for a match ID.
> assert "OBAMA" in matcher
> ```
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------------- |
-| `key` | unicode | The match ID. |
-| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
+| Name | Type | Description |
+| ----------- | ---- | ----------------------------------------------------- |
+| `key` | str | The match ID. |
+| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
## PhraseMatcher.add {#add tag="method"}
@@ -162,7 +162,7 @@ overwritten.
| Name | Type | Description |
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id` | unicode | An ID for the thing you're matching. |
+| `match_id` | str | An ID for the thing you're matching. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*docs` | `Doc` | `Doc` objects of the phrases to match. |
@@ -198,6 +198,6 @@ does not exist.
> assert "OBAMA" not in matcher
> ```
-| Name | Type | Description |
-| ----- | ------- | ------------------------- |
-| `key` | unicode | The ID of the match rule. |
+| Name | Type | Description |
+| ----- | ---- | ------------------------- |
+| `key` | str | The ID of the match rule. |
diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md
index 6e2b473b1..fc417845c 100644
--- a/website/docs/api/pipeline-functions.md
+++ b/website/docs/api/pipeline-functions.md
@@ -112,8 +112,8 @@ end of the pipeline and after all other components.
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------------------------ |
-| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
-| `label` | unicode | The subtoken dependency label. Defaults to `"subtok"`. |
-| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. |
+| Name | Type | Description |
+| ----------- | ----- | ------------------------------------------------------------ |
+| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
+| `label` | str | The subtoken dependency label. Defaults to `"subtok"`. |
+| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. |
diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md
index c9b935f22..03e843fcc 100644
--- a/website/docs/api/sentencizer.md
+++ b/website/docs/api/sentencizer.md
@@ -81,9 +81,9 @@ a file `sentencizer.json`. This also happens automatically when you save an
> sentencizer.to_disk("/path/to/sentencizer.jsonl")
> ```
-| Name | Type | Description |
-| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| Name | Type | Description |
+| ------ | ------------ | ---------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## Sentencizer.from_disk {#from_disk tag="method"}
@@ -98,10 +98,10 @@ added to its pipeline.
> sentencizer.from_disk("/path/to/sentencizer.json")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. |
+| Name | Type | Description |
+| ----------- | ------------- | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
+| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. |
## Sentencizer.to_bytes {#to_bytes tag="method"}
diff --git a/website/docs/api/span.md b/website/docs/api/span.md
index 3833bbca9..c41d9aa03 100644
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@@ -110,7 +110,7 @@ For details, see the documentation on
| Name | Type | Description |
| --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- |
-| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. |
+| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. |
| `default` | - | Optional default value of the attribute if no getter or method is defined. |
| `method` | callable | Set a custom method on the object, for example `span._.compare(other_span)`. |
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. |
@@ -132,10 +132,10 @@ Look up a previously registered extension by name. Returns a 4-tuple
> assert extension == (False, None, None, None)
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------------------------- |
-| `name` | unicode | Name of the extension. |
-| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
+| Name | Type | Description |
+| ----------- | ----- | ------------------------------------------------------------- |
+| `name` | str | Name of the extension. |
+| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
## Span.has_extension {#has_extension tag="classmethod" new="2"}
@@ -149,10 +149,10 @@ Check whether an extension has been registered on the `Span` class.
> assert Span.has_extension("is_city")
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------ |
-| `name` | unicode | Name of the extension to check. |
-| **RETURNS** | bool | Whether the extension has been registered. |
+| Name | Type | Description |
+| ----------- | ---- | ------------------------------------------ |
+| `name` | str | Name of the extension to check. |
+| **RETURNS** | bool | Whether the extension has been registered. |
## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
@@ -167,10 +167,10 @@ Remove a previously registered extension.
> assert not Span.has_extension("is_city")
> ```
-| Name | Type | Description |
-| ----------- | ------- | --------------------------------------------------------------------- |
-| `name` | unicode | Name of the extension. |
-| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
+| Name | Type | Description |
+| ----------- | ----- | --------------------------------------------------------------------- |
+| `name` | str | Name of the extension. |
+| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
## Span.char_span {#char_span tag="method" new="2.2.4"}
@@ -497,16 +497,16 @@ The L2 norm of the span's vector representation.
| `end` | int | The token offset for the end of the span. |
| `start_char` | int | The character offset for the start of the span. |
| `end_char` | int | The character offset for the end of the span. |
-| `text` | unicode | A unicode representation of the span text. |
-| `text_with_ws` | unicode | The text content of the span with a trailing whitespace character if the last token has one. |
+| `text` | str | A unicode representation of the span text. |
+| `text_with_ws` | str | The text content of the span with a trailing whitespace character if the last token has one. |
| `orth` | int | ID of the verbatim text content. |
-| `orth_` | unicode | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. |
+| `orth_` | str | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. |
| `label` | int | The hash value of the span's label. |
-| `label_` | unicode | The span's label. |
-| `lemma_` | unicode | The span's lemma. |
+| `label_` | str | The span's label. |
+| `lemma_` | str | The span's lemma. |
| `kb_id` | int | The hash value of the knowledge base ID referred to by the span. |
-| `kb_id_` | unicode | The knowledge base ID referred to by the span. |
+| `kb_id_` | str | The knowledge base ID referred to by the span. |
| `ent_id` | int | The hash value of the named entity the token is an instance of. |
-| `ent_id_` | unicode | The string ID of the named entity the token is an instance of. |
+| `ent_id_` | str | The string ID of the named entity the token is an instance of. |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the span. |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md
index 268f19125..922174c78 100644
--- a/website/docs/api/stringstore.md
+++ b/website/docs/api/stringstore.md
@@ -55,7 +55,7 @@ Retrieve a string from a given hash, or vice versa.
| Name | Type | Description |
| -------------- | ------------------------ | -------------------------- |
| `string_or_id` | bytes, unicode or uint64 | The value to encode. |
-| **RETURNS** | unicode or int | The value to be retrieved. |
+| **RETURNS** | str or int | The value to be retrieved. |
## StringStore.\_\_contains\_\_ {#contains tag="method"}
@@ -69,10 +69,10 @@ Check whether a string is in the store.
> assert not "cherry" in stringstore
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------- |
-| `string` | unicode | The string to check. |
-| **RETURNS** | bool | Whether the store contains the string. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------- |
+| `string` | str | The string to check. |
+| **RETURNS** | bool | Whether the store contains the string. |
## StringStore.\_\_iter\_\_ {#iter tag="method"}
@@ -87,9 +87,9 @@ store will always include an empty string `''` at position `0`.
> assert all_strings == ["apple", "orange"]
> ```
-| Name | Type | Description |
-| ---------- | ------- | ---------------------- |
-| **YIELDS** | unicode | A string in the store. |
+| Name | Type | Description |
+| ---------- | ---- | ---------------------- |
+| **YIELDS** | str | A string in the store. |
## StringStore.add {#add tag="method" new="2"}
@@ -106,10 +106,10 @@ Add a string to the `StringStore`.
> assert stringstore["banana"] == banana_hash
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------------ |
-| `string` | unicode | The string to add. |
-| **RETURNS** | uint64 | The string's hash value. |
+| Name | Type | Description |
+| ----------- | ------ | ------------------------ |
+| `string` | str | The string to add. |
+| **RETURNS** | uint64 | The string's hash value. |
## StringStore.to_disk {#to_disk tag="method" new="2"}
@@ -121,9 +121,9 @@ Save the current state to a directory.
> stringstore.to_disk("/path/to/strings")
> ```
-| Name | Type | Description |
-| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| Name | Type | Description |
+| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## StringStore.from_disk {#from_disk tag="method" new="2"}
@@ -136,10 +136,10 @@ Loads state from a directory. Modifies the object in place and returns it.
> stringstore = StringStore().from_disk("/path/to/strings")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `StringStore` | The modified `StringStore` object. |
+| Name | Type | Description |
+| ----------- | ------------- | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| **RETURNS** | `StringStore` | The modified `StringStore` object. |
## StringStore.to_bytes {#to_bytes tag="method"}
@@ -185,7 +185,7 @@ Get a 64-bit hash for a given string.
> assert hash_string("apple") == 8566208034543834098
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------- |
-| `string` | unicode | The string to hash. |
-| **RETURNS** | uint64 | The hash. |
+| Name | Type | Description |
+| ----------- | ------ | ------------------- |
+| `string` | str | The string to hash. |
+| **RETURNS** | uint64 | The hash. |
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index bd3382f89..f14da3ac5 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -229,10 +229,10 @@ Add a new label to the pipe.
> tagger.add_label("MY_LABEL", {POS: 'NOUN'})
> ```
-| Name | Type | Description |
-| -------- | ------- | --------------------------------------------------------------- |
-| `label` | unicode | The label to add. |
-| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. |
+| Name | Type | Description |
+| -------- | ---- | --------------------------------------------------------------- |
+| `label` | str | The label to add. |
+| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. |
## Tagger.to_disk {#to_disk tag="method"}
@@ -245,10 +245,10 @@ Serialize the pipe to disk.
> tagger.to_disk("/path/to/tagger")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Tagger.from_disk {#from_disk tag="method"}
@@ -261,11 +261,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
> tagger.from_disk("/path/to/tagger")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
-| **RETURNS** | `Tagger` | The modified `Tagger` object. |
+| Name | Type | Description |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Tagger` | The modified `Tagger` object. |
## Tagger.to_bytes {#to_bytes tag="method"}
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index 1a0280265..dc1c083ac 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -44,7 +44,7 @@ shortcut for this and instantiate the component using its string name and
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
| `exclusive_classes` | bool | Make categories mutually exclusive. Defaults to `False`. |
-| `architecture` | unicode | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. |
+| `architecture` | str | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. |
| **RETURNS** | `TextCategorizer` | The newly constructed object. |
### Architectures {#architectures new="2.1"}
@@ -247,9 +247,9 @@ Add a new label to the pipe.
> textcat.add_label("MY_LABEL")
> ```
-| Name | Type | Description |
-| ------- | ------- | ----------------- |
-| `label` | unicode | The label to add. |
+| Name | Type | Description |
+| ------- | ---- | ----------------- |
+| `label` | str | The label to add. |
## TextCategorizer.to_disk {#to_disk tag="method"}
@@ -262,10 +262,10 @@ Serialize the pipe to disk.
> textcat.to_disk("/path/to/textcat")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## TextCategorizer.from_disk {#from_disk tag="method"}
@@ -280,7 +280,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description |
| ----------- | ----------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. |
diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index c30c01c20..1accbe062 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -58,7 +58,7 @@ For details, see the documentation on
| Name | Type | Description |
| --------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------- |
-| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `token._.my_attr`. |
+| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `token._.my_attr`. |
| `default` | - | Optional default value of the attribute if no getter or method is defined. |
| `method` | callable | Set a custom method on the object, for example `token._.compare(other_token)`. |
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. |
@@ -80,10 +80,10 @@ Look up a previously registered extension by name. Returns a 4-tuple
> assert extension == (False, None, None, None)
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------------------------- |
-| `name` | unicode | Name of the extension. |
-| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
+| Name | Type | Description |
+| ----------- | ----- | ------------------------------------------------------------- |
+| `name` | str | Name of the extension. |
+| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
## Token.has_extension {#has_extension tag="classmethod" new="2"}
@@ -97,10 +97,10 @@ Check whether an extension has been registered on the `Token` class.
> assert Token.has_extension("is_fruit")
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------ |
-| `name` | unicode | Name of the extension to check. |
-| **RETURNS** | bool | Whether the extension has been registered. |
+| Name | Type | Description |
+| ----------- | ---- | ------------------------------------------ |
+| `name` | str | Name of the extension to check. |
+| **RETURNS** | bool | Whether the extension has been registered. |
## Token.remove_extension {#remove_extension tag="classmethod" new=""2.0.11""}
@@ -115,10 +115,10 @@ Remove a previously registered extension.
> assert not Token.has_extension("is_fruit")
> ```
-| Name | Type | Description |
-| ----------- | ------- | --------------------------------------------------------------------- |
-| `name` | unicode | Name of the extension. |
-| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
+| Name | Type | Description |
+| ----------- | ----- | --------------------------------------------------------------------- |
+| `name` | str | Name of the extension. |
+| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
## Token.check_flag {#check_flag tag="method"}
@@ -408,71 +408,71 @@ The L2 norm of the token's vector representation.
## Attributes {#attributes}
-| Name | Type | Description |
-| -------------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `doc` | `Doc` | The parent document. |
-| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. |
-| `text` | unicode | Verbatim text content. |
-| `text_with_ws` | unicode | Text content, with trailing space character if present. |
-| `whitespace_` | unicode | Trailing space character if present. |
-| `orth` | int | ID of the verbatim text content. |
-| `orth_` | unicode | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. |
-| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
-| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. |
-| `head` | `Token` | The syntactic parent, or "governor", of this token. |
-| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. |
-| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. |
-| `i` | int | The index of the token within the parent document. |
-| `ent_type` | int | Named entity type. |
-| `ent_type_` | unicode | Named entity type. |
-| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. |
-| `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. |
-| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. |
-| `ent_kb_id_` 2.2 | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. |
-| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. |
-| `ent_id_` | unicode | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. |
-| `lemma` | int | Base form of the token, with no inflectional suffixes. |
-| `lemma_` | unicode | Base form of the token, with no inflectional suffixes. |
-| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). |
-| `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). |
-| `lower` | int | Lowercase form of the token. |
-| `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. |
+| Name | Type | Description |
+| -------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `doc` | `Doc` | The parent document. |
+| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. |
+| `text` | str | Verbatim text content. |
+| `text_with_ws` | str | Text content, with trailing space character if present. |
+| `whitespace_` | str | Trailing space character if present. |
+| `orth` | int | ID of the verbatim text content. |
+| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. |
+| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
+| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. |
+| `head` | `Token` | The syntactic parent, or "governor", of this token. |
+| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. |
+| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. |
+| `i` | int | The index of the token within the parent document. |
+| `ent_type` | int | Named entity type. |
+| `ent_type_` | str | Named entity type. |
+| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. |
+| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. |
+| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. |
+| `ent_kb_id_` 2.2 | str | Knowledge base ID that refers to the named entity this token is a part of, if any. |
+| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. |
+| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. |
+| `lemma` | int | Base form of the token, with no inflectional suffixes. |
+| `lemma_` | str | Base form of the token, with no inflectional suffixes. |
+| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). |
+| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). |
+| `lower` | int | Lowercase form of the token. |
+| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. |
| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
-| `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
-| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. |
-| `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. |
-| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. |
-| `suffix_` | unicode | Length-N substring from the end of the token. Defaults to `N=3`. |
-| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. |
-| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. |
-| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. |
-| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. |
-| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. |
-| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. |
-| `is_punct` | bool | Is the token punctuation? |
-| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `'('` ? |
-| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `')'` ? |
-| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. |
-| `is_bracket` | bool | Is the token a bracket? |
-| `is_quote` | bool | Is the token a quotation mark? |
-| `is_currency` 2.0.8 | bool | Is the token a currency symbol? |
-| `like_url` | bool | Does the token resemble a URL? |
-| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. |
-| `like_email` | bool | Does the token resemble an email address? |
-| `is_oov` | bool | Is the token out-of-vocabulary? |
-| `is_stop` | bool | Is the token part of a "stop list"? |
-| `pos` | int | Coarse-grained part-of-speech. |
-| `pos_` | unicode | Coarse-grained part-of-speech. |
-| `tag` | int | Fine-grained part-of-speech. |
-| `tag_` | unicode | Fine-grained part-of-speech. |
-| `dep` | int | Syntactic dependency relation. |
-| `dep_` | unicode | Syntactic dependency relation. |
-| `lang` | int | Language of the parent document's vocabulary. |
-| `lang_` | unicode | Language of the parent document's vocabulary. |
-| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). |
-| `idx` | int | The character offset of the token within the parent document. |
-| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. |
-| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. |
-| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. |
-| `cluster` | int | Brown cluster ID. |
-| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
+| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
+| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. |
+| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. |
+| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. |
+| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. |
+| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. |
+| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. |
+| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. |
+| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. |
+| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. |
+| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. |
+| `is_punct` | bool | Is the token punctuation? |
+| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `'('` ? |
+| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `')'` ? |
+| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. |
+| `is_bracket` | bool | Is the token a bracket? |
+| `is_quote` | bool | Is the token a quotation mark? |
+| `is_currency` 2.0.8 | bool | Is the token a currency symbol? |
+| `like_url` | bool | Does the token resemble a URL? |
+| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. |
+| `like_email` | bool | Does the token resemble an email address? |
+| `is_oov` | bool | Is the token out-of-vocabulary? |
+| `is_stop` | bool | Is the token part of a "stop list"? |
+| `pos` | int | Coarse-grained part-of-speech. |
+| `pos_` | str | Coarse-grained part-of-speech. |
+| `tag` | int | Fine-grained part-of-speech. |
+| `tag_` | str | Fine-grained part-of-speech. |
+| `dep` | int | Syntactic dependency relation. |
+| `dep_` | str | Syntactic dependency relation. |
+| `lang` | int | Language of the parent document's vocabulary. |
+| `lang_` | str | Language of the parent document's vocabulary. |
+| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). |
+| `idx` | int | The character offset of the token within the parent document. |
+| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. |
+| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. |
+| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. |
+| `cluster` | int | Brown cluster ID. |
+| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md
index 7462af739..c71f849ad 100644
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@@ -34,15 +34,15 @@ the
> tokenizer = nlp.Defaults.create_tokenizer(nlp)
> ```
-| Name | Type | Description |
-| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | `Vocab` | A storage container for lexical types. |
-| `rules` | dict | Exceptions and special-cases for the tokenizer. |
-| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
-| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
-| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
-| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. |
-| **RETURNS** | `Tokenizer` | The newly constructed object. |
+| Name | Type | Description |
+| ---------------- | ----------- | ------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | A storage container for lexical types. |
+| `rules` | dict | Exceptions and special-cases for the tokenizer. |
+| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
+| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
+| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
+| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. |
+| **RETURNS** | `Tokenizer` | The newly constructed object. |
## Tokenizer.\_\_call\_\_ {#call tag="method"}
@@ -55,10 +55,10 @@ Tokenize a string.
> assert len(tokens) == 4
> ```
-| Name | Type | Description |
-| ----------- | ------- | --------------------------------------- |
-| `string` | unicode | The string to tokenize. |
-| **RETURNS** | `Doc` | A container for linguistic annotations. |
+| Name | Type | Description |
+| ----------- | ----- | --------------------------------------- |
+| `string` | str | The string to tokenize. |
+| **RETURNS** | `Doc` | A container for linguistic annotations. |
## Tokenizer.pipe {#pipe tag="method"}
@@ -82,20 +82,20 @@ Tokenize a stream of texts.
Find internal split points of the string.
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `string` | unicode | The string to split. |
-| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `string` | str | The string to split. |
+| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. |
## Tokenizer.find_prefix {#find_prefix tag="method"}
Find the length of a prefix that should be segmented from the string, or `None`
if no prefix rules match.
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------------------ |
-| `string` | unicode | The string to segment. |
-| **RETURNS** | int | The length of the prefix if present, otherwise `None`. |
+| Name | Type | Description |
+| ----------- | ---- | ------------------------------------------------------ |
+| `string` | str | The string to segment. |
+| **RETURNS** | int | The length of the prefix if present, otherwise `None`. |
## Tokenizer.find_suffix {#find_suffix tag="method"}
@@ -104,7 +104,7 @@ if no suffix rules match.
| Name | Type | Description |
| ----------- | ------------ | ------------------------------------------------------ |
-| `string` | unicode | The string to segment. |
+| `string` | str | The string to segment. |
| **RETURNS** | int / `None` | The length of the suffix if present, otherwise `None`. |
## Tokenizer.add_special_case {#add_special_case tag="method"}
@@ -125,7 +125,7 @@ and examples.
| Name | Type | Description |
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `string` | unicode | The string to specially tokenize. |
+| `string` | str | The string to specially tokenize. |
| `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. |
## Tokenizer.explain {#explain tag="method"}
@@ -142,10 +142,10 @@ produced are identical to `Tokenizer.__call__` except for whitespace tokens.
> assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"]
> ```
-| Name | Type | Description |
-| ------------| -------- | --------------------------------------------------- |
-| `string` | unicode | The string to tokenize with the debugging tokenizer |
-| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples |
+| Name | Type | Description |
+| ----------- | ---- | --------------------------------------------------- |
+| `string` | str | The string to tokenize with the debugging tokenizer |
+| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples |
## Tokenizer.to_disk {#to_disk tag="method"}
@@ -158,10 +158,10 @@ Serialize the tokenizer to disk.
> tokenizer.to_disk("/path/to/tokenizer")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Tokenizer.from_disk {#from_disk tag="method"}
@@ -174,11 +174,11 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
> tokenizer.from_disk("/path/to/tokenizer")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
-| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. |
+| Name | Type | Description |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. |
## Tokenizer.to_bytes {#to_bytes tag="method"}
@@ -217,14 +217,14 @@ it.
## Attributes {#attributes}
-| Name | Type | Description |
-| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
-| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
-| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
-| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
-| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
-| `rules` | dict | A dictionary of tokenizer exceptions and special cases. |
+| Name | Type | Description |
+| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
+| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
+| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
+| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
+| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an`re.MatchObject`or`None. |
+| `rules` | dict | A dictionary of tokenizer exceptions and special cases. |
## Serialization fields {#serialization-fields}
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 2360ad472..bdd094021 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -32,11 +32,11 @@ class. The data will be loaded in via
> nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | --------------------------------------------------------------------------------- |
-| `name` | unicode / `Path` | Model to load, i.e. shortcut link, package name or path. |
-| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
-| **RETURNS** | `Language` | A `Language` object with the loaded model. |
+| Name | Type | Description |
+| ----------- | ------------ | --------------------------------------------------------------------------------- |
+| `name` | str / `Path` | Model to load, i.e. shortcut link, package name or path. |
+| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
+| **RETURNS** | `Language` | A `Language` object with the loaded model. |
Essentially, `spacy.load()` is a convenience wrapper that reads the language ID
and pipeline components from a model's `meta.json`, initializes the `Language`
@@ -79,7 +79,7 @@ Create a blank model of a given language class. This function is the twin of
| Name | Type | Description |
| ----------- | ---------- | ------------------------------------------------------------------------------------------------ |
-| `name` | unicode | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
+| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. |
@@ -98,10 +98,10 @@ meta data as a dictionary instead, you can use the `meta` attribute on your
> spacy.info("de", markdown=True)
> ```
-| Name | Type | Description |
-| ---------- | ------- | ------------------------------------------------------------- |
-| `model` | unicode | A model, i.e. shortcut link, package name or path (optional). |
-| `markdown` | bool | Print information as Markdown. |
+| Name | Type | Description |
+| ---------- | ---- | ------------------------------------------------------------- |
+| `model` | str | A model, i.e. shortcut link, package name or path (optional). |
+| `markdown` | bool | Print information as Markdown. |
### spacy.explain {#spacy.explain tag="function"}
@@ -122,10 +122,10 @@ list of available terms, see
> # world NN noun, singular or mass
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------------------------- |
-| `term` | unicode | Term to explain. |
-| **RETURNS** | unicode | The explanation, or `None` if not found in the glossary. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------------------------- |
+| `term` | str | Term to explain. |
+| **RETURNS** | str | The explanation, or `None` if not found in the glossary. |
### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"}
@@ -189,13 +189,13 @@ browser. Will run a simple web server.
| Name | Type | Description | Default |
| --------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- |
| `docs` | list, `Doc`, `Span` | Document(s) to visualize. |
-| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
+| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
| `page` | bool | Render markup as full HTML page. | `True` |
| `minify` | bool | Minify HTML markup. | `False` |
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
| `port` | int | Port to serve visualization. | `5000` |
-| `host` | unicode | Host to serve visualization. | `'0.0.0.0'` |
+| `host` | str | Host to serve visualization. | `'0.0.0.0'` |
### displacy.render {#displacy.render tag="method" new="2"}
@@ -214,13 +214,13 @@ Render a dependency parse tree or named entity visualization.
| Name | Type | Description | Default |
| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
| `docs` | list, `Doc`, `Span` | Document(s) to visualize. |
-| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
+| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
| `page` | bool | Render markup as full HTML page. | `False` |
| `minify` | bool | Minify HTML markup. | `False` |
| `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` |
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
-| **RETURNS** | unicode | Rendered HTML markup. |
+| **RETURNS** | str | Rendered HTML markup. |
### Visualizer options {#displacy_options}
@@ -236,22 +236,22 @@ If a setting is not present in the options, the default value will be used.
> displacy.serve(doc, style="dep", options=options)
> ```
-| Name | Type | Description | Default |
-| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
-| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
-| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` |
-| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
-| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
-| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
-| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` |
-| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` |
-| `font` | unicode | Font name or font family for all text. | `'Arial'` |
-| `offset_x` | int | Spacing on left side of the SVG in px. | `50` |
-| `arrow_stroke` | int | Width of arrow path in px. | `2` |
-| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) |
-| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) |
-| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` |
-| `distance` | int | Distance between words in px. | `175` / `150` (compact) |
+| Name | Type | Description | Default |
+| ------------------------------------------ | ---- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
+| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
+| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` |
+| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
+| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
+| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
+| `color` | str | Text color (HEX, RGB or color names). | `'#000000'` |
+| `bg` | str | Background color (HEX, RGB or color names). | `'#ffffff'` |
+| `font` | str | Font name or font family for all text. | `'Arial'` |
+| `offset_x` | int | Spacing on left side of the SVG in px. | `50` |
+| `arrow_stroke` | int | Width of arrow path in px. | `2` |
+| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) |
+| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) |
+| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` |
+| `distance` | int | Distance between words in px. | `175` / `150` (compact) |
#### Named Entity Visualizer options {#displacy_options-ent}
@@ -263,11 +263,11 @@ If a setting is not present in the options, the default value will be used.
> displacy.serve(doc, style="ent", options=options)
> ```
-| Name | Type | Description | Default |
-| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
-| `ents` | list | Entity types to highlight (`None` for all types). | `None` |
-| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` |
-| `template` 2.2 | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
+| Name | Type | Description | Default |
+| --------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
+| `ents` | list | Entity types to highlight (`None` for all types). | `None` |
+| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` |
+| `template` 2.2 | str | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
By default, displaCy comes with colors for all
[entity types supported by spaCy](/api/annotation#named-entities). If you're
@@ -308,9 +308,9 @@ Set custom path to the data directory where spaCy looks for models.
> # PosixPath('/custom/path')
> ```
-| Name | Type | Description |
-| ------ | ---------------- | --------------------------- |
-| `path` | unicode / `Path` | Path to new data directory. |
+| Name | Type | Description |
+| ------ | ------------ | --------------------------- |
+| `path` | str / `Path` | Path to new data directory. |
### util.get_lang_class {#util.get_lang_class tag="function"}
@@ -330,7 +330,7 @@ you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper.
| Name | Type | Description |
| ----------- | ---------- | -------------------------------------- |
-| `lang` | unicode | Two-letter language code, e.g. `'en'`. |
+| `lang` | str | Two-letter language code, e.g. `'en'`. |
| **RETURNS** | `Language` | Language class. |
### util.set_lang_class {#util.set_lang_class tag="function"}
@@ -352,7 +352,7 @@ the two-letter language code.
| Name | Type | Description |
| ------ | ---------- | -------------------------------------- |
-| `name` | unicode | Two-letter language code, e.g. `'en'`. |
+| `name` | str | Two-letter language code, e.g. `'en'`. |
| `cls` | `Language` | The language class, e.g. `English`. |
### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
@@ -368,10 +368,10 @@ loaded lazily, to avoid expensive setup code associated with the language data.
> assert util.lang_class_is_loaded("de") is False
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------- |
-| `name` | unicode | Two-letter language code, e.g. `'en'`. |
-| **RETURNS** | bool | Whether the class has been loaded. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------- |
+| `name` | str | Two-letter language code, e.g. `'en'`. |
+| **RETURNS** | bool | Whether the class has been loaded. |
### util.load_model {#util.load_model tag="function" new="2"}
@@ -392,7 +392,7 @@ in via [`Language.from_disk()`](/api/language#from_disk).
| Name | Type | Description |
| ------------- | ---------- | -------------------------------------------------------- |
-| `name` | unicode | Package name, shortcut link or model path. |
+| `name` | str | Package name, shortcut link or model path. |
| `**overrides` | - | Specific overrides, like pipeline components to disable. |
| **RETURNS** | `Language` | `Language` class with the loaded model. |
@@ -411,7 +411,7 @@ it easy to test a new model that you haven't packaged yet.
| Name | Type | Description |
| ------------- | ---------- | ---------------------------------------------------------------------------------------------------- |
-| `model_path` | unicode | Path to model data directory. |
+| `model_path` | str | Path to model data directory. |
| `meta` | dict | Model meta data. If `False`, spaCy will try to load the meta from a meta.json in the same directory. |
| `**overrides` | - | Specific overrides, like pipeline components to disable. |
| **RETURNS** | `Language` | `Language` class with the loaded model. |
@@ -432,7 +432,7 @@ A helper function to use in the `load()` method of a model package's
| Name | Type | Description |
| ------------- | ---------- | -------------------------------------------------------- |
-| `init_file` | unicode | Path to model's `__init__.py`, i.e. `__file__`. |
+| `init_file` | str | Path to model's `__init__.py`, i.e. `__file__`. |
| `**overrides` | - | Specific overrides, like pipeline components to disable. |
| **RETURNS** | `Language` | `Language` class with the loaded model. |
@@ -446,10 +446,10 @@ Get a model's meta.json from a directory path and validate its contents.
> meta = util.get_model_meta("/path/to/model")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | ------------------------ |
-| `path` | unicode / `Path` | Path to model directory. |
-| **RETURNS** | dict | The model's meta data. |
+| Name | Type | Description |
+| ----------- | ------------ | ------------------------ |
+| `path` | str / `Path` | Path to model directory. |
+| **RETURNS** | dict | The model's meta data. |
### util.is_package {#util.is_package tag="function"}
@@ -463,10 +463,10 @@ Check if string maps to a package installed via pip. Mainly used to validate
> util.is_package("xyz") # False
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------------- |
-| `name` | unicode | Name of package. |
-| **RETURNS** | `bool` | `True` if installed package, `False` if not. |
+| Name | Type | Description |
+| ----------- | ------ | -------------------------------------------- |
+| `name` | str | Name of package. |
+| **RETURNS** | `bool` | `True` if installed package, `False` if not. |
### util.get_package_path {#util.get_package_path tag="function" new="2"}
@@ -480,10 +480,10 @@ Get path to an installed package. Mainly used to resolve the location of
> # /usr/lib/python3.6/site-packages/en_core_web_sm
> ```
-| Name | Type | Description |
-| -------------- | ------- | -------------------------------- |
-| `package_name` | unicode | Name of installed package. |
-| **RETURNS** | `Path` | Path to model package directory. |
+| Name | Type | Description |
+| -------------- | ------ | -------------------------------- |
+| `package_name` | str | Name of installed package. |
+| **RETURNS** | `Path` | Path to model package directory. |
### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"}
diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md
index 93e747c1e..d4c0269ef 100644
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@@ -35,7 +35,7 @@ you can add vectors to later.
| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. |
| `keys` | iterable | A sequence of keys aligned with the data. |
| `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. |
-| `name` | unicode | A name to identify the vectors table. |
+| `name` | str | A name to identify the vectors table. |
| **RETURNS** | `Vectors` | The newly created object. |
## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
@@ -140,7 +140,7 @@ mapping separately. If you need to manage the strings, you should use the
| Name | Type | Description |
| ----------- | ---------------------------------- | ----------------------------------------------------- |
-| `key` | unicode / int | The key to add. |
+| `key` | str / int | The key to add. |
| `vector` | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key. |
| `row` | int | An optional row number of a vector to map the key to. |
| **RETURNS** | int | The row the vector was added to. |
@@ -227,7 +227,7 @@ Look up one or more keys by row, or vice versa.
| Name | Type | Description |
| ----------- | ------------------------------------- | ------------------------------------------------------------------------ |
-| `key` | unicode / int | Find the row that the given key points to. Returns int, `-1` if missing. |
+| `key` | str / int | Find the row that the given key points to. Returns int, `-1` if missing. |
| `keys` | iterable | Find rows that the keys point to. Returns `ndarray`. |
| `row` | int | Find the first key that points to the row. Returns int. |
| `rows` | iterable | Find the keys that point to the rows. Returns ndarray. |
@@ -337,9 +337,9 @@ Save the current state to a directory.
>
> ```
-| Name | Type | Description |
-| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| Name | Type | Description |
+| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## Vectors.from_disk {#from_disk tag="method"}
@@ -352,10 +352,10 @@ Loads state from a directory. Modifies the object in place and returns it.
> vectors.from_disk("/path/to/vectors")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `Vectors` | The modified `Vectors` object. |
+| Name | Type | Description |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| **RETURNS** | `Vectors` | The modified `Vectors` object. |
## Vectors.to_bytes {#to_bytes tag="method"}
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 685619c88..420e8263a 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -327,11 +327,11 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy)..
### Disabling the parser {#disabling}
In the [default models](/models), the parser is loaded and enabled as part of
-the [standard processing pipeline](/usage/processing-pipelines). If you don't need
-any of the syntactic information, you should disable the parser. Disabling the
-parser will make spaCy load and run much faster. If you want to load the parser,
-but need to disable it for specific documents, you can also control its use on
-the `nlp` object.
+the [standard processing pipeline](/usage/processing-pipelines). If you don't
+need any of the syntactic information, you should disable the parser. Disabling
+the parser will make spaCy load and run much faster. If you want to load the
+parser, but need to disable it for specific documents, you can also control its
+use on the `nlp` object.
```python
nlp = spacy.load("en_core_web_sm", disable=["parser"])
@@ -990,10 +990,10 @@ nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = my_tokenizer
```
-| Argument | Type | Description |
-| ----------- | ------- | ------------------------- |
-| `text` | unicode | The raw text to tokenize. |
-| **RETURNS** | `Doc` | The tokenized document. |
+| Argument | Type | Description |
+| ----------- | ----- | ------------------------- |
+| `text` | str | The raw text to tokenize. |
+| **RETURNS** | `Doc` | The tokenized document. |
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 696e11106..e7aca3981 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -272,16 +272,16 @@ doc = nlp("I won't have named entities")
disabled.restore()
```
-If you want to disable all pipes except for one or a few, you can use the `enable`
-keyword. Just like the `disable` keyword, it takes a list of pipe names, or a string
-defining just one pipe.
+If you want to disable all pipes except for one or a few, you can use the
+`enable` keyword. Just like the `disable` keyword, it takes a list of pipe
+names, or a string defining just one pipe.
+
```python
# Enable only the parser
with nlp.select_pipes(enable="parser"):
doc = nlp("I will only be parsed")
```
-
Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method
to remove pipeline components from an existing pipeline, the
[`rename_pipe`](/api/language#rename_pipe) method to rename them, or the
@@ -349,12 +349,12 @@ last** in the pipeline, or define a **custom name**. If no name is set and no
> nlp.add_pipe(my_component, before="parser")
> ```
-| Argument | Type | Description |
-| -------- | ------- | ------------------------------------------------------------------------ |
-| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). |
-| `first` | bool | If set to `True`, component is added **first** in the pipeline. |
-| `before` | unicode | String name of component to add the new component **before**. |
-| `after` | unicode | String name of component to add the new component **after**. |
+| Argument | Type | Description |
+| -------- | ---- | ------------------------------------------------------------------------ |
+| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). |
+| `first` | bool | If set to `True`, component is added **first** in the pipeline. |
+| `before` | str | String name of component to add the new component **before**. |
+| `after` | str | String name of component to add the new component **after**. |
### Example: A simple pipeline component {#custom-components-simple}
diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index 058204a5d..588782986 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -94,8 +94,8 @@ docs = list(doc_bin.get_docs(nlp.vocab))
If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as
well, which includes the values of
-[extension attributes](/usage/processing-pipelines#custom-components-attributes) (if
-they're serializable with msgpack).
+[extension attributes](/usage/processing-pipelines#custom-components-attributes)
+(if they're serializable with msgpack).
@@ -666,10 +666,10 @@ and lets you customize how the model should be initialized and loaded. You can
define the language data to be loaded and the
[processing pipeline](/usage/processing-pipelines) to execute.
-| Setting | Type | Description |
-| ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang` | unicode | ID of the language class to initialize. |
-| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. |
+| Setting | Type | Description |
+| ---------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `lang` | str | ID of the language class to initialize. |
+| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. |
The `load()` method that comes with our model package templates will take care
of putting all this together and returning a `Language` object with the loaded
diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md
index dd0b0eb50..9733e09c2 100644
--- a/website/docs/usage/visualizers.md
+++ b/website/docs/usage/visualizers.md
@@ -67,12 +67,12 @@ arcs.
-| Argument | Type | Description | Default |
-| --------- | ------- | ----------------------------------------------------------- | ----------- |
-| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
-| `color` | unicode | Text color (HEX, RGB or color names). | `"#000000"` |
-| `bg` | unicode | Background color (HEX, RGB or color names). | `"#ffffff"` |
-| `font` | unicode | Font name or font family for all text. | `"Arial"` |
+| Argument | Type | Description | Default |
+| --------- | ---- | ----------------------------------------------------------- | ----------- |
+| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
+| `color` | str | Text color (HEX, RGB or color names). | `"#000000"` |
+| `bg` | str | Background color (HEX, RGB or color names). | `"#ffffff"` |
+| `font` | str | Font name or font family for all text. | `"Arial"` |
For a list of all available options, see the
[`displacy` API documentation](/api/top-level#displacy_options).
From 1a15896ba9bcb2b12113880929edfb4fdf0683ff Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sun, 24 May 2020 18:51:10 +0200
Subject: [PATCH 174/496] unicode -> str consistency [ci skip]
---
spacy/cli/info.py | 2 +-
spacy/displacy/render.py | 2 +-
spacy/gold.pyx | 4 ++--
spacy/language.py | 4 ++--
spacy/matcher/dependencymatcher.pyx | 2 +-
spacy/matcher/matcher.pyx | 2 +-
spacy/matcher/phrasematcher.pyx | 2 +-
spacy/pipeline/entityruler.py | 2 +-
spacy/strings.pyx | 6 +++---
spacy/tokenizer.pyx | 4 ++--
spacy/tokens/doc.pyx | 4 ++--
spacy/util.py | 8 ++++----
spacy/vocab.pyx | 10 +++++-----
13 files changed, 26 insertions(+), 26 deletions(-)
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index d779eb2b3..98fd5cabf 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -65,7 +65,7 @@ def print_markdown(data, title=None):
"""Print data in GitHub-flavoured Markdown format for issues etc.
data (dict or list of tuples): Label/value pairs.
- title (unicode or None): Title, will be rendered as headline 2.
+ title (str / None): Title, will be rendered as headline 2.
"""
markdown = []
for key, value in data.items():
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index d3572ce78..ef8632cbc 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -302,7 +302,7 @@ class EntityRenderer(object):
text (str): Original text.
spans (list): Individual entity spans and their start, end and label.
- title (unicode or None): Document title set in Doc.user_data['title'].
+ title (str / None): Document title set in Doc.user_data['title'].
"""
markup = ""
offset = 0
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 1864b7a04..ecbd13354 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -140,8 +140,8 @@ class GoldCorpus(object):
def __init__(self, train, dev, gold_preproc=False, limit=None):
"""Create a GoldCorpus.
- train (unicode or Path): File or directory of training data.
- dev (unicode or Path): File or directory of development data.
+ train (str / Path): File or directory of training data.
+ dev (str / Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object.
"""
self.limit = limit
diff --git a/spacy/language.py b/spacy/language.py
index e3b770723..551b8c9af 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -934,7 +934,7 @@ class Language(object):
"""Save the current state to a directory. If a model is loaded, this
will include the model.
- path (unicode or Path): Path to a directory, which will be created if
+ path (str / Path): Path to a directory, which will be created if
it doesn't exist.
exclude (list): Names of components or serialization fields to exclude.
@@ -968,7 +968,7 @@ class Language(object):
returns it. If the saved `Language` object contains a model, the
model will be loaded.
- path (unicode or Path): A path to a directory.
+ path (str / Path): A path to a directory.
exclude (list): Names of components or serialization fields to exclude.
RETURNS (Language): The modified `Language` object.
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 732931380..ddeeedd06 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -194,7 +194,7 @@ cdef class DependencyMatcher:
def get(self, key, default=None):
"""Retrieve the pattern stored for a key.
- key (unicode or int): The key to retrieve.
+ key (str / int): The key to retrieve.
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
"""
key = self._normalize_key(key)
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 225eba9a9..868465b8d 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -165,7 +165,7 @@ cdef class Matcher:
def get(self, key, default=None):
"""Retrieve the pattern stored for a key.
- key (unicode or int): The key to retrieve.
+ key (str / int): The key to retrieve.
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
"""
key = self._normalize_key(key)
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index f7ce44ece..aa4534296 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -30,7 +30,7 @@ cdef class PhraseMatcher:
"""Initialize the PhraseMatcher.
vocab (Vocab): The shared vocabulary.
- attr (int / unicode): Token attribute to match on.
+ attr (int / str): Token attribute to match on.
validate (bool): Perform additional validation when patterns are added.
RETURNS (PhraseMatcher): The newly constructed object.
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index cdacc82f6..bdc009192 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -30,7 +30,7 @@ class EntityRuler(object):
nlp (Language): The shared nlp object to pass the vocab to the matchers
and process phrase patterns.
- phrase_matcher_attr (int / unicode): Token attribute to match on, passed
+ phrase_matcher_attr (int / str): Token attribute to match on, passed
to the internal PhraseMatcher as `attr`
validate (bool): Whether patterns should be validated, passed to
Matcher and PhraseMatcher as `validate`
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 9fe5af154..9e584ce8a 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -109,7 +109,7 @@ cdef class StringStore:
"""Retrieve a string from a given hash, or vice versa.
string_or_id (bytes, unicode or uint64): The value to encode.
- Returns (unicode or uint64): The value to be retrieved.
+ Returns (str / uint64): The value to be retrieved.
"""
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
return 0
@@ -223,7 +223,7 @@ cdef class StringStore:
def to_disk(self, path):
"""Save the current state to a directory.
- path (unicode or Path): A path to a directory, which will be created if
+ path (str / Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or Path-like objects.
"""
path = util.ensure_path(path)
@@ -234,7 +234,7 @@ cdef class StringStore:
"""Loads state from a directory. Modifies the object in place and
returns it.
- path (unicode or Path): A path to a directory. Paths may be either
+ path (str / Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (StringStore): The modified `StringStore` object.
"""
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index b628b1171..538bf60e9 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -693,7 +693,7 @@ cdef class Tokenizer:
def to_disk(self, path, **kwargs):
"""Save the current state to a directory.
- path (unicode or Path): A path to a directory, which will be created if
+ path (str / Path): A path to a directory, which will be created if
it doesn't exist.
exclude (list): String names of serialization fields to exclude.
@@ -707,7 +707,7 @@ cdef class Tokenizer:
"""Loads state from a directory. Modifies the object in place and
returns it.
- path (unicode or Path): A path to a directory.
+ path (str / Path): A path to a directory.
exclude (list): String names of serialization fields to exclude.
RETURNS (Tokenizer): The modified `Tokenizer` object.
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index f6d0dbf4a..31c1e8c82 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -843,7 +843,7 @@ cdef class Doc:
def to_disk(self, path, **kwargs):
"""Save the current state to a directory.
- path (unicode or Path): A path to a directory, which will be created if
+ path (str / Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or Path-like objects.
exclude (list): String names of serialization fields to exclude.
@@ -857,7 +857,7 @@ cdef class Doc:
"""Loads state from a directory. Modifies the object in place and
returns it.
- path (unicode or Path): A path to a directory. Paths may be either
+ path (str / Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
exclude (list): String names of serialization fields to exclude.
RETURNS (Doc): The modified `Doc` object.
diff --git a/spacy/util.py b/spacy/util.py
index fc5837755..b614c29c7 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -269,7 +269,7 @@ def load_config(path, create_objects=False):
"""Load a Thinc-formatted config file, optionally filling in objects where
the config references registry entries. See "Thinc config files" for details.
- path (unicode or Path): Path to the config file
+ path (str / Path): Path to the config file
create_objects (bool): Whether to automatically create objects when the config
references registry entries. Defaults to False.
@@ -286,7 +286,7 @@ def load_config_from_str(string, create_objects=False):
"""Load a Thinc-formatted config, optionally filling in objects where
the config references registry entries. See "Thinc config files" for details.
- string (unicode or Path): Text contents of the config file.
+ string (str / Path): Text contents of the config file.
create_objects (bool): Whether to automatically create objects when the config
references registry entries. Defaults to False.
@@ -302,7 +302,7 @@ def load_config_from_str(string, create_objects=False):
def get_model_meta(path):
"""Get model meta.json from a directory path and validate its contents.
- path (unicode or Path): Path to model directory.
+ path (str / Path): Path to model directory.
RETURNS (dict): The model's meta data.
"""
model_path = ensure_path(path)
@@ -321,7 +321,7 @@ def get_model_meta(path):
def get_model_config(path):
"""Get the model's config from a directory path.
- path (unicode or Path): Path to model directory.
+ path (str / Path): Path to model directory.
RETURNS (Config): The model's config data.
"""
model_path = ensure_path(path)
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index ed37f6e98..3a82ab72d 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -336,7 +336,7 @@ cdef class Vocab:
If `minn` is defined, then the resulting vector uses Fasttext's
subword features by average over ngrams of `orth`.
- orth (int / unicode): The hash value of a word, or its unicode string.
+ orth (int / str): The hash value of a word, or its unicode string.
minn (int): Minimum n-gram length used for Fasttext's ngram computation.
Defaults to the length of `orth`.
maxn (int): Maximum n-gram length used for Fasttext's ngram computation.
@@ -389,7 +389,7 @@ cdef class Vocab:
"""Set a vector for a word in the vocabulary. Words can be referenced
by string or int ID.
- orth (int / unicode): The word.
+ orth (int / str): The word.
vector (numpy.ndarray[ndim=1, dtype='float32']): The vector to set.
DOCS: https://spacy.io/api/vocab#set_vector
@@ -411,7 +411,7 @@ cdef class Vocab:
"""Check whether a word has a vector. Returns False if no vectors have
been loaded. Words can be looked up by string or int ID.
- orth (int / unicode): The word.
+ orth (int / str): The word.
RETURNS (bool): Whether the word has a vector.
DOCS: https://spacy.io/api/vocab#has_vector
@@ -423,7 +423,7 @@ cdef class Vocab:
def to_disk(self, path, exclude=tuple(), **kwargs):
"""Save the current state to a directory.
- path (unicode or Path): A path to a directory, which will be created if
+ path (str / Path): A path to a directory, which will be created if
it doesn't exist.
exclude (list): String names of serialization fields to exclude.
@@ -448,7 +448,7 @@ cdef class Vocab:
"""Loads state from a directory. Modifies the object in place and
returns it.
- path (unicode or Path): A path to a directory.
+ path (str / Path): A path to a directory.
exclude (list): String names of serialization fields to exclude.
RETURNS (Vocab): The modified `Vocab` object.
From 0d3cfe155f55490af57a12321fb0be58f04ecc39 Mon Sep 17 00:00:00 2001
From: Rajat <22280243+R1j1t@users.noreply.github.com>
Date: Mon, 25 May 2020 15:00:23 +0530
Subject: [PATCH 175/496] update spacy universe with my project (#5497)
* added contextualSpellCheck in spacy universe meta
* removed extra formatting by code
* updated with permanent links
* run json linter used by spacy
* filled SCA
* updated the description
---
.github/contributors/R1j1t.md | 106 ++++++++++++++++++++++++++++++++++
website/meta/universe.json | 30 ++++++++++
2 files changed, 136 insertions(+)
create mode 100644 .github/contributors/R1j1t.md
diff --git a/.github/contributors/R1j1t.md b/.github/contributors/R1j1t.md
new file mode 100644
index 000000000..a92f1e092
--- /dev/null
+++ b/.github/contributors/R1j1t.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Rajat |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 24 May 2020 |
+| GitHub username | R1j1t |
+| Website (optional) | |
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 58f4cc2aa..aafec7178 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2293,6 +2293,36 @@
},
"category": ["pipeline", "research"],
"tags": ["term_extraction"]
+ },
+ {
+ "id": "contextualSpellCheck",
+ "title": "Contextual Spell Check",
+ "slogan": "Contextual spell correction using BERT (bidirectional representations)",
+ "description": "This package currently focuses on Out of Vocabulary (OOV) word or non-word error (NWE) correction using BERT model. The idea of using BERT was to use the context when correcting NWE. In the coming days, I would like to focus on RWE and optimising the package by implementing it in cython.",
+ "github": "R1j1t/contextualSpellCheck",
+ "pip": "contextualSpellCheck",
+ "code_example": [
+ "import spacy",
+ "import contextualSpellCheck",
+ "",
+ "nlp = spacy.load('en')",
+ "contextualSpellCheck.add_to_pipe(nlp)",
+ "doc = nlp('Income was $9.4 milion compared to the prior year of $2.7 milion.')",
+ "",
+ "print(doc._.performed_spellCheck) #Should be True",
+ "print(doc._.outcome_spellCheck) #Income was $9.4 million compared to the prior year of $2.7 million."
+ ],
+ "code_language": "python",
+ "url": "https://github.com/R1j1t/contextualSpellCheck",
+ "thumb": "https://user-images.githubusercontent.com/22280243/82760949-98e68480-9e14-11ea-952e-4738620fd9e3.png",
+ "image": "https://user-images.githubusercontent.com/22280243/82138959-2852cd00-9842-11ea-918a-49b2a7873ef6.png",
+ "author": "Rajat Goel",
+ "author_links": {
+ "github": "r1j1t",
+ "website": "https://github.com/R1j1t"
+ },
+ "category": ["pipeline", "conversational", "research"],
+ "tags": ["spell check", "correction", "preprocessing", "translation", "correction"]
}
],
From 4fd087572a1c597781fef8ca4fbcfebed825c0fb Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Thu, 28 May 2020 12:51:37 +0200
Subject: [PATCH 176/496] WIP: improve model version deps
---
spacy/cli/package.py | 2 +-
spacy/util.py | 9 +++++++++
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index cf93c872f..15ae2033c 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -138,7 +138,7 @@ def list_files(data_dir):
def list_requirements(meta):
parent_package = meta.get('parent_package', 'spacy')
- requirements = [parent_package + meta['spacy_version']]
+ requirements = [parent_package + '>=' + meta['spacy_version']]
if 'setup_requires' in meta:
requirements += meta['setup_requires']
if 'requirements' in meta:
diff --git a/spacy/util.py b/spacy/util.py
index b614c29c7..4e468ef9d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -265,6 +265,15 @@ def is_compatible_model(meta):
return True
+def get_model_version_range(version):
+ """Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy
+ version. Models are always compatible across patch versions but not
+ across minor or major versions.
+ """
+ major, minor = split_version(version)
+ return f">={version},<{major}.{minor + 1}.0"
+
+
def load_config(path, create_objects=False):
"""Load a Thinc-formatted config file, optionally filling in objects where
the config references registry entries. See "Thinc config files" for details.
From bed62991add4ff12282a00dd1d321441878b27ef Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sat, 30 May 2020 14:59:55 +0200
Subject: [PATCH 177/496] Tidy up requirements
---
requirements.txt | 5 ++++-
setup.cfg | 7 ++++---
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index add083a05..a104b68ba 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,8 +13,11 @@ numpy>=1.15.0
requests>=2.13.0,<3.0.0
plac>=0.9.6,<1.2.0
tqdm>=4.38.0,<5.0.0
-importlib_metadata>=0.20; python_version < "3.8"
pydantic>=1.3.0,<2.0.0
+# Official Python utilities
+setuptools
+packaging
+importlib_metadata>=0.20; python_version < "3.8"
# Development dependencies
cython>=0.25
pytest>=4.6.5
diff --git a/setup.cfg b/setup.cfg
index eb7608c4e..ae09d071c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -47,15 +47,16 @@ install_requires =
wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0
catalogue>=0.0.7,<1.1.0
- ml_datasets
+ ml_datasets>=0.1.1
# Third-party dependencies
tqdm>=4.38.0,<5.0.0
- setuptools
numpy>=1.15.0
plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0
pydantic>=1.3.0,<2.0.0
- tqdm>=4.38.0,<5.0.0
+ # Official Python utilities
+ setuptools
+ packaging
importlib_metadata>=0.20; python_version < "3.8"
[options.extras_require]
From e47e5a4b10e0d3c5b6fed255040cebc019173e39 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sat, 30 May 2020 15:01:58 +0200
Subject: [PATCH 178/496] Use more sophisticated version parsing logic
---
spacy/cli/download.py | 7 +++---
spacy/cli/package.py | 4 ++--
spacy/cli/validate.py | 9 ++++----
spacy/language.py | 3 ++-
spacy/tests/test_misc.py | 12 ++++++++--
spacy/util.py | 49 ++++++++++++++++------------------------
6 files changed, 41 insertions(+), 43 deletions(-)
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index af132bbbe..3d56822a5 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -5,7 +5,7 @@ import sys
from wasabi import msg
from .. import about
-from ..util import is_package
+from ..util import is_package, get_base_version
def download(
@@ -63,8 +63,7 @@ def get_json(url, desc):
def get_compatibility():
- version = about.__version__
- version = version.rsplit(".dev", 1)[0]
+ version = get_base_version(about.__version__)
comp_table = get_json(about.__compatibility__, "compatibility table")
comp = comp_table["spacy"]
if version not in comp:
@@ -73,7 +72,7 @@ def get_compatibility():
def get_version(model, comp):
- model = model.rsplit(".dev", 1)[0]
+ model = get_base_version(model)
if model not in comp:
msg.fail(
f"No compatible model found for '{model}' (spaCy v{about.__version__})",
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 15ae2033c..153e61ba3 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -90,7 +90,7 @@ def generate_meta(model_path, existing_meta, msg):
("license", "License", meta.get("license", "MIT")),
]
nlp = util.load_model_from_path(Path(model_path))
- meta["spacy_version"] = about.__version__
+ meta["spacy_version"] = util.get_model_version_range(about.__version__)
meta["pipeline"] = nlp.pipe_names
meta["vectors"] = {
"width": nlp.vocab.vectors_length,
@@ -138,7 +138,7 @@ def list_files(data_dir):
def list_requirements(meta):
parent_package = meta.get('parent_package', 'spacy')
- requirements = [parent_package + '>=' + meta['spacy_version']]
+ requirements = [parent_package + meta['spacy_version']]
if 'setup_requires' in meta:
requirements += meta['setup_requires']
if 'requirements' in meta:
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index c39cadc7b..3c49abb3e 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -4,7 +4,7 @@ import requests
from wasabi import msg
from .. import about
-from ..util import get_package_version, get_installed_models, split_version
+from ..util import get_package_version, get_installed_models, get_base_version
from ..util import get_package_path, get_model_meta, is_compatible_model
@@ -14,7 +14,7 @@ def validate():
with the installed models. Should be run after `pip install -U spacy`.
"""
model_pkgs, compat = get_model_pkgs()
- spacy_version = about.__version__.rsplit(".dev", 1)[0]
+ spacy_version = get_base_version(about.__version__)
current_compat = compat.get(spacy_version, {})
if not current_compat:
msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
@@ -78,13 +78,12 @@ def get_model_pkgs():
version = get_package_version(pkg_name)
if package in compat:
is_compat = version in compat[package]
- v_maj, v_min = split_version(about.__version__)
- spacy_version = f"{v_maj}.{v_min}"
+ spacy_version = about.__version__
else:
model_path = get_package_path(package)
model_meta = get_model_meta(model_path)
- is_compat = is_compatible_model(model_meta)
spacy_version = model_meta.get("spacy_version", "n/a")
+ is_compat = is_compatible_model(spacy_version)
pkgs[pkg_name] = {
"name": package,
"version": version,
diff --git a/spacy/language.py b/spacy/language.py
index 551b8c9af..61d69b63e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -191,13 +191,14 @@ class Language(object):
@property
def meta(self):
+ spacy_version = util.get_model_version_range(about.__version__)
if self.vocab.lang:
self._meta.setdefault("lang", self.vocab.lang)
else:
self._meta.setdefault("lang", self.lang)
self._meta.setdefault("name", "model")
self._meta.setdefault("version", "0.0.0")
- self._meta.setdefault("spacy_version", about.__version__)
+ self._meta.setdefault("spacy_version", spacy_version)
self._meta.setdefault("description", "")
self._meta.setdefault("author", "")
self._meta.setdefault("email", "")
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 9e67ae83b..9aa95c431 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -95,7 +95,15 @@ def test_ascii_filenames():
@pytest.mark.parametrize(
"version,compatible",
- [(spacy_version, True), ("2.0.0", False), (">=1.2.3,<4.5.6", False)],
+ [
+ (spacy_version, True),
+ (f">={spacy_version}", True),
+ ("2.0.0", False),
+ (">=2.0.0", True),
+ (">=1.0.0,<2.1.1", False),
+ (">=1.2.3,<4.5.6", True),
+ ("n/a", None),
+ ],
)
def test_is_compatible_model(version, compatible):
- assert util.is_compatible_model({"spacy_version": version}) is compatible
+ assert util.is_compatible_model(version) is compatible
diff --git a/spacy/util.py b/spacy/util.py
index 4e468ef9d..835e46fc6 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -14,6 +14,8 @@ import srsly
import catalogue
import sys
import warnings
+from packaging.specifiers import SpecifierSet, InvalidSpecifier
+from packaging.version import Version, InvalidVersion
try:
@@ -236,42 +238,31 @@ def get_package_version(name):
return None
-def split_version(version):
- """RETURNS (tuple): Two integers, the major and minor spaCy version."""
- pieces = version.split(".", 3)
- return int(pieces[0]), int(pieces[1])
-
-
-def is_compatible_model(meta):
- """Check if a model is compatible with the current version of spaCy, based
- on its meta.json. We compare the version of spaCy the model was created with
- with the current version. If the minor version is different, it's considered
- incompatible.
-
- meta (dict): The model's meta.
- RETURNS (bool / None): Whether the model is compatible with the current
- spaCy or None if we don't have enough info.
- """
- cur_v = about.__version__
- pkg_v = meta.get("spacy_version")
- if not pkg_v or not isinstance(pkg_v, str):
+def is_compatible_model(constraint):
+ version = Version(about.__version__)
+ if constraint[0].isdigit():
+ # Handle cases where exact version is provided as constraint
+ constraint = f"=={constraint}"
+ try:
+ spec = SpecifierSet(constraint)
+ except InvalidSpecifier:
return None
- # Handle spacy_version values like >=x,=1.2.3,<1.3.0 based on a given spaCy
version. Models are always compatible across patch versions but not
across minor or major versions.
"""
- major, minor = split_version(version)
- return f">={version},<{major}.{minor + 1}.0"
+ release = Version(spacy_version).release
+ return f">={spacy_version},<{release[0]}.{release[1] + 1}.0"
+
+
+def get_base_version(version):
+ return Version(version).base_version
def load_config(path, create_objects=False):
From a7e370bcbfd4234b53061a004c0b588e3ec76c06 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sat, 30 May 2020 15:03:18 +0200
Subject: [PATCH 179/496] Don't override spaCy version
---
spacy/cli/train.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index c205fa5b2..590ce4f13 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -467,7 +467,6 @@ def train(
# Update model meta.json
meta["lang"] = nlp.lang
meta["pipeline"] = nlp.pipe_names
- meta["spacy_version"] = about.__version__
if beam_width == 1:
meta["speed"] = {
"nwords": nwords,
From b7aff6020c34ecae3bb0891b469193d8772b8197 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sat, 30 May 2020 15:18:53 +0200
Subject: [PATCH 180/496] Make functions more general purpose and update
docstrings and tests
---
spacy/cli/validate.py | 4 ++--
spacy/tests/test_misc.py | 22 ++++++++++++----------
spacy/util.py | 27 +++++++++++++++++++++------
3 files changed, 35 insertions(+), 18 deletions(-)
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index 3c49abb3e..080cd77e2 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -5,7 +5,7 @@ from wasabi import msg
from .. import about
from ..util import get_package_version, get_installed_models, get_base_version
-from ..util import get_package_path, get_model_meta, is_compatible_model
+from ..util import get_package_path, get_model_meta, is_compatible_version
def validate():
@@ -83,7 +83,7 @@ def get_model_pkgs():
model_path = get_package_path(package)
model_meta = get_model_meta(model_path)
spacy_version = model_meta.get("spacy_version", "n/a")
- is_compat = is_compatible_model(spacy_version)
+ is_compat = is_compatible_version(about.__version__, spacy_version)
pkgs[pkg_name] = {
"name": package,
"version": version,
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 9aa95c431..e4b4e570c 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -94,16 +94,18 @@ def test_ascii_filenames():
@pytest.mark.parametrize(
- "version,compatible",
+ "version,constraint,compatible",
[
- (spacy_version, True),
- (f">={spacy_version}", True),
- ("2.0.0", False),
- (">=2.0.0", True),
- (">=1.0.0,<2.1.1", False),
- (">=1.2.3,<4.5.6", True),
- ("n/a", None),
+ (spacy_version, spacy_version, True),
+ (spacy_version, f">={spacy_version}", True),
+ ("3.0.0", "2.0.0", False),
+ ("3.2.1", ">=2.0.0", True),
+ ("2.2.10a1", ">=1.0.0,<2.1.1", False),
+ ("3.0.0.dev3", ">=1.2.3,<4.5.6", True),
+ ("n/a", ">=1.2.3,<4.5.6", None),
+ ("1.2.3", "n/a", None),
+ ("n/a", "n/a", None),
],
)
-def test_is_compatible_model(version, compatible):
- assert util.is_compatible_model(version) is compatible
+def test_is_compatible_version(version, constraint, compatible):
+ assert util.is_compatible_version(version, constraint) is compatible
diff --git a/spacy/util.py b/spacy/util.py
index 835e46fc6..741b289c1 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -238,17 +238,27 @@ def get_package_version(name):
return None
-def is_compatible_model(constraint):
- version = Version(about.__version__)
+def is_compatible_version(version, constraint, prereleases=True):
+ """Check if a version (e.g. "2.0.0") is compatible given a version
+ constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
+ it's interpreted as =={version}.
+
+ version (str): The version to check.
+ constraint (str): The constraint string.
+ prereleases (bool): Whether to allow prereleases. If set to False,
+ prerelease versions will be considered incompatible.
+ RETURNS (bool / None): Whether the version is compatible, or None if the
+ version or constraint are invalid.
+ """
+ # Handle cases where exact version is provided as constraint
if constraint[0].isdigit():
- # Handle cases where exact version is provided as constraint
constraint = f"=={constraint}"
try:
spec = SpecifierSet(constraint)
- except InvalidSpecifier:
+ version = Version(version)
+ except (InvalidSpecifier, InvalidVersion):
return None
- # Allow prereleases and dev versions
- spec.prereleases = True
+ spec.prereleases = prereleases
return version in spec
@@ -262,6 +272,11 @@ def get_model_version_range(spacy_version):
def get_base_version(version):
+ """Generate the base version without any prerelease identifiers.
+
+ version (str): The version, e.g. "3.0.0.dev1".
+ RETURNS (str): The base version, e.g. "3.0.0".
+ """
return Version(version).base_version
From 368182776e61f6582223c02cf31b5eee65521d20 Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sat, 30 May 2020 15:19:53 +0200
Subject: [PATCH 181/496] Tidy up dependencies
---
setup.cfg | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/setup.cfg b/setup.cfg
index eb7608c4e..c5c39b447 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -47,7 +47,7 @@ install_requires =
wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0
catalogue>=0.0.7,<1.1.0
- ml_datasets
+ ml_datasets>=0.1.1
# Third-party dependencies
tqdm>=4.38.0,<5.0.0
setuptools
@@ -55,7 +55,6 @@ install_requires =
plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0
pydantic>=1.3.0,<2.0.0
- tqdm>=4.38.0,<5.0.0
importlib_metadata>=0.20; python_version < "3.8"
[options.extras_require]
From dc186afdc5b7f42dd32eeafb239b3d5604b8fbbd Mon Sep 17 00:00:00 2001
From: Ines Montani
Date: Sat, 30 May 2020 15:34:54 +0200
Subject: [PATCH 182/496] Add warning
---
spacy/errors.py | 6 ++++++
spacy/util.py | 10 ++++++++++
2 files changed, 16 insertions(+)
diff --git a/spacy/errors.py b/spacy/errors.py
index 932bb1eff..da2cfdf04 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -104,6 +104,12 @@ class Warnings(object):
"string \"Field1=Value1,Value2|Field2=Value3\".")
# TODO: fix numbering after merging develop into master
+ W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is "
+ "incompatible with the current version ({current}). This may lead "
+ "to unexpected results or runtime errors. To resolve this, "
+ "download a newer compatible model or retrain your custom model "
+ "with the current spaCy version. For more details and available "
+ "updates, run: python -m spacy validate")
W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' "
"instead.")
W097 = ("No Model config was provided to create the '{name}' component, "
diff --git a/spacy/util.py b/spacy/util.py
index 741b289c1..79134400c 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -330,6 +330,16 @@ def get_model_meta(path):
for setting in ["lang", "name", "version"]:
if setting not in meta or not meta[setting]:
raise ValueError(Errors.E054.format(setting=setting))
+ if "spacy_version" in meta:
+ if not is_compatible_version(about.__version__, meta["spacy_version"]):
+ warnings.warn(
+ Warnings.W095.format(
+ model=f"{meta['lang']}_{meta['name']}",
+ model_version=meta["version"],
+ version=meta["spacy_version"],
+ current=about.__version__,
+ )
+ )
return meta
From cd5f748e0982524167e55884a7b1677a63b5b308 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal
Date: Sat, 30 May 2020 20:27:47 +0200
Subject: [PATCH 183/496] Add onto-joint experiment file
---
examples/experiments/onto-joint/defaults.cfg | 115 +++++++++++++++++++
1 file changed, 115 insertions(+)
create mode 100644 examples/experiments/onto-joint/defaults.cfg
diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg
new file mode 100644
index 000000000..fbac4ea7d
--- /dev/null
+++ b/examples/experiments/onto-joint/defaults.cfg
@@ -0,0 +1,115 @@
+# Training hyper-parameters and additional features.
+[training]
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length or number of examples.
+max_length = 0
+limit = 0
+# Data augmentation
+orth_variant_level = 0.0
+dropout = 0.1
+# Controls early-stopping. 0 or -1 mean unlimited.
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 400
+# Other settings
+seed = 0
+accumulate_gradient = 1
+use_pytorch_for_gpu_memory = false
+# Control how scores are printed and checkpoints are evaluated.
+scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
+score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
+# These settings are invalid for the transformer models.
+init_tok2vec = null
+vectors = null
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 1000
+stop = 1000
+compound = 1.001
+
+[optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 0.001
+
+#[optimizer.learn_rate]
+#@schedules = "warmup_linear.v1"
+#warmup_steps = 250
+#total_steps = 20000
+#initial_rate = 0.001
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.senter]
+factory = "senter"
+
+[nlp.pipeline.ner]
+factory = "ner"
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+
+[nlp.pipeline.senter.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.senter.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 3
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 256
+depth = 6
+window_size = 1
+embed_size = 10000
+maxout_pieces = 3
+subword_features = true
From e0f9f448f1305e382c5e7042d8bbac882fea9644 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Mon, 1 Jun 2020 23:38:48 +0200
Subject: [PATCH 184/496] remove Tensorizer
---
examples/training/pretrain_textcat.py | 212 ------------------
spacy/language.py | 4 -
spacy/ml/models/__init__.py | 1 -
spacy/ml/models/tensorizer.py | 10 -
spacy/pipeline/__init__.py | 3 +-
spacy/pipeline/defaults/__init__.py | 10 -
.../pipeline/defaults/tensorizer_defaults.cfg | 4 -
spacy/pipeline/hooks.py | 6 +-
spacy/pipeline/pipes.pyx | 136 +----------
.../serialize/test_serialize_pipeline.py | 22 +-
10 files changed, 8 insertions(+), 400 deletions(-)
delete mode 100644 examples/training/pretrain_textcat.py
delete mode 100644 spacy/ml/models/tensorizer.py
delete mode 100644 spacy/pipeline/defaults/tensorizer_defaults.cfg
diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py
deleted file mode 100644
index 5c41c0e92..000000000
--- a/examples/training/pretrain_textcat.py
+++ /dev/null
@@ -1,212 +0,0 @@
-"""This script is experimental.
-
-Try pre-training the CNN component of the text categorizer using a cheap
-language modelling-like objective. Specifically, we load pretrained vectors
-(from something like word2vec, GloVe, FastText etc), and use the CNN to
-predict the tokens' pretrained vectors. This isn't as easy as it sounds:
-we're not merely doing compression here, because heavy dropout is applied,
-including over the input words. This means the model must often (50% of the time)
-use the context in order to predict the word.
-
-To evaluate the technique, we're pre-training with the 50k texts from the IMDB
-corpus, and then training with only 100 labels. Note that it's a bit dirty to
-pre-train with the development data, but also not *so* terrible: we're not using
-the development labels, after all --- only the unlabelled text.
-"""
-import plac
-import tqdm
-import random
-
-import ml_datasets
-
-import spacy
-from spacy.util import minibatch
-from spacy.pipeline import TextCategorizer
-from spacy.ml.models.tok2vec import build_Tok2Vec_model
-import numpy
-
-
-def load_texts(limit=0):
- train, dev = ml_datasets.imdb()
- train_texts, train_labels = zip(*train)
- dev_texts, dev_labels = zip(*train)
- train_texts = list(train_texts)
- dev_texts = list(dev_texts)
- random.shuffle(train_texts)
- random.shuffle(dev_texts)
- if limit >= 1:
- return train_texts[:limit]
- else:
- return list(train_texts) + list(dev_texts)
-
-
-def load_textcat_data(limit=0):
- """Load data from the IMDB dataset."""
- # Partition off part of the train data for evaluation
- train_data, eval_data = ml_datasets.imdb()
- random.shuffle(train_data)
- train_data = train_data[-limit:]
- texts, labels = zip(*train_data)
- eval_texts, eval_labels = zip(*eval_data)
- cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
- eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
- return (texts, cats), (eval_texts, eval_cats)
-
-
-def prefer_gpu():
- used = spacy.util.use_gpu(0)
- if used is None:
- return False
- else:
- import cupy.random
-
- cupy.random.seed(0)
- return True
-
-
-def build_textcat_model(tok2vec, nr_class, width):
- from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged
-
- with Model.define_operators({">>": chain}):
- model = (
- tok2vec
- >> list2ragged()
- >> reduce_mean()
- >> Softmax(nr_class, width)
- )
- model.set_ref("tok2vec", tok2vec)
- return model
-
-
-def block_gradients(model):
- from thinc.api import wrap # TODO FIX
-
- def forward(X, drop=0.0):
- Y, _ = model.begin_update(X, drop=drop)
- return Y, None
-
- return wrap(forward, model)
-
-
-def create_pipeline(width, embed_size, vectors_model):
- print("Load vectors")
- nlp = spacy.load(vectors_model)
- print("Start training")
- textcat = TextCategorizer(
- nlp.vocab,
- labels=["POSITIVE", "NEGATIVE"],
- # TODO: replace with config version
- model=build_textcat_model(
- build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width
- ),
- )
-
- nlp.add_pipe(textcat)
- return nlp
-
-
-def train_tensorizer(nlp, texts, dropout, n_iter):
- tensorizer = nlp.create_pipe("tensorizer")
- nlp.add_pipe(tensorizer)
- optimizer = nlp.begin_training()
- for i in range(n_iter):
- losses = {}
- for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
- docs = [nlp.make_doc(text) for text in batch]
- tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout)
- print(losses)
- return optimizer
-
-
-def train_textcat(nlp, n_texts, n_iter=10):
- textcat = nlp.get_pipe("textcat")
- tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes()
- (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
- print(
- "Using {} examples ({} training, {} evaluation)".format(
- n_texts, len(train_texts), len(dev_texts)
- )
- )
- train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
-
- with nlp.select_pipes(enable="textcat"): # only train textcat
- optimizer = nlp.begin_training()
- textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights)
- print("Training the model...")
- print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
- for i in range(n_iter):
- losses = {"textcat": 0.0}
- # batch up the examples using spaCy's minibatch
- batches = minibatch(tqdm.tqdm(train_data), size=2)
- for batch in batches:
- nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
- with textcat.model.use_params(optimizer.averages):
- # evaluate on the dev data split off in load_data()
- scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
- print(
- "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
- losses["textcat"],
- scores["textcat_p"],
- scores["textcat_r"],
- scores["textcat_f"],
- )
- )
-
-
-def evaluate_textcat(tokenizer, textcat, texts, cats):
- docs = (tokenizer(text) for text in texts)
- tp = 1e-8
- fp = 1e-8
- tn = 1e-8
- fn = 1e-8
- for i, doc in enumerate(textcat.pipe(docs)):
- gold = cats[i]
- for label, score in doc.cats.items():
- if label not in gold:
- continue
- if score >= 0.5 and gold[label] >= 0.5:
- tp += 1.0
- elif score >= 0.5 and gold[label] < 0.5:
- fp += 1.0
- elif score < 0.5 and gold[label] < 0.5:
- tn += 1
- elif score < 0.5 and gold[label] >= 0.5:
- fn += 1
- precision = tp / (tp + fp)
- recall = tp / (tp + fn)
- f_score = 2 * (precision * recall) / (precision + recall)
- return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
-
-
-@plac.annotations(
- width=("Width of CNN layers", "positional", None, int),
- embed_size=("Embedding rows", "positional", None, int),
- pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
- train_iters=("Number of iterations to pretrain", "option", "tn", int),
- train_examples=("Number of labelled examples", "option", "eg", int),
- vectors_model=("Name or path to vectors model to learn from"),
-)
-def main(
- width,
- embed_size,
- vectors_model,
- pretrain_iters=30,
- train_iters=30,
- train_examples=1000,
-):
- random.seed(0)
- numpy.random.seed(0)
- use_gpu = prefer_gpu()
- print("Using GPU?", use_gpu)
-
- nlp = create_pipeline(width, embed_size, vectors_model)
- print("Load data")
- texts = load_texts(limit=0)
- print("Train tensorizer")
- optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
- print("Train textcat")
- train_textcat(nlp, train_examples, n_iter=train_iters)
-
-
-if __name__ == "__main__":
- plac.call(main)
diff --git a/spacy/language.py b/spacy/language.py
index 61d69b63e..22360c65f 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -225,10 +225,6 @@ class Language(object):
# Conveniences to access pipeline components
# Shouldn't be used anymore!
- @property
- def tensorizer(self):
- return self.get_pipe("tensorizer")
-
@property
def tagger(self):
return self.get_pipe("tagger")
diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py
index ef1e8efca..40cde2437 100644
--- a/spacy/ml/models/__init__.py
+++ b/spacy/ml/models/__init__.py
@@ -2,6 +2,5 @@ from .entity_linker import * # noqa
from .parser import * # noqa
from .simple_ner import *
from .tagger import * # noqa
-from .tensorizer import * # noqa
from .textcat import * # noqa
from .tok2vec import * # noqa
diff --git a/spacy/ml/models/tensorizer.py b/spacy/ml/models/tensorizer.py
deleted file mode 100644
index f66610b64..000000000
--- a/spacy/ml/models/tensorizer.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from thinc.api import Linear, zero_init
-
-from ... import util
-from ...util import registry
-
-
-@registry.architectures.register("spacy.Tensorizer.v1")
-def build_tensorizer(input_size, output_size):
- input_size = util.env_opt("token_vector_width", input_size)
- return Linear(output_size, input_size, init_W=zero_init)
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index b2866bad2..116a08e92 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -1,5 +1,5 @@
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
-from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
+from .pipes import TextCategorizer, Pipe, Sentencizer
from .pipes import SentenceRecognizer
from .simple_ner import SimpleNER
from .morphologizer import Morphologizer
@@ -14,7 +14,6 @@ __all__ = [
"EntityRecognizer",
"EntityLinker",
"TextCategorizer",
- "Tensorizer",
"Tok2Vec",
"Pipe",
"Morphologizer",
diff --git a/spacy/pipeline/defaults/__init__.py b/spacy/pipeline/defaults/__init__.py
index e17e2d3b4..483c6bbd6 100644
--- a/spacy/pipeline/defaults/__init__.py
+++ b/spacy/pipeline/defaults/__init__.py
@@ -63,16 +63,6 @@ def default_tagger():
return util.load_config(loc, create_objects=True)["model"]
-def default_tensorizer_config():
- loc = Path(__file__).parent / "tensorizer_defaults.cfg"
- return util.load_config(loc, create_objects=False)
-
-
-def default_tensorizer():
- loc = Path(__file__).parent / "tensorizer_defaults.cfg"
- return util.load_config(loc, create_objects=True)["model"]
-
-
def default_textcat_config():
loc = Path(__file__).parent / "textcat_defaults.cfg"
return util.load_config(loc, create_objects=False)
diff --git a/spacy/pipeline/defaults/tensorizer_defaults.cfg b/spacy/pipeline/defaults/tensorizer_defaults.cfg
deleted file mode 100644
index 81880a109..000000000
--- a/spacy/pipeline/defaults/tensorizer_defaults.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-[model]
-@architectures = "spacy.Tensorizer.v1"
-input_size=96
-output_size=300
diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py
index 351323ae9..a97e7be68 100644
--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@@ -44,8 +44,8 @@ class SentenceSegmenter(object):
class SimilarityHook(Pipe):
"""
Experimental: A pipeline component to install a hook for supervised
- similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
- documents. The similarity model can be any object obeying the Thinc `Model`
+ similarity into `Doc` objects.
+ The similarity model can be any object obeying the Thinc `Model`
interface. By default, the model concatenates the elementwise mean and
elementwise max of the two tensors, and compares them using the
Cauchy-like similarity function from Chen (2013):
@@ -82,7 +82,7 @@ class SimilarityHook(Pipe):
sims, bp_sims = self.model.begin_update(doc1_doc2)
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
- """Allocate model, using width from tensorizer in pipeline.
+ """Allocate model, using nO from the first model in the pipeline.
gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index f75ed1659..cfe01981e 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -16,7 +16,7 @@ from ..morphology cimport Morphology
from ..vocab cimport Vocab
from .defaults import default_tagger, default_parser, default_ner, default_textcat
-from .defaults import default_nel, default_senter, default_tensorizer
+from .defaults import default_nel, default_senter
from .functions import merge_subtokens
from ..language import Language, component
from ..syntax import nonproj
@@ -238,138 +238,6 @@ class Pipe(object):
return self
-@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer)
-class Tensorizer(Pipe):
- """Pre-train position-sensitive vectors for tokens."""
-
- def __init__(self, vocab, model, **cfg):
- """Construct a new statistical model. Weights are not allocated on
- initialisation.
-
- vocab (Vocab): A `Vocab` instance. The model must share the same
- `Vocab` instance with the `Doc` objects it will process.
- **cfg: Config parameters.
- """
- self.vocab = vocab
- self.model = model
- self.input_models = []
- self.cfg = dict(cfg)
-
- def __call__(self, example):
- """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
- model. Vectors are set to the `Doc.tensor` attribute.
-
- docs (Doc or iterable): One or more documents to add vectors to.
- RETURNS (dict or None): Intermediate computations.
- """
- doc = self._get_doc(example)
- tokvecses = self.predict([doc])
- self.set_annotations([doc], tokvecses)
- if isinstance(example, Example):
- example.doc = doc
- return example
- return doc
-
- def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
- """Process `Doc` objects as a stream.
-
- stream (iterator): A sequence of `Doc` or `Example` objects to process.
- batch_size (int): Number of `Doc` or `Example` objects to group.
- YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input.
- """
- for examples in util.minibatch(stream, size=batch_size):
- docs = [self._get_doc(ex) for ex in examples]
- tensors = self.predict(docs)
- self.set_annotations(docs, tensors)
-
- if as_example:
- for ex, doc in zip(examples, docs):
- ex.doc = doc
- yield ex
- else:
- yield from docs
-
- def predict(self, docs):
- """Return a single tensor for a batch of documents.
-
- docs (iterable): A sequence of `Doc` objects.
- RETURNS (object): Vector representations for each token in the docs.
- """
- inputs = self.model.ops.flatten([doc.tensor for doc in docs])
- outputs = self.model(inputs)
- return self.model.ops.unflatten(outputs, [len(d) for d in docs])
-
- def set_annotations(self, docs, tensors):
- """Set the tensor attribute for a batch of documents.
-
- docs (iterable): A sequence of `Doc` objects.
- tensors (object): Vector representation for each token in the docs.
- """
- for doc, tensor in zip(docs, tensors):
- if tensor.shape[0] != len(doc):
- raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
- doc.tensor = tensor
-
- def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None):
- """Update the model.
-
- docs (iterable): A batch of `Doc` objects.
- golds (iterable): A batch of `GoldParse` objects.
- drop (float): The dropout rate.
- sgd (callable): An optimizer.
- RETURNS (dict): Results from the update.
- """
- examples = Example.to_example_objects(examples)
- inputs = []
- bp_inputs = []
- set_dropout_rate(self.model, drop)
- for tok2vec in self.input_models:
- set_dropout_rate(tok2vec, drop)
- tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples])
- inputs.append(tensor)
- bp_inputs.append(bp_tensor)
- inputs = self.model.ops.xp.hstack(inputs)
- scores, bp_scores = self.model.begin_update(inputs)
- loss, d_scores = self.get_loss(examples, scores)
- d_inputs = bp_scores(d_scores, sgd=sgd)
- d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
- for d_input, bp_input in zip(d_inputs, bp_inputs):
- bp_input(d_input)
- if sgd is not None:
- for tok2vec in self.input_models:
- tok2vec.finish_update(sgd)
- self.model.finish_update(sgd)
- if losses is not None:
- losses.setdefault(self.name, 0.0)
- losses[self.name] += loss
- return loss
-
- def get_loss(self, examples, prediction):
- examples = Example.to_example_objects(examples)
- ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
- target = self.vocab.vectors.data[ids]
- d_scores = (prediction - target) / prediction.shape[0]
- loss = (d_scores ** 2).sum()
- return loss, d_scores
-
- def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
- """Allocate models, pre-process training data and acquire an
- optimizer.
-
- get_examples (iterable): Gold-standard training data.
- pipeline (list): The pipeline the model is part of.
- """
- if pipeline is not None:
- for name, model in pipeline:
- if model.has_ref("tok2vec"):
- self.input_models.append(model.get_ref("tok2vec"))
- self.model.initialize()
- link_vectors_to_models(self.vocab)
- if sgd is None:
- sgd = self.create_optimizer()
- return sgd
-
-
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger)
class Tagger(Pipe):
"""Pipeline component for part-of-speech tagging.
@@ -1707,4 +1575,4 @@ def ner_factory(nlp, model, **cfg):
warnings.warn(Warnings.W098.format(name="ner"))
return EntityRecognizer.from_nlp(nlp, model, **cfg)
-__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
+__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 4fc277c4f..595a35a9f 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -1,7 +1,7 @@
import pytest
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
-from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer
-from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger
+from spacy.pipeline import TextCategorizer, SentenceRecognizer
+from spacy.pipeline.defaults import default_parser, default_tagger
from spacy.pipeline.defaults import default_textcat, default_senter
from ..util import make_tempdir
@@ -95,24 +95,6 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
-def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
- tensorizer = Tensorizer(en_vocab, default_tensorizer())
- tensorizer_b = tensorizer.to_bytes(exclude=["vocab"])
- new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b)
- assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b
-
-
-def test_serialize_tensorizer_roundtrip_disk(en_vocab):
- tensorizer = Tensorizer(en_vocab, default_tensorizer())
- with make_tempdir() as d:
- file_path = d / "tensorizer"
- tensorizer.to_disk(file_path)
- tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path)
- assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes(
- exclude=["vocab"]
- )
-
-
def test_serialize_textcat_empty(en_vocab):
# See issue #1105
textcat = TextCategorizer(
From ec52e7f886ad3839bb509c38707a8ae4e955b7d4 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Tue, 2 Jun 2020 13:21:55 +0200
Subject: [PATCH 185/496] add oversize examples before StopIteration returns
---
spacy/util.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/spacy/util.py b/spacy/util.py
index 79134400c..54ecb6edd 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -681,6 +681,9 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0
try:
example = next(examples)
except StopIteration:
+ if oversize:
+ example = oversize.pop(0)
+ batch.append(example)
if batch:
yield batch
return
From fdfd82293688678b1590d680f758c32da3c83d73 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Tue, 2 Jun 2020 15:22:54 +0200
Subject: [PATCH 186/496] rewrite minibatch_by_words function
---
spacy/util.py | 60 ++++++++++++++++++++++++++-------------------------
1 file changed, 31 insertions(+), 29 deletions(-)
diff --git a/spacy/util.py b/spacy/util.py
index 54ecb6edd..0f8de3ddf 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -656,45 +656,47 @@ def decaying(start, stop, decay):
curr -= decay
-def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0.2):
+def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False):
"""Create minibatches of roughly a given number of words. If any examples
are longer than the specified batch length, they will appear in a batch by
- themselves."""
+ themselves, or be discarded if discard_oversize=True."""
if isinstance(size, int):
size_ = itertools.repeat(size)
elif isinstance(size, List):
size_ = iter(size)
else:
size_ = size
- examples = iter(examples)
- oversize = []
- while True:
- batch_size = next(size_)
- tol_size = batch_size * 0.2
- batch = []
- if oversize:
- example = oversize.pop(0)
- n_words = count_words(example.doc)
+
+ target_size = next(size_)
+ tol_size = target_size * tolerance
+ batch = []
+ current_size = 0
+
+ for example in examples:
+ n_words = count_words(example.doc)
+ # add the example to the current batch if it still fits
+ if (current_size + n_words) < (target_size + tol_size):
batch.append(example)
- batch_size -= n_words
- while batch_size >= 1:
- try:
- example = next(examples)
- except StopIteration:
- if oversize:
- example = oversize.pop(0)
- batch.append(example)
- if batch:
- yield batch
- return
- n_words = count_words(example.doc)
- if n_words < (batch_size + tol_size):
- batch_size -= n_words
- batch.append(example)
+ current_size += n_words
+ else:
+ # if the current example exceeds the batch size, it is returned separately
+ # but only if discard_oversize=False.
+ if current_size > target_size:
+ if not discard_oversize:
+ yield [example]
+ # yield the previous batch and start a new one
else:
- oversize.append(example)
- if batch:
- yield batch
+ yield batch
+ target_size = next(size_)
+ tol_size = target_size * tolerance
+ # In theory it may happen that the current example now exceeds the new target_size,
+ # but that seems like an unimportant edge case if batch sizes are variable anyway?
+ batch = [example]
+ current_size = n_words
+
+ # yield the final batch
+ if batch:
+ yield batch
def itershuffle(iterable, bufsize=1000):
From 5b350a6c9998ccb53439f2721159ab92ca61003f Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Tue, 2 Jun 2020 17:49:33 +0200
Subject: [PATCH 187/496] bugfix of the bugfix
---
spacy/util.py | 31 ++++++++++++++++---------------
1 file changed, 16 insertions(+), 15 deletions(-)
diff --git a/spacy/util.py b/spacy/util.py
index 0f8de3ddf..f5ca49637 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -674,25 +674,26 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o
for example in examples:
n_words = count_words(example.doc)
+ # if the current example exceeds the batch size, it is returned separately
+ # but only if discard_oversize=False.
+ if n_words > target_size:
+ if not discard_oversize:
+ yield [example]
+
# add the example to the current batch if it still fits
- if (current_size + n_words) < (target_size + tol_size):
+ elif (current_size + n_words) < (target_size + tol_size):
batch.append(example)
current_size += n_words
+
+ # yield the previous batch and start a new one
else:
- # if the current example exceeds the batch size, it is returned separately
- # but only if discard_oversize=False.
- if current_size > target_size:
- if not discard_oversize:
- yield [example]
- # yield the previous batch and start a new one
- else:
- yield batch
- target_size = next(size_)
- tol_size = target_size * tolerance
- # In theory it may happen that the current example now exceeds the new target_size,
- # but that seems like an unimportant edge case if batch sizes are variable anyway?
- batch = [example]
- current_size = n_words
+ yield batch
+ target_size = next(size_)
+ tol_size = target_size * tolerance
+ # In theory it may happen that the current example now exceeds the new target_size,
+ # but that seems like an unimportant edge case if batch sizes are variable anyway?
+ batch = [example]
+ current_size = n_words
# yield the final batch
if batch:
From 85b0597ed5f8e23de337f56966e4b342827a99c3 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Tue, 2 Jun 2020 18:26:21 +0200
Subject: [PATCH 188/496] add test for minibatch util
---
spacy/tests/test_util.py | 23 +++++++++++++++++++++++
spacy/tests/util.py | 7 +++++++
2 files changed, 30 insertions(+)
create mode 100644 spacy/tests/test_util.py
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
new file mode 100644
index 000000000..382a8f548
--- /dev/null
+++ b/spacy/tests/test_util.py
@@ -0,0 +1,23 @@
+import pytest
+from spacy.gold import Example
+
+from .util import get_doc
+
+from spacy.util import minibatch_by_words
+
+
+@pytest.mark.parametrize(
+ "doc_sizes, expected_batches",
+ [
+ ([400, 400, 199], [3]),
+ ([400, 400, 199, 3], [4]),
+ ([400, 400, 199, 3, 250], [3, 2]),
+ ],
+)
+def test_util_minibatch(doc_sizes, expected_batches):
+ docs = [get_doc(doc_size) for doc_size in doc_sizes]
+
+ examples = [Example(doc=doc) for doc in docs]
+
+ batches = list(minibatch_by_words(examples=examples, size=1000))
+ assert [len(batch) for batch in batches] == expected_batches
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index e29342268..73650a6f7 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -92,6 +92,13 @@ def get_batch(batch_size):
return docs
+def get_doc(n_words):
+ vocab = Vocab()
+ # Make the words numbers, so that they're easy to track.
+ numbers = [str(i) for i in range(0, n_words)]
+ return Doc(vocab, words=numbers)
+
+
def apply_transition_sequence(parser, doc, sequence):
"""Perform a series of pre-specified transitions, to put the parser in a
desired state."""
From 6651fafd5cad7edf34dfb1374c962dff6ce901e9 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Tue, 2 Jun 2020 19:43:39 +0200
Subject: [PATCH 189/496] using overflow buffer for examples within the
tolerance margin
---
spacy/tests/test_util.py | 4 ++--
spacy/util.py | 17 ++++++++++++++---
2 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
index 382a8f548..93201eb4b 100644
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@@ -11,13 +11,13 @@ from spacy.util import minibatch_by_words
[
([400, 400, 199], [3]),
([400, 400, 199, 3], [4]),
+ ([400, 400, 199, 3, 1], [5]),
([400, 400, 199, 3, 250], [3, 2]),
+ ([400, 400, 199, 3, 1, 250], [3, 3]),
],
)
def test_util_minibatch(doc_sizes, expected_batches):
docs = [get_doc(doc_size) for doc_size in doc_sizes]
-
examples = [Example(doc=doc) for doc in docs]
-
batches = list(minibatch_by_words(examples=examples, size=1000))
assert [len(batch) for batch in batches] == expected_batches
diff --git a/spacy/util.py b/spacy/util.py
index f5ca49637..8ac2fd370 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -670,7 +670,9 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o
target_size = next(size_)
tol_size = target_size * tolerance
batch = []
+ overflow = []
current_size = 0
+ overflow_size = 0
for example in examples:
n_words = count_words(example.doc)
@@ -681,10 +683,15 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o
yield [example]
# add the example to the current batch if it still fits
- elif (current_size + n_words) < (target_size + tol_size):
+ elif (current_size + n_words) < target_size:
batch.append(example)
current_size += n_words
+ # add the example to the overflow buffer if it fits in the tolerance margins
+ elif (current_size + n_words) < (target_size + tol_size):
+ overflow.append(example)
+ overflow_size += n_words
+
# yield the previous batch and start a new one
else:
yield batch
@@ -692,11 +699,15 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o
tol_size = target_size * tolerance
# In theory it may happen that the current example now exceeds the new target_size,
# but that seems like an unimportant edge case if batch sizes are variable anyway?
- batch = [example]
- current_size = n_words
+ batch = overflow
+ batch.append(example)
+ current_size = overflow_size + n_words
+ overflow = []
+ overflow_size = 0
# yield the final batch
if batch:
+ batch.extend(overflow)
yield batch
From 6208d322d383455ea91c1e30b2c834a08e2cbbf0 Mon Sep 17 00:00:00 2001
From: svlandeg
Date: Tue, 2 Jun 2020 19:47:30 +0200
Subject: [PATCH 190/496] slightly more challenging unit test
---
spacy/tests/test_util.py | 4 ++--
spacy/util.py | 6 +++---
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
index 93201eb4b..a0c6ab6c0 100644
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@@ -12,8 +12,8 @@ from spacy.util import minibatch_by_words
([400, 400, 199], [3]),
([400, 400, 199, 3], [4]),
([400, 400, 199, 3, 1], [5]),
- ([400, 400, 199, 3, 250], [3, 2]),
- ([400, 400, 199, 3, 1, 250], [3, 3]),
+ ([400, 400, 199, 3, 200], [3, 2]),
+ ([400, 400, 199, 3, 1, 200], [3, 3]),
],
)
def test_util_minibatch(doc_sizes, expected_batches):
diff --git a/spacy/util.py b/spacy/util.py
index 8ac2fd370..b4e6f7fb1 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -682,13 +682,13 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o
if not discard_oversize:
yield [example]
- # add the example to the current batch if it still fits
- elif (current_size + n_words) < target_size:
+ # add the example to the current batch if it still fits and there's no overflow yet
+ elif overflow_size == 0 and (current_size + n_words) < target_size:
batch.append(example)
current_size += n_words
# add the example to the overflow buffer if it fits in the tolerance margins
- elif (current_size + n_words) < (target_size + tol_size):
+ elif (current_size + overflow_size + n_words) < (target_size + tol_size):
overflow.append(example)
overflow_size += n_words
From ef834b4cd7f51d24b2df451b091caaf21586d199 Mon Sep 17 00:00:00 2001
From: svlandeg