From dfbed07d3b824aaa5e5619c6a67179cfffe244b3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 24 Feb 2019 22:26:08 +0100 Subject: [PATCH 1/4] Remove unused temp errors --- spacy/errors.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 45f3fea76..2a501089d 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -342,13 +342,8 @@ class Errors(object): @add_codes class TempErrors(object): - T001 = ("Max length currently 10 for phrase matching") - T002 = ("Pattern length ({doc_len}) >= phrase_matcher.max_length " - "({max_len}). Length can be set on initialization, up to 10.") T003 = ("Resizing pre-trained Tagger models is not currently supported.") T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.") - T005 = ("Currently history size is hard-coded to 0. Received: {value}.") - T006 = ("Currently history width is hard-coded to 0. Received: {value}.") T007 = ("Can't yet set {attr} from Span. Vote for this feature on the " "issue tracker: http://github.com/explosion/spaCy/issues") T008 = ("Bad configuration of Tagger. This is probably a bug within " From 55bb570f51dff3966d1704558d646610a01b7887 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 25 Feb 2019 09:37:05 +0100 Subject: [PATCH 2/4] Add [ja] to extras_require --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 3070985f1..34c92ad2b 100755 --- a/setup.py +++ b/setup.py @@ -244,6 +244,8 @@ def setup_package(): "cuda91": ["cupy-cuda91>=4.0"], "cuda92": ["cupy-cuda92>=4.0"], "cuda100": ["cupy-cuda100>=4.0"], + # Language tokenizers with external dependencies + "ja": ["mecab-python3==0.7"], }, python_requires=">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*", classifiers=[ From 1981b194cc579ae3e7561ee37c0b828e5934e4d1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 25 Feb 2019 10:03:20 +0100 Subject: [PATCH 3/4] Fix recomputing of :target [ci skip] Prevents additional history entry --- website/gatsby-browser.js | 1 - 1 file changed, 1 deletion(-) diff --git a/website/gatsby-browser.js b/website/gatsby-browser.js index 25fedd4b4..3b570703d 100644 --- a/website/gatsby-browser.js +++ b/website/gatsby-browser.js @@ -42,7 +42,6 @@ export const onRouteUpdate = ({ location }) => { // Navigate to targeted element el.scrollIntoView() // Force recomputing :target pseudo class with pushState/popState - window.location.hash = '' window.location.hash = location.hash } }, 0) From 1b6238101ae5c2623ae1411ffbd2d0cdcdad7a49 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 25 Feb 2019 10:03:43 +0100 Subject: [PATCH 4/4] Add table explaining training metrics [closes #2644] --- website/docs/usage/training.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index bc541e345..33bdf8266 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -39,6 +39,33 @@ mkdir models python -m spacy train es models ancora-json/es_ancora-ud-train.json ancora-json/es_ancora-ud-dev.json ``` +#### Understanding the training output + +When you train a model using the [`spacy train`](/api/cli#train) command, you'll +see a table showing metrics after each pass over the data. Here's what those +metrics means: + +> #### Tokenization metrics +> +> Note that if the development data has raw text, some of the gold-standard +> entities might not align to the predicted tokenization. These tokenization +> errors are **excluded from the NER evaluation**. If your tokenization makes it +> impossible for the model to predict 50% of your entities, your NER F-score +> might still look good. + +| Name | Description | +| ---------- | ------------------------------------------------------------------------------------------------- | +| `Dep Loss` | Training loss for dependency parser. Should decrease, but usually not to 0. | +| `NER Loss` | Training loss for named entity recognizer. Should decrease, but usually not to 0. | +| `UAS` | Unlabeled attachment score for parser. The percentage of unlabeled correct arcs. Should increase. | +| `NER P.` | NER precision on development data. Should increase. | +| `NER R.` | NER recall on development data. Should increase. | +| `NER F.` | NER F-score on development data. Should increase. | +| `Tag %` | Fine-grained part-of-speech tag accuracy on development data. Should increase. | +| `Token %` | Tokenization accuracy on development data. | +| `CPU WPS` | Prediction speed on CPU in words per second, if available. Should stay stable. | +| `GPU WPS` | Prediction speed on GPU in words per second, if available. Should stay stable. | + ### Improving accuracy with transfer learning {#transfer-learning new="2.1"} In most projects, you'll usually have a small amount of labelled data, and