diff --git a/setup.py b/setup.py index 3070985f1..34c92ad2b 100755 --- a/setup.py +++ b/setup.py @@ -244,6 +244,8 @@ def setup_package(): "cuda91": ["cupy-cuda91>=4.0"], "cuda92": ["cupy-cuda92>=4.0"], "cuda100": ["cupy-cuda100>=4.0"], + # Language tokenizers with external dependencies + "ja": ["mecab-python3==0.7"], }, python_requires=">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*", classifiers=[ diff --git a/spacy/errors.py b/spacy/errors.py index 45f3fea76..2a501089d 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -342,13 +342,8 @@ class Errors(object): @add_codes class TempErrors(object): - T001 = ("Max length currently 10 for phrase matching") - T002 = ("Pattern length ({doc_len}) >= phrase_matcher.max_length " - "({max_len}). Length can be set on initialization, up to 10.") T003 = ("Resizing pre-trained Tagger models is not currently supported.") T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.") - T005 = ("Currently history size is hard-coded to 0. Received: {value}.") - T006 = ("Currently history width is hard-coded to 0. Received: {value}.") T007 = ("Can't yet set {attr} from Span. Vote for this feature on the " "issue tracker: http://github.com/explosion/spaCy/issues") T008 = ("Bad configuration of Tagger. This is probably a bug within " diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index bc541e345..33bdf8266 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -39,6 +39,33 @@ mkdir models python -m spacy train es models ancora-json/es_ancora-ud-train.json ancora-json/es_ancora-ud-dev.json ``` +#### Understanding the training output + +When you train a model using the [`spacy train`](/api/cli#train) command, you'll +see a table showing metrics after each pass over the data. Here's what those +metrics means: + +> #### Tokenization metrics +> +> Note that if the development data has raw text, some of the gold-standard +> entities might not align to the predicted tokenization. These tokenization +> errors are **excluded from the NER evaluation**. If your tokenization makes it +> impossible for the model to predict 50% of your entities, your NER F-score +> might still look good. + +| Name | Description | +| ---------- | ------------------------------------------------------------------------------------------------- | +| `Dep Loss` | Training loss for dependency parser. Should decrease, but usually not to 0. | +| `NER Loss` | Training loss for named entity recognizer. Should decrease, but usually not to 0. | +| `UAS` | Unlabeled attachment score for parser. The percentage of unlabeled correct arcs. Should increase. | +| `NER P.` | NER precision on development data. Should increase. | +| `NER R.` | NER recall on development data. Should increase. | +| `NER F.` | NER F-score on development data. Should increase. | +| `Tag %` | Fine-grained part-of-speech tag accuracy on development data. Should increase. | +| `Token %` | Tokenization accuracy on development data. | +| `CPU WPS` | Prediction speed on CPU in words per second, if available. Should stay stable. | +| `GPU WPS` | Prediction speed on GPU in words per second, if available. Should stay stable. | + ### Improving accuracy with transfer learning {#transfer-learning new="2.1"} In most projects, you'll usually have a small amount of labelled data, and diff --git a/website/gatsby-browser.js b/website/gatsby-browser.js index 25fedd4b4..3b570703d 100644 --- a/website/gatsby-browser.js +++ b/website/gatsby-browser.js @@ -42,7 +42,6 @@ export const onRouteUpdate = ({ location }) => { // Navigate to targeted element el.scrollIntoView() // Force recomputing :target pseudo class with pushState/popState - window.location.hash = '' window.location.hash = location.hash } }, 0)