Update usage documentation

This commit is contained in:
ines 2017-10-03 14:26:20 +02:00
parent 9af604f0da
commit 3f4fd2c5d5
85 changed files with 3906 additions and 3143 deletions

View File

@ -1,28 +0,0 @@
{
"index": {
"title" : "Documentation",
"sections": {
"Usage": {
"url": "/docs/usage",
"svg": "computer",
"description": "How to use spaCy and its features."
},
"API": {
"url": "/docs/api",
"svg": "brain",
"description": "The detailed reference for spaCy's API."
},
"Tutorials": {
"url": "/docs/usage/tutorials",
"svg": "eye",
"description": "End-to-end examples, with code you can modify and run."
},
"Showcase & Demos": {
"url": "/docs/usage/showcase",
"svg": "bubble",
"description": "Demos, libraries and products from the spaCy community."
}
}
}
}

View File

@ -1,25 +0,0 @@
//- 💫 DOCS
include ../_includes/_mixins
+aside("Help us improve the docs")
| Did you spot a mistake or come across explanations that
| are unclear? You can find a "Suggest edits" button at the
| bottom of each page that points you to the source.
| We always appreciate
| #[+a(gh("spaCy") + "/pulls") pull requests].#[br]#[br]
| Have you built something cool with spaCy, or did you
| write a tutorial to help others use spaCy?
| #[a(href="mailto:#{EMAIL}") Let us know!]
+grid
each details, title in sections
+card(false, false)
a(href=details.url)
+svg("graphics", details.svg, 300, 150).u-color-theme
a(href=details.url)
+h(3)=title
p=details.description
+button(details.url, true, "primary")(target="_self") View

View File

@ -1,420 +0,0 @@
{
"sidebar": {
"Get started": {
"Installation": "./",
"Models": "models",
"spaCy 101": "spacy-101",
"Lightning tour": "lightning-tour",
"What's new in v2.0": "v2"
},
"Guides": {
"POS tagging": "pos-tagging",
"Using the parse": "dependency-parse",
"Entity recognition": "entity-recognition",
"Vectors & similarity": "word-vectors-similarities",
"Custom tokenization": "customizing-tokenizer",
"Rule-based matching": "rule-based-matching",
"Adding languages": "adding-languages",
"Processing pipelines": "language-processing-pipeline",
"Text classification": "text-classification",
"Deep learning": "deep-learning",
"Production use": "production-use",
"Training": "training",
"Training NER": "training-ner",
"Saving & loading": "saving-loading",
"Visualizers": "visualizers"
},
"Examples": {
"Tutorials": "tutorials",
"Showcase": "showcase"
}
},
"index": {
"title": "Install spaCy",
"next": "models",
"quickstart": true
},
"models": {
"title": "Models",
"next": "spacy-101",
"quickstart": true
},
"spacy-101": {
"title": "spaCy 101 Everything you need to know",
"next": "lightning-tour",
"quickstart": true,
"preview": "101"
},
"lightning-tour": {
"title": "Lightning tour",
"next": "v2"
},
"visualizers": {
"title": "Visualizers"
},
"v2": {
"title": "What's new in v2.0"
},
"pos-tagging": {
"title": "Part-of-speech tagging",
"next": "dependency-parse"
},
"dependency-parse": {
"title": "Using the dependency parse",
"next": "entity-recognition"
},
"entity-recognition": {
"title": "Named Entity Recognition",
"next": "training-ner"
},
"word-vectors-similarities": {
"title": "Using word vectors and semantic similarities",
"next": "customizing-tokenizer"
},
"customizing-tokenizer": {
"title": "Customising the tokenizer",
"next": "rule-based-matching"
},
"rule-based-matching": {
"title": "Rule-based matching",
"next": "adding-languages"
},
"adding-languages": {
"title": "Adding languages",
"next": "training"
},
"language-processing-pipeline": {
"title": "Language processing pipelines",
"next": "deep-learning"
},
"deep-learning": {
"title": "Hooking a deep learning model into spaCy",
"next": "production use"
},
"text-classification": {
"title": "Text classification",
"next": "training"
},
"production-use": {
"title": "Production use",
"next": "training"
},
"training": {
"title": "Training spaCy's statistical models",
"next": "saving-loading"
},
"training-ner": {
"title": "Training the Named Entity Recognizer",
"next": "saving-loading"
},
"saving-loading": {
"title": "Saving, loading and data serialization"
},
"showcase": {
"title": "Showcase",
"libraries": {
"spacy_api": {
"url": "https://github.com/kootenpv/spacy_api",
"author": "Pascal van Kooten",
"description": "Server/client to load models in a separate, dedicated process."
},
"spacy-nlp": {
"url": "https://github.com/kengz/spacy-nlp",
"author": "Wah Loon Keng",
"description": "Expose spaCy NLP text parsing to Node.js (and other languages) via Socket.IO."
},
"spacy-api-docker": {
"url": "https://github.com/jgontrum/spacy-api-docker",
"author": "Johannes Gontrum",
"description": "spaCy accessed by a REST API, wrapped in a Docker container."
},
"spacy-nlp-zeromq": {
"url": "https://github.com/pasupulaphani/spacy-nlp-docker",
"author": "Phaninder Pasupula",
"description": "Docker image exposing spaCy with ZeroMQ bindings."
},
"textacy": {
"url": "https://github.com/chartbeat-labs/textacy",
"author": " Burton DeWilde (Chartbeat)",
"description": "Higher-level NLP built on spaCy."
},
"visual-qa": {
"url": "https://github.com/avisingh599/visual-qa",
"author": "Avi Singh",
"description": "Keras-based LSTM/CNN models for Visual Question Answering."
},
"rasa_nlu": {
"url": "https://github.com/golastmile/rasa_nlu",
"author": "LASTMILE",
"description": "High level APIs for building your own language parser using existing NLP and ML libraries."
},
"spacyr": {
"url": "https://github.com/kbenoit/spacyr",
"author": "Kenneth Benoit",
"description": "An R wrapper for spaCy."
}
},
"visualizations": {
"displaCy": {
"url": "https://demos.explosion.ai/displacy",
"author": "Ines Montani",
"description": "An open-source NLP visualiser for the modern web.",
"image": "displacy.jpg"
},
"displaCy ENT": {
"url": "https://demos.explosion.ai/displacy-ent",
"author": "Ines Montani",
"description": "An open-source named entity visualiser for the modern web.",
"image": "displacy-ent.jpg"
}
},
"products": {
"sense2vec": {
"url": "https://demos.explosion.ai/sense2vec",
"author": "Matthew Honnibal and Ines Montani",
"description": "Semantic analysis of the Reddit hivemind.",
"image": "sense2vec.jpg"
},
"TruthBot": {
"url": "http://summerscope.github.io/govhack/2016/truthbot/",
"author": "Team Truthbot",
"description": "The world's first artificially intelligent fact checking robot.",
"image": "truthbot.jpg"
},
"Laice": {
"url": "https://github.com/kendricktan/laice",
"author": "Kendrick Tan",
"description": "Train your own Natural Language Processor from a browser.",
"image": "laice.jpg"
},
"FoxType": {
"url": "https://foxtype.com",
"description": "Smart tools for writers.",
"image": "foxtype.jpg"
},
"Kip": {
"url": "https://kipthis.com",
"description": "An AI chat assistant for group shopping.",
"image": "kip.jpg"
},
"Indico": {
"url": "https://indico.io",
"description": "Text and image analysis powered by Machine Learning.",
"image": "indico.jpg"
},
"TextAnalysisOnline": {
"url": "http://textanalysisonline.com",
"description": "Online tool for spaCy's tokenizer, parser, NER and more.",
"image": "textanalysis.jpg"
}
},
"books": {
"Introduction to Machine Learning with Python: A Guide for Data Scientists": {
"url": "https://books.google.de/books?id=vbQlDQAAQBAJ",
"author": "Andreas C. Müller and Sarah Guido (O'Reilly, 2016)",
"description": "Andreas is a lead developer of Scikit-Learn, and Sarah is a lead data scientist at Mashable. We're proud to get a mention."
},
"Text Analytics with Python: A Practical Real-World Approach to Gaining Actionable Insights from your Data": {
"url": "https://www.amazon.com/Text-Analytics-Python-Real-World-Actionable/dp/148422387X",
"author": "Dipanjan Sarkar (Apress / Springer, 2016)",
"description": "Derive useful insights from your data using Python. Learn the techniques related to natural language processing and text analytics, and gain the skills to know which technique is best suited to solve a particular problem."
}
},
"research": {
"Distributional semantics for understanding spoken meal descriptions": {
"url": "https://www.semanticscholar.org/paper/Distributional-semantics-for-understanding-spoken-Korpusik-Huang/5f55c5535e80d3e5ed7f1f0b89531e32725faff5",
"author": "Mandy Korpusik et al. (2016)"
},
"Refactoring the Genia Event Extraction Shared Task Toward a General Framework for IE-Driven KB Development": {
"url": "https://www.semanticscholar.org/paper/Refactoring-the-Genia-Event-Extraction-Shared-Task-Kim-Wang/06d94b64a7bd2d3433f57caddad5084435d6a91f",
"author": "Jin-Dong Kim et al. (2016)"
},
"Mixing Dirichlet Topic Models and Word Embeddings to Make lda2vec": {
"url": "https://www.semanticscholar.org/paper/Mixing-Dirichlet-Topic-Models-and-Word-Embeddings-Moody/bf8116e06f7b498c6abfbf97aeb67d0838c08609",
"author": "Christopher E. Moody (2016)"
},
"Predicting Pre-click Quality for Native Advertisements": {
"url": "https://www.semanticscholar.org/paper/Predicting-Pre-click-Quality-for-Native-Zhou-Redi/564985430ff2fbc3a9daa9c2af8997b7f5046da8",
"author": "Ke Zhou et al. (2016)"
},
"Threat detection in online discussions": {
"url": "https://www.semanticscholar.org/paper/Threat-detection-in-online-discussions-Wester-%C3%98vrelid/f4150e2fb4d8646ebc2ea84f1a86afa1b593239b",
"author": "Aksel Wester et al. (2016)"
},
"The language of mental health problems in social media": {
"url": "https://www.semanticscholar.org/paper/The-language-of-mental-health-problems-in-social-Gkotsis-Oellrich/537db6c2984514d92a754a591841e2e20845985a",
"author": "George Gkotsis et al. (2016)"
}
}
},
"tutorials": {
"title": "Tutorials",
"next": "showcase",
"first_steps": {
"Setting up an NLP environment with Python": {
"url": "https://shirishkadam.com/2016/10/06/setting-up-natural-language-processing-environment-with-python/",
"author": "Shirish Kadam"
},
"NLP with spaCy in 10 lines of code": {
"url": "https://github.com/cytora/pycon-nlp-in-10-lines",
"author": "Andraz Hribernik et al. (Cytora)",
"tags": ["jupyter"]
},
"Intro to NLP with spaCy": {
"url": "https://nicschrading.com/project/Intro-to-NLP-with-spaCy/",
"author": "J Nicolas Schrading"
},
"NLP with spaCy and IPython Notebook": {
"url": "http://blog.sharepointexperience.com/2016/01/nlp-and-sharepoint-part-1/",
"author": "Dustin Miller (SharePoint)",
"tags": ["jupyter"]
},
"Getting Started with spaCy": {
"url": "http://textminingonline.com/getting-started-with-spacy",
"author": "TextMiner"
},
"spaCy A fast natural language processing library": {
"url": "https://bjoernkw.com/2015/11/22/spacy-a-fast-natural-language-processing-library/",
"author": "Björn Wilmsmann"
},
"NLP (almost) From Scratch - POS Network with spaCy": {
"url": "http://sujitpal.blogspot.de/2016/07/nlp-almost-from-scratch-implementing.html",
"author": "Sujit Pal",
"tags": ["gensim", "keras"]
},
"NLP tasks with various libraries": {
"url": "http://clarkgrubb.com/nlp",
"author": "Clark Grubb"
},
"A very (very) short primer on spacy.io": {
"url": "http://blog.milonimrod.com/2015/10/a-very-very-short-primer-on-spacyio.html",
"author": "Nimrod Milo "
}
},
"deep_dives": {
"Modern NLP in Python What you can learn about food by analyzing a million Yelp reviews": {
"url": "http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb",
"author": "Patrick Harrison (S&P Global)",
"tags": ["jupyter", "gensim"]
},
"Deep Learning with custom pipelines and Keras": {
"url": "https://explosion.ai/blog/spacy-deep-learning-keras",
"author": "Matthew Honnibal",
"tags": ["keras", "sentiment"]
},
"A decomposable attention model for Natural Language Inference": {
"url": "https://github.com/explosion/spaCy/tree/master/examples/keras_parikh_entailment",
"author": "Matthew Honnibal",
"tags": ["keras", "similarity"]
},
"Using the German model": {
"url": "https://explosion.ai/blog/german-model",
"author": "Wolfgang Seeker",
"tags": ["multi-lingual"]
},
"Sense2vec with spaCy and Gensim": {
"url": "https://explosion.ai/blog/sense2vec-with-spacy",
"author": "Matthew Honnibal",
"tags": ["big data", "gensim"]
},
"Building your bot's brain with Node.js and spaCy": {
"url": "https://explosion.ai/blog/chatbot-node-js-spacy",
"author": "Wah Loon Keng",
"tags": ["bots", "node.js"]
},
"An intent classifier with spaCy": {
"url": "http://blog.themusio.com/2016/07/18/musios-intent-classifier-2/",
"author": "Musio",
"tags": ["bots", "keras"]
},
"Visual Question Answering with spaCy": {
"url": "http://iamaaditya.github.io/2016/04/visual_question_answering_demo_notebook",
"author": "Aaditya Prakash",
"tags": ["vqa", "keras"]
},
"Extracting time suggestions from emails with spaCy": {
"url": "https://medium.com/redsift-outbox/what-time-cc9ce0c2aed2",
"author": "Chris Savvopoulos",
"tags": ["ner"]
},
"Advanced text analysis with spaCy and Scikit-Learn": {
"url": "https://github.com/JonathanReeve/advanced-text-analysis-workshop-2017/blob/master/advanced-text-analysis.ipynb",
"author": "Jonathan Reeve",
"tags": ["jupyter", "scikit-learn"]
}
},
"code": {
"Training a new entity type": {
"url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_new_entity_type.py",
"author": "Matthew Honnibal",
"tags": ["ner", "training"]
},
"Training an NER system from scratch": {
"url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_ner_standalone.py",
"author": "Matthew Honnibal",
"tags": ["ner", "training"]
},
"Information extraction": {
"url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py",
"author": "Matthew Honnibal",
"tags": ["snippet"]
},
"Neural bag of words": {
"url": "https://github.com/explosion/spaCy/blob/master/examples/nn_text_class.py",
"author": "Matthew Honnibal",
"tags": ["sentiment"]
},
"Part-of-speech tagging": {
"url": "https://github.com/explosion/spaCy/blob/master/examples/pos_tag.py",
"author": "Matthew Honnibal",
"tags": ["pos"]
},
"Parallel parse": {
"url": "https://github.com/explosion/spaCy/blob/master/examples/parallel_parse.py",
"author": "Matthew Honnibal",
"tags": ["big data"]
},
"Inventory count": {
"url": "https://github.com/explosion/spaCy/tree/master/examples/inventory_count",
"author": "Oleg Zd"
},
"Multi-word matches": {
"url": "https://github.com/explosion/spaCy/blob/master/examples/multi_word_matches.py",
"author": "Matthew Honnibal",
"tags": ["matcher", "out of date"]
}
}
}
}

View File

@ -1,24 +0,0 @@
//- 💫 DOCS > USAGE > MODELS LIST
include ../../_includes/_mixins
p
| Model differences are mostly statistical. In general, we do expect larger
| models to be "better" and more accurate overall. Ultimately, it depends on
| your use case and requirements, and we recommend starting with the default
| models (marked with a star below).
+aside
| Models are now available as #[code .tar.gz] archives #[+a(gh("spacy-models")) from GitHub],
| attached to individual releases. They can be downloaded and loaded manually,
| or using spaCy's #[code download] and #[code link] commands. All models
| follow the naming convention of #[code [language]_[type]_[genre]_[size]].
| #[br]#[br]
+button(gh("spacy-models"), true, "primary").u-text-tag
| View model releases
+table(["Name", "Language", "Voc", "Dep", "Ent", "Vec", "Size", "License"])
for models, lang in MODELS
for model, i in models
+model-row(model.id, model.lang, model.feats, model.size, model.license, model.def || models.length == 1, i == 0)

View File

@ -1,92 +0,0 @@
//- 💫 DOCS > USAGE > DEEP LEARNING
include ../../_includes/_mixins
p
| In this example, we'll be using #[+a("https://keras.io/") Keras], as
| it's the most popular deep learning library for Python. Using Keras,
| we will write a custom sentiment analysis model that predicts whether a
| document is positive or negative. Then, we will use it to find which entities
| are commonly associated with positive or negative documents. Here's a
| quick example of how that can look at runtime.
+aside("What's Keras?")
| #[+a("https://keras.io/") Keras] gives you a high-level, declarative
| interface to define neural networks. Models are trained using Google's
| #[+a("https://www.tensorflow.org") TensorFlow] by default.
| #[+a("http://deeplearning.net/software/theano/") Theano] is also
| supported.
+under-construction
p
| For most applications, I it's recommended to use pre-trained word embeddings
| without "fine-tuning". This means that you'll use the same embeddings
| across different models, and avoid learning adjustments to them on your
| training data. The embeddings table is large, and the values provided by
| the pre-trained vectors are already pretty good. Fine-tuning the
| embeddings table is therefore a waste of your "parameter budget". It's
| usually better to make your network larger some other way, e.g. by
| adding another LSTM layer, using attention mechanism, using character
| features, etc.
+h(2, "attribute-hooks") Attribute hooks
+under-construction
p
| Earlier, we saw how to store data in the new generic #[code user_data]
| dict. This generalises well, but it's not terribly satisfying. Ideally,
| we want to let the custom data drive more "native" behaviours. For
| instance, consider the #[code .similarity()] methods provided by spaCy's
| #[+api("doc") #[code Doc]], #[+api("token") #[code Token]] and
| #[+api("span") #[code Span]] objects:
+code("Polymorphic similarity example").
span.similarity(doc)
token.similarity(span)
doc1.similarity(doc2)
p
| By default, this just averages the vectors for each document, and
| computes their cosine. Obviously, spaCy should make it easy for you to
| install your own similarity model. This introduces a tricky design
| challenge. The current solution is to add three more dicts to the
| #[code Doc] object:
+aside("Implementation note")
| The hooks live on the #[code Doc] object because the #[code Span] and
| #[code Token] objects are created lazily, and don't own any data. They
| just proxy to their parent #[code Doc]. This turns out to be convenient
| here — we only have to worry about installing hooks in one place.
+table(["Name", "Description"])
+row
+cell #[code user_hooks]
+cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents]
+row
+cell #[code user_token_hooks]
+cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts]
+row
+cell #[code user_span_hooks]
+cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root]
p
| To sum up, here's an example of hooking in custom #[code .similarity()]
| methods:
+code("Add custom similarity hooks").
class SimilarityModel(object):
def __init__(self, model):
self._model = model
def __call__(self, doc):
doc.user_hooks['similarity'] = self.similarity
doc.user_span_hooks['similarity'] = self.similarity
doc.user_token_hooks['similarity'] = self.similarity
def similarity(self, obj1, obj2):
y = self._model([obj1.vector, obj2.vector])
return float(y[0])

View File

@ -1,353 +0,0 @@
//- 💫 DOCS > USAGE
include ../../_includes/_mixins
p
| spaCy is compatible with #[strong 64-bit CPython 2.6+∕3.3+] and
| runs on #[strong Unix/Linux], #[strong macOS/OS X] and
| #[strong Windows]. The latest spaCy releases are
| available over #[+a("https://pypi.python.org/pypi/spacy") pip] (source
| packages only) and #[+a("https://anaconda.org/conda-forge/spacy") conda].
| Installation requires a working build environment. See notes on
| #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
| and #[a(href="#source-windows") Windows] for details.
+quickstart(QUICKSTART, "Quickstart")
+qs({config: 'venv', python: 2}) python -m pip install -U virtualenv
+qs({config: 'venv', python: 3}) python -m pip install -U venv
+qs({config: 'venv', python: 2}) virtualenv .env
+qs({config: 'venv', python: 3}) venv .env
+qs({config: 'venv', os: 'mac'}) source .env/bin/activate
+qs({config: 'venv', os: 'linux'}) source .env/bin/activate
+qs({config: 'venv', os: 'windows'}) .env\Scripts\activate
+qs({config: 'gpu', os: 'mac'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
+qs({config: 'gpu', os: 'linux'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
+qs({package: 'pip'}) pip install -U spacy
+qs({package: 'conda'}) conda install -c conda-forge spacy
+qs({package: 'source'}) git clone https://github.com/explosion/spaCy
+qs({package: 'source'}) cd spaCy
+qs({package: 'source'}) pip install -r requirements.txt
+qs({package: 'source'}) pip install -e .
+qs({model: 'en'}) spacy download en
+qs({model: 'de'}) spacy download de
+qs({model: 'fr'}) spacy download fr
+qs({model: 'es'}) spacy download es
+h(2, "installation") Installation instructions
+h(3, "pip") pip
+badge("pipy")
p Using pip, spaCy releases are currently only available as source packages.
+code(false, "bash").
pip install -U spacy
+aside("Download models")
| After installation you need to download a language model. For more info
| and available models, see the #[+a("/docs/usage/models") docs on models].
+code.o-no-block.
spacy download en
>>> import spacy
>>> nlp = spacy.load('en')
p
| When using pip it is generally recommended to install packages in a
| #[code virtualenv] to avoid modifying system state:
+code(false, "bash").
virtualenv .env
source .env/bin/activate
pip install spacy
+h(3, "conda") conda
+badge("conda")
p
| Thanks to our great community, we've finally re-added conda support. You
| can now install spaCy via #[code conda-forge]:
+code(false, "bash").
conda config --add channels conda-forge
conda install spacy
p
| For the feedstock including the build recipe and configuration, check out
| #[+a("https://github.com/conda-forge/spacy-feedstock") this repository].
| Improvements and pull requests to the recipe and setup are always appreciated.
+h(2, "gpu") Run spaCy with GPU
p
| As of v2.0, spaCy's comes with neural network models that are implemented
| in our machine learning library, #[+a(gh("thinc")) Thinc]. For GPU
| support, we've been grateful to use the work of
| #[+a("http://chainer.org") Chainer]'s CuPy module, which provides
| a NumPy-compatible interface for GPU arrays.
p
| First, install follows the normal CUDA installation procedure. Next, set
| your environment variables so that the installation will be able to find
| CUDA. Finally, install spaCy.
+code(false, "bash").
export CUDA_HOME=/usr/local/cuda-8.0 # Or wherever your CUDA is
export PATH=$PATH:$CUDA_HOME/bin
pip install spacy
python -c "import thinc.neural.gpu_ops" # Check the GPU ops were built
+h(2, "source") Compile from source
p
| The other way to install spaCy is to clone its
| #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
| the common way if you want to make changes to the code base. You'll need to
| make sure that you have a development environment consisting of a Python
| distribution including header files, a compiler,
| #[+a("https://pip.pypa.io/en/latest/installing/") pip],
| #[+a("https://virtualenv.pypa.io/") virtualenv] and
| #[+a("https://git-scm.com") git] installed. The compiler part is the
| trickiest. How to do that depends on your system. See notes on
| #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") OS X] and
| #[a(href="#source-windows") Windows] for details.
+code(false, "bash").
# make sure you are using recent pip/virtualenv versions
python -m pip install -U pip virtualenv
git clone #{gh("spaCy")}
cd spaCy
virtualenv .env
source .env/bin/activate
pip install -r requirements.txt
pip install -e .
p
| Compared to regular install via pip, #[+a(gh("spaCy", "requirements.txt")) requirements.txt]
| additionally installs developer dependencies such as Cython.
p
| Instead of the above verbose commands, you can also use the following
| #[+a("http://www.fabfile.org/") Fabric] commands:
+table(["Command", "Description"])
+row
+cell #[code fab env]
+cell Create #[code virtualenv] and delete previous one, if it exists.
+row
+cell #[code fab make]
+cell Compile the source.
+row
+cell #[code fab clean]
+cell Remove compiled objects, including the generated C++.
+row
+cell #[code fab test]
+cell Run basic tests, aborting after first failure.
p
| All commands assume that your #[code virtualenv] is located in a
| directory #[code .env]. If you're using a different directory, you can
| change it via the environment variable #[code VENV_DIR], for example:
+code(false, "bash").
VENV_DIR=".custom-env" fab clean make
+h(3, "source-ubuntu") Ubuntu
p Install system-level dependencies via #[code apt-get]:
+code(false, "bash").
sudo apt-get install build-essential python-dev git
+h(3, "source-osx") macOS / OS X
p
| Install a recent version of #[+a("https://developer.apple.com/xcode/") XCode],
| including the so-called "Command Line Tools". macOS and OS X ship with
| Python and git preinstalled. To compile spaCy with multi-threading support
| on macOS / OS X, #[+a("https://github.com/explosion/spaCy/issues/267") see here].
+h(3, "source-windows") Windows
p
| Install a version of
| #[+a("https://www.visualstudio.com/vs/visual-studio-express/") Visual Studio Express]
| that matches the version that was used to compile your Python
| interpreter. For official distributions these are:
+table([ "Distribution", "Version"])
+row
+cell Python 2.7
+cell Visual Studio 2008
+row
+cell Python 3.4
+cell Visual Studio 2010
+row
+cell Python 3.5+
+cell Visual Studio 2015
+h(2, "troubleshooting") Troubleshooting guide
p
| This section collects some of the most common errors you may come
| across when installing, loading and using spaCy, as well as their solutions.
+aside("Help us improve this guide")
| Did you come across a problem like the ones listed here and want to
| share the solution? You can find the "Suggest edits" button at the
| bottom of this page that points you to the source. We always
| appreciate #[+a(gh("spaCy") + "/pulls") pull requests]!
+h(3, "compatible-model") No compatible model found
+code(false, "text").
No compatible model found for [lang] (spaCy v#{SPACY_VERSION}).
p
| This usually means that the model you're trying to download does not
| exist, or isn't available for your version of spaCy. Check the
| #[+a(gh("spacy-models", "compatibility.json")) compatibility table]
| to see which models are available for your spaCy version. If you're using
| an old version, consider upgrading to the latest release. Note that while
| spaCy supports tokenization for
| #[+a("/docs/api/language-models/#alpha-support") a variety of languages],
| not all of them come with statistical models. To only use the tokenizer,
| import the language's #[code Language] class instead, for example
| #[code from spacy.fr import French].
+h(3, "symlink-privilege") Symbolic link privilege not held
+code(false, "text").
OSError: symbolic link privilege not held
p
| To create #[+a("/docs/usage/models/#usage") shortcut links] that let you
| load models by name, spaCy creates a symbolic link in the
| #[code spacy/data] directory. This means your user needs permission to do
| this. The above error mostly occurs when doing a system-wide installation,
| which will create the symlinks in a system directory. Run the
| #[code download] or #[code link] command as administrator, or use a
| #[code virtualenv] to install spaCy in a user directory, instead
| of doing a system-wide installation.
+h(3, "no-cache-dir") No such option: --no-cache-dir
+code(false, "text").
no such option: --no-cache-dir
p
| The #[code download] command uses pip to install the models and sets the
| #[code --no-cache-dir] flag to prevent it from requiring too much memory.
| #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting]
| requires pip v6.0 or newer. Run #[code pip install -U pip] to upgrade to
| the latest version of pip. To see which version you have installed,
| run #[code pip --version].
+h(3, "import-error") Import error
+code(false, "text").
Import Error: No module named spacy
p
| This error means that the spaCy module can't be located on your system, or in
| your environment. Make sure you have spaCy installed. If you're using a
| #[code virtualenv], make sure it's activated and check that spaCy is
| installed in that environment otherwise, you're trying to load a system
| installation. You can also run #[code which python] to find out where
| your Python executable is located.
+h(3, "import-error-models") Import error: models
+code(false, "text").
ImportError: No module named 'en_core_web_sm'
p
| As of spaCy v1.7, all models can be installed as Python packages. This means
| that they'll become importable modules of your application. When creating
| #[+a("/docs/usage/models/#usage") shortcut links], spaCy will also try
| to import the model to load its meta data. If this fails, it's usually a
| sign that the package is not installed in the current environment.
| Run #[code pip list] or #[code pip freeze] to check which model packages
| you have installed, and install the
| #[+a("/docs/usage/models#available") correct models] if necessary. If you're
| importing a model manually at the top of a file, make sure to use the name
| of the package, not the shortcut link you've created.
+h(3, "vocab-strings") File not found: vocab/strings.json
+code(false, "text").
FileNotFoundError: No such file or directory: [...]/vocab/strings.json
p
| This error may occur when using #[code spacy.load()] to load
| a language model either because you haven't set up a
| #[+a("/docs/usage/models/#usage") shortcut link] for it, or because it
| doesn't actually exist. Set up a
| #[+a("/docs/usage/models/#usage") shortcut link] for the model
| you want to load. This can either be an installed model package, or a
| local directory containing the model data. If you want to use one of the
| #[+a("/docs/api/language-models/#alpha-support") alpha tokenizers] for
| languages that don't yet have a statistical model, you should import its
| #[code Language] class instead, for example
| #[code from spacy.lang.bn import Bengali].
+h(3, "command-not-found") Command not found
+code(false, "text").
command not found: spacy
p
| This error may occur when running the #[code spacy] command from the
| command line. spaCy does not currently add an entry to our #[code PATH]
| environment variable, as this can lead to unexpected results, especially
| when using #[code virtualenv]. Instead, spaCy adds an auto-alias that
| maps #[code spacy] to #[code python -m spacy]. If this is not working as
| expected, run the command with #[code python -m], yourself
| for example #[code python -m spacy download en]. For more info on this,
| see #[+api("cli#download") download].
+h(3, "module-load") 'module' object has no attribute 'load'
+code(false, "text").
AttributeError: 'module' object has no attribute 'load'
p
| While this could technically have many causes, including spaCy being
| broken, the most likely one is that your script's file or directory name
| is "shadowing" the module e.g. your file is called #[code spacy.py],
| or a directory you're importing from is called #[code spacy]. So, when
| using spaCy, never call anything else #[code spacy].
+h(2, "tests") Run tests
p
| spaCy comes with an #[+a(gh("spacy", "spacy/tests")) extensive test suite].
| First, find out where spaCy is installed:
+code(false, "bash").
python -c "import os; import spacy; print(os.path.dirname(spacy.__file__))"
p
| Then run #[code pytest] on that directory. The flags #[code --slow] and
| #[code --model] are optional and enable additional tests.
+code(false, "bash").
# make sure you are using recent pytest version
python -m pip install -U pytest
python -m pytest <spacy-directory> # basic tests
python -m pytest <spacy-directory> --slow # basic and slow tests
python -m pytest <spacy-directory> --models --all # basic and all model tests
python -m pytest <spacy-directory> --models --en # basic and English model tests

View File

@ -1,147 +0,0 @@
//- 💫 DOCS > USAGE > PROCESSING TEXT
include ../../_includes/_mixins
+under-construction
+h(2, "multithreading") Multi-threading with #[code .pipe()]
p
| If you have a sequence of documents to process, you should use the
| #[+api("language#pipe") #[code Language.pipe()]] method. The method takes
| an iterator of texts, and accumulates an internal buffer,
| which it works on in parallel. It then yields the documents in order,
| one-by-one. After a long and bitter struggle, the global interpreter
| lock was freed around spaCy's main parsing loop in v0.100.3. This means
| that #[code .pipe()] will be significantly faster in most
| practical situations, because it allows shared memory parallelism.
+code.
for doc in nlp.pipe(texts, batch_size=10000, n_threads=3):
pass
p
| To make full use of the #[code .pipe()] function, you might want to
| brush up on #[strong Python generators]. Here are a few quick hints:
+list
+item
| Generator comprehensions can be written as
| #[code (item for item in sequence)].
+item
| The
| #[+a("https://docs.python.org/2/library/itertools.html") #[code itertools] built-in library]
| and the
| #[+a("https://github.com/pytoolz/cytoolz") #[code cytoolz] package]
| provide a lot of handy #[strong generator tools].
+item
| Often you'll have an input stream that pairs text with some
| important meta data, e.g. a JSON document. To
| #[strong pair up the meta data] with the processed #[code Doc]
| object, you should use the #[code itertools.tee] function to split
| the generator in two, and then #[code izip] the extra stream to the
| document stream.
+h(2, "own-annotations") Bringing your own annotations
p
| spaCy generally assumes by default that your data is raw text. However,
| sometimes your data is partially annotated, e.g. with pre-existing
| tokenization, part-of-speech tags, etc. The most common situation is
| that you have pre-defined tokenization. If you have a list of strings,
| you can create a #[code Doc] object directly. Optionally, you can also
| specify a list of boolean values, indicating whether each word has a
| subsequent space.
+code.
doc = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False])
p
| If provided, the spaces list must be the same length as the words list.
| The spaces list affects the #[code doc.text], #[code span.text],
| #[code token.idx], #[code span.start_char] and #[code span.end_char]
| attributes. If you don't provide a #[code spaces] sequence, spaCy will
| assume that all words are whitespace delimited.
+code.
good_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False])
bad_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'])
assert bad_spaces.text == u'Hello , world !'
assert good_spaces.text == u'Hello, world!'
p
| Once you have a #[+api("doc") #[code Doc]] object, you can write to its
| attributes to set the part-of-speech tags, syntactic dependencies, named
| entities and other attributes. For details, see the respective usage
| pages.
+h(2, "models") Working with models
p
| If your application depends on one or more #[+a("/docs/usage/models") models],
| you'll usually want to integrate them into your continuous integration
| workflow and build process. While spaCy provides a range of useful helpers
| for downloading, linking and loading models, the underlying functionality
| is entirely based on native Python packages. This allows your application
| to handle a model like any other package dependency.
+h(3, "models-download") Downloading and requiring model dependencies
p
| spaCy's built-in #[+api("cli#download") #[code download]] command
| is mostly intended as a convenient, interactive wrapper. It performs
| compatibility checks and prints detailed error messages and warnings.
| However, if you're downloading models as part of an automated build
| process, this only adds an unnecessary layer of complexity. If you know
| which models your application needs, you should be specifying them directly.
p
| Because all models are valid Python packages, you can add them to your
| application's #[code requirements.txt]. If you're running your own
| internal PyPi installation, you can simply upload the models there. pip's
| #[+a("https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format") requirements file format]
| supports both package names to download via a PyPi server, as well as direct
| URLs.
+code("requirements.txt", "text").
spacy>=2.0.0,<3.0.0
-e #{gh("spacy-models")}/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
p
| All models are versioned and specify their spaCy dependency. This ensures
| cross-compatibility and lets you specify exact version requirements for
| each model. If you've trained your own model, you can use the
| #[+api("cli#package") #[code package]] command to generate the required
| meta data and turn it into a loadable package.
+h(3, "models-loading") Loading and testing models
p
| Downloading models directly via pip won't call spaCy's link
| #[+api("cli#link") #[code link]] command, which creates
| symlinks for model shortcuts. This means that you'll have to run this
| command separately, or use the native #[code import] syntax to load the
| models:
+code.
import en_core_web_sm
nlp = en_core_web_sm.load()
p
| In general, this approach is recommended for larger code bases, as it's
| more "native", and doesn't depend on symlinks or rely on spaCy's loader
| to resolve string names to model packages. If a model can't be
| imported, Python will raise an #[code ImportError] immediately. And if a
| model is imported but not used, any linter will catch that.
p
| Similarly, it'll give you more flexibility when writing tests that
| require loading models. For example, instead of writing your own
| #[code try] and #[code except] logic around spaCy's loader, you can use
| #[+a("http://pytest.readthedocs.io/en/latest/") pytest]'s
| #[code importorskip()] method to only run a test if a specific model or
| model version is installed. Each model package exposes a #[code __version__]
| attribute which you can also use to perform your own version compatibility
| checks before loading a model.

View File

@ -1,44 +0,0 @@
//- 💫 DOCS > USAGE > SHOWCASE
include ../../_includes/_mixins
p
| On this page, we'll be featuring demos, libraries and products from
| the spaCy community. Have you done something cool with spaCy?
| #[a(href="mailto:#{EMAIL}") Let us know!]
+h(2, "libraries") Third-party libraries
+list
each details, title in libraries
+card-item(title, details)
+h(2, "visualizations") Visualizations
+grid
each details, name in visualizations
- details.image = "/assets/img/showcase/" + details.image
+card(name, details)
+h(2, "products") Built with spaCy
+grid
each details, name in products
- details.image = "/assets/img/showcase/" + details.image
+card(name, details)
+h(2, "books") Books
p We're excited to see books featuring spaCy already start to appear.
+list
each details, title in books
+card-item(title, details)
+h(2, "research") Research systems
p Researchers are using spaCy to build ambitious, next-generation text processing technologies. spaCy is particularly popular amongst the biomedical NLP community, who are working on extracting knowledge from the huge volume of literature in their field. For an up-to-date list of the papers citing spaCy, see #[+a("https://www.semanticscholar.org/search?year%5B%5D=2015&year%5B%5D=2020&q=spacy&sort=relevance&ae=false") Semantic Scholar].
+list
each details, title in research
+card-item(title, details)

View File

@ -1,430 +0,0 @@
//- 💫 DOCS > USAGE > SPACY 101
include ../../_includes/_mixins
p
| Whether you're new to spaCy, or just want to brush up on some
| NLP basics and implementation details this page should have you covered.
| Each section will explain one of spaCy's features in simple terms and
| with examples or illustrations. Some sections will also reappear across
| the usage guides as a quick introcution.
+aside("Help us improve the docs")
| Did you spot a mistake or come across explanations that
| are unclear? We always appreciate improvement
| #[+a(gh("spaCy") + "/issues") suggestions] or
| #[+a(gh("spaCy") + "/pulls") pull requests]. You can find a "Suggest
| edits" link at the bottom of each page that points you to the source.
+h(2, "whats-spacy") What's spaCy?
+grid.o-no-block
+grid-col("half")
p
| spaCy is a #[strong free, open-source library] for advanced
| #[strong Natural Language Processing] (NLP) in Python.
p
| If you're working with a lot of text, you'll eventually want to
| know more about it. For example, what's it about? What do the
| words mean in context? Who is doing what to whom? What companies
| and products are mentioned? Which texts are similar to each other?
p
| spaCy is designed specifically for #[strong production use] and
| helps you build applications that process and "understand"
| large volumes of text. It can be used to build
| #[strong information extraction] or
| #[strong natural language understanding] systems, or to
| pre-process text for #[strong deep learning].
+table-of-contents
+item #[+a("#features") Features]
+item #[+a("#annotations") Linguistic annotations]
+item #[+a("#annotations-token") Tokenization]
+item #[+a("#annotations-pos-deps") POS tags and dependencies]
+item #[+a("#annotations-ner") Named entities]
+item #[+a("#vectors-similarity") Word vectors and similarity]
+item #[+a("#pipelines") Pipelines]
+item #[+a("#vocab") Vocab, hashes and lexemes]
+item #[+a("#serialization") Serialization]
+item #[+a("#training") Training]
+item #[+a("#language-data") Language data]
+item #[+a("#architecture") Architecture]
+item #[+a("#community") Community & FAQ]
+h(3, "what-spacy-isnt") What spaCy isn't
+list
+item #[strong spaCy is not a platform or "an API"].
| Unlike a platform, spaCy does not provide a software as a service, or
| a web application. It's an open-source library designed to help you
| build NLP applications, not a consumable service.
+item #[strong spaCy is not an out-of-the-box chat bot engine].
| While spaCy can be used to power conversational applications, it's
| not designed specifically for chat bots, and only provides the
| underlying text processing capabilities.
+item #[strong spaCy is not research software].
| It's built on the latest research, but it's designed to get
| things done. This leads to fairly different design decisions than
| #[+a("https://github./nltk/nltk") NLTK]
| or #[+a("https://stanfordnlp.github.io/CoreNLP/") CoreNLP], which were
| created as platforms for teaching and research. The main difference
| is that spaCy is integrated and opinionated. spaCy tries to avoid asking
| the user to choose between multiple algorithms that deliver equivalent
| functionality. Keeping the menu small lets spaCy deliver generally better
| performance and developer experience.
+item #[strong spaCy is not a company].
| It's an open-source library. Our company publishing spaCy and other
| software is called #[+a(COMPANY_URL, true) Explosion AI].
+h(2, "features") Features
p
| In the documentation, you'll come across mentions of spaCy's
| features and capabilities. Some of them refer to linguistic concepts,
| while others are related to more general machine learning functionality.
+aside
| If one of spaCy's functionalities #[strong needs a model], it means that
| you need to have one of the available
| #[+a("/docs/usage/models") statistical models] installed. Models are used
| to #[strong predict] linguistic annotations for example, if a word is
| a verb or a noun.
+table(["Name", "Description", "Needs model"])
+row
+cell #[strong Tokenization]
+cell Segmenting text into words, punctuations marks etc.
+cell #[+procon("con")]
+row
+cell #[strong Part-of-speech] (POS) #[strong Tagging]
+cell Assigning word types to tokens, like verb or noun.
+cell #[+procon("pro")]
+row
+cell #[strong Dependency Parsing]
+cell
| Assigning syntactic dependency labels, describing the relations
| between individual tokens, like subject or object.
+cell #[+procon("pro")]
+row
+cell #[strong Lemmatization]
+cell
| Assigning the base forms of words. For example, the lemma of
| "was" is "be", and the lemma of "rats" is "rat".
+cell #[+procon("pro")]
+row
+cell #[strong Sentence Boundary Detection] (SBD)
+cell Finding and segmenting individual sentences.
+cell #[+procon("pro")]
+row
+cell #[strong Named Entity Recongition] (NER)
+cell
| Labelling named "real-world" objects, like persons, companies or
| locations.
+cell #[+procon("pro")]
+row
+cell #[strong Similarity]
+cell
| Comparing words, text spans and documents and how similar they
| are to each other.
+cell #[+procon("pro")]
+row
+cell #[strong Text classification]
+cell Assigning categories or labels to a whole document, or parts of a document.
+cell #[+procon("pro")]
+row
+cell #[strong Rule-based Matching]
+cell
| Finding sequences of tokens based on their texts and linguistic
| annotations, similar to regular expressions.
+cell #[+procon("con")]
+row
+cell #[strong Training]
+cell Updating and improving a statistical model's predictions.
+cell #[+procon("neutral")]
+row
+cell #[strong Serialization]
+cell Saving objects to files or byte strings.
+cell #[+procon("neutral")]
+h(2, "annotations") Linguistic annotations
p
| spaCy provides a variety of linguistic annotations to give you
| #[strong insights into a text's grammatical structure]. This includes the
| word types, like the parts of speech, and how the words are related to
| each other. For example, if you're analysing text, it makes a huge
| difference whether a noun is the subject of a sentence, or the object
| or whether "google" is used as a verb, or refers to the website or
| company in a specific context.
p
| Once you've downloaded and installed a #[+a("/docs/usage/models") model],
| you can load it via #[+api("spacy#load") #[code spacy.load()]]. This will
| return a #[code Language] object contaning all components and data needed
| to process text. We usually call it #[code nlp]. Calling the #[code nlp]
| object on a string of text will return a processed #[code Doc]:
+code.
import spacy
nlp = spacy.load('en')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
p
| Even though a #[code Doc] is processed e.g. split into individual words
| and annotated it still holds #[strong all information of the original text],
| like whitespace characters. You can always get the offset of a token into the
| original string, or reconstruct the original by joining the tokens and their
| trailing whitespace. This way, you'll never lose any information
| when processing text with spaCy.
+h(3, "annotations-token") Tokenization
include _spacy-101/_tokenization
+infobox
| To learn more about how spaCy's tokenization rules work in detail,
| how to #[strong customise and replace] the default tokenizer and how to
| #[strong add language-specific data], see the usage guides on
| #[+a("/docs/usage/adding-languages") adding languages] and
| #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
+tag-model("dependency parse")
include _spacy-101/_pos-deps
+infobox
| To learn more about #[strong part-of-speech tagging] and rule-based
| morphology, and how to #[strong navigate and use the parse tree]
| effectively, see the usage guides on
| #[+a("/docs/usage/pos-tagging") part-of-speech tagging] and
| #[+a("/docs/usage/dependency-parse") using the dependency parse].
+h(3, "annotations-ner") Named Entities
+tag-model("named entities")
include _spacy-101/_named-entities
+infobox
| To learn more about entity recognition in spaCy, how to
| #[strong add your own entities] to a document and how to
| #[strong train and update] the entity predictions of a model, see the
| usage guides on
| #[+a("/docs/usage/entity-recognition") named entity recognition] and
| #[+a("/docs/usage/training-ner") training the named entity recognizer].
+h(2, "vectors-similarity") Word vectors and similarity
+tag-model("vectors")
include _spacy-101/_similarity
include _spacy-101/_word-vectors
+infobox
| To learn more about word vectors, how to #[strong customise them] and
| how to load #[strong your own vectors] into spaCy, see the usage
| guide on
| #[+a("/docs/usage/word-vectors-similarities") using word vectors and semantic similarities].
+h(2, "pipelines") Pipelines
include _spacy-101/_pipelines
+infobox
| To learn more about #[strong how processing pipelines work] in detail,
| how to enable and disable their components, and how to
| #[strong create your own], see the usage guide on
| #[+a("/docs/usage/language-processing-pipeline") language processing pipelines].
+h(2, "vocab") Vocab, hashes and lexemes
include _spacy-101/_vocab
+h(2, "serialization") Serialization
include _spacy-101/_serialization
+infobox
| To learn more about #[strong serialization] and how to
| #[strong save and load your own models], see the usage guide on
| #[+a("/docs/usage/saving-loading") saving, loading and data serialization].
+h(2, "training") Training
include _spacy-101/_training
+infobox
| To learn more about #[strong training and updating] models, how to create
| training data and how to improve spaCy's named entity recognition models,
| see the usage guides on #[+a("/docs/usage/training") training] and
| #[+a("/docs/usage/training-ner") training the named entity recognizer].
+h(2, "language-data") Language data
include _spacy-101/_language-data
+infobox
| To learn more about the individual components of the language data and
| how to #[strong add a new language] to spaCy in preparation for training
| a language model, see the usage guide on
| #[+a("/docs/usage/adding-languages") adding languages].
+h(2, "architecture") Architecture
include _spacy-101/_architecture.jade
+h(2, "community") Community & FAQ
p
| We're very happy to see the spaCy community grow and include a mix of
| people from all kinds of different backgrounds computational
| linguistics, data science, deep learning, research and more. If you'd
| like to get involved, below are some answers to the most important
| questions and resources for further reading.
+h(3, "faq-help-code") Help, my code isn't working!
p
| Bugs suck, and we're doing our best to continuously improve the tests
| and fix bugs as soon as possible. Before you submit an issue, do a
| quick search and check if the problem has already been reported. If
| you're having installation or loading problems, make sure to also check
| out the #[+a("/docs/usage#troubleshooting") troubleshooting guide]. Help
| with spaCy is available via the following platforms:
+aside("How do I know if something is a bug?")
| Of course, it's always hard to know for sure, so don't worry we're not
| going to be mad if a bug report turns out to be a typo in your
| code. As a simple rule, any C-level error without a Python traceback,
| like a #[strong segmentation fault] or #[strong memory error],
| is #[strong always] a spaCy bug.#[br]#[br]
| Because models are statistical, their performance will never be
| #[em perfect]. However, if you come across
| #[strong patterns that might indicate an underlying issue], please do
| file a report. Similarly, we also care about behaviours that
| #[strong contradict our docs].
+table(["Platform", "Purpose"])
+row
+cell #[+a("https://stackoverflow.com/questions/tagged/spacy") StackOverflow]
+cell
| #[strong Usage questions] and everything related to problems with
| your specific code. The StackOverflow community is much larger
| than ours, so if your problem can be solved by others, you'll
| receive help much quicker.
+row
+cell #[+a("https://gitter.im/" + SOCIAL.gitter) Gitter chat]
+cell
| #[strong General discussion] about spaCy, meeting other community
| members and exchanging #[strong tips, tricks and best practices].
| If we're working on experimental models and features, we usually
| share them on Gitter first.
+row
+cell #[+a(gh("spaCy") + "/issues") GitHub issue tracker]
+cell
| #[strong Bug reports] and #[strong improvement suggestions], i.e.
| everything that's likely spaCy's fault. This also includes
| problems with the models beyond statistical imprecisions, like
| patterns that point to a bug.
+infobox
| Please understand that we won't be able to provide individual support via
| email. We also believe that help is much more valuable if it's shared
| publicly, so that #[strong more people can benefit from it]. If you come
| across an issue and you think you might be able to help, consider posting
| a quick update with your solution. No matter how simple, it can easily
| save someone a lot of time and headache and the next time you need help,
| they might repay the favour.
+h(3, "faq-contributing") How can I contribute to spaCy?
p
| You don't have to be an NLP expert or Python pro to contribute, and we're
| happy to help you get started. If you're new to spaCy, a good place to
| start is the
| #[+a(gh("spaCy") + '/issues?q=is%3Aissue+is%3Aopen+label%3A"help+wanted+%28easy%29"') #[code help wanted (easy)] label]
| on GitHub, which we use to tag bugs and feature requests that are easy
| and self-contained. We also appreciate contributions to the docs whether
| it's fixing a typo, improving an example or adding additional explanations.
| You'll find a "Suggest edits" link at the bottom of each page that points
| you to the source.
p
| Another way of getting involved is to help us improve the
| #[+a("/docs/usage/adding-languages#language-data") language data]
| especially if you happen to speak one of the languages currently in
| #[+a("/docs/api/language-models#alpha-support") alpha support]. Even
| adding simple tokenizer exceptions, stop words or lemmatizer data
| can make a big difference. It will also make it easier for us to provide
| a statistical model for the language in the future. Submitting a test
| that documents a bug or performance issue, or covers functionality that's
| especially important for your application is also very helpful. This way,
| you'll also make sure we never accidentally introduce regressions to the
| parts of the library that you care about the most.
p
strong
| For more details on the types of contributions we're looking for, the
| code conventions and other useful tips, make sure to check out the
| #[+a(gh("spaCy", "CONTRIBUTING.md")) contributing guidelines].
+infobox("Code of Conduct")
| spaCy adheres to the
| #[+a("http://contributor-covenant.org/version/1/4/") Contributor Covenant Code of Conduct].
| By participating, you are expected to uphold this code.
+h(3, "faq-project-with-spacy")
| I've built something cool with spaCy how can I get the word out?
p
| First, congrats we'd love to check it out! When you share your
| project on Twitter, don't forget to tag
| #[+a("https://twitter.com/" + SOCIAL.twitter) @#{SOCIAL.twitter}] so we
| don't miss it. If you think your project would be a good fit for the
| #[+a("/docs/usage/showcase") showcase], #[strong feel free to submit it!]
| Tutorials are also incredibly valuable to other users and a great way to
| get exposure. So we strongly encourage #[strong writing up your experiences],
| or sharing your code and some tips and tricks on your blog. Since our
| website is open-source, you can add your project or tutorial by making a
| pull request on GitHub.
+aside("Contributing to spacy.io")
| All showcase and tutorial links are stored in a
| #[+a(gh("spaCy", "website/docs/usage/_data.json")) JSON file], so you
| won't even have to edit any markup. For more info on how to submit
| your project, see the
| #[+a(gh("spaCy", "CONTRIBUTING.md#submitting-a-project-to-the-showcase")) contributing guidelines]
| and our #[+a(gh("spaCy", "website")) website docs].
p
| If you would like to use the spaCy logo on your site, please get in touch
| and ask us first. However, if you want to show support and tell others
| that your project is using spaCy, you can grab one of our
| #[strong spaCy badges] here:
- SPACY_BADGES = ["built%20with-spaCy-09a3d5.svg", "made%20with%20❤%20and-spaCy-09a3d5.svg", "spaCy-v2-09a3d5.svg"]
+quickstart([{id: "badge", input_style: "check", options: SPACY_BADGES.map(function(badge, i) { return {id: i, title: "<img class='o-icon' src='https://img.shields.io/badge/" + badge + "' height='20'/>", checked: (i == 0) ? true : false}}) }], false, false, true)
.c-code-block(data-qs-results)
for badge, i in SPACY_BADGES
- var url = "https://img.shields.io/badge/" + badge
+code(false, "text", "star").o-no-block(data-qs-badge=i)=url
+code(false, "text", "code").o-no-block(data-qs-badge=i).
&lt;a href="#{SITE_URL}"&gt;&lt;img src="#{url}" height="20"&gt;&lt;/a&gt;
+code(false, "text", "markdown").o-no-block(data-qs-badge=i).
[![spaCy](#{url})](#{SITE_URL})

View File

@ -1,5 +0,0 @@
//- 💫 DOCS > USAGE > TEXT CLASSIFICATION
include ../../_includes/_mixins
+under-construction

View File

@ -1,114 +0,0 @@
include ../../_includes/_mixins
p
| All #[+a("/docs/usage/models") spaCy models] support online learning, so
| you can update a pre-trained model with new examples. You can even add
| new classes to an existing model, to recognise a new entity type,
| part-of-speech, or syntactic relation. Updating an existing model is
| particularly useful as a "quick and dirty solution", if you have only a
| few corrections or annotations.
+h(2, "improving-accuracy") Improving accuracy on existing entity types
p
| To update the model, you first need to create an instance of
| #[+api("goldparse") #[code GoldParse]], with the entity labels
| you want to learn. You'll usually need to provide many examples to
| meaningfully improve the system — a few hundred is a good start, although
| more is better.
+image
include ../../assets/img/docs/training-loop.svg
.u-text-right
+button("/assets/img/docs/training-loop.svg", false, "secondary").u-text-tag View large graphic
p
| You should avoid iterating over the same few examples multiple times, or
| the model is likely to "forget" how to annotate other examples. If you
| iterate over the same few examples, you're effectively changing the loss
| function. The optimizer will find a way to minimize the loss on your
| examples, without regard for the consequences on the examples it's no
| longer paying attention to.
p
| One way to avoid this "catastrophic forgetting" problem is to "remind"
| the model of other examples by augmenting your annotations with sentences
| annotated with entities automatically recognised by the original model.
| Ultimately, this is an empirical process: you'll need to
| #[strong experiment on your own data] to find a solution that works best
| for you.
+h(2, "example") Example
+under-construction
+code.
import random
from spacy.lang.en import English
from spacy.gold import GoldParse, biluo_tags_from_offsets
def main(model_dir=None):
train_data = [
('Who is Shaka Khan?',
[(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]),
('I like London and Berlin.',
[(len('I like '), len('I like London'), 'LOC'),
(len('I like London and '), len('I like London and Berlin'), 'LOC')])
]
nlp = English(pipeline=['tensorizer', 'ner'])
get_data = lambda: reformat_train_data(nlp.tokenizer, train_data)
optimizer = nlp.begin_training(get_data)
for itn in range(100):
random.shuffle(train_data)
losses = {}
for raw_text, entity_offsets in train_data:
doc = nlp.make_doc(raw_text)
gold = GoldParse(doc, entities=entity_offsets)
nlp.update([doc], [gold], drop=0.5, sgd=optimizer, losses=losses)
nlp.to_disk(model_dir)
+code.
def reformat_train_data(tokenizer, examples):
"""Reformat data to match JSON format"""
output = []
for i, (text, entity_offsets) in enumerate(examples):
doc = tokenizer(text)
ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets)
words = [w.text for w in doc]
tags = ['-'] * len(doc)
heads = [0] * len(doc)
deps = [''] * len(doc)
sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
output.append((text, [(sentence, [])]))
return output
p.u-text-right
+button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary").u-text-tag View full example
+h(2, "saving-loading") Saving and loading
p
| After training our model, you'll usually want to save its state, and load
| it back later. You can do this with the
| #[+api("language#to_disk") #[code Language.to_disk()]] method:
+code.
nlp.to_disk('/home/me/data/en_technology')
p
| To make the model more convenient to deploy, we recommend wrapping it as
| a Python package, so that you can install it via pip and load it as a
| module. spaCy comes with a handy #[+api("cli#package") #[code package]]
| CLI command to create all required files and directories.
+code(false, "bash").
spacy package /home/me/data/en_technology /home/me/my_models
p
| To build the package and create a #[code .tar.gz] archive, run
| #[code python setup.py sdist] from within its directory.
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/docs/usage/saving-loading#models") saving and loading models].

View File

@ -1,38 +0,0 @@
//- 💫 DOCS > USAGE > TUTORIALS
include ../../_includes/_mixins
p
| Have you written a tutorial on spaCy, or did you find one that should be
| featured here? #[a(href="mailto:#{EMAIL}") Let us know!]
+h(2, "first-steps") First steps
p
| These tutorials help you get started. They describe how to set up your
| environment and start using spaCy.
+grid
each details, title in first_steps
+card(title, details)
+h(2, "features") Deep dives
p
| These tutorials take a closer look at particular features of spaCy, or
| particular types of NLP problems. Most come with more explanatory text,
| to help introduce you to new concepts.
+grid
each details, title in deep_dives
+card(title, details)
+h(2, "code") Programs and scripts
p
| These tutorials give you all the code and nothing but the code — they're
| Python scripts you can modify and run.
+grid
each details, title in code
+card(title, details)

View File

@ -1,531 +0,0 @@
//- 💫 DOCS > USAGE > WHAT'S NEW IN V2.0
include ../../_includes/_mixins
p
| We're very excited to finally introduce spaCy v2.0! On this page, you'll
| find a summary of the new features, information on the backwards
| incompatibilities, including a handy overview of what's been renamed or
| deprecated. To help you make the most of v2.0, we also
| #[strong re-wrote almost all of the usage guides and API docs], and added
| more real-world examples. If you're new to spaCy, or just want to brush
| up on some NLP basics and the details of the library, check out
| the #[+a("/docs/usage/spacy-101") spaCy 101 guide] that explains the most
| important concepts with examples and illustrations.
+h(2, "summary") Summary
+grid.o-no-block
+grid-col("half")
p This release features
| entirely new #[strong deep learning-powered models] for spaCy's tagger,
| parser and entity recognizer. The new models are #[strong 20x smaller]
| than the linear models that have powered spaCy until now: from 300 MB to
| only 15 MB.
p
| We've also made several usability improvements that are
| particularly helpful for #[strong production deployments]. spaCy
| v2 now fully supports the Pickle protocol, making it easy to use
| spaCy with #[+a("https://spark.apache.org/") Apache Spark]. The
| string-to-integer mapping is #[strong no longer stateful], making
| it easy to reconcile annotations made in different processes.
| Models are smaller and use less memory, and the APIs for serialization
| are now much more consistent.
+table-of-contents
+item #[+a("#summary") Summary]
+item #[+a("#features") New features]
+item #[+a("#features-pipelines") Improved processing pipelines]
+item #[+a("#features-text-classification") Text classification]
+item #[+a("#features-hash-ids") Hash values instead of integer IDs]
+item #[+a("#features-serializer") Saving, loading and serialization]
+item #[+a("#features-displacy") displaCy visualizer]
+item #[+a("#features-language") Language data and lazy loading]
+item #[+a("#features-matcher") Revised matcher API]
+item #[+a("#features-models") Neural network models]
+item #[+a("#incompat") Backwards incompatibilities]
+item #[+a("#migrating") Migrating from spaCy v1.x]
+item #[+a("#benchmarks") Benchmarks]
p
| The main usability improvements you'll notice in spaCy v2.0 are around
| #[strong defining, training and loading your own models] and components.
| The new neural network models make it much easier to train a model from
| scratch, or update an existing model with a few examples. In v1.x, the
| statistical models depended on the state of the #[code Vocab]. If you
| taught the model a new word, you would have to save and load a lot of
| data — otherwise the model wouldn't correctly recall the features of your
| new example. That's no longer the case.
p
| Due to some clever use of hashing, the statistical models
| #[strong never change size], even as they learn new vocabulary items.
| The whole pipeline is also now fully differentiable. Even if you don't
| have explicitly annotated data, you can update spaCy using all the
| #[strong latest deep learning tricks] like adversarial training, noise
| contrastive estimation or reinforcement learning.
+h(2, "features") New features
p
| This section contains an overview of the most important
| #[strong new features and improvements]. The #[+a("/docs/api") API docs]
| include additional deprecation notes. New methods and functions that
| were introduced in this version are marked with a #[+tag-new(2)] tag.
+h(3, "features-pipelines") Improved processing pipelines
+aside-code("Example").
# Modify an existing pipeline
nlp = spacy.load('en')
nlp.pipeline.append(my_component)
# Register a factory to create a component
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory', mycomponent])
p
| It's now much easier to #[strong customise the pipeline] with your own
| components, functions that receive a #[code Doc] object, modify and
| return it. If your component is stateful, you can define and register a
| factory which receives the shared #[code Vocab] object and returns a
|  component. spaCy's default components can be added to your pipeline by
| using their string IDs. This way, you won't have to worry about finding
| and implementing them simply add #[code "tagger"] to the pipeline,
| and spaCy will know what to do.
+image
include ../../assets/img/docs/pipeline.svg
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
+h(3, "features-text-classification") Text classification
+aside-code("Example").
from spacy.lang.en import English
nlp = English(pipeline=['tensorizer', 'tagger', 'textcat'])
p
| spaCy v2.0 lets you add text categorization models to spaCy pipelines.
| The model supports classification with multiple, non-mutually exclusive
| labels so multiple labels can apply at once. You can change the model
| architecture rather easily, but by default, the #[code TextCategorizer]
| class uses a convolutional neural network to assign position-sensitive
| vectors to each word in the document.
+infobox
| #[strong API:] #[+api("textcategorizer") #[code TextCategorizer]],
| #[+api("doc#attributes") #[code Doc.cats]],
| #[+api("goldparse#attributes") #[code GoldParse.cats]]#[br]
| #[strong Usage:] #[+a("/docs/usage/text-classification") Text classification]
+h(3, "features-hash-ids") Hash values instead of integer IDs
+aside-code("Example").
doc = nlp(u'I love coffee')
assert doc.vocab.strings[u'coffee'] == 3197928453018144401
assert doc.vocab.strings[3197928453018144401] == u'coffee'
beer_hash = doc.vocab.strings.add(u'beer')
assert doc.vocab.strings[u'beer'] == beer_hash
assert doc.vocab.strings[beer_hash] == u'beer'
p
| The #[+api("stringstore") #[code StringStore]] now resolves all strings
| to hash values instead of integer IDs. This means that the string-to-int
| mapping #[strong no longer depends on the vocabulary state], making a lot
| of workflows much simpler, especially during training. Unlike integer IDs
| in spaCy v1.x, hash values will #[strong always match] even across
| models. Strings can now be added explicitly using the new
| #[+api("stringstore#add") #[code Stringstore.add]] method. A token's hash
| is available via #[code token.orth].
+infobox
| #[strong API:] #[+api("stringstore") #[code StringStore]]
| #[strong Usage:] #[+a("/docs/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
+h(3, "features-serializer") Saving, loading and serialization
+aside-code("Example").
nlp = spacy.load('en') # shortcut link
nlp = spacy.load('en_core_web_sm') # package
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
nlp.to_disk('/path/to/nlp')
nlp = English().from_disk('/path/to/nlp')
p
| spay's serialization API has been made consistent across classes and
| objects. All container classes, i.e. #[code Language], #[code Doc],
| #[code Vocab] and #[code StringStore] now have a #[code to_bytes()],
| #[code from_bytes()], #[code to_disk()] and #[code from_disk()] method
| that supports the Pickle protocol.
p
| The improved #[code spacy.load] makes loading models easier and more
| transparent. You can load a model by supplying its
| #[+a("/docs/usage/models#usage") shortcut link], the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package] or a path.
| The #[code Language] class to initialise will be determined based on the
| model's settings. For a blank language, you can import the class directly,
| e.g. #[code from spacy.lang.en import English].
+infobox
| #[strong API:] #[+api("spacy#load") #[code spacy.load]], #[+api("binder") #[code Binder]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(3, "features-displacy") displaCy visualizer with Jupyter support
+aside-code("Example").
from spacy import displacy
doc = nlp(u'This is a sentence about Facebook.')
displacy.serve(doc, style='dep') # run the web server
html = displacy.render(doc, style='ent') # generate HTML
p
| Our popular dependency and named entity visualizers are now an official
| part of the spaCy library! displaCy can run a simple web server, or
| generate raw HTML markup or SVG files to be exported. You can pass in one
| or more docs, and customise the style. displaCy also auto-detects whether
| you're running #[+a("https://jupyter.org") Jupyter] and will render the
| visualizations in your notebook.
+infobox
| #[strong API:] #[+api("displacy") #[code displacy]]
| #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizing spaCy]
+h(3, "features-language") Improved language data and lazy loading
p
| Language-specfic data now lives in its own submodule, #[code spacy.lang].
| Languages are lazy-loaded, i.e. only loaded when you import a
| #[code Language] class, or load a model that initialises one. This allows
| languages to contain more custom data, e.g. lemmatizer lookup tables, or
| complex regular expressions. The language data has also been tidied up
| and simplified. spaCy now also supports simple lookup-based lemmatization.
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Code:] #[+src(gh("spaCy", "spacy/lang")) spacy/lang]
| #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
+h(3, "features-matcher") Revised matcher API
+aside-code("Example").
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
assert len(matcher) == 1
assert 'HEARTS' in matcher
p
| Patterns can now be added to the matcher by calling
| #[+api("matcher-add") #[code matcher.add()]] with a match ID, an optional
| callback function to be invoked on each match, and one or more patterns.
| This allows you to write powerful, pattern-specific logic using only one
| matcher. For example, you might only want to merge some entity types,
| and set custom flags for other matched patterns.
+infobox
| #[strong API:] #[+api("matcher") #[code Matcher]]
| #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
+h(3, "features-models") Neural network models for English, German, French, Spanish and multi-language NER
+aside-code("Example", "bash").
spacy download en # default English model
spacy download de # default German model
spacy download fr # default French model
spacy download es # default Spanish model
spacy download xx_ent_wiki_sm # multi-language NER
p
| spaCy v2.0 comes with new and improved neural network models for English,
| German, French and Spanish, as well as a multi-language named entity
| recognition model trained on Wikipedia. #[strong GPU usage] is now
| supported via #[+a("http://chainer.org") Chainer]'s CuPy module.
+infobox
| #[strong Details:] #[+a("/docs/api/language-models") Languages],
| #[+src(gh("spacy-models")) spacy-models]
| #[strong Usage:] #[+a("/docs/usage/models") Models],
| #[+a("/docs/usage#gpu") Using spaCy with GPU]
+h(2, "incompat") Backwards incompatibilities
+table(["Old", "New"])
+row
+cell
| #[code spacy.en]
| #[code spacy.xx]
+cell
| #[code spacy.lang.en]
| #[code spacy.lang.xx]
+row
+cell #[code orth]
+cell #[code lang.xx.lex_attrs]
+row
+cell #[code syntax.iterators]
+cell #[code lang.xx.syntax_iterators]
+row
+cell #[code Language.save_to_directory]
+cell #[+api("language#to_disk") #[code Language.to_disk]]
+row
+cell #[code Language.create_make_doc]
+cell #[+api("language#attributes") #[code Language.tokenizer]]
+row
+cell
| #[code Vocab.load]
| #[code Vocab.load_lexemes]
+cell
| #[+api("vocab#from_disk") #[code Vocab.from_disk]]
| #[+api("vocab#from_bytes") #[code Vocab.from_bytes]]
+row
+cell
| #[code Vocab.dump]
+cell
| #[+api("vocab#to_disk") #[code Vocab.to_disk]]#[br]
| #[+api("vocab#to_bytes") #[code Vocab.to_bytes]]
+row
+cell
| #[code Vocab.load_vectors]
| #[code Vocab.load_vectors_from_bin_loc]
+cell
| #[+api("vectors#from_disk") #[code Vectors.from_disk]]
| #[+api("vectors#from_bytes") #[code Vectors.from_bytes]]
+row
+cell
| #[code Vocab.dump_vectors]
+cell
| #[+api("vectors#to_disk") #[code Vectors.to_disk]]
| #[+api("vectors#to_bytes") #[code Vectors.to_bytes]]
+row
+cell
| #[code StringStore.load]
+cell
| #[+api("stringstore#from_disk") #[code StringStore.from_disk]]
| #[+api("stringstore#from_bytes") #[code StringStore.from_bytes]]
+row
+cell
| #[code StringStore.dump]
+cell
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
+row
+cell #[code Tokenizer.load]
+cell
| #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
| #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
+row
+cell #[code Tagger.load]
+cell
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+row
+cell #[code DependencyParser.load]
+cell
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+row
+cell #[code EntityRecognizer.load]
+cell
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+row
+cell #[code Matcher.load]
+cell -
+row
+cell
| #[code Matcher.add_pattern]
| #[code Matcher.add_entity]
+cell #[+api("matcher#add") #[code Matcher.add]]
+row
+cell #[code Matcher.get_entity]
+cell #[+api("matcher#get") #[code Matcher.get]]
+row
+cell #[code Matcher.has_entity]
+cell #[+api("matcher#contains") #[code Matcher.__contains__]]
+row
+cell #[code Doc.read_bytes]
+cell #[+api("binder") #[code Binder]]
+row
+cell #[code Token.is_ancestor_of]
+cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
+row
+cell #[code cli.model]
+cell -
+h(2, "migrating") Migrating from spaCy 1.x
p
| Because we'e made so many architectural changes to the library, we've
| tried to #[strong keep breaking changes to a minimum]. A lot of projects
| follow the philosophy that if you're going to break anything, you may as
| well break everything. We think migration is easier if there's a logic to
| what has changed.
p
| We've therefore followed a policy of avoiding breaking changes to the
| #[code Doc], #[code Span] and #[code Token] objects. This way, you can
| focus on only migrating the code that does training, loading and
| serialization — in other words, code that works with the #[code nlp]
| object directly. Code that uses the annotations should continue to work.
+infobox("Important note")
| If you've trained your own models, keep in mind that your train and
| runtime inputs must match. This means you'll have to
| #[strong retrain your models] with spaCy v2.0.
+h(3, "migrating-saving-loading") Saving, loading and serialization
p
| Double-check all calls to #[code spacy.load()] and make sure they don't
| use the #[code path] keyword argument. If you're only loading in binary
| data and not a model package that can construct its own #[code Language]
| class and pipeline, you should now use the
| #[+api("language#from_disk") #[code Language.from_disk()]] method.
+code-new.
nlp = spacy.load('/model')
nlp = English().from_disk('/model/data')
+code-old nlp = spacy.load('en', path='/model')
p
| Review all other code that writes state to disk or bytes.
| All containers, now share the same, consistent API for saving and
| loading. Replace saving with #[code to_disk()] or #[code to_bytes()], and
| loading with #[code from_disk()] and #[code from_bytes()].
+code-new.
nlp.to_disk('/model')
nlp.vocab.to_disk('/vocab')
+code-old.
nlp.save_to_directory('/model')
nlp.vocab.dump('/vocab')
p
| If you've trained models with input from v1.x, you'll need to
| #[strong retrain them] with spaCy v2.0. All previous models will not
| be compatible with the new version.
+h(3, "migrating-strings") Strings and hash values
p
| The change from integer IDs to hash values may not actually affect your
| code very much. However, if you're adding strings to the vocab manually,
| you now need to call #[+api("stringstore#add") #[code StringStore.add()]]
| explicitly. You can also now be sure that the string-to-hash mapping will
| always match across vocabularies.
+code-new.
nlp.vocab.strings.add(u'coffee')
nlp.vocab.strings[u'coffee'] # 3197928453018144401
other_nlp.vocab.strings[u'coffee'] # 3197928453018144401
+code-old.
nlp.vocab.strings[u'coffee'] # 3672
other_nlp.vocab.strings[u'coffee'] # 40259
+h(3, "migrating-languages") Processing pipelines and language data
p
| If you're importing language data or #[code Language] classes, make sure
| to change your import statements to import from #[code spacy.lang]. If
| you've added your own custom language, it needs to be moved to
| #[code spacy/lang/xx] and adjusted accordingly.
+code-new from spacy.lang.en import English
+code-old from spacy.en import English
p
| If you've been using custom pipeline components, check out the new
| guide on #[+a("/docs/usage/language-processing-pipelines") processing pipelines].
| Appending functions to the pipeline still works but you might be able
| to make this more convenient by registering "component factories".
| Components of the processing pipeline can now be disabled by passing a
| list of their names to the #[code disable] keyword argument on loading
| or processing.
+code-new.
nlp = spacy.load('en', disable=['tagger', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser'])
+code-old.
nlp = spacy.load('en', tagger=False, entity=False)
doc = nlp(u"I don't want parsed", parse=False)
+h(3, "migrating-matcher") Adding patterns and callbacks to the matcher
p
| If you're using the matcher, you can now add patterns in one step. This
| should be easy to update simply merge the ID, callback and patterns
| into one call to #[+api("matcher#add") #[code Matcher.add()]].
+code-new.
matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
+code-old.
matcher.add_entity('GoogleNow', on_match=merge_phrases)
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
p
| If you've been using #[strong acceptor functions], you'll need to move
| this logic into the
| #[+a("/docs/usage/rule-based-matching#on_match") #[code on_match] callbacks].
| The callback function is invoked on every match and will give you access to
| the doc, the index of the current match and all total matches. This lets
| you both accept or reject the match, and define the actions to be
| triggered.
+h(2, "benchmarks") Benchmarks
+under-construction
+aside("Data sources")
| #[strong Parser, tagger, NER:] #[+a("https://www.gabormelli.com/RKB/OntoNotes_Corpus") OntoNotes 5]#[br]
| #[strong Word vectors:] #[+a("http://commoncrawl.org") Common Crawl]#[br]
p The evaluation was conducted on raw text with no gold standard information.
+table(["Model", "Version", "Type", "UAS", "LAS", "NER F", "POS", "w/s"])
mixin benchmark-row(name, details, values, highlight, style)
+row(style)
+cell #[code=name]
for cell in details
+cell=cell
for cell, i in values
+cell.u-text-right
if highlight && highlight[i]
strong=cell
else
!=cell
+benchmark-row("en_core_web_sm", ["2.0.0", "neural"], ["91.2", "89.2", "82.6", "96.6", "10,300"], [1, 1, 1, 0, 0])
+benchmark-row("en_core_web_sm", ["1.2.0", "linear"], ["86.6", "83.8", "78.5", "96.6", "25,700"], [0, 0, 0, 0, 1], "divider")
+benchmark-row("en_core_web_md", ["1.2.1", "linear"], ["90.6", "88.5", "81.4", "96.7", "18,800"], [0, 0, 0, 1, 0])

View File

@ -1,58 +1,4 @@
//- 💫 DOCS > USAGE > ADDING LANGUAGES
include ../../_includes/_mixins
p
| Adding full support for a language touches many different parts of the
| spaCy library. This guide explains how to fit everything together, and
| points you to the specific workflows for each component.
+aside("Working on spaCy's source")
| To add a new language to spaCy, you'll need to
| #[strong modify the library&apos;s code]. The easiest way to do this is to
| clone the #[+src(gh("spaCy")) repository] and #[strong build spaCy from source].
| For more information on this, see the #[+a("/docs/usage") installation guide].
| Unlike spaCy's core, which is mostly written in Cython, all language
| data is stored in regular Python files. This means that you won't have to
| rebuild anything in between you can simply make edits and reload spaCy
| to test them.
+grid.o-no-block
+grid-col("half")
p
| Obviously, there are lots of ways you can organise your code when
| you implement your own language data. This guide will focus on
| how it's done within spaCy. For full language support, you'll
| need to create a #[code Language] subclass, define custom
| #[strong language data], like a stop list and tokenizer
| exceptions and test the new tokenizer. Once the language is set
| up, you can #[strong build the vocabulary], including word
| frequencies, Brown clusters and word vectors. Finally, you can
| #[strong train the tagger and parser], and save the model to a
| directory.
p
| For some languages, you may also want to develop a solution for
| lemmatization and morphological analysis.
+table-of-contents
+item #[+a("#101") Language data 101]
+item #[+a("#language-subclass") The Language subclass]
+item #[+a("#stop-words") Stop words]
+item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
+item #[+a("#norm-exceptions") Norm exceptions]
+item #[+a("#lex-attrs") Lexical attributes]
+item #[+a("#syntax-iterators") Syntax iterators]
+item #[+a("#lemmatizer") Lemmatizer]
+item #[+a("#tag-map") Tag map]
+item #[+a("#morph-rules") Morph rules]
+item #[+a("#testing") Testing the tokenizer]
+item #[+a("#vocabulary") Building the vocabulary]
+item #[+a("#training") Training]
+h(2, "101") Language data 101
include _spacy-101/_language-data
//- 💫 DOCS > USAGE > ADDING LANGUAGES > LANGUAGE DATA
p
| The individual components #[strong expose variables] that can be imported
@ -137,7 +83,7 @@ p
+aside("Should I ever update the global data?")
| Reuseable language data is collected as atomic pieces in the root of the
| #[+src(gh("spaCy", "lang")) spacy.lang] package. Often, when a new
| #[+src(gh("spaCy", "lang")) #[code spacy.lang]] package. Often, when a new
| language is added, you'll find a pattern or symbol that's missing. Even
| if it isn't common in other languages, it might be best to add it to the
| shared language data, unless it has some conflicting interpretation. For
@ -150,14 +96,14 @@ p
| needs to know the language's character set. If the language you're adding
| uses non-latin characters, you might need to add the required character
| classes to the global
| #[+src(gh("spacy", "spacy/lang/char_classes.py")) char_classes.py].
| #[+src(gh("spacy", "spacy/lang/char_classes.py")) #[code char_classes.py]].
| spaCy uses the #[+a("https://pypi.python.org/pypi/regex/") #[code regex] library]
| to keep this simple and readable. If the language requires very specific
| punctuation rules, you should consider overwriting the default regular
| expressions with your own in the language's #[code Defaults].
+h(2, "language-subclass") Creating a #[code Language] subclass
+h(3, "language-subclass") Creating a #[code Language] subclass
p
| Language-specific code and resources should be organised into a
@ -250,7 +196,7 @@ p
+h(3, "tokenizer-exceptions") Tokenizer exceptions
p
| spaCy's #[+a("/docs/usage/customizing-tokenizer#how-tokenizer-works") tokenization algorithm]
| spaCy's #[+a("/usage/linguistic-features#how-tokenizer-works") tokenization algorithm]
| lets you deal with whitespace-delimited chunks separately. This makes it
| easy to define special-case rules, without worrying about how they
| interact with the rest of the tokenizer. Whenever the key string is
@ -284,7 +230,7 @@ p
| efficiently and make your data less verbose. How you do this ultimately
| depends on the language. Here's an example of how exceptions for time
| formats like "1a.m." and "1am" are generated in the English
| #[+src(gh("spaCy", "spacy/en/lang/tokenizer_exceptions.py")) tokenizer_exceptions.py]:
| #[+src(gh("spaCy", "spacy/en/lang/tokenizer_exceptions.py")) #[code tokenizer_exceptions.py]]:
+code("tokenizer_exceptions.py (excerpt)").
# use short, internal variable for readability
@ -376,7 +322,7 @@ p
p
| Norm exceptions can be provided as a simple dictionary. For more examples,
| see the English
| #[+src(gh("spaCy", "spacy/lang/en/norm_exceptions.py")) norm_exceptions.py].
| #[+src(gh("spaCy", "spacy/lang/en/norm_exceptions.py")) #[code norm_exceptions.py]].
+code("Example").
NORM_EXCEPTIONS = {
@ -428,7 +374,7 @@ p
p
| Here's an example from the English
| #[+src(gh("spaCy", "spacy/en/lang/lex_attrs.py")) lex_attrs.py]:
| #[+src(gh("spaCy", "spacy/en/lang/lex_attrs.py")) #[code lex_attrs.py]]:
+code("lex_attrs.py").
_num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
@ -466,7 +412,7 @@ p
| Syntax iterators are functions that compute views of a #[code Doc]
| object based on its syntax. At the moment, this data is only used for
| extracting
| #[+a("/docs/usage/dependency-parse#noun-chunks") noun chunks], which
| #[+a("/usage/linguistic-features#noun-chunks") noun chunks], which
| are available as the #[+api("doc#noun_chunks") #[code Doc.noun_chunks]]
| property. Because base noun phrases work differently across languages,
| the rules to compute them are part of the individual language's data. If
@ -479,13 +425,14 @@ p
assert chunks[0].text == "A phrase"
assert chunks[1].text == "another phrase"
+table(["Language", "Source"])
for lang, lang_id in {en: "English", de: "German", es: "Spanish"}
+table(["Language", "Code", "Source"])
for lang in ["en", "de", "fr", "es"]
+row
+cell=lang
+cell=LANGUAGES[lang]
+cell #[code=lang]
+cell
+src(gh("spaCy", "spacy/lang/" + lang_id + "/syntax_iterators.py"))
| lang/#{lang_id}/syntax_iterators.py
+src(gh("spaCy", "spacy/lang/" + lang + "/syntax_iterators.py"))
code lang/#{lang}/syntax_iterators.py
+h(3, "lemmatizer") Lemmatizer
@ -547,7 +494,7 @@ p
| #[+a("http://universaldependencies.org/u/pos/all.html") Universal Dependencies]
| tags. Optionally, you can also include morphological features or other
| token attributes in the tag map as well. This allows you to do simple
| #[+a("/docs/usage/pos-tagging#rule-based-morphology") rule-based morphological analysis].
| #[+a("/usage/linguistic-features#rule-based-morphology") rule-based morphological analysis].
+code("Example").
from ..symbols import POS, NOUN, VERB, DET
@ -560,233 +507,62 @@ p
+h(3, "morph-rules") Morph rules
+under-construction
p
| The morphology rules let you set token attributes such as lemmas, keyed
| by the extended part-of-speech tag and token text. The morphological
| features and their possible values are language-specific and based on the
| #[+a("http://universaldependencies.org") Universal Dependencies scheme].
+h(2, "testing") Testing the new language tokenizer
+code("Example").
from ..symbols import LEMMA
MORPH_RULES = {
"VBZ": {
"am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
"are": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"},
"is": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"},
"'re": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"},
"'s": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"}
}
}
p
| Before using the new language or submitting a
| #[+a(gh("spaCy") + "/pulls") pull request] to spaCy, you should make sure
| it works as expected. This is especially important if you've added custom
| regular expressions for token matching or punctuation you don't want to
| be causing regressions.
| In the example of #[code "am"], the attributes look like this:
+aside("spaCy's test suite")
| spaCy uses the #[+a("https://docs.pytest.org/en/latest/") pytest framework]
| for testing. For more details on how the tests are structured and best
| practices for writing your own tests, see our
| #[+a(gh("spaCy", "spacy/tests")) tests documentation].
+table(["Attribute", "Description"])
+row
+cell #[code LEMMA: "be"]
+cell Base form, e.g. "to be".
+h(3, "testing-tokenizer") Testing the basic tokenizer
+row
+cell #[code "VerbForm": "Fin"]
+cell
| Finite verb. Finite verbs have a subject and can be the root of
| an independent clause "I am." is a valid, complete
| sentence.
p
| The easiest way to test your new tokenizer is to run the
| language-independent "tokenizer sanity" tests located in
| #[+src(gh("spaCy", "spacy/tests/tokenizer")) tests/tokenizer]. This will
| test for basic behaviours like punctuation splitting, URL matching and
| correct handling of whitespace. In the
| #[+src(gh("spaCy", "spacy/tests/conftest.py")) conftest.py], add the new
| language ID to the list of #[code _languages]:
+row
+cell #[code "Person": "One"]
+cell First person, i.e. "#[strong I] am".
+code.
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
'nl', 'pl', 'pt', 'sv', 'xx'] # new language here
+row
+cell #[code "Tense": "Pres"]
+cell
| Present tense, i.e. actions that are happening right now or
| actions that usually happen.
+aside-code("Global tokenizer test example").
# use fixture by adding it as an argument
def test_with_all_languages(tokenizer):
# will be performed on ALL language tokenizers
tokens = tokenizer(u'Some text here.')
+row
+cell #[code "Mood": "Ind"]
+cell
| Indicative, i.e. something happens, has happened or will happen
| (as opposed to imperative or conditional).
p
| The language will now be included in the #[code tokenizer] test fixture,
| which is used by the basic tokenizer tests. If you want to add your own
| tests that should be run over all languages, you can use this fixture as
| an argument of your test function.
+h(3, "testing-custom") Writing language-specific tests
p
| It's recommended to always add at least some tests with examples specific
| to the language. Language tests should be located in
| #[+src(gh("spaCy", "spacy/tests/lang")) tests/lang] in a directory named
| after the language ID. You'll also need to create a fixture for your
| tokenizer in the #[+src(gh("spaCy", "spacy/tests/conftest.py")) conftest.py].
| Always use the #[code get_lang_class()] helper function within the fixture,
| instead of importing the class at the top of the file. This will load the
| language data only when it's needed. (Otherwise, #[em all data] would be
| loaded every time you run a test.)
+code.
@pytest.fixture
def en_tokenizer():
return util.get_lang_class('en').Defaults.create_tokenizer()
p
| When adding test cases, always
| #[+a(gh("spaCy", "spacy/tests#parameters")) #[code parametrize]] them
| this will make it easier for others to add more test cases without having
| to modify the test itself. You can also add parameter tuples, for example,
| a test sentence and its expected length, or a list of expected tokens.
| Here's an example of an English tokenizer test for combinations of
| punctuation and abbreviations:
+code("Example test").
@pytest.mark.parametrize('text,length', [
("The U.S. Army likes Shock and Awe.", 8),
("U.N. regulations are not a part of their concern.", 10),
("“Isn't it?”", 6)])
def test_en_tokenizer_handles_punct_abbrev(en_tokenizer, text, length):
tokens = en_tokenizer(text)
assert len(tokens) == length
+h(2, "vocabulary") Building the vocabulary
+under-construction
p
| spaCy expects that common words will be cached in a
| #[+api("vocab") #[code Vocab]] instance. The vocabulary caches lexical
| features, and makes it easy to use information from unlabelled text
| samples in your models. Specifically, you'll usually want to collect
| word frequencies, and train two types of distributional similarity model:
| Brown clusters, and word vectors. The Brown clusters are used as features
| by linear models, while the word vectors are useful for lexical
| similarity models and deep learning.
+h(3, "word-frequencies") Word frequencies
p
| To generate the word frequencies from a large, raw corpus, you can use the
| #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py]
| script from the spaCy developer resources. Note that your corpus should
| not be preprocessed (i.e. you need punctuation for example). The
| #[+api("cli#model") #[code model]] command expects a tab-separated word
| frequencies file with three columns:
+list("numbers")
+item The number of times the word occurred in your language sample.
+item The number of distinct documents the word occurred in.
+item The word itself.
p
| An example word frequencies file could look like this:
+code("es_word_freqs.txt", "text").
6361109 111 Aunque
23598543 111 aunque
10097056 111 claro
193454 111 aro
7711123 111 viene
12812323 111 mal
23414636 111 momento
2014580 111 felicidad
233865 111 repleto
15527 111 eto
235565 111 deliciosos
17259079 111 buena
71155 111 Anímate
37705 111 anímate
33155 111 cuéntanos
2389171 111 cuál
961576 111 típico
p
| You should make sure you use the spaCy tokenizer for your
| language to segment the text for your word frequencies. This will ensure
| that the frequencies refer to the same segmentation standards you'll be
| using at run-time. For instance, spaCy's English tokenizer segments
| "can't" into two tokens. If we segmented the text by whitespace to
| produce the frequency counts, we'll have incorrect frequency counts for
| the tokens "ca" and "n't".
+h(3, "brown-clusters") Training the Brown clusters
p
| spaCy's tagger, parser and entity recognizer are designed to use
| distributional similarity features provided by the
| #[+a("https://github.com/percyliang/brown-cluster") Brown clustering algorithm].
| You should train a model with between 500 and 1000 clusters. A minimum
| frequency threshold of 10 usually works well.
p
| An example clusters file could look like this:
+code("es_clusters.data", "text").
0000 Vestigial 1
0000 Vesturland 1
0000 Veyreau 1
0000 Veynes 1
0000 Vexilografía 1
0000 Vetrigne 1
0000 Vetónica 1
0000 Asunden 1
0000 Villalambrús 1
0000 Vichuquén 1
0000 Vichtis 1
0000 Vichigasta 1
0000 VAAH 1
0000 Viciebsk 1
0000 Vicovaro 1
0000 Villardeveyo 1
0000 Vidala 1
0000 Videoguard 1
0000 Vedás 1
0000 Videocomunicado 1
0000 VideoCrypt 1
+h(3, "word-vectors") Training the word vectors
+under-construction
p
| #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related
| algorithms let you train useful word similarity models from unlabelled
| text. This is a key part of using
| #[+a("/docs/usage/deep-learning") deep learning] for NLP with limited
| labelled data. The vectors are also useful by themselves they power
| the #[code .similarity()] methods in spaCy. For best results, you should
| pre-process the text with spaCy before training the Word2vec model. This
| ensures your tokenization will match.
p
| You can use our
| #[+src(gh("spacy-dev-resources", "training/word_vectors.py")) word vectors training script],
| which pre-processes the text with your language-specific tokenizer and
| trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim].
| The #[code vectors.bin] file should consist of one word and vector per line.
//-+aside-code("your_data_directory", "yaml").
├── vocab/
| ├── lexemes.bin
| ├── strings.json
| └── oov_prob
├── pos/
| ├── model
| └── config.json
├── deps/
| ├── model
| └── config.json
└── ner/
├── model
└── config.json
+h(2, "train-tagger-parser") Training the tagger and parser
+under-construction
p
| You can now train the model using a corpus for your language annotated
| with #[+a("http://universaldependencies.org/") Universal Dependencies].
| If your corpus uses the
| #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format,
| i.e. files with the extension #[code .conllu], you can use the
| #[+api("cli#convert") #[code convert]] command to convert it to spaCy's
| #[+a("/docs/api/annotation#json-input") JSON format] for training.
p
| Once you have your UD corpus transformed into JSON, you can train your
| model use the using spaCy's #[+api("cli#train") #[code train]] command:
+code(false, "bash").
spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
+infobox("Important note", "⚠️")
| The morphological attributes are currently #[strong not all used by spaCy].
| Full integration is still being developed. In the meantime, it can still
| be useful to add them, especially if the language you're adding includes
| important distinctions and special cases. This ensures that as soon as
| full support is introduced, your language will be able to assign all
| possible attributes.

View File

@ -0,0 +1,76 @@
//- 💫 DOCS > USAGE > ADDING LANGUAGES > TESTING
p
| Before using the new language or submitting a
| #[+a(gh("spaCy") + "/pulls") pull request] to spaCy, you should make sure
| it works as expected. This is especially important if you've added custom
| regular expressions for token matching or punctuation you don't want to
| be causing regressions.
+infobox("spaCy's test suite")
| spaCy uses the #[+a("https://docs.pytest.org/en/latest/") pytest framework]
| for testing. For more details on how the tests are structured and best
| practices for writing your own tests, see our
| #[+a(gh("spaCy", "spacy/tests")) tests documentation].
p
| The easiest way to test your new tokenizer is to run the
| language-independent "tokenizer sanity" tests located in
| #[+src(gh("spaCy", "spacy/tests/tokenizer")) #[code tests/tokenizer]].
| This will test for basic behaviours like punctuation splitting, URL
| matching and correct handling of whitespace. In the
| #[+src(gh("spaCy", "spacy/tests/conftest.py")) #[code conftest.py]], add
| the new language ID to the list of #[code _languages]:
+code.
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
'nl', 'pl', 'pt', 'sv', 'xx'] # new language here
+aside-code("Global tokenizer test example").
# use fixture by adding it as an argument
def test_with_all_languages(tokenizer):
# will be performed on ALL language tokenizers
tokens = tokenizer(u'Some text here.')
p
| The language will now be included in the #[code tokenizer] test fixture,
| which is used by the basic tokenizer tests. If you want to add your own
| tests that should be run over all languages, you can use this fixture as
| an argument of your test function.
+h(3, "testing-custom") Writing language-specific tests
p
| It's recommended to always add at least some tests with examples specific
| to the language. Language tests should be located in
| #[+src(gh("spaCy", "spacy/tests/lang")) #[code tests/lang]] in a
| directory named after the language ID. You'll also need to create a
| fixture for your tokenizer in the
| #[+src(gh("spaCy", "spacy/tests/conftest.py")) #[code conftest.py]].
| Always use the #[+api("util#get_lang_class") #[code get_lang_class()]]
| helper function within the fixture, instead of importing the class at the
| top of the file. This will load the language data only when it's needed.
| (Otherwise, #[em all data] would be loaded every time you run a test.)
+code.
@pytest.fixture
def en_tokenizer():
return util.get_lang_class('en').Defaults.create_tokenizer()
p
| When adding test cases, always
| #[+a(gh("spaCy", "spacy/tests#parameters")) #[code parametrize]] them
| this will make it easier for others to add more test cases without having
| to modify the test itself. You can also add parameter tuples, for example,
| a test sentence and its expected length, or a list of expected tokens.
| Here's an example of an English tokenizer test for combinations of
| punctuation and abbreviations:
+code("Example test").
@pytest.mark.parametrize('text,length', [
("The U.S. Army likes Shock and Awe.", 8),
("U.N. regulations are not a part of their concern.", 10),
("“Isn't it?”", 6)])
def test_en_tokenizer_handles_punct_abbrev(en_tokenizer, text, length):
tokens = en_tokenizer(text)
assert len(tokens) == length

View File

@ -0,0 +1,93 @@
//- 💫 DOCS > USAGE > ADDING LANGUAGES > TRAINING
p
| spaCy expects that common words will be cached in a
| #[+api("vocab") #[code Vocab]] instance. The vocabulary caches lexical
| features, and makes it easy to use information from unlabelled text
| samples in your models. Specifically, you'll usually want to collect
| word frequencies, and train word vectors. To generate the word frequencies
| from a large, raw corpus, you can use the
| #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) #[code word_freqs.py]]
| script from the spaCy developer resources.
+github("spacy-dev-resources", "training/word_freqs.py")
p
| Note that your corpus should not be preprocessed (i.e. you need
| punctuation for example). The word frequencies should be generated as a
| tab-separated file with three columns:
+list("numbers")
+item The number of times the word occurred in your language sample.
+item The number of distinct documents the word occurred in.
+item The word itself.
+code("es_word_freqs.txt", "text").
6361109 111 Aunque
23598543 111 aunque
10097056 111 claro
193454 111 aro
7711123 111 viene
12812323 111 mal
23414636 111 momento
2014580 111 felicidad
233865 111 repleto
15527 111 eto
235565 111 deliciosos
17259079 111 buena
71155 111 Anímate
37705 111 anímate
33155 111 cuéntanos
2389171 111 cuál
961576 111 típico
+aside("Brown Clusters")
| Additionally, you can use distributional similarity features provided by the
| #[+a("https://github.com/percyliang/brown-cluster") Brown clustering algorithm].
| You should train a model with between 500 and 1000 clusters. A minimum
| frequency threshold of 10 usually works well.
p
| You should make sure you use the spaCy tokenizer for your
| language to segment the text for your word frequencies. This will ensure
| that the frequencies refer to the same segmentation standards you'll be
| using at run-time. For instance, spaCy's English tokenizer segments
| "can't" into two tokens. If we segmented the text by whitespace to
| produce the frequency counts, we'll have incorrect frequency counts for
| the tokens "ca" and "n't".
+h(4, "word-vectors") Training the word vectors
p
| #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related
| algorithms let you train useful word similarity models from unlabelled
| text. This is a key part of using
| #[+a("/usage/deep-learning") deep learning] for NLP with limited
| labelled data. The vectors are also useful by themselves they power
| the #[code .similarity()] methods in spaCy. For best results, you should
| pre-process the text with spaCy before training the Word2vec model. This
| ensures your tokenization will match. You can use our
| #[+src(gh("spacy-dev-resources", "training/word_vectors.py")) word vectors training script],
| which pre-processes the text with your language-specific tokenizer and
| trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim].
| The #[code vectors.bin] file should consist of one word and vector per line.
+github("spacy-dev-resources", "training/word_vectors.py")
+h(3, "train-tagger-parser") Training the tagger and parser
p
| You can now train the model using a corpus for your language annotated
| with #[+a("http://universaldependencies.org/") Universal Dependencies].
| If your corpus uses the
| #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format,
| i.e. files with the extension #[code .conllu], you can use the
| #[+api("cli#convert") #[code convert]] command to convert it to spaCy's
| #[+a("/api/annotation#json-input") JSON format] for training.
| Once you have your UD corpus transformed into JSON, you can train your
| model use the using spaCy's #[+api("cli#train") #[code train]] command.
+infobox
| For more details and examples of how to
| #[strong train the tagger and dependency parser], see the
| #[+a("/usage/training#tagger-parser") usage guide on training].

195
website/usage/_data.json Normal file
View File

@ -0,0 +1,195 @@
{
"sidebar": {
"Get started": {
"Installation": "./",
"Models & Languages": "models",
"Facts & Figures": "facts-figures",
"spaCy 101": "spacy-101",
"New in v2.0": "v2"
},
"Guides": {
"Linguistic Features": "linguistic-features",
"Processing Pipelines": "processing-pipelines",
"Vectors & Similarity": "vectors-similarity",
"Text Classification": "text-classification",
"Deep Learning": "deep-learning",
"Training Models": "training",
"Adding Languages": "adding-languages",
"Visualizers": "visualizers"
},
"In-depth": {
"Code Examples": "examples",
"Resources": "resources"
}
},
"index": {
"title": "Install spaCy",
"next": "models",
"quickstart": true,
"changelog": true,
"menu": {
"Quickstart": "quickstart",
"Instructions": "instructions",
"Troubleshooting": "troubleshooting",
"Changelog": "changelog"
}
},
"models": {
"title": "Models & Languages",
"next": "facts-figures",
"quickstart": true,
"menu": {
"Quickstart": "quickstart",
"Available Models": "available",
"Installation & Usage": "install",
"Language Support": "languages",
"Production Use": "production"
}
},
"facts-figures": {
"title": "Facts & Figures",
"teaser": "The hard numbers for spaCy and how it compares to other libraries and tools.",
"next": "spacy-101",
"menu": {
"Feature comparison": "comparison",
"Benchmarks": "benchmarks",
"Powered by spaCy": "powered-by",
"Other Libraries": "other-libraries"
}
},
"spacy-101": {
"title": "spaCy 101: Everything you need to know",
"teaser": "The most important concepts, explained in simple terms.",
"next": "index",
"quickstart": true,
"preview": "101",
"menu": {
"Features": "features",
"Lightning tour": "lightning-tour",
"Architecture": "architecture",
"Community & FAQ": "community-faq"
}
},
"v2": {
"title": "What's New in v2.0",
"teaser": "New features, backwards incompatibilities and migration guide.",
"menu": {
"New features": "features",
"Backwards Incompatibilities": "incompat",
"Migrating from v1.x": "migrating",
"Benchmarks": "benchmarks"
}
},
"linguistic-features": {
"title": "Linguistic Features",
"teaser": "Using spaCy to extract linguistic features like part-of-speech tags, dependency labels and named entities, customising the tokenizer and working with the rule-based matcher.",
"next": "processing-pipelines",
"menu": {
"POS Tagging": "pos-tagging",
"Dependency Parse": "dependency-parse",
"Named Entities": "named-entities",
"Tokenization": "tokenization",
"Rule-based Matching": "rule-based-matching"
}
},
"processing-pipelines": {
"title": "Language Processing Pipelines",
"next": "vectors-similarity",
"menu": {
"How pipelines work": "pipelines",
"Examples": "examples",
"Multi-threading": "multithreading",
"User Hooks": "user-hooks",
"Serialization": "serialization"
}
},
"vectors-similarity": {
"title": "Word Vectors and Semantic Similarity",
"next": "text-classification",
"menu": {
"Basics": "basics",
"Similarity in Context": "in-context",
"Custom Vectors": "custom",
"GPU Usage": "gpu"
}
},
"deep-learning": {
"title": "Deep Learning",
"teaser": "Using spaCy to pre-process text for deep learning, and how to plug in your own machine learning models.",
"next": "training",
"menu": {
"Pre-processing Text": "pre-processing",
"spaCy and Thinc": "thinc",
"TensorFlow / Keras": "tensorflow-keras",
"scikit-learn": "scikit-learn",
"PyTorch": "pytorch",
"DyNet": "dynet"
}
},
"text-classification": {
"title": "Text Classification",
"next": "training"
},
"training": {
"title": "Training spaCy's Statistical Models",
"next": "adding-languages",
"menu": {
"Basics": "basics",
"NER": "ner",
"Tagger & Parser": "tagger-parser",
"Similarity": "similarity",
"Text Classification": "textcat",
"Saving & Loading": "saving-loading"
}
},
"adding-languages": {
"title": "Adding Languages",
"teaser": "Adding full support for a language touches many different parts of the spaCy library. This guide explains how to fit everything together, and points you to the specific workflows for each component.",
"next": "training",
"menu": {
"Language data": "language-data",
"Testing": "testing",
"Training": "training"
}
},
"visualizers": {
"title": "Visualizers",
"next": "resources"
},
"resources": {
"title": "Resources",
"teaser": "Libraries, demos, books, courses and research systems featuring spaCy.",
"menu": {
"Third-party libraries": "libraries",
"Demos & Visualizations": "demos",
"Books & Courses": "books",
"Jupyter Notebooks": "notebooks",
"Research": "research"
}
},
"examples": {
"title": "Code Examples",
"teaser": "Full code examples you can modify and run.",
"next": "resources",
"menu": {
"Matching": "matching",
"Training": "training",
"Deep Learning": "deep-learning"
}
}
}

View File

@ -0,0 +1,11 @@
//- 💫 DOCS > USAGE > DEEP LEARNING > DYNET
+infobox
+infobox-logos(["dynet", 80, 34, "http://dynet.readthedocs.io/"])
| #[strong DyNet] is a dynamic neural network library, which can be much
| easier to work with for NLP. Outside of Google, there's a general shift
| among NLP researchers to both DyNet and Pytorch. You can use DyNet to
| create spaCy pipeline components, to add annotations to the #[code Doc]
| object.
+under-construction

View File

@ -0,0 +1,3 @@
//- 💫 DOCS > USAGE > DEEP LEARNING > PRE-PROCESSING
+under-construction

View File

@ -0,0 +1,91 @@
//- 💫 DOCS > USAGE > DEEP LEARNING > PYTORCH
+infobox
+infobox-logos(["pytorch", 100, 48, "http://pytorch.org"])
| #[strong PyTorch] is a dynamic neural network library, which can be much
| easier to work with for NLP. Outside of Google, there's a general shift
| among NLP researchers to both Pytorch and DyNet. spaCy is the front-end
| of choice for PyTorch's #[code torch.text] extension. You can use PyTorch
| to create spaCy pipeline components, to add annotations to the
| #[code Doc] object.
+under-construction
p
| Here's how a #[code begin_update] function that wraps an arbitrary
| PyTorch model would look:
+code.
class PytorchWrapper(thinc.neural.Model):
def __init__(self, pytorch_model):
self.pytorch_model = pytorch_model
def begin_update(self, x_data, drop=0.):
x_var = Variable(x_data)
# Make prediction
y_var = pytorch_model.forward(x_var)
def backward(dy_data, sgd=None):
dy_var = Variable(dy_data)
dx_var = torch.autograd.backward(x_var, dy_var)
return dx_var
return y_var.data, backward
p
| PyTorch requires data to be wrapped in a container, #[code Variable],
| that tracks the operations performed on the data. This "tape" of
| operations is then used by #[code torch.autograd.backward] to compute the
| gradient with respect to the input. For example, the following code
| constructs a PyTorch Linear layer that takes a vector of shape
| #[code (length, 2)], multiples it by a #[code (2, 2)] matrix of weights,
| adds a #[code (2,)] bias, and returns the resulting #[code (length, 2)]
| vector:
+code("PyTorch Linear").
from torch import autograd
from torch import nn
import torch
import numpy
pt_model = nn.Linear(2, 2)
length = 5
input_data = numpy.ones((5, 2), dtype='f')
input_var = autograd.Variable(torch.Tensor(input_data))
output_var = pt_model(input_var)
output_data = output_var.data.numpy()
p
| Given target values we would like the output data to approximate, we can
| then "learn" values of the parameters within #[code pt_model], to give us
| output that's closer to our target. As a trivial example, let's make the
| linear layer compute the negative inverse of the input:
+code.
def get_target(input_data):
return -(1 / input_data)
p
| To update the PyTorch model, we create an optimizer and give it
| references to the model's parameters. We'll then randomly generate input
| data and get the target result we'd like the function to produce. We then
| compute the #[strong gradient of the error] between the current output
| and the target. Using the most popular definition of "error", this is
| simply the average difference:
+code.
from torch import optim
optimizer = optim.SGD(pt_model.parameters(), lr = 0.01)
for i in range(10):
input_data = numpy.random.uniform(-1., 1., (length, 2))
target = -(1 / input_data)
output_var = pt_model(autograd.Variable(torch.Tensor(input_data)))
output_data = output_var.data.numpy()
d_output_data = (output_data - target) / length
d_output_var = autograd.Variable(torch.Tensor(d_output_data))
d_input_var = torch.autograg.backward(output_var, d_output_var)
optimizer.step()

View File

@ -0,0 +1,15 @@
//- 💫 DOCS > USAGE > DEEP LEARNING > SCIKIT-LEARN
+infobox
+infobox-logos(["scikitlearn", 70, 34, "http://scikit-learn.org"])
| #[strong scikit-learn] features a number of useful NLP functions,
| especially for solving text classification problems using linear models
| with bag-of-words features. If you know you need exactly that, it might
| be better to use scikit-learn's built-in pipeline directly. However, if
| you want to extract more detailed features, using part-of-speech tags,
| named entity labels, or string transformations, you can use spaCy as a
| pre-process in your classification system. scikit-learn also provides a
| lot of experiment management and evaluation utilities that people use
| alongside spaCy.
+under-construction

View File

@ -0,0 +1,11 @@
//- 💫 DOCS > USAGE > DEEP LEARNING > TENSORFLOW / KERAS
+infobox
+infobox-logos(["tensorflow", 35, 42, "https://www.tensorflow.org"], ["keras", 45, 45, "https://www.keras.io"])
| #[strong Tensorflow / Keras] is the most popular deep learning library.
| spaCy provides efficient and powerful feature extraction functionality,
| that can be used as a pre-process to any deep learning library. You can
| also use Tensorflow and Keras to create spaCy pipeline components, to add
| annotations to the #[code Doc] object.
+under-construction

View File

@ -0,0 +1,66 @@
//- 💫 DOCS > USAGE > DEEP LEARNING > THINC
p
| #[+a(gh("thinc")) Thinc] is the machine learning library powering spaCy.
| It's a practical toolkit for implementing models that follow the
| #[+a("https://explosion.ai/blog/deep-learning-formula-nlp", true) "Embed, encode, attend, predict"]
| architecture. It's designed to be easy to install, efficient for CPU
| usage and optimised for NLP and deep learning with text in particular,
| hierarchically structured input and variable-length sequences.
p
| spaCy's built-in pipeline components can all be powered by any object
| that follows Thinc's #[code Model] API. If a wrapper is not yet available
| for the library you're using, you should create a
| #[code thinc.neural.Model] subclass that implements a #[code begin_update]
| method. You'll also want to implement #[code to_bytes], #[code from_bytes],
| #[code to_disk] and #[code from_disk] methods, to save and load your
| model. Here's the tempate you'll need to fill in:
+code("Thinc Model API").
class ThincModel(thinc.neural.Model):
def __init__(self, *args, **kwargs):
pass
def begin_update(self, X, drop=0.):
def backprop(dY, sgd=None):
return dX
return Y, backprop
def to_disk(self, path, **exclude):
return None
def from_disk(self, path, **exclude):
return self
def to_bytes(self, **exclude):
return bytes
def from_bytes(self, msgpacked_bytes, **exclude):
return self
p
| The #[code begin_update] method should return a callback, that takes the
| gradient with respect to the output, and returns the gradient with
| respect to the input. It's usually convenient to implement the callback
| as a nested function, so you can refer to any intermediate variables from
| the forward computation in the enclosing scope.
+h(3, "how-thinc-works") How Thinc works
p
| Neural networks are all about composing small functions that we know how
| to differentiate into larger functions that we know how to differentiate.
| To differentiate a function efficiently, you usually need to store
| intermediate results, computed during the "forward pass", to reuse them
| during the backward pass. Most libraries require the data passed through
| the network to accumulate these intermediate result. This is the "tape"
| in tape-based differentiation.
p
| In Thinc, a model that computes #[code y = f(x)] is required to also
| return a callback that computes #[code dx = f&apos;(dy)]. The same
| intermediate state needs to be tracked, but this becomes an
| implementation detail for the model to take care of usually, the
| callback is implemented as a closure, so the intermediate results can be
| read from the enclosing scope.

View File

@ -0,0 +1,45 @@
//- 💫 DOCS > USAGE > FACTS & FIGURES > BENCHMARKS > CHOI ET AL. (2015)
+table(["System", "Year", "Language", "Accuracy", "Speed (wps)"])
+row
+cell #[strong spaCy v2.x]
+cell 2017
+cell Python / Cython
+cell.u-text-right #[strong 92.6]
+cell.u-text-right #[em n/a]
| #[+help("This table shows speed as benchmarked by Choi et al. We therefore can't provide comparable figures, as we'd be running the benchmark on different hardware.").u-color-dark]
+row
+cell #[strong spaCy v1.x]
+cell 2015
+cell Python / Cython
+cell.u-text-right 91.8
+cell.u-text-right 13,963
+row
+cell ClearNLP
+cell 2015
+cell Java
+cell.u-text-right 91.7
+cell.u-text-right 10,271
+row
+cell CoreNLP
+cell 2015
+cell Java
+cell.u-text-right 89.6
+cell.u-text-right 8,602
+row
+cell MATE
+cell 2015
+cell Java
+cell.u-text-right 92.5
+cell.u-text-right 550
+row
+cell Turbo
+cell 2015
+cell C++
+cell.u-text-right 92.4
+cell.u-text-right 349

View File

@ -0,0 +1,48 @@
//- 💫 DOCS > USAGE > FACTS & FIGURES > BENCHMARKS > MODEL COMPARISON
p
| In this section, we provide benchmark accuracies for the pre-trained
| model pipelines we distribute with spaCy. Evaluations are conducted
| end-to-end from raw text, with no "gold standard" pre-processing, over
| text from a mix of genres where possible.
+under-construction
+aside("Methodology")
| The evaluation was conducted on raw text with no gold standard
| information. The parser, tagger and entity recognizer were trained on the
| #[+a("https://www.gabormelli.com/RKB/OntoNotes_Corpus") OntoNotes 5]
| corpus, the word vectors on #[+a("http://commoncrawl.org") Common Crawl].
+table(["Model", "spaCy", "Type", "UAS", "NER F", "POS", "WPS", "Size"])
+row
+cell #[+a("/models/en#en_core_web_sm") #[code en_core_web_sm]] 2.0.0a5
each data in ["2.x", "neural"]
+cell.u-text-right=data
+cell.u-text-right 91.4
+cell.u-text-right 85.5
+cell.u-text-right 97.0
+cell.u-text-right 8.2k
+cell.u-text-right #[strong 36 MB]
+row
+cell #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] 2.0.0a0
each data in ["2.x", "neural"]
+cell.u-text-right=data
+cell.u-text-right #[strong 91.9]
+cell.u-text-right #[strong 86.4]
+cell.u-text-right #[strong 97.2]
+cell.u-text-right #[em n/a]
+cell.u-text-right 667 MB
+row("divider")
+cell #[code en_core_web_sm] 1.2.0
each data in ["1.x", "linear", 86.6, 78.5, 96.6]
+cell.u-text-right=data
+cell.u-text-right #[strong 25.7k]
+cell.u-text-right 50 MB
+row
+cell #[code en_core_web_md] 1.2.1
each data in ["1.x", "linear", 90.6, 81.4, 96.7, "18.8k", "1 GB"]
+cell.u-text-right=data

View File

@ -0,0 +1,206 @@
//- 💫 DOCS > USAGE > FACTS & FIGURES > BENCHMARKS
p
| Two peer-reviewed papers in 2015 confirm that spaCy offers the
| #[strong fastest syntactic parser in the world] and that
| #[strong its accuracy is within 1% of the best] available. The few
| systems that are more accurate are 20&times; slower or more.
+aside("About the evaluation")
| The first of the evaluations was published by #[strong Yahoo! Labs] and
| #[strong Emory University], as part of a survey of current parsing
| technologies #[+a("https://aclweb.org/anthology/P/P15/P15-1038.pdf") (Choi et al., 2015)].
| Their results and subsequent discussions helped us develop a novel
| psychologically-motivated technique to improve spaCy's accuracy, which
| we published in joint work with Macquarie University
| #[+a("https://aclweb.org/anthology/D/D15/D15-1162.pdf") (Honnibal and Johnson, 2015)].
include _benchmarks-choi-2015
+h(3, "algorithm") Algorithm comparison
p
| In this section, we compare spaCy's algorithms to recently published
| systems, using some of the most popular benchmarks. These benchmarks are
| designed to help isolate the contributions of specific algorithmic
| decisions, so they promote slightly "idealised" conditions. Specifically,
| the text comes pre-processed with "gold standard" token and sentence
| boundaries. The data sets also tend to be fairly small, to help
| researchers iterate quickly. These conditions mean the models trained on
| these data sets are not always useful for practical purposes.
+h(4, "parse-accuracy-penn") Parse accuracy (Penn Treebank / Wall Street Journal)
p
| This is the "classic" evaluation, so it's the number parsing researchers
| are most easily able to put in context. However, it's quite far removed
| from actual usage: it uses sentences with gold-standard segmentation and
| tokenization, from a pretty specific type of text (articles from a single
| newspaper, 1984-1989).
+aside("Methodology")
| #[+a("http://arxiv.org/abs/1603.06042") Andor et al. (2016)] chose
| slightly different experimental conditions from
| #[+a("https://aclweb.org/anthology/P/P15/P15-1038.pdf") Choi et al. (2015)],
| so the two accuracy tables here do not present directly comparable
| figures.
+table(["System", "Year", "Type", "Accuracy"])
+row
+cell spaCy v2.0.0
+cell 2017
+cell neural
+cell.u-text-right 94.48
+row
+cell spaCy v1.1.0
+cell 2016
+cell linear
+cell.u-text-right 92.80
+row("divider")
+cell
+a("https://arxiv.org/pdf/1611.01734.pdf") Dozat and Manning
+cell 2017
+cell neural
+cell.u-text-right #[strong 95.75]
+row
+cell
+a("http://arxiv.org/abs/1603.06042") Andor et al.
+cell 2016
+cell neural
+cell.u-text-right 94.44
+row
+cell
+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet Parsey McParseface
+cell 2016
+cell neural
+cell.u-text-right 94.15
+row
+cell
+a("http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43800.pdf") Weiss et al.
+cell 2015
+cell neural
+cell.u-text-right 93.91
+row
+cell
+a("http://research.google.com/pubs/archive/38148.pdf") Zhang and McDonald
+cell 2014
+cell linear
+cell.u-text-right 93.32
+row
+cell
+a("http://www.cs.cmu.edu/~ark/TurboParser/") Martins et al.
+cell 2013
+cell linear
+cell.u-text-right 93.10
+h(4, "ner-accuracy-ontonotes5") NER accuracy (OntoNotes 5, no pre-process)
p
| This is the evaluation we use to tune spaCy's parameters are decide which
| algorithms are better than others. It's reasonably close to actual usage,
| because it requires the parses to be produced from raw text, without any
| pre-processing.
+table(["System", "Year", "Type", "Accuracy"])
+row
+cell spaCy #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] v2.0.0
+cell 2017
+cell neural
+cell.u-text-right 86.45
+row("divider")
+cell
+a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al.
+cell 2017
+cell neural
+cell.u-text-right #[strong 86.81]
+row
+cell
+a("https://www.semanticscholar.org/paper/Named-Entity-Recognition-with-Bidirectional-LSTM-C-Chiu-Nichols/10a4db59e81d26b2e0e896d3186ef81b4458b93f") Chiu and Nichols
+cell 2016
+cell neural
+cell.u-text-right 86.19
+row
+cell
+a("https://www.semanticscholar.org/paper/A-Joint-Model-for-Entity-Analysis-Coreference-Typi-Durrett-Klein/28eb033eee5f51c5e5389cbb6b777779203a6778") Durrett and Klein
+cell 2014
+cell neural
+cell.u-text-right 84.04
+row
+cell
+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth
+cell 2009
+cell linear
+cell.u-text-right 83.45
+h(3, "spacy-models") Model comparison
include _benchmarks-models
+h(3, "speed-comparison") Detailed speed comparison
p
| Here we compare the per-document processing time of various spaCy
| functionalities against other NLP libraries. We show both absolute
| timings (in ms) and relative performance (normalized to spaCy). Lower is
| better.
+infobox("Important note", "⚠️")
| This evaluation was conducted in 2015. We're working on benchmarks on
| current CPU and GPU hardware.
+aside("Methodology")
| #[strong Set up:] 100,000 plain-text documents were streamed from an
| SQLite3 database, and processed with an NLP library, to one of three
| levels of detail — tokenization, tagging, or parsing. The tasks are
| additive: to parse the text you have to tokenize and tag it. The
| pre-processing was not subtracted from the times — we report the time
| required for the pipeline to complete. We report mean times per document,
| in milliseconds.#[br]#[br]
| #[strong Hardware]: Intel i7-3770 (2012)#[br]
| #[strong Implementation]: #[+src(gh("spacy-benchmarks")) #[code spacy-benchmarks]]
+table
+row.u-text-label.u-text-center
+head-cell
+head-cell(colspan="3") Absolute (ms per doc)
+head-cell(colspan="3") Relative (to spaCy)
+row
each column in ["System", "Tokenize", "Tag", "Parse", "Tokenize", "Tag", "Parse"]
+head-cell=column
+row
+cell #[strong spaCy]
each data in [ "0.2ms", "1ms", "19ms"]
+cell.u-text-right #[strong=data]
each data in ["1x", "1x", "1x"]
+cell.u-text-right=data
+row
+cell CoreNLP
each data in ["2ms", "10ms", "49ms", "10x", "10x", "2.6x"]
+cell.u-text-right=data
+row
+cell ZPar
each data in ["1ms", "8ms", "850ms", "5x", "8x", "44.7x"]
+cell.u-text-right=data
+row
+cell NLTK
each data in ["4ms", "443ms"]
+cell.u-text-right=data
+cell.u-text-right #[em n/a]
each data in ["20x", "443x"]
+cell.u-text-right=data
+cell.u-text-right #[em n/a]

View File

@ -0,0 +1,58 @@
//- 💫 DOCS > USAGE > FACTS & FIGURES > FEATURE COMPARISON
p
| Here's a quick comparison of the functionalities offered by spaCy,
| #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet],
| #[+a("http://www.nltk.org/py-modindex.html") NLTK] and
| #[+a("http://stanfordnlp.github.io/CoreNLP/") CoreNLP].
+table(["", "spaCy", "SyntaxNet", "NLTK", "CoreNLP"])
+row
+cell Programming language
each lang in ["Python", "C++", "Python", "Java"]
+cell.u-text-small.u-text-center=lang
+row
+cell Neural network models
each icon in ["pro", "pro", "con", "pro"]
+cell.u-text-center #[+procon(icon)]
+row
+cell Integrated word vectors
each icon in ["pro", "con", "con", "con"]
+cell.u-text-center #[+procon(icon)]
+row
+cell Multi-language support
each icon in ["pro", "pro", "pro", "pro"]
+cell.u-text-center #[+procon(icon)]
+row
+cell Tokenization
each icon in ["pro", "pro", "pro", "pro"]
+cell.u-text-center #[+procon(icon)]
+row
+cell Part-of-speech tagging
each icon in ["pro", "pro", "pro", "pro"]
+cell.u-text-center #[+procon(icon)]
+row
+cell Sentence segmentation
each icon in ["pro", "pro", "pro", "pro"]
+cell.u-text-center #[+procon(icon)]
+row
+cell Dependency parsing
each icon in ["pro", "pro", "con", "pro"]
+cell.u-text-center #[+procon(icon)]
+row
+cell Entity recognition
each icon in ["pro", "con", "pro", "pro"]
+cell.u-text-center #[+procon(icon)]
+row
+cell Coreference resolution
each icon in ["con", "con", "con", "pro"]
+cell.u-text-center #[+procon(icon)]

View File

@ -0,0 +1,70 @@
//- 💫 DOCS > USAGE > FACTS & FIGURES > OTHER LIBRARIES
p
| Data scientists, researchers and machine learning engineers have
| converged on Python as the language for AI. This gives developers a rich
| ecosystem of NLP libraries to work with. Here's how we think the pieces
| fit together.
+aside("Using spaCy with other libraries")
| For details on how to use spaCy together with popular machine learning
| libraries like TensorFlow, Keras or PyTorch, see the
| #[+a("/usage/deep-learning") usage guide on deep learning].
+infobox
+infobox-logos(["nltk", 80, 25, "http://nltk.org"])
| #[+label-inline NLTK] offers some of the same functionality as spaCy.
| Although originally developed for teaching and research, its longevity
| and stability has resulted in a large number of industrial users. It's
| the main alternative to spaCy for tokenization and sentence segmentation.
| In comparison to spaCy, NLTK takes a much more "broad church" approach
| so it has some functions that spaCy doesn't provide, at the expense of a
| bit more clutter to sift through. spaCy is also much more
| performance-focussed than NLTK: where the two libraries provide the same
| functionality, spaCy's implementation will usually be faster and more
| accurate.
+infobox
+infobox-logos(["gensim", 40, 40, "https://radimrehurek.com/gensim/"])
| #[+label-inline Gensim] provides unsupervised text modelling algorithms.
| Although Gensim isn't a runtime dependency of spaCy, we use it to train
| word vectors. There's almost no overlap between the libraries the two
| work together.
+infobox
+infobox-logos(["tensorflow", 35, 42, "https://www.tensorflow.org"], ["keras", 45, 45, "https://www.keras.io"])
| #[+label-inline Tensorflow / Keras] is the most popular deep learning library.
| spaCy provides efficient and powerful feature extraction functionality,
| that can be used as a pre-process to any deep learning library. You can
| also use Tensorflow and Keras to create spaCy pipeline components, to add
| annotations to the #[code Doc] object.
+infobox
+infobox-logos(["scikitlearn", 90, 44, "http://scikit-learn.org"])
| #[+label-inline scikit-learn] features a number of useful NLP functions,
| especially for solving text classification problems using linear models
| with bag-of-words features. If you know you need exactly that, it might
| be better to use scikit-learn's built-in pipeline directly. However, if
| you want to extract more detailed features, using part-of-speech tags,
| named entity labels, or string transformations, you can use spaCy as a
| pre-process in your classification system. scikit-learn also provides a
| lot of experiment management and evaluation utilities that people use
| alongside spaCy.
+infobox
+infobox-logos(["pytorch", 100, 48, "http://pytorch.org"], ["dynet", 80, 34, "http://dynet.readthedocs.io/"], ["chainer", 80, 43, "http://chainer.org"])
| #[+label-inline PyTorch, DyNet and Chainer] are dynamic neural network
| libraries, which can be much easier to work with for NLP. Outside of
| Google, there's a general shift among NLP researchers to both DyNet and
| Pytorch. spaCy is the front-end of choice for PyTorch's
| #[code torch.text] extension. You can use any of these libraries to
| create spaCy pipeline components, to add annotations to the #[code Doc]
| object.
+infobox
+infobox-logos(["allennlp", 124, 22, "http://allennlp.org"])
| #[+label-inline AllenNLP] is a new library designed to accelerate NLP
| research, by providing a framework that supports modern deep learning
| workflows for cutting-edge language understanding problems. AllenNLP uses
| spaCy as a preprocessing component. You can also use AllenNLP to develop
| spaCy pipeline components, to add annotations to the #[code Doc] object.

View File

@ -0,0 +1,31 @@
//- 💫 DOCS > USAGE > INSTALL > CHANGELOG
+h(2, "changelog") Changelog
+button(gh("spacy") + "/releases", false, "secondary", "small").u-float-right.u-nowrap View releases
div(data-tpl="changelog" data-tpl-key="error")
+infobox
| Unable to load changelog from GitHub. Please see the
| #[+a(gh("spacy") + "/releases") releases page] instead.
section(data-tpl="changelog" data-tpl-key="table" style="display: none")
+table(["Date", "Version", "Title"])
tbody(data-tpl="changelog" data-tpl-key="releases")
+row(data-tpl="changelog" data-tpl-key="item")
+cell.u-nowrap
+label(data-changelog="date")
+cell(data-changelog="tag")
+cell.u-text-small(data-changelog="title")
+h(3) Pre-releases
+aside("About pre-releases")
.o-block-small
| Pre-releases include alpha and beta versions, as well as release
| candidates. They are not intended for production use. You can
| download spaCy pre-releases via the #[code spacy-nightly] package
| on pip.
+badge("https://img.shields.io/pypi/v/spacy-nightly.svg?style=flat-square", "https://pypi.python.org/pypi/spacy-nightly")
+table(["Date", "Version", "Title"])
tbody(data-tpl="changelog" data-tpl-key="prereleases")

View File

@ -0,0 +1,185 @@
//- 💫 DOCS > USAGE > INSTALL > INSTRUCTIONS
+h(3, "pip") pip
+badge("https://img.shields.io/pypi/v/spacy.svg?style=flat-square", "https://pypi.python.org/pypi/spacy")
p Using pip, spaCy releases are currently only available as source packages.
+code(false, "bash").
pip install -U spacy
+aside("Download models")
| After installation you need to download a language model. For more info
| and available models, see the #[+a("/usage/models") docs on models].
+code.o-no-block.
spacy download en
&gt;&gt;&gt; import spacy
&gt;&gt;&gt; nlp = spacy.load('en')
p
| When using pip it is generally recommended to install packages in a
| #[code virtualenv] to avoid modifying system state:
+code(false, "bash").
virtualenv .env
source .env/bin/activate
pip install spacy
+h(3, "conda") conda
+badge("https://anaconda.org/conda-forge/spacy/badges/version.svg", "https://anaconda.org/conda-forge/spacy")
p
| Thanks to our great community, we've finally re-added conda support. You
| can now install spaCy via #[code conda-forge]:
+code(false, "bash").
conda config --add channels conda-forge
conda install spacy
p
| For the feedstock including the build recipe and configuration, check out
| #[+a("https://github.com/conda-forge/spacy-feedstock") this repository].
| Improvements and pull requests to the recipe and setup are always
| appreciated.
+h(3, "gpu") Run spaCy with GPU
p
| As of v2.0, spaCy's comes with neural network models that are implemented
| in our machine learning library, #[+a(gh("thinc")) Thinc]. For GPU
| support, we've been grateful to use the work of
| #[+a("http://chainer.org") Chainer]'s CuPy module, which provides
| a NumPy-compatible interface for GPU arrays.
p
| First, install follows the normal CUDA installation procedure. Next, set
| your environment variables so that the installation will be able to find
| CUDA. Finally, install spaCy.
+code(false, "bash").
export CUDA_HOME=/usr/local/cuda-8.0 # Or wherever your CUDA is
export PATH=$PATH:$CUDA_HOME/bin
pip install spacy
python -c "import thinc.neural.gpu_ops" # Check the GPU ops were built
+h(3, "source") Compile from source
p
| The other way to install spaCy is to clone its
| #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
| the common way if you want to make changes to the code base. You'll need
| to make sure that you have a development environment consisting of a
| Python distribution including header files, a compiler,
| #[+a("https://pip.pypa.io/en/latest/installing/") pip],
| #[+a("https://virtualenv.pypa.io/") virtualenv] and
| #[+a("https://git-scm.com") git] installed. The compiler part is the
| trickiest. How to do that depends on your system. See notes on
| #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") OS X] and
| #[a(href="#source-windows") Windows] for details.
+code(false, "bash").
# make sure you are using recent pip/virtualenv versions
python -m pip install -U pip virtualenv
git clone #{gh("spaCy")}
cd spaCy
virtualenv .env
source .env/bin/activate
pip install -r requirements.txt
pip install -e .
p
| Compared to regular install via pip,
| #[+a(gh("spaCy", "requirements.txt")) requirements.txt]
| additionally installs developer dependencies such as Cython.
p
| Instead of the above verbose commands, you can also use the following
| #[+a("http://www.fabfile.org/") Fabric] commands:
+table(["Command", "Description"])
+row
+cell #[code fab env]
+cell Create #[code virtualenv] and delete previous one, if it exists.
+row
+cell #[code fab make]
+cell Compile the source.
+row
+cell #[code fab clean]
+cell Remove compiled objects, including the generated C++.
+row
+cell #[code fab test]
+cell Run basic tests, aborting after first failure.
p
| All commands assume that your #[code virtualenv] is located in a
| directory #[code .env]. If you're using a different directory, you can
| change it via the environment variable #[code VENV_DIR], for example:
+code(false, "bash").
VENV_DIR=".custom-env" fab clean make
+h(4, "source-ubuntu") Ubuntu
p Install system-level dependencies via #[code apt-get]:
+code(false, "bash").
sudo apt-get install build-essential python-dev git
+h(4, "source-osx") macOS / OS X
p
| Install a recent version of
| #[+a("https://developer.apple.com/xcode/") XCode], including the
| so-called "Command Line Tools". macOS and OS X ship with Python and git
| preinstalled. To compile spaCy with multi-threading support on macOS / OS X,
| #[+a("https://github.com/explosion/spaCy/issues/267") see here].
+h(4, "source-windows") Windows
p
| Install a version of
| #[+a("https://www.visualstudio.com/vs/visual-studio-express/") Visual Studio Express]
| that matches the version that was used to compile your Python
| interpreter. For official distributions these are:
+table([ "Distribution", "Version"])
+row
+cell Python 2.7
+cell Visual Studio 2008
+row
+cell Python 3.4
+cell Visual Studio 2010
+row
+cell Python 3.5+
+cell Visual Studio 2015
+h(3, "tests") Run tests
p
| spaCy comes with an #[+a(gh("spacy", "spacy/tests")) extensive test suite].
| First, find out where spaCy is installed:
+code(false, "bash").
python -c "import os; import spacy; print(os.path.dirname(spacy.__file__))"
p
| Then run #[code pytest] on that directory. The flags #[code --slow] and
| #[code --model] are optional and enable additional tests.
+code(false, "bash").
# make sure you are using recent pytest version
python -m pip install -U pytest
python -m pytest &lt;spacy-directory&gt; # basic tests
python -m pytest &lt;spacy-directory&gt; --slow # basic and slow tests
python -m pytest &lt;spacy-directory&gt; --models --all # basic and all model tests
python -m pytest &lt;spacy-directory&gt; --models --en # basic and English model tests

View File

@ -0,0 +1,26 @@
//- 💫 DOCS > USAGE > INSTALL > QUICKSTART
- QUICKSTART[QUICKSTART.length - 1].options = Object.keys(MODELS).map(m => ({ id: m, title: LANGUAGES[m] }))
+quickstart(QUICKSTART, "Quickstart")
+qs({config: 'venv', python: 2}) python -m pip install -U virtualenv
+qs({config: 'venv', python: 3}) python -m pip install -U venv
+qs({config: 'venv', python: 2}) virtualenv .env
+qs({config: 'venv', python: 3}) venv .env
+qs({config: 'venv', os: 'mac'}) source .env/bin/activate
+qs({config: 'venv', os: 'linux'}) source .env/bin/activate
+qs({config: 'venv', os: 'windows'}) .env\Scripts\activate
+qs({config: 'gpu', os: 'mac'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
+qs({config: 'gpu', os: 'linux'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
+qs({package: 'pip'}) pip install -U spacy
+qs({package: 'conda'}) conda install -c conda-forge spacy
+qs({package: 'source'}) git clone https://github.com/explosion/spaCy
+qs({package: 'source'}) cd spaCy
+qs({package: 'source'}) pip install -r requirements.txt
+qs({package: 'source'}) pip install -e .
for _, model in MODELS
+qs({model: model}) spacy download #{model}

View File

@ -0,0 +1,147 @@
//- 💫 DOCS > USAGE > INSTALL > TROUBLESHOOTING
p
| This section collects some of the most common errors you may come
| across when installing, loading and using spaCy, as well as their solutions.
+aside("Help us improve this guide")
| Did you come across a problem like the ones listed here and want to
| share the solution? You can find the "Suggest edits" button at the
| bottom of this page that points you to the source. We always
| appreciate #[+a(gh("spaCy") + "/pulls") pull requests]!
+h(3, "compatible-model") No compatible model found
+code(false, "text").
No compatible model found for [lang] (spaCy v#{SPACY_VERSION}).
p
| This usually means that the model you're trying to download does not
| exist, or isn't available for your version of spaCy. Check the
| #[+a(gh("spacy-models", "compatibility.json")) compatibility table]
| to see which models are available for your spaCy version. If you're using
| an old version, consider upgrading to the latest release. Note that while
| spaCy supports tokenization for
| #[+a("/usage/models/#languages") a variety of languages],
| not all of them come with statistical models. To only use the tokenizer,
| import the language's #[code Language] class instead, for example
| #[code from spacy.fr import French].
+h(3, "symlink-privilege") Symbolic link privilege not held
+code(false, "text").
OSError: symbolic link privilege not held
p
| To create #[+a("/usage/models/#usage") shortcut links] that let you
| load models by name, spaCy creates a symbolic link in the
| #[code spacy/data] directory. This means your user needs permission to do
| this. The above error mostly occurs when doing a system-wide installation,
| which will create the symlinks in a system directory. Run the
| #[code download] or #[code link] command as administrator, or use a
| #[code virtualenv] to install spaCy in a user directory, instead
| of doing a system-wide installation.
+h(3, "no-cache-dir") No such option: --no-cache-dir
+code(false, "text").
no such option: --no-cache-dir
p
| The #[code download] command uses pip to install the models and sets the
| #[code --no-cache-dir] flag to prevent it from requiring too much memory.
| #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting]
| requires pip v6.0 or newer. Run #[code pip install -U pip] to upgrade to
| the latest version of pip. To see which version you have installed,
| run #[code pip --version].
+h(3, "import-error") Import error
+code(false, "text").
Import Error: No module named spacy
p
| This error means that the spaCy module can't be located on your system, or in
| your environment. Make sure you have spaCy installed. If you're using a
| #[code virtualenv], make sure it's activated and check that spaCy is
| installed in that environment otherwise, you're trying to load a system
| installation. You can also run #[code which python] to find out where
| your Python executable is located.
+h(3, "import-error-models") Import error: models
+code(false, "text").
ImportError: No module named 'en_core_web_sm'
p
| As of spaCy v1.7, all models can be installed as Python packages. This means
| that they'll become importable modules of your application. When creating
| #[+a("/usage/models/#usage") shortcut links], spaCy will also try
| to import the model to load its meta data. If this fails, it's usually a
| sign that the package is not installed in the current environment.
| Run #[code pip list] or #[code pip freeze] to check which model packages
| you have installed, and install the
| #[+a("/models") correct models] if necessary. If you're
| importing a model manually at the top of a file, make sure to use the name
| of the package, not the shortcut link you've created.
+h(3, "vocab-strings") File not found: vocab/strings.json
+code(false, "text").
FileNotFoundError: No such file or directory: [...]/vocab/strings.json
p
| This error may occur when using #[code spacy.load()] to load
| a language model either because you haven't set up a
| #[+a("/usage/models/#usage") shortcut link] for it, or because it
| doesn't actually exist. Set up a
| #[+a("/usage/models/#usage") shortcut link] for the model
| you want to load. This can either be an installed model package, or a
| local directory containing the model data. If you want to use one of the
| #[+a("/usage/models#languages") alpha tokenizers] for
| languages that don't yet have a statistical model, you should import its
| #[code Language] class instead, for example
| #[code from spacy.lang.bn import Bengali].
+h(3, "command-not-found") Command not found
+code(false, "text").
command not found: spacy
p
| This error may occur when running the #[code spacy] command from the
| command line. spaCy does not currently add an entry to our #[code PATH]
| environment variable, as this can lead to unexpected results, especially
| when using #[code virtualenv]. Instead, spaCy adds an auto-alias that
| maps #[code spacy] to #[code python -m spacy]. If this is not working as
| expected, run the command with #[code python -m], yourself
| for example #[code python -m spacy download en]. For more info on this,
| see #[+api("cli#download") download].
+h(3, "module-load") 'module' object has no attribute 'load'
+code(false, "text").
AttributeError: 'module' object has no attribute 'load'
p
| While this could technically have many causes, including spaCy being
| broken, the most likely one is that your script's file or directory name
| is "shadowing" the module e.g. your file is called #[code spacy.py],
| or a directory you're importing from is called #[code spacy]. So, when
| using spaCy, never call anything else #[code spacy].
+h(3, "pron-lemma") Pronoun lemma is returned as #[code -PRON-]
+code.
doc = nlp(u'They are')
print(doc[0].lemma_)
# -PRON-
p
| This is in fact expected behaviour and not a bug.
| Unlike verbs and common nouns, there's no clear base form of a personal
| pronoun. Should the lemma of "me" be "I", or should we normalize person
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
| novel symbol, #[code -PRON-], which is used as the lemma for
| all personal pronouns. For more info on this, see the
| #[+api("annotation#lemmatization") annotation specs] on lemmatization.

View File

@ -1,6 +1,4 @@
//- 💫 DOCS > USAGE > DEPENDENCY PARSE
include ../../_includes/_mixins
//- 💫 DOCS > USAGE > LINGUISTIC FEATURES > DEPENDENCY PARSE
p
| spaCy features a fast and accurate syntactic dependency parser, and has
@ -11,8 +9,7 @@ p
| boolean value. If this attribute is #[code False], the default sentence
| iterator will raise an exception.
+h(2, "noun-chunks") Noun chunks
+tag-model("dependency parse")
+h(3, "noun-chunks") Noun chunks
p
| Noun chunks are "base noun phrases" flat phrases that have a noun as
@ -41,7 +38,7 @@ p
+annotation-row(["insurance liability", "liability", "dobj", "shift"], style)
+annotation-row(["manufacturers", "manufacturers", "pobj", "toward"], style)
+h(2, "navigating") Navigating the parse tree
+h(3, "navigating") Navigating the parse tree
p
| spaCy uses the terms #[strong head] and #[strong child] to describe the words
@ -110,7 +107,7 @@ p
| attribute, which provides a sequence of #[+api("token") #[code Token]]
| objects.
+h(3, "navigating-around") Iterating around the local tree
+h(4, "navigating-around") Iterating around the local tree
p
| A few more convenience attributes are provided for iterating around the
@ -135,7 +132,7 @@ p
| method.
+aside("Projective vs. non-projective")
| For the #[+a("/docs/usage/models#available") default English model], the
| For the #[+a("/models/en") default English model], the
| parse tree is #[strong projective], which means that there are no crossing
| brackets. The tokens returned by #[code .subtree] are therefore guaranteed
| to be contiguous. This is not true for the German model, which has many
@ -181,7 +178,7 @@ p
+annotation-row(["their", "ADJ", "poss", "requests"], style)
+annotation-row(["requests", "NOUN", "dobj", "submit"], style)
+h(2, "displacy") Visualizing dependencies
+h(3, "displacy") Visualizing dependencies
p
| The best way to understand spaCy's dependency parser is interactively.
@ -201,14 +198,14 @@ p
+infobox
| For more details and examples, see the
| #[+a("/docs/usage/visualizers") usage guide on visualizing spaCy]. You
| #[+a("/usage/visualizers") usage guide on visualizing spaCy]. You
| can also test displaCy in our #[+a(DEMOS_URL + "/displacy", true) online demo].
+h(2, "disabling") Disabling the parser
+h(3, "disabling") Disabling the parser
p
| In the #[+a("/docs/usage/models/available") default models], the parser
| is loaded and enabled as part of the
| In the #[+a("/models") default models], the parser is loaded and enabled
| as part of the
| #[+a("docs/usage/language-processing-pipelines") standard processing pipeline].
| If you don't need any of the syntactic information, you should disable
| the parser. Disabling the parser will make spaCy load and run much faster.
@ -225,7 +222,7 @@ p
| Since spaCy v2.0 comes with better support for customising the
| processing pipeline components, the #[code parser] keyword argument
| has been replaced with #[code disable], which takes a list of
| #[+a("/docs/usage/language-processing-pipeline") pipeline component names].
| #[+a("/usage/processing-pipelines") pipeline component names].
| This lets you disable both default and custom components when loading
| a model, or initialising a Language class via
| #[+api("language-from_disk") #[code from_disk]].

View File

@ -1,6 +1,4 @@
//- 💫 DOCS > USAGE > NAMED ENTITY RECOGNITION
include ../../_includes/_mixins
//- 💫 DOCS > USAGE > LINGUISTIC FEATURES > NAMED ENTITY RECOGNITION
p
| spaCy features an extremely fast statistical entity recognition system,
@ -9,12 +7,11 @@ p
| locations, organizations and products. You can add arbitrary classes to
| the entity recognition system, and update the model with new examples.
+h(2, "101") Named Entity Recognition 101
+tag-model("named entities")
+h(3, "101") Named Entity Recognition 101
include _spacy-101/_named-entities
include ../_spacy-101/_named-entities
+h(2, "accessing") Accessing entity annotations
+h(3, "accessing") Accessing entity annotations
p
| The standard way to access entity annotations is the
@ -62,7 +59,7 @@ p
+annotation-row(["delivery", 2, "O", '""', "outside an entity"], style)
+annotation-row(["robots", 2, "O", '""', "outside an entity"], style)
+h(2, "setting") Setting entity annotations
+h(3, "setting") Setting entity annotations
p
| To ensure that the sequence of token annotations remains consistent, you
@ -92,7 +89,7 @@ p
| but at the document level, the entity will have the start and end
| indices #[code (0, 7)].
+h(3, "setting-from-array") Setting entity annotations from array
+h(4, "setting-from-array") Setting entity annotations from array
p
| You can also assign entity annotations using the
@ -114,7 +111,7 @@ p
doc.from_array(header, attr_array)
assert list(doc.ents)[0].text == u'London'
+h(3, "setting-cython") Setting entity annotations in Cython
+h(4, "setting-cython") Setting entity annotations in Cython
p
| Finally, you can always write to the underlying struct, if you compile
@ -137,18 +134,16 @@ p
| you'll have responsibility for ensuring that the data is left in a
| consistent state.
+h(2, "entity-types") Built-in entity types
+h(3, "entity-types") Built-in entity types
+aside("Tip: Understanding entity types")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of an entity label. For example,
| #[code spacy.explain("LANGUAGE")] will return "any named language".
include ../api/_annotation/_named-entities
include ../../api/_annotation/_named-entities
+h(2, "updating") Training and updating
+under-construction
+h(3, "updating") Training and updating
p
| To provide training examples to the entity recogniser, you'll first need
@ -166,65 +161,24 @@ p
+code.
doc = Doc(nlp.vocab, [u'rats', u'make', u'good', u'pets'])
gold = GoldParse(doc, [u'U-ANIMAL', u'O', u'O', u'O'])
gold = GoldParse(doc, entities=[u'U-ANIMAL', u'O', u'O', u'O'])
+infobox
| For more details on #[strong training and updating] the named entity
| recognizer, see the usage guides on #[+a("/docs/usage/training") training]
| and #[+a("/docs/usage/training-ner") training the named entity recognizer],
| recognizer, see the usage guides on #[+a("/usage/training") training]
| or check out the runnable
| #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
| on GitHub.
+h(3, "updating-biluo") The BILUO Scheme
+h(4, "updating-biluo") The BILUO Scheme
p
| You can also provide token-level entity annotation, using the
| following tagging scheme to describe the entity boundaries:
+table([ "Tag", "Description" ])
+row
+cell #[code #[span.u-color-theme B] EGIN]
+cell The first token of a multi-token entity.
include ../../api/_annotation/_biluo
+row
+cell #[code #[span.u-color-theme I] N]
+cell An inner token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme L] AST]
+cell The final token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme U] NIT]
+cell A single-token entity.
+row
+cell #[code #[span.u-color-theme O] UT]
+cell A non-entity token.
+aside("Why BILUO, not IOB?")
| There are several coding schemes for encoding entity annotations as
| token tags. These coding schemes are equally expressive, but not
| necessarily equally learnable.
| #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
| showed that the minimal #[strong Begin], #[strong In], #[strong Out]
| scheme was more difficult to learn than the #[strong BILUO] scheme that
| we use, which explicitly marks boundary tokens.
p
| spaCy translates the character offsets into this scheme, in order to
| decide the cost of each action given the current state of the entity
| recogniser. The costs are then used to calculate the gradient of the
| loss, to train the model. The exact algorithm is a pastiche of
| well-known methods, and is not currently described in any single
| publication. The model is a greedy transition-based parser guided by a
| linear model whose weights are learned using the averaged perceptron
| loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle]
| imitation learning strategy. The transition system is equivalent to the
| BILOU tagging scheme.
+h(2, "displacy") Visualizing named entities
+h(3, "displacy") Visualizing named entities
p
| The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer]
@ -238,7 +192,7 @@ p
p
| For more details and examples, see the
| #[+a("/docs/usage/visualizers") usage guide on visualizing spaCy].
| #[+a("/usage/visualizers") usage guide on visualizing spaCy].
+code("Named Entity example").
import spacy

View File

@ -1,20 +1,10 @@
//- 💫 DOCS > USAGE > PART-OF-SPEECH TAGGING
//- 💫 DOCS > USAGE > LINGUISTIC FEATURES > PART-OF-SPEECH TAGGING
include ../../_includes/_mixins
include ../_spacy-101/_pos-deps
p
| Part-of-speech tags are labels like noun, verb, adjective etc that are
| assigned to each token in the document. They're useful in rule-based
| processes. They can also be useful features in some statistical models.
//-+aside("Help spaCy's output is wrong!")
+h(2, "101") Part-of-speech tagging 101
+tag-model("tagger", "dependency parse")
include _spacy-101/_pos-deps
+aside("Help spaCy's output is wrong!")
+h(2, "rule-based-morphology") Rule-based morphology
+h(3, "rule-based-morphology") Rule-based morphology
p
| Inflectional morphology is the process by which a root form of a word is
@ -54,7 +44,7 @@ p
+list("numbers")
+item
| The tokenizer consults a
| #[+a("/docs/usage/adding-languages#tokenizer-exceptions") mapping table]
| #[+a("/usage/adding-languages#tokenizer-exceptions") mapping table]
| #[code TOKENIZER_EXCEPTIONS], which allows sequences of characters
| to be mapped to multiple tokens. Each token may be assigned a part
| of speech and one or more morphological features.
@ -68,7 +58,7 @@ p
+item
| For words whose POS is not set by a prior process, a
| #[+a("/docs/usage/adding-languages#tag-map") mapping table]
| #[+a("/usage/adding-languages#tag-map") mapping table]
| #[code TAG_MAP] maps the tags to a part-of-speech and a set of
| morphological features.
@ -80,6 +70,4 @@ p
| list-based exception files, acquired from
| #[+a("https://wordnet.princeton.edu/") WordNet].
+h(2, "pos-schemes") Part-of-speech tag schemes
include ../api/_annotation/_pos-tags
include ../../api/_annotation/_pos-tags

View File

@ -1,19 +1,18 @@
//- 💫 DOCS > USAGE > RULE-BASED MATCHING
include ../../_includes/_mixins
p
| spaCy features a rule-matching engine that operates over tokens, similar
| spaCy features a rule-matching engine, the #[+api("matcher") #[code Matcher]],
| that operates over tokens, similar
| to regular expressions. The rules can refer to token annotations (e.g.
| the token #[code text] or #[code tag_], and flags (e.g. #[code IS_PUNCT]).
| The rule matcher also lets you pass in a custom callback
| to act on matches for example, to merge entities and apply custom labels.
| You can also associate patterns with entity IDs, to allow some basic
| entity linking or disambiguation.
| entity linking or disambiguation. To match large terminology lists,
| you can use the #[+api("phrasematcher") #[code PhraseMatcher]], which
| accepts #[code Doc] objects as match patterns.
//-+aside("What about \"real\" regular expressions?")
+h(2, "adding-patterns") Adding patterns
+h(3, "adding-patterns") Adding patterns
p
| Let's say we want to enable spaCy to find a combination of three tokens:
@ -76,7 +75,7 @@ p
| other pattern types. You shouldn't have to create different matchers for
| each of those processes.
+h(2, "on_match") Adding #[code on_match] rules
+h(3, "on_match") Adding #[code on_match] rules
p
| To move on to a more realistic example, let's say you're working with a
@ -142,7 +141,7 @@ p
options={'ents': ['EVENT']})
| For more info and examples, see the usage guide on
| #[+a("/docs/usage/visualizers") visualizing spaCy].
| #[+a("/usage/visualizers") visualizing spaCy].
p
| We can now call the matcher on our documents. The patterns will be
@ -184,7 +183,7 @@ p
| A list of #[code (match_id, start, end)] tuples, describing the
| matches. A match tuple describes a span #[code doc[start:end]].
+h(2, "quantifiers") Using operators and quantifiers
+h(3, "quantifiers") Using operators and quantifiers
p
| The matcher also lets you use quantifiers, specified as the #[code 'OP']
@ -221,7 +220,7 @@ p
+cell match 0 or 1 times
+cell optional, max one
+h(2, "example1") Example: Using linguistic annotations
+h(3, "example1") Example: Using linguistic annotations
p
| Let's say you're analysing user comments and you want to find out what
@ -246,13 +245,13 @@ p
p
| To get a quick overview of the results, you could collect all sentences
| containing a match and render them with the
| #[+a("/docs/usage/visualizers") displaCy visualizer].
| #[+a("/usage/visualizers") displaCy visualizer].
| In the callback function, you'll have access to the #[code start] and
| #[code end] of each match, as well as the parent #[code Doc]. This lets
| you determine the sentence containing the match,
| #[code doc[start : end].sent], and calculate the start and end of the
| matched span within the sentence. Using displaCy in
| #[+a("/docs/usage/visualizers#manual-usage") "manual" mode] lets you
| #[+a("/usage/visualizers#manual-usage") "manual" mode] lets you
| pass in a list of dictionaries containing the text and entities to render.
+code.
@ -283,7 +282,7 @@ p
# set manual=True to make displaCy render straight from a dictionary
displacy.serve(matched_sents, style='ent', manual=True)
+h(2, "example2") Example: Phone numbers
+h(3, "example2") Example: Phone numbers
p
| Phone numbers can have many different formats and matching them is often
@ -321,7 +320,7 @@ p
| extend, and doesn't require any training data only a set of
| test cases.
+h(2, "example3") Example: Hashtags and emoji on social media
+h(3, "example3") Example: Hashtags and emoji on social media
p
| Social media posts, especially tweets, can be difficult to work with.

View File

@ -1,6 +1,4 @@
//- 💫 DOCS > USAGE > TOKENIZER
include ../../_includes/_mixins
//- 💫 DOCS > USAGE > LINGUISTIC FEATURES > TOKENIZATION
p
| Tokenization is the task of splitting a text into meaningful segments,
@ -11,15 +9,14 @@ p
| #[code spaces] booleans, which allow you to maintain alignment of the
| tokens into the original string.
+h(2, "101") Tokenizer 101
include ../_spacy-101/_tokenization
include _spacy-101/_tokenization
+h(3, "101-data") Tokenizer data
+h(4, "101-data") Tokenizer data
p
| #[strong Global] and #[strong language-specific] tokenizer data is
| supplied via the language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang].
| supplied via the language data in
| #[+src(gh("spaCy", "spacy/lang")) #[code spacy/lang]].
| The tokenizer exceptions define special cases like "don't" in English,
| which needs to be split into two tokens: #[code {ORTH: "do"}] and
| #[code {ORTH: "n't", LEMMA: "not"}]. The prefixes, suffixes and infixes
@ -27,16 +24,14 @@ p
| (at the end of a sentence), and when to leave token containing periods
| intact (abbreviations like "U.S.").
+image
include ../../assets/img/docs/language_data.svg
.u-text-right
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
+graphic("/assets/img/language_data.svg")
include ../../assets/img/language_data.svg
+infobox
| For more details on the language-specific data, see the
| usage guide on #[+a("/docs/usage/adding-languages") adding languages].
| usage guide on #[+a("/usage/adding-languages") adding languages].
+h(2, "special-cases") Adding special case tokenization rules
+h(3, "special-cases") Adding special case tokenization rules
p
| Most domains have at least some idiosyncrasies that require custom
@ -46,7 +41,7 @@ p
+aside("Language data vs. custom tokenization")
| Tokenization rules that are specific to one language, but can be
| #[strong generalised across that language] should ideally live in the
| language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang]  we
| language data in #[+src(gh("spaCy", "spacy/lang")) #[code spacy/lang]]  we
| always appreciate pull requests! Anything that's specific to a domain or
| text type like financial trading abbreviations, or Bavarian youth slang
| should be added as a special case rule to your tokenizer instance. If
@ -69,9 +64,12 @@ p
special_case = [{ORTH: u'gim', LEMMA: u'give', POS: u'VERB'}, {ORTH: u'me'}]
nlp.tokenizer.add_special_case(u'gimme', special_case)
assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that']
# Pronoun lemma is returned as -PRON-!
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that']
p
| For details on spaCy's custom pronoun lemma #[code -PRON-],
| #[+a("/usage/#pron-lemma") see here].
| The special case doesn't have to match an entire whitespace-delimited
| substring. The tokenizer will incrementally split off punctuation, and
| keep looking up the remaining substring:
@ -97,7 +95,7 @@ p
| #[+api("language") #[code Language]] class itself.
+h(2, "how-tokenizer-works") How spaCy's tokenizer works
+h(3, "how-tokenizer-works") How spaCy's tokenizer works
p
| spaCy introduces a novel tokenization algorithm, that gives a better
@ -113,8 +111,8 @@ p
| algorithm in Python, optimized for readability rather than performance:
+code.
def tokenizer_pseudo_code(text, find_prefix, find_suffix,
find_infixes, special_cases):
def tokenizer_pseudo_code(text, special_cases,
find_prefix, find_suffix, find_infixes):
tokens = []
for substring in text.split(' '):
suffixes = []
@ -162,11 +160,11 @@ p
| like hyphens etc.
+item Once we can't consume any more of the string, handle it as a single token.
+h(2, "native-tokenizers") Customizing spaCy's Tokenizer class
+h(3, "native-tokenizers") Customizing spaCy's Tokenizer class
p
| Let's imagine you wanted to create a tokenizer for a new language or
| specific domain. There are four things you would need to define:
| specific domain. There are five things you would need to define:
+list("numbers")
+item
@ -188,6 +186,11 @@ p
| A function #[code infixes_finditer], to handle non-whitespace
| separators, such as hyphens etc.
+item
| An optional boolean function #[code token_match] matching strings
| that should never be split, overriding the previous rules.
| Useful for things like URLs or numbers.
p
| You shouldn't usually need to create a #[code Tokenizer] subclass.
| Standard usage is to use #[code re.compile()] to build a regular
@ -200,10 +203,14 @@ p
prefix_re = re.compile(r'''[\[\(&quot;&apos;]''')
suffix_re = re.compile(r'''[\]\)&quot;&apos;]''')
infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''')
def custom_tokenizer(nlp):
return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
suffix_search=suffix_re.search)
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=simple_url_re.match)
nlp = spacy.load('en')
nlp.tokenizer = custom_tokenizer(nlp)
@ -213,7 +220,7 @@ p
| specialize are #[code find_prefix], #[code find_suffix] and
| #[code find_infix].
+h(2, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
+h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
p
| The tokenizer is the first component of the processing pipeline and the
@ -222,11 +229,8 @@ p
| it takes a text and returns a #[code Doc], whereas all other components
| expect to already receive a tokenized #[code Doc].
+image
include ../../assets/img/docs/pipeline.svg
.u-text-right
+button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic
+graphic("/assets/img/pipeline.svg")
include ../../assets/img/pipeline.svg
p
| To overwrite the existing tokenizer, you need to replace
@ -243,7 +247,7 @@ p
+cell unicode
+cell The raw text to tokenize.
+footrow
+row("foot")
+cell returns
+cell #[code Doc]
+cell The tokenized document.
@ -295,3 +299,36 @@ p
+code.
nlp = spacy.load('en')
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
+h(3, "own-annotations") Bringing your own annotations
p
| spaCy generally assumes by default that your data is raw text. However,
| sometimes your data is partially annotated, e.g. with pre-existing
| tokenization, part-of-speech tags, etc. The most common situation is
| that you have pre-defined tokenization. If you have a list of strings,
| you can create a #[code Doc] object directly. Optionally, you can also
| specify a list of boolean values, indicating whether each word has a
| subsequent space.
+code.
doc = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False])
p
| If provided, the spaces list must be the same length as the words list.
| The spaces list affects the #[code doc.text], #[code span.text],
| #[code token.idx], #[code span.start_char] and #[code span.end_char]
| attributes. If you don't provide a #[code spaces] sequence, spaCy will
| assume that all words are whitespace delimited.
+code.
good_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False])
bad_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'])
assert bad_spaces.text == u'Hello , world !'
assert good_spaces.text == u'Hello, world!'
p
| Once you have a #[+api("doc") #[code Doc]] object, you can write to its
| attributes to set the part-of-speech tags, syntactic dependencies, named
| entities and other attributes. For details, see the respective usage
| pages.

View File

@ -0,0 +1,22 @@
//- 💫 DOCS > USAGE > MODELS > AVAILABE MODELS
p
| Model differences are mostly statistical. In general, we do expect larger
| models to be "better" and more accurate overall. Ultimately, it depends on
| your use case and requirements, and we recommend starting with the default
| models (marked with a star below). For a more detailed overview, see the
| #[+a("/models") models directory].
+table(["Name", "Language", "Type"])
for models, lang in MODELS
for model, i in models
- var comps = getModelComponents(model)
+row
+cell #[+a("/models/" + lang + "#" + model) #[code=model]]
if i == 0
+icon("star", 16).o-icon--inline.u-color-theme
+cell #{LANGUAGES[comps.lang]}
+cell #{MODEL_META[comps.type]}
.u-text-right
+button("/models", true, "primary", "small") View models directory

View File

@ -0,0 +1,33 @@
//- 💫 DOCS > USAGE > MODELS > INSTALLATION BASICS
p
| The easiest way to download a model is via spaCy's
| #[+api("cli#download") #[code download]] command. It takes care of
| finding the best-matching model compatible with your spaCy installation.
- var models = Object.keys(MODELS).map(function(lang) { return "spacy download " + lang })
+code(false, "bash").
# out-of-the-box: download best-matching default model
#{Object.keys(MODELS).map(function(l) {return "spacy download " + l}).join('\n')}
# download best-matching version of specific model for your spaCy installation
spacy download en_core_web_sm
# download exact model version (doesn't create shortcut link)
spacy download en_core_web_sm-2.0.0 --direct
p
| The download command will #[+a("/usage/models#download-pip") install the model] via
| pip, place the package in your #[code site-packages] directory and create
| a #[+a("/usage/models#usage") shortcut link] that lets you load the model by a custom
| name. The shortcut link will be the same as the model name used in
| #[code spacy download].
+code(false, "bash").
pip install spacy
spacy download en
+code.
import spacy
nlp = spacy.load('en')
doc = nlp(u'This is a sentence.')

View File

@ -1,38 +1,4 @@
//- 💫 DOCS > USAGE > MODELS
include ../../_includes/_mixins
p
| As of v1.7.0, models for spaCy can be installed as #[strong Python packages].
| This means that they're a component of your application, just like any
| other module. They're versioned and can be defined as a dependency in your
| #[code requirements.txt]. Models can be installed from a download URL or
| a local directory, manually or via #[+a("https://pypi.python.org/pypi/pip") pip].
| Their data can be located anywhere on your file system.
+aside("Important note")
| If you're upgrading to spaCy v1.7.x or v2.x, you need to
| #[strong download the new models]. If you've trained statistical models
| that use spaCy's annotations, you should #[strong retrain your models]
| after updating spaCy. If you don't retrain, you may suffer train/test
| skew, which might decrease your accuracy.
+quickstart(QUICKSTART_MODELS, "Quickstart", "Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below.")
for models, lang in MODELS
- var package = (models.length == 1) ? models[0] : models.find(function(m) { return m.def })
+qs({lang: lang}) spacy download #{lang}
+qs({lang: lang}, "divider")
+qs({lang: lang, load: "module"}, "python") import #{package.id}
+qs({lang: lang, load: "module"}, "python") nlp = #{package.id}.load()
+qs({lang: lang, load: "spacy"}, "python") nlp = spacy.load('#{lang}')
+qs({lang: lang, config: "example"}, "python") doc = nlp(u"#{EXAMPLE_SENTENCES[lang]}")
+qs({lang: lang, config: "example"}, "python") print([(w.text, w.pos_) for w in doc])
+h(2, "available") Available models
include _models-list
+h(2, "download") Downloading models
//- 💫 DOCS > USAGE > MODELS > INSTALLATION
+aside("Downloading models in spaCy < v1.7")
| In older versions of spaCy, you can still use the old download commands.
@ -47,37 +13,8 @@ include _models-list
| The old models are also #[+a(gh("spacy") + "/tree/v1.6.0") attached to the v1.6.0 release].
| To download and install them manually, unpack the archive, drop the
| contained directory into #[code spacy/data].
p
| The easiest way to download a model is via spaCy's
| #[+api("cli#download") #[code download]] command. It takes care of
| finding the best-matching model compatible with your spaCy installation.
- var models = Object.keys(MODELS).map(function(lang) { return "spacy download " + lang })
+code(false, "bash").
# out-of-the-box: download best-matching default model
#{Object.keys(MODELS).map(function(l) {return "spacy download " + l}).join('\n')}
# download best-matching version of specific model for your spaCy installation
spacy download en_core_web_md
# download exact model version (doesn't create shortcut link)
spacy download en_core_web_md-1.2.0 --direct
p
| The download command will #[+a("#download-pip") install the model] via
| pip, place the package in your #[code site-packages] directory and create
| a #[+a("#usage") shortcut link] that lets you load the model by a custom
| name. The shortcut link will be the same as the model name used in
| #[code spacy.download].
+code(false, "bash").
pip install spacy
spacy download en
+code.
import spacy
nlp = spacy.load('en')
doc = nlp(u'This is a sentence.')
include _install-basics
+h(3, "download-pip") Installation via pip
@ -107,8 +44,8 @@ p
+infobox
| You can also add the direct download link to your application's
| #[code requirements.txt]. For more details,
| see the usage guide on
| #[+a("/docs/usage/production-use#models") working with models in production].
| see the section on
| #[+a("/models/#production") working with models in production].
+h(3, "download-manual") Manual download and installation
@ -135,7 +72,7 @@ p
| local file system. To use it with spaCy, simply assign it a name by
| creating a #[+a("#usage") shortcut link] for the data directory.
+h(2, "usage") Using models with spaCy
+h(3, "usage") Using models with spaCy
p
| To load a model, use #[+api("spacy#load") #[code spacy.load()]] with the
@ -201,7 +138,7 @@ p
| privileges, the #[code spacy link] command may fail. The easiest solution
| is to re-run the command as admin, or use a #[code virtualenv]. For more
| info on this, see the
| #[+a("/docs/usage/#symlink-privilege") troubleshooting guide].
| #[+a("/usage/#symlink-privilege") troubleshooting guide].
+h(3, "usage-import") Importing models as modules
@ -227,15 +164,15 @@ p
| #[code spacy.load()].
+infobox
| For more details, see the usage guide on
| #[+a("/docs/usage/production-use#models") working with models in production].
| For more details, see the section on
| #[+a("/models/#production") working with models in production].
+h(2, "own-models") Using your own models
+h(3, "own-models") Using your own models
p
| If you've trained your own model, for example for
| #[+a("/docs/usage/adding-languages") additional languages] or
| #[+a("/docs/usage/train-ner") custom named entities], you can save its
| #[+a("/usage/adding-languages") additional languages] or
| #[+a("/usage/training#ner") custom named entities], you can save its
| state using the #[+api("language#to_disk") #[code Language.to_disk()]]
| method. To make the model more convenient to deploy, we recommend
| wrapping it as a Python package.
@ -243,4 +180,4 @@ p
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/docs/usage/saving-loading#models") saving and loading models].
| #[+a("/usage/training#saving-loading") saving and loading models].

View File

@ -0,0 +1,81 @@
//- 💫 DOCS > USAGE > MODELS > PRODUCTION USE
p
| If your application depends on one or more models,
| you'll usually want to integrate them into your continuous integration
| workflow and build process. While spaCy provides a range of useful helpers
| for downloading, linking and loading models, the underlying functionality
| is entirely based on native Python packages. This allows your application
| to handle a model like any other package dependency.
+infobox("Training models for production")
| For an example of an automated model training and build process, see
| #[+a("/usage/training#example-training-spacy") this example] of how
| we're training and packaging our models for spaCy.
+h(3, "models-download") Downloading and requiring model dependencies
p
| spaCy's built-in #[+api("cli#download") #[code download]] command
| is mostly intended as a convenient, interactive wrapper. It performs
| compatibility checks and prints detailed error messages and warnings.
| However, if you're downloading models as part of an automated build
| process, this only adds an unnecessary layer of complexity. If you know
| which models your application needs, you should be specifying them directly.
p
| Because all models are valid Python packages, you can add them to your
| application's #[code requirements.txt]. If you're running your own
| internal PyPi installation, you can simply upload the models there. pip's
| #[+a("https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format") requirements file format]
| supports both package names to download via a PyPi server, as well as direct
| URLs.
+code("requirements.txt", "text").
spacy&gt;=2.0.0,&lt;3.0.0
-e #{gh("spacy-models")}/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#en_core_web_sm
p
| Specifying #[code #egg=] with the package name tells pip
| which package to expect from the download URL. This way, the
| package won't be re-downloaded and overwritten if it's already
| installed - just like when you're downloading a package from PyPi.
p
| All models are versioned and specify their spaCy dependency. This ensures
| cross-compatibility and lets you specify exact version requirements for
| each model. If you've trained your own model, you can use the
| #[+api("cli#package") #[code package]] command to generate the required
| meta data and turn it into a loadable package.
+h(3, "models-loading") Loading and testing models
p
| Downloading models directly via pip won't call spaCy's link
| #[+api("cli#link") #[code link]] command, which creates
| symlinks for model shortcuts. This means that you'll have to run this
| command separately, or use the native #[code import] syntax to load the
| models:
+code.
import en_core_web_sm
nlp = en_core_web_sm.load()
p
| In general, this approach is recommended for larger code bases, as it's
| more "native", and doesn't depend on symlinks or rely on spaCy's loader
| to resolve string names to model packages. If a model can't be
| imported, Python will raise an #[code ImportError] immediately. And if a
| model is imported but not used, any linter will catch that.
p
| Similarly, it'll give you more flexibility when writing tests that
| require loading models. For example, instead of writing your own
| #[code try] and #[code except] logic around spaCy's loader, you can use
| #[+a("http://pytest.readthedocs.io/en/latest/") pytest]'s
| #[+a("https://docs.pytest.org/en/latest/builtin.html#_pytest.outcomes.importorskip") #[code importorskip()]]
| method to only run a test if a specific model or model version is
| installed. Each model package exposes a #[code __version__] attribute
| which you can also use to perform your own version compatibility checks
| before loading a model.

View File

@ -0,0 +1,17 @@
//- 💫 DOCS > USAGE > MODELS > QUICKSTART
- QUICKSTART_MODELS[0].options = Object.keys(MODELS).map(m => ({ id: m, title: LANGUAGES[m], checked: m == 'en'}))
+quickstart(QUICKSTART_MODELS, "Quickstart", "Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below.")
for models, lang in MODELS
- var package = models[0]
+qs({lang: lang}) spacy download #{lang}
+qs({lang: lang}, "divider")
+qs({lang: lang, load: "module"}, "python") import #{package}
+qs({lang: lang, load: "module"}, "python") nlp = #{package}.load()
+qs({lang: lang, load: "spacy"}, "python") nlp = spacy.load('#{lang}')
+qs({lang: lang, config: "example"}, "python") doc = nlp(u"#{EXAMPLE_SENTENCES[lang]}")
if lang != "xx"
+qs({lang: lang, config: "example"}, "python") print([(w.text, w.pos_) for w in doc])
else
+qs({lang: lang, config: "example"}, "python") print([(ent.text, ent.label) for ent in doc.ents])

View File

@ -0,0 +1,126 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > EXAMPLES
p
| To see real-world examples of pipeline factories and components in action,
| you can have a look at the source of spaCy's built-in components, e.g.
| the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or
| #[+api("entityrecognizer") #[code EntityRecongnizer]].
+h(3, "example1") Example: Custom sentence segmentation logic
p
| Let's say you want to implement custom logic to improve spaCy's sentence
| boundary detection. Currently, sentence segmentation is based on the
| dependency parse, which doesn't always produce ideal results. The custom
| logic should therefore be applied #[strong after] tokenization, but
| #[strong before] the dependency parsing this way, the parser can also
| take advantage of the sentence boundaries.
+code.
def sbd_component(doc):
for i, token in enumerate(doc[:-2]):
# define sentence start if period + titlecase token
if token.text == '.' and doc[i+1].is_title:
doc[i+1].sent_start = True
return doc
p
| In this case, we simply want to add the component to the existing
| pipeline of the English model. We can do this by inserting it at index 0
| of #[code nlp.pipeline]:
+code.
nlp = spacy.load('en')
nlp.pipeline.insert(0, sbd_component)
p
| When you call #[code nlp] on some text, spaCy will tokenize it to create
| a #[code Doc] object, and first call #[code sbd_component] on it, followed
| by the model's default pipeline.
+h(3, "example2") Example: Sentiment model
p
| Let's say you have trained your own document sentiment model on English
| text. After tokenization, you want spaCy to first execute the
| #[strong default tensorizer], followed by a custom
| #[strong sentiment component] that adds a #[code .sentiment]
| property to the #[code Doc], containing your model's sentiment precition.
p
| Your component class will have a #[code from_disk()] method that spaCy
| calls to load the model data. When called, the component will compute
| the sentiment score, add it to the #[code Doc] and return the modified
| document. Optionally, the component can include an #[code update()] method
| to allow training the model.
+code.
import pickle
from pathlib import Path
class SentimentComponent(object):
def __init__(self, vocab):
self.weights = None
def __call__(self, doc):
doc.sentiment = sum(self.weights*doc.vector) # set sentiment property
return doc
def from_disk(self, path): # path = model path + factory ID ('sentiment')
self.weights = pickle.load(Path(path) / 'weights.bin') # load weights
return self
def update(self, doc, gold): # update weights allows training!
prediction = sum(self.weights*doc.vector)
self.weights -= 0.001*doc.vector*(prediction-gold.sentiment)
p
| The factory will initialise the component with the #[code Vocab] object.
| To be able to add it to your model's pipeline as #[code 'sentiment'],
| it also needs to be registered via
| #[+api("spacy#set_factory") #[code set_factory()]].
+code.
def sentiment_factory(vocab):
component = SentimentComponent(vocab) # initialise component
return component
spacy.set_factory('sentiment', sentiment_factory)
p
| The above code should be #[strong shipped with your model]. You can use
| the #[+api("cli#package") #[code package]] command to create all required
| files and directories. The model package will include an
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]]
| with a #[code load()] method, that will initialise the language class with
| the model's pipeline and call the #[code from_disk()] method to load
| the model data.
p
| In the model package's meta.json, specify the language class and pipeline
| IDs:
+code("meta.json (excerpt)", "json").
{
"name": "sentiment_model",
"lang": "en",
"version": "1.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"pipeline": ["tensorizer", "sentiment"]
}
p
| When you load your new model, spaCy will call the model's #[code load()]
| method. This will return a #[code Language] object with a pipeline
| containing the default tensorizer, and the sentiment component returned
| by your custom #[code "sentiment"] factory.
+code.
nlp = spacy.load('en_sentiment_model')
doc = nlp(u'I love pizza')
assert doc.sentiment
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/usage/training#saving-loading") saving and loading models].

View File

@ -0,0 +1,40 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > MULTI-THREADING
p
| If you have a sequence of documents to process, you should use the
| #[+api("language#pipe") #[code Language.pipe()]] method. The method takes
| an iterator of texts, and accumulates an internal buffer,
| which it works on in parallel. It then yields the documents in order,
| one-by-one. After a long and bitter struggle, the global interpreter
| lock was freed around spaCy's main parsing loop in v0.100.3. This means
| that #[code .pipe()] will be significantly faster in most
| practical situations, because it allows shared memory parallelism.
+code.
for doc in nlp.pipe(texts, batch_size=10000, n_threads=3):
pass
p
| To make full use of the #[code .pipe()] function, you might want to
| brush up on #[strong Python generators]. Here are a few quick hints:
+list
+item
| Generator comprehensions can be written as
| #[code (item for item in sequence)].
+item
| The
| #[+a("https://docs.python.org/2/library/itertools.html") #[code itertools] built-in library]
| and the
| #[+a("https://github.com/pytoolz/cytoolz") #[code cytoolz] package]
| provide a lot of handy #[strong generator tools].
+item
| Often you'll have an input stream that pairs text with some
| important meta data, e.g. a JSON document. To
| #[strong pair up the meta data] with the processed #[code Doc]
| object, you should use the #[code itertools.tee] function to split
| the generator in two, and then #[code izip] the extra stream to the
| document stream. Here's
| #[+a(gh("spacy") + "/issues/172#issuecomment-183963403") an example].

View File

@ -1,12 +1,4 @@
//- 💫 DOCS > USAGE > PIPELINE
include ../../_includes/_mixins
+h(2, "101") Pipelines 101
include _spacy-101/_pipelines
+h(2, "pipelines") How pipelines work
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > PIPELINES
p
| spaCy makes it very easy to create your own pipelines consisting of
@ -15,11 +7,11 @@ p
| functions. A pipeline component can be added to an already existing
| #[code nlp] object, specified when initialising a #[code Language] class,
| or defined within a
| #[+a("/docs/usage/saving-loading#models-generating") model package].
| #[+a("/usage/saving-loading#models-generating") model package].
p
| When you load a model, spaCy first consults the model's
| #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
| #[+a("/usage/saving-loading#models-generating") meta.json]. The
| meta typically includes the model details, the ID of a language class,
| and an optional list of pipeline components. spaCy then does the
| following:
@ -29,7 +21,7 @@ p
"name": "example_model",
"lang": "en"
"description": "Example model for spaCy",
"pipeline": ["token_vectors", "tagger"]
"pipeline": ["tensorizer", "tagger"]
}
+list("numbers")
@ -56,24 +48,50 @@ p
p
| ... the model tells spaCy to use the pipeline
| #[code ["tensorizer", "tagger", "parser", "ner"]]. spaCy will then look
| up each string in its internal factories registry and initialise the
| individual components. It'll then load #[code spacy.lang.en.English],
| pass it the path to the model's data directory, and return it for you
| to use as the #[code nlp] object.
| #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will
| then look up each string in its internal factories registry and
| initialise the individual components. It'll then load
| #[code spacy.lang.en.English], pass it the path to the model's data
| directory, and return it for you to use as the #[code nlp] object.
p
| Fundamentally, a #[+a("/models") spaCy model] consists of three
| components: #[strong the weights], i.e. binary data loaded in from a
| directory, a #[strong pipeline] of functions called in order,
| and #[strong language data] like the tokenization rules and annotation
| scheme. All of this is specific to each model, and defined in the
| model's #[code meta.json] for example, a Spanish NER model requires
| different weights, language data and pipeline components than an English
| parsing and tagging model. This is also why the pipeline state is always
| held by the #[code Language] class.
| #[+api("spacy#load") #[code spacy.load]] puts this all together and
| returns an instance of #[code Language] with a pipeline set and access
| to the binary data:
+code("spacy.load under the hood").
lang = 'en'
pipeline = ['tensorizer', 'tagger', 'parser', 'ner']
data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0'
cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English()
nlp = cls(pipeline=pipeline) # 2. initialise it with the pipeline
nlp.from_disk(model_data_path) # 3. load in the binary data
p
| When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and
| then #[strong call each component] on the #[code Doc], in order.
| Components all return the modified document, which is then processed by
| the component next in the pipeline.
| Since the model data is loaded, the components can access it to assign
| annotations to the #[code Doc] object, and subsequently to the
| #[code Token] and #[code Span] which are only views of the #[code Doc],
| and don't own any data themselves. All components return the modified
| document, which is then processed by the component next in the pipeline.
+code("The pipeline under the hood").
doc = nlp.make_doc(u'This is a sentence')
for proc in nlp.pipeline:
doc = proc(doc)
+h(2, "creating") Creating pipeline components and factories
+h(3, "creating") Creating pipeline components and factories
p
| spaCy lets you customise the pipeline with your own components. Components
@ -82,7 +100,7 @@ p
| pipeline. You can do that by defining and registering a factory which
| receives the shared #[code Vocab] object and returns a component.
+h(3, "creating-component") Creating a component
+h(4, "creating-component") Creating a component
p
| A component receives a #[code Doc] object and
@ -103,7 +121,7 @@ p
+cell #[code Doc]
+cell The #[code Doc] object processed by the previous component.
+footrow
+row("foot")
+cell returns
+cell #[code Doc]
+cell The #[code Doc] object processed by this pipeline component.
@ -123,7 +141,7 @@ p
nlp = spacy.load('en')
nlp.pipeline.append(my_component)
+h(3, "creating-factory") Creating a factory
+h(4, "creating-factory") Creating a factory
p
| A factory is a #[strong function that returns a pipeline component].
@ -149,7 +167,7 @@ p
| Shared data between components, including strings, morphology,
| vectors etc.
+footrow
+row("foot")
+cell returns
+cell callable
+cell The pipeline component.
@ -171,148 +189,22 @@ p
| by looking it up in the available factories. The factory will then be
| initialised with the #[code Vocab]. Providing factory names instead of
| callables also makes it easy to specify them in the model's
| #[+a("/docs/usage/saving-loading#models-generating") meta.json]. If you're
| #[+a("/usage/saving-loading#models-generating") meta.json]. If you're
| training your own model and want to use one of spaCy's default components,
| you won't have to worry about finding and implementing it either to use
| the default tagger, simply add #[code "tagger"] to the pipeline, and
| #[strong spaCy will know what to do].
+infobox("Important note")
| Because factories are #[strong resolved on initialisation] of the
| #[code Language] class, it's #[strong not possible] to add them to the
| pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only
| works with individual component functions. To use factories, you need to
| create a new #[code Language] object, or generate a
| #[+a("/docs/usage/saving-loading#models-generating") model package] with
| #[+a("/usage/training#models-generating") model package] with
| a custom pipeline.
+aside("Real-world examples")
| To see real-world examples of pipeline factories and components in action,
| you can have a look at the source of spaCy's built-in components, e.g.
| the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or
| #[+api("entityrecognizer") #[code EntityRecongnizer]].
+h(2, "example1") Example: Custom sentence segmentation logic
p
| Let's say you want to implement custom logic to improve spaCy's sentence
| boundary detection. Currently, sentence segmentation is based on the
| dependency parse, which doesn't always produce ideal results. The custom
| logic should therefore be applied #[strong after] tokenization, but
| #[strong before] the dependency parsing this way, the parser can also
| take advantage of the sentence boundaries.
+code.
def sbd_component(doc):
for i, token in enumerate(doc[:-2]):
# define sentence start if period + titlecase token
if token.text == '.' and doc[i+1].is_title:
doc[i+1].sent_start = True
return doc
p
| In this case, we simply want to add the component to the existing
| pipeline of the English model. We can do this by inserting it at index 0
| of #[code nlp.pipeline]:
+code.
nlp = spacy.load('en')
nlp.pipeline.insert(0, sbd_component)
p
| When you call #[code nlp] on some text, spaCy will tokenize it to create
| a #[code Doc] object, and first call #[code sbd_component] on it, followed
| by the model's default pipeline.
+h(2, "example2") Example: Sentiment model
p
| Let's say you have trained your own document sentiment model on English
| text. After tokenization, you want spaCy to first execute the
| #[strong default tensorizer], followed by a custom
| #[strong sentiment component] that adds a #[code .sentiment]
| property to the #[code Doc], containing your model's sentiment precition.
p
| Your component class will have a #[code from_disk()] method that spaCy
| calls to load the model data. When called, the component will compute
| the sentiment score, add it to the #[code Doc] and return the modified
| document. Optionally, the component can include an #[code update()] method
| to allow training the model.
+code.
import pickle
from pathlib import Path
class SentimentComponent(object):
def __init__(self, vocab):
self.weights = None
def __call__(self, doc):
doc.sentiment = sum(self.weights*doc.vector) # set sentiment property
return doc
def from_disk(self, path): # path = model path + factory ID ('sentiment')
self.weights = pickle.load(Path(path) / 'weights.bin') # load weights
return self
def update(self, doc, gold): # update weights allows training!
prediction = sum(self.weights*doc.vector)
self.weights -= 0.001*doc.vector*(prediction-gold.sentiment)
p
| The factory will initialise the component with the #[code Vocab] object.
| To be able to add it to your model's pipeline as #[code 'sentiment'],
| it also needs to be registered via
| #[+api("spacy#set_factory") #[code set_factory()]].
+code.
def sentiment_factory(vocab):
component = SentimentComponent(vocab) # initialise component
return component
spacy.set_factory('sentiment', sentiment_factory)
p
| The above code should be #[strong shipped with your model]. You can use
| the #[+api("cli#package") #[code package]] command to create all required
| files and directories. The model package will include an
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py]
| with a #[code load()] method, that will initialise the language class with
| the model's pipeline and call the #[code from_disk()] method to load
| the model data.
p
| In the model package's meta.json, specify the language class and pipeline
| IDs:
+code("meta.json (excerpt)", "json").
{
"name": "sentiment_model",
"lang": "en",
"version": "1.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"pipeline": ["tensorizer", "sentiment"]
}
p
| When you load your new model, spaCy will call the model's #[code load()]
| method. This will return a #[code Language] object with a pipeline
| containing the default tensorizer, and the sentiment component returned
| by your custom #[code "sentiment"] factory.
+code.
nlp = spacy.load('en_sentiment_model')
doc = nlp(u'I love pizza')
assert doc.sentiment
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/docs/usage/saving-loading#models") saving and loading models].
+h(2, "disabling") Disabling pipeline components
+h(3, "disabling") Disabling pipeline components
p
| If you don't need a particular component of the pipeline for

View File

@ -0,0 +1,38 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > SERIALIZATION
include ../_spacy-101/_serialization
+infobox("Important note")
| In spaCy v2.0, the API for saving and loading has changed to only use the
| four methods listed above consistently across objects and classes. For an
| overview of the changes, see #[+a("/usage/v2#incompat") this table]
| and the notes on #[+a("/usage/v2#migrating-saving-loading") migrating].
+h(3, "example-doc") Example: Saving and loading a document
p
| For simplicity, let's assume you've
| #[+a("/usage/entity-recognition#setting") added custom entities] to
| a #[code Doc], either manually, or by using a
| #[+a("/usage/rule-based-matching#on_match") match pattern]. You can
| save it locally by calling #[+api("doc#to_disk") #[code Doc.to_disk()]],
| and load it again via #[+api("doc#from_disk") #[code Doc.from_disk()]].
| This will overwrite the existing object and return it.
+code.
import spacy
from spacy.tokens import Span
text = u'Netflix is hiring a new VP of global policy'
nlp = spacy.load('en')
doc = nlp(text)
assert len(doc.ents) == 0 # Doc has no entities
doc.ents += ((Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])) # add entity
doc.to_disk('/path/to/doc') # save Doc to disk
new_doc = nlp(text)
assert len(new_doc.ents) == 0 # new Doc has no entities
new_doc = new_doc.from_disk('path/to/doc') # load from disk and overwrite
assert len(new_doc.ents) == 1 # entity is now recognised!
assert [(ent.text, ent.label_) for ent in new_doc.ents] == [(u'Netflix', u'ORG')]

View File

@ -0,0 +1,61 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS
p
| Hooks let you customize some of the behaviours of the #[code Doc],
| #[code Span] or #[code Token] objects by adding a component to the
| pipeline. For instance, to customize the
| #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
| component that sets a custom function to
| #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
| method will check the #[code user_hooks] dict, and delegate to your
| function if you've set one. Similar results can be achieved by setting
| functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
+code("Polymorphic similarity example").
span.similarity(doc)
token.similarity(span)
doc1.similarity(doc2)
p
| By default, this just averages the vectors for each document, and
| computes their cosine. Obviously, spaCy should make it easy for you to
| install your own similarity model. This introduces a tricky design
| challenge. The current solution is to add three more dicts to the
| #[code Doc] object:
+aside("Implementation note")
| The hooks live on the #[code Doc] object because the #[code Span] and
| #[code Token] objects are created lazily, and don't own any data. They
| just proxy to their parent #[code Doc]. This turns out to be convenient
| here — we only have to worry about installing hooks in one place.
+table(["Name", "Description"])
+row
+cell #[code user_hooks]
+cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents]
+row
+cell #[code user_token_hooks]
+cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts]
+row
+cell #[code user_span_hooks]
+cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root]
p
| To sum up, here's an example of hooking in custom #[code .similarity()]
| methods:
+code("Add custom similarity hooks").
class SimilarityModel(object):
def __init__(self, model):
self._model = model
def __call__(self, doc):
doc.user_hooks['similarity'] = self.similarity
doc.user_span_hooks['similarity'] = self.similarity
doc.user_token_hooks['similarity'] = self.similarity
def similarity(self, obj1, obj2):
y = self._model([obj1.vector, obj2.vector])
return float(y[0])

View File

@ -20,18 +20,12 @@ p
| returning an #[strong annotated document]. It also orchestrates training
| and serialization.
+image
include ../../../assets/img/docs/architecture.svg
.u-text-right
+button("/assets/img/docs/architecture.svg", false, "secondary").u-text-tag View large graphic
+graphic("/assets/img/architecture.svg")
include ../../assets/img/architecture.svg
+h(3, "architecture-containers") Container objects
+table(["Name", "Description"])
+row
+cell #[+api("language") #[code Language]]
+cell
| A text-processing pipeline. Usually you'll load this once per
| process as #[code nlp] and pass the instance around your application.
+row
+cell #[+api("doc") #[code Doc]]
+cell A container for accessing linguistic annotations.
@ -53,43 +47,25 @@ p
| opposed to a word token. It therefore has no part-of-speech tag,
| dependency parse etc.
+row
+cell #[+api("vocab") #[code Vocab]]
+cell
| A lookup table for the vocabulary that allows you to access
| #[code Lexeme] objects.
+row
+cell #[code Morphology]
+cell
| Assign linguistic features like lemmas, noun case, verb tense etc.
| based on the word and its part-of-speech tag.
+row
+cell #[+api("stringstore") #[code StringStore]]
+cell Map strings to and from hash values.
+row
+cell #[+api("tokenizer") #[code Tokenizer]]
+cell
| Segment text, and create #[code Doc] objects with the discovered
| segment boundaries.
+row
+cell #[code Lemmatizer]
+cell
| Determine the base forms of words.
+row
+cell #[+api("matcher") #[code Matcher]]
+cell
| Match sequences of tokens, based on pattern rules, similar to
| regular expressions.
+h(3, "architecture-pipeline") Pipeline components
+h(3, "architecture-pipeline") Processing pipeline
+table(["Name", "Description"])
+row
+cell #[+api("language") #[code Language]]
+cell
| A text-processing pipeline. Usually you'll load this once per
| process as #[code nlp] and pass the instance around your application.
+row
+cell #[+api("pipe") #[code Pipe]]
+cell Base class for processing pipeline components.
+row
+cell #[+api("tensorizer") #[code Tensorizer]]
+cell
| Add tensors with position-sensitive meaning representations to
| #[code Doc] objects.
+row
+cell #[+api("tagger") #[code Tagger]]
+cell Annotate part-of-speech tags on #[code Doc] objects.
@ -104,16 +80,54 @@ p
| Annotate named entities, e.g. persons or products, on #[code Doc]
| objects.
+row
+cell #[+api("textcategorizer") #[code TextCategorizer]]
+cell Assigning categories or labels to #[code Doc] objects.
+row
+cell #[+api("tokenizer") #[code Tokenizer]]
+cell
| Segment text, and create #[code Doc] objects with the discovered
| segment boundaries.
+row
+cell #[+api("lemmatizer") #[code Lemmatizer]]
+cell
| Determine the base forms of words.
+row
+cell #[code Morphology]
+cell
| Assign linguistic features like lemmas, noun case, verb tense etc.
| based on the word and its part-of-speech tag.
+row
+cell #[+api("matcher") #[code Matcher]]
+cell
| Match sequences of tokens, based on pattern rules, similar to
| regular expressions.
+row
+cell #[+api("phrasematcher") #[code PhraseMatcher]]
+cell Match sequences of tokens based on phrases.
+h(3, "architecture-other") Other classes
+table(["Name", "Description"])
+row
+cell #[+api("vectors") #[code Vectors]]
+cell Container class for vector data keyed by string.
+cell #[+api("vocab") #[code Vocab]]
+cell
| A lookup table for the vocabulary that allows you to access
| #[code Lexeme] objects.
+row
+cell #[+api("binder") #[code Binder]]
+cell Container class for serializing collections of #[code Doc] objects.
+cell #[+api("stringstore") #[code StringStore]]
+cell Map strings to and from hash values.
+row
+cell #[+api("vectors") #[code Vectors]]
+cell Container class for vector data keyed by string.
+row
+cell #[+api("goldparse") #[code GoldParse]]
@ -124,3 +138,7 @@ p
+cell
| An annotated corpus, using the JSON file format. Manages
| annotations for tagging, dependency parsing and NER.
+row
+cell #[+api("binder") #[code Binder]]
+cell Container class for serializing collections of #[code Doc] objects.

View File

@ -0,0 +1,141 @@
//- 💫 DOCS > USAGE > SPACY 101 > COMMUNITY & FAQ
p
| We're very happy to see the spaCy community grow and include a mix of
| people from all kinds of different backgrounds computational
| linguistics, data science, deep learning, research and more. If you'd
| like to get involved, below are some answers to the most important
| questions and resources for further reading.
+h(3, "faq-help-code") Help, my code isn't working!
p
| Bugs suck, and we're doing our best to continuously improve the tests
| and fix bugs as soon as possible. Before you submit an issue, do a
| quick search and check if the problem has already been reported. If
| you're having installation or loading problems, make sure to also check
| out the #[+a("/usage/#troubleshooting") troubleshooting guide]. Help
| with spaCy is available via the following platforms:
+aside("How do I know if something is a bug?")
| Of course, it's always hard to know for sure, so don't worry we're not
| going to be mad if a bug report turns out to be a typo in your
| code. As a simple rule, any C-level error without a Python traceback,
| like a #[strong segmentation fault] or #[strong memory error],
| is #[strong always] a spaCy bug.#[br]#[br]
| Because models are statistical, their performance will never be
| #[em perfect]. However, if you come across
| #[strong patterns that might indicate an underlying issue], please do
| file a report. Similarly, we also care about behaviours that
| #[strong contradict our docs].
+table(["Platform", "Purpose"])
+row
+cell #[+a("https://stackoverflow.com/questions/tagged/spacy") StackOverflow]
+cell
| #[strong Usage questions] and everything related to problems with
| your specific code. The StackOverflow community is much larger
| than ours, so if your problem can be solved by others, you'll
| receive help much quicker.
+row
+cell #[+a("https://gitter.im/" + SOCIAL.gitter) Gitter chat]
+cell
| #[strong General discussion] about spaCy, meeting other community
| members and exchanging #[strong tips, tricks and best practices].
| If we're working on experimental models and features, we usually
| share them on Gitter first.
+row
+cell #[+a(gh("spaCy") + "/issues") GitHub issue tracker]
+cell
| #[strong Bug reports] and #[strong improvement suggestions], i.e.
| everything that's likely spaCy's fault. This also includes
| problems with the models beyond statistical imprecisions, like
| patterns that point to a bug.
+infobox
| Please understand that we won't be able to provide individual support via
| email. We also believe that help is much more valuable if it's shared
| publicly, so that #[strong more people can benefit from it]. If you come
| across an issue and you think you might be able to help, consider posting
| a quick update with your solution. No matter how simple, it can easily
| save someone a lot of time and headache and the next time you need help,
| they might repay the favour.
+h(3, "faq-contributing") How can I contribute to spaCy?
p
| You don't have to be an NLP expert or Python pro to contribute, and we're
| happy to help you get started. If you're new to spaCy, a good place to
| start is the
| #[+a(gh("spaCy") + '/issues?q=is%3Aissue+is%3Aopen+label%3A"help+wanted+%28easy%29"') #[code help wanted (easy)] label]
| on GitHub, which we use to tag bugs and feature requests that are easy
| and self-contained. We also appreciate contributions to the docs whether
| it's fixing a typo, improving an example or adding additional explanations.
| You'll find a "Suggest edits" link at the bottom of each page that points
| you to the source.
p
| Another way of getting involved is to help us improve the
| #[+a("/usage/adding-languages#language-data") language data]
| especially if you happen to speak one of the languages currently in
| #[+a("/usage/models#languages") alpha support]. Even
| adding simple tokenizer exceptions, stop words or lemmatizer data
| can make a big difference. It will also make it easier for us to provide
| a statistical model for the language in the future. Submitting a test
| that documents a bug or performance issue, or covers functionality that's
| especially important for your application is also very helpful. This way,
| you'll also make sure we never accidentally introduce regressions to the
| parts of the library that you care about the most.
p
strong
| For more details on the types of contributions we're looking for, the
| code conventions and other useful tips, make sure to check out the
| #[+a(gh("spaCy", "CONTRIBUTING.md")) contributing guidelines].
+infobox("Code of Conduct")
| spaCy adheres to the
| #[+a("http://contributor-covenant.org/version/1/4/") Contributor Covenant Code of Conduct].
| By participating, you are expected to uphold this code.
+h(3, "faq-project-with-spacy")
| I've built something cool with spaCy how can I get the word out?
p
| First, congrats we'd love to check it out! When you share your
| project on Twitter, don't forget to tag
| #[+a("https://twitter.com/" + SOCIAL.twitter) @#{SOCIAL.twitter}] so we
| don't miss it. If you think your project would be a good fit for the
| #[+a("/usage/resources") resources], #[strong feel free to submit it!]
| Tutorials are also incredibly valuable to other users and a great way to
| get exposure. So we strongly encourage #[strong writing up your experiences],
| or sharing your code and some tips and tricks on your blog. Since our
| website is open-source, you can add your project or tutorial by making a
| pull request on GitHub.
+aside("Contributing to spacy.io")
| All showcase and tutorial links are stored in a
| #[+a(gh("spaCy", "website/usage/_data.json")) JSON file], so you
| won't even have to edit any markup. For more info on how to submit
| your project, see the
| #[+a(gh("spaCy", "CONTRIBUTING.md#submitting-a-project-to-the-showcase")) contributing guidelines]
| and our #[+a(gh("spaCy", "website")) website docs].
p
| If you would like to use the spaCy logo on your site, please get in touch
| and ask us first. However, if you want to show support and tell others
| that your project is using spaCy, you can grab one of our
| #[strong spaCy badges] here:
- SPACY_BADGES = ["built%20with-spaCy-09a3d5.svg", "made%20with%20❤%20and-spaCy-09a3d5.svg", "spaCy-v2-09a3d5.svg"]
+quickstart([{id: "badge", input_style: "check", options: SPACY_BADGES.map(function(badge, i) { return {id: i, title: "<img class='o-icon' src='https://img.shields.io/badge/" + badge + "' height='20'/>", checked: (i == 0) ? true : false}}) }], false, false, true)
.c-code-block(data-qs-results)
for badge, i in SPACY_BADGES
- var url = "https://img.shields.io/badge/" + badge
+code(false, "text", false, false, "star").o-no-block(data-qs-badge=i)=url
+code(false, "text", false, false, "code").o-no-block(data-qs-badge=i).
&lt;a href="#{SITE_URL}"&gt;&lt;img src="#{url}" height="20"&gt;&lt;/a&gt;
+code(false, "text", false, false, "markdown").o-no-block(data-qs-badge=i).
[![spaCy](#{url})](#{SITE_URL})

View File

@ -5,7 +5,7 @@ p
| #[strong exceptions and special cases], especially amongst the most
| common words. Some of these exceptions are shared across languages, while
| others are #[strong entirely specific] usually so specific that they need
| to be hard-coded. The #[+src(gh("spaCy", "spacy/lang")) lang] module
| to be hard-coded. The #[+src(gh("spaCy", "spacy/lang")) #[code lang]] module
| contains all language-specific data, organised in simple Python files.
| This makes the data easy to update and extend.
@ -27,15 +27,13 @@ p
nlp_en = English() # includes English data
nlp_de = German() # includes German data
+image
include ../../../assets/img/docs/language_data.svg
.u-text-right
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
+graphic("/assets/img/language_data.svg")
include ../../assets/img/language_data.svg
+table(["Name", "Description"])
+row
+cell #[strong Stop words]#[br]
| #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py]
| #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) #[code stop_words.py]]
+cell
| List of most common words of a language that are often useful to
| filter out, for example "and" or "I". Matching tokens will
@ -43,21 +41,21 @@ p
+row
+cell #[strong Tokenizer exceptions]#[br]
| #[+src(gh("spacy-dev-resources", "templates/new_language/tokenizer_exceptions.py")) tokenizer_exceptions.py]
| #[+src(gh("spacy-dev-resources", "templates/new_language/tokenizer_exceptions.py")) #[code tokenizer_exceptions.py]]
+cell
| Special-case rules for the tokenizer, for example, contractions
| like "can't" and abbreviations with punctuation, like "U.K.".
+row
+cell #[strong Norm exceptions]
| #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) norm_exceptions.py]
| #[+src(gh("spaCy", "spacy/lang/norm_exceptions.py")) #[code norm_exceptions.py]]
+cell
| Special-case rules for normalising tokens to improve the model's
| predictions, for example on American vs. British spelling.
+row
+cell #[strong Punctuation rules]
| #[+src(gh("spaCy", "spacy/lang/punctuation.py")) punctuation.py]
| #[+src(gh("spaCy", "spacy/lang/punctuation.py")) #[code punctuation.py]]
+cell
| Regular expressions for splitting tokens, e.g. on punctuation or
| special characters like emoji. Includes rules for prefixes,
@ -65,14 +63,14 @@ p
+row
+cell #[strong Character classes]
| #[+src(gh("spaCy", "spacy/lang/char_classes.py")) char_classes.py]
| #[+src(gh("spaCy", "spacy/lang/char_classes.py")) #[code char_classes.py]]
+cell
| Character classes to be used in regular expressions, for example,
| latin characters, quotes, hyphens or icons.
+row
+cell #[strong Lexical attributes]
| #[+src(gh("spacy-dev-resources", "templates/new_language/lex_attrs.py")) lex_attrs.py]
| #[+src(gh("spacy-dev-resources", "templates/new_language/lex_attrs.py")) #[code lex_attrs.py]]
+cell
| Custom functions for setting lexical attributes on tokens, e.g.
| #[code like_num], which includes language-specific words like "ten"
@ -80,22 +78,22 @@ p
+row
+cell #[strong Syntax iterators]
| #[+src(gh("spaCy", "spacy/lang/en/syntax_iterators.py")) syntax_iterators.py]
| #[+src(gh("spaCy", "spacy/lang/en/syntax_iterators.py")) #[code syntax_iterators.py]]
+cell
| Functions that compute views of a #[code Doc] object based on its
| syntax. At the moment, only used for
| #[+a("/docs/usage/dependency-parse#noun-chunks") noun chunks].
| #[+a("/usage/linguistic-features#noun-chunks") noun chunks].
+row
+cell #[strong Lemmatizer]
| #[+src(gh("spacy-dev-resources", "templates/new_language/lemmatizer.py")) lemmatizer.py]
| #[+src(gh("spacy-dev-resources", "templates/new_language/lemmatizer.py")) #[code lemmatizer.py]]
+cell
| Lemmatization rules or a lookup-based lemmatization table to
| assign base forms, for example "be" for "was".
+row
+cell #[strong Tag map]#[br]
| #[+src(gh("spacy-dev-resources", "templates/new_language/tag_map.py")) tag_map.py]
| #[+src(gh("spacy-dev-resources", "templates/new_language/tag_map.py")) #[code tag_map.py]]
+cell
| Dictionary mapping strings in your tag set to
| #[+a("http://universaldependencies.org/u/pos/all.html") Universal Dependencies]
@ -103,7 +101,7 @@ p
+row
+cell #[strong Morph rules]
| #[+src(gh("spaCy", "spacy/lang/en/morph_rules.py")) morph_rules.py]
| #[+src(gh("spaCy", "spacy/lang/en/morph_rules.py")) #[code morph_rules.py]]
+cell
| Exception rules for morphological analysis of irregular words like
| personal pronouns.

View File

@ -1,13 +1,11 @@
//- 💫 DOCS > USAGE > LIGHTNING TOUR
include ../../_includes/_mixins
//- 💫 DOCS > USAGE > SPACY 101 > LIGHTNING TOUR
p
| The following examples and code snippets give you an overview of spaCy's
| functionality and its usage. If you're new to spaCy, make sure to check
| out the #[+a("/docs/usage/spacy-101") spaCy 101 guide].
| out the #[+a("/usage/spacy-101") spaCy 101 guide].
+h(2, "models") Install models and process text
+h(3, "lightning-tour-models") Install models and process text
+code(false, "bash").
spacy download en
@ -23,10 +21,10 @@ p
+infobox
| #[strong API:] #[+api("spacy#load") #[code spacy.load()]]
| #[strong Usage:] #[+a("/docs/usage/models") Models],
| #[+a("/docs/usage/spacy-101") spaCy 101]
| #[strong Usage:] #[+a("/usage/models") Models],
| #[+a("/usage/spacy-101") spaCy 101]
+h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences
+h(3, "lightning-tour-tokens-sentences") Get tokens, noun chunks & sentences
+tag-model("dependency parse")
+code.
@ -45,9 +43,9 @@ p
+infobox
| #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101]
| #[strong Usage:] #[+a("/usage/spacy-101") spaCy 101]
+h(2, "examples-pos-tags") Get part-of-speech tags and flags
+h(3, "lightning-tour-pos-tags") Get part-of-speech tags and flags
+tag-model("tagger")
+code.
@ -66,9 +64,9 @@ p
+infobox
| #[strong API:] #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]
| #[strong Usage:] #[+a("/usage/linguistic-features#pos-tagging") Part-of-speech tagging]
+h(2, "examples-hashes") Use hash values for any string
+h(3, "lightning-tour-hashes") Use hash values for any string
+code.
doc = nlp(u'I love coffee')
@ -86,9 +84,9 @@ p
+infobox
| #[strong API:] #[+api("stringstore") #[code stringstore]]
| #[strong Usage:] #[+a("/docs/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
| #[strong Usage:] #[+a("/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
+h(2, "examples-entities") Recongnise and update named entities
+h(3, "lightning-tour-entities") Recongnise and update named entities
+tag-model("NER")
+code.
@ -103,9 +101,9 @@ p
assert ents == [(0, 7, u'ORG')]
+infobox
| #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition]
| #[strong Usage:] #[+a("/usage/linguistic-features#named-entities") Named entity recognition]
+h(2, "displacy") Visualize a dependency parse and named entities in your browser
+h(3, "lightning-tour-displacy") Visualize a dependency parse and named entities in your browser
+tag-model("dependency parse", "NER")
+aside
@ -156,9 +154,9 @@ p
+infobox
| #[strong API:] #[+api("displacy") #[code displacy]]
| #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers]
| #[strong Usage:] #[+a("/usage/visualizers") Visualizers]
+h(2, "examples-word-vectors") Get word vectors and similarity
+h(3, "lightning-tour-word-vectors") Get word vectors and similarity
+tag-model("word vectors")
+code.
@ -171,9 +169,9 @@ p
assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector
+infobox
| #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity]
| #[strong Usage:] #[+a("/usage/vectors-similarity") Word vectors and similarity]
+h(2, "examples-serialization") Simple and efficient serialization
+h(3, "lightning-tour-serialization") Simple and efficient serialization
+code.
import spacy
@ -190,9 +188,9 @@ p
+infobox
| #[strong API:] #[+api("language") #[code Language]],
| #[+api("doc") #[code Doc]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
| #[strong Usage:] #[+a("/usage/models#saving-loading") Saving and loading models]
+h(2, "rule-matcher") Match text with token rules
+h(3, "lightning-tour-rule-matcher") Match text with token rules
+code.
import spacy
@ -212,9 +210,9 @@ p
+infobox
| #[strong API:] #[+api("matcher") #[code Matcher]]
| #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
| #[strong Usage:] #[+a("/usage/linguistic-features#rule-based-matching") Rule-based matching]
+h(2, "multi-threaded") Multi-threaded generator
+h(3, "lightning-tour-multi-threaded") Multi-threaded generator
+code.
texts = [u'One document.', u'...', u'Lots of documents']
@ -227,9 +225,9 @@ p
+infobox
| #[strong API:] #[+api("doc") #[code Doc]]
| #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage]
| #[strong Usage:] #[+a("/usage/processing-pipelines#multithreading") Processing pipelines]
+h(2, "examples-dependencies") Get syntactic dependencies
+h(3, "lightning-tour-dependencies") Get syntactic dependencies
+tag-model("dependency parse")
+code.
@ -243,9 +241,9 @@ p
+infobox
| #[strong API:] #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse]
| #[strong Usage:] #[+a("/usage/linguistic-features#dependency-parse") Using the dependency parse]
+h(2, "examples-numpy-arrays") Export to numpy arrays
+h(3, "lightning-tour-numpy-arrays") Export to numpy arrays
+code.
from spacy.attrs import ORTH, LIKE_URL, IS_OOV
@ -258,7 +256,7 @@ p
assert doc[0].like_url == doc_array[0, 1]
assert list(doc_array[:, 1]) == [t.like_url for t in doc]
+h(2, "examples-inline") Calculate inline markup on original string
+h(3, "lightning-tour-inline") Calculate inline markup on original string
+code.
def put_spans_around_tokens(doc, get_classes):

View File

@ -3,7 +3,7 @@
p
| A named entity is a "real-world object" that's assigned a name for
| example, a person, a country, a product or a book title. spaCy can
| #[strong recognise] #[+a("/docs/api/annotation#named-entities") various types]
| #[strong recognise] #[+a("/api/annotation#named-entities") various types]
| of named entities in a document, by asking the model for a
| #[strong prediction]. Because models are statistical and strongly depend
| on the examples they were trained on, this doesn't always work
@ -32,7 +32,7 @@ p
+annotation-row(["$1 billion", 44, 54, "MONEY", "Monetary values, including unit."], style)
p
| Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer],
| Using spaCy's built-in #[+a("/usage/visualizers") displaCy visualizer],
| here's what our example sentence and its named entities look like:
+codepen("2f2ad1408ff79fc6a326ea3aedbb353b", 160)

View File

@ -5,15 +5,13 @@ p
| produce a #[code Doc] object. The #[code Doc] is then processed in several
| different steps this is also referred to as the
| #[strong processing pipeline]. The pipeline used by the
| #[+a("/docs/usage/models") default models] consists of a
| #[+a("/models") default models] consists of a
| tensorizer, a tagger, a parser and an entity recognizer. Each pipeline
| component returns the processed #[code Doc], which is then passed on to
| the next component.
+image
include ../../../assets/img/docs/pipeline.svg
.u-text-right
+button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic
+graphic("/assets/img/pipeline.svg")
include ../../assets/img/pipeline.svg
+aside
| #[strong Name:] ID of the pipeline component.#[br]
@ -30,7 +28,7 @@ p
+row("divider")
+cell tensorizer
+cell #[code TokenVectorEncoder]
+cell #[+api("tensorizer") Tensorizer]
+cell #[code Doc.tensor]
+cell Create feature representation tensor for #[code Doc].
@ -54,6 +52,12 @@ p
+cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]
+cell Detect and label named entities.
+row
+cell textcat
+cell #[+api("textcategorizer") #[code TextCategorizer]]
+cell #[code Doc.cats]
+cell Assign document labels.
p
| The processing pipeline always #[strong depends on the statistical model]
| and its capabilities. For example, a pipeline can only include an entity

View File

@ -1,7 +1,7 @@
//- 💫 DOCS > USAGE > SPACY 101 > POS TAGGING AND DEPENDENCY PARSING
p
| After tokenization, spaCy can also #[strong parse] and #[strong tag] a
| After tokenization, spaCy can #[strong parse] and #[strong tag] a
| given #[code Doc]. This is where the statistical model comes in, which
| enables spaCy to #[strong make a prediction] of which tag or label most
| likely applies in this context. A model consists of binary data and is
@ -56,7 +56,7 @@ p
| singular present".
p
| Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer],
| Using spaCy's built-in #[+a("/usage/visualizers") displaCy visualizer],
| here's what our example sentence and its dependencies look like:
+codepen("030d1e4dfa6256cad8fdd59e6aefecbe", 460)

View File

@ -49,14 +49,12 @@ p
| #[strong Infix:] Character(s) in between, e.g.
| #[code -], #[code --], #[code /], #[code …].#[br]
+image
include ../../../assets/img/docs/tokenization.svg
.u-text-right
+button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic
+graphic("/assets/img/tokenization.svg")
include ../../assets/img/tokenization.svg
p
| While punctuation rules are usually pretty general, tokenizer exceptions
| strongly depend on the specifics of the individual language. This is
| why each #[+a("/docs/api/language-models") available language] has its
| why each #[+a("/models/#languages") available language] has its
| own subclass like #[code English] or #[code German], that loads in lists
| of hard-coded data and exception rules.

View File

@ -24,10 +24,8 @@ p
| #[strong Gradient:] Gradient of the loss function calculating the
| difference between input and expected output.
+image
include ../../../assets/img/docs/training.svg
.u-text-right
+button("/assets/img/docs/training.svg", false, "secondary").u-text-tag View large graphic
+graphic("/assets/img/training.svg")
include ../../assets/img/training.svg
p
| When training a model, we don't just want it to memorise our examples

View File

@ -19,10 +19,8 @@ p
| #[strong StringStore]: The dictionary mapping hash values to strings, for
| example #[code 3197928453018144401] &rarr; "coffee".
+image
include ../../../assets/img/docs/vocab_stringstore.svg
.u-text-right
+button("/assets/img/docs/vocab_stringstore.svg", false, "secondary").u-text-tag View large graphic
+graphic("/assets/img/vocab_stringstore.svg")
include ../../assets/img/vocab_stringstore.svg
p
| If you process lots of documents containing the word "coffee" in all

View File

@ -5,7 +5,7 @@ p
| embeddings", multi-dimensional meaning representations of a word. Word
| vectors can be generated using an algorithm like
| #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
| #[+a("/docs/usage/models") default models] come with
| #[+a("/models") default models] come with
| #[strong 300-dimensional vectors] that look like this:
+code("banana.vector", false, false, 250).
@ -148,5 +148,5 @@ p
p
| If your application will benefit from a large vocabulary with more
| vectors, you should consider using one of the
| #[+a("/docs/usage/models#available") larger models] instead of the default,
| #[+a("/models") larger models] instead of the default,
| smaller ones, which usually come with a clipped vocabulary.

View File

@ -1,14 +1,6 @@
include ../../_includes/_mixins
//- 💫 DOCS > USAGE > TRAINING > BASICS
p
| This guide describes how to train new statistical models for spaCy's
| part-of-speech tagger, named entity recognizer and dependency parser.
| Once the model is trained, you can then
| #[+a("/docs/usage/saving-loading") save and load] it.
+h(2, "101") Training 101
include _spacy-101/_training
include ../_spacy-101/_training
+h(3, "training-data") How do I get training data?
@ -50,7 +42,7 @@ p
p
| Alternatively, the
| #[+a("/docs/usage/rule-based-matching#example3") rule-based matcher]
| #[+a("/usage/linguistic-features#rule-based-matching") rule-based matcher]
| can be a useful tool to extract tokens or combinations of tokens, as
| well as their start and end index in a document. In this case, we'll
| extract mentions of Google and assume they're an #[code ORG].
@ -73,7 +65,7 @@ p
| #[strong what you want the model to learn]. While there are some entity
| annotations that are more or less universally correct like Canada being
| a geopolitical entity your application may have its very own definition
| of the #[+a("/docs/api/annotation#named-entities") NER annotation scheme].
| of the #[+a("/api/annotation#named-entities") NER annotation scheme].
+code.
train_data = [
@ -84,7 +76,7 @@ p
("Google rebrands its business apps", [(0, 6, "ORG")]),
("look what i found on google! 😂", [(21, 27, "PRODUCT")])]
+h(2) Training with annotations
+h(3, "annotations") Training with annotations
p
| The #[+api("goldparse") #[code GoldParse]] object collects the annotated
@ -103,7 +95,7 @@ p
p
| Using the #[code Doc] and its gold-standard annotations, the model can be
| updated to learn a sentence of three words with their assigned
| part-of-speech tags. The #[+a("/docs/usage/adding-languages#tag-map") tag map]
| part-of-speech tags. The #[+a("/usage/adding-languages#tag-map") tag map]
| is part of the vocabulary and defines the annotation scheme. If you're
| training a new language model, this will let you map the tags present in
| the treebank you train on to spaCy's tag scheme.
@ -115,7 +107,7 @@ p
p
| The same goes for named entities. The letters added before the labels
| refer to the tags of the
| #[+a("/docs/usage/entity-recognition#updating-biluo") BILUO scheme]
| #[+a("/usage/linguistic-features#updating-biluo") BILUO scheme]
| #[code O] is a token outside an entity, #[code U] an single entity unit,
| #[code B] the beginning of an entity, #[code I] a token inside an entity
| and #[code L] the last token of an entity.
@ -130,10 +122,8 @@ p
| #[strong Update]: Update the model's weights.#[br]
| #[strong ]
+image
include ../../assets/img/docs/training-loop.svg
.u-text-right
+button("/assets/img/docs/training-loop.svg", false, "secondary").u-text-tag View large graphic
+graphic("/assets/img/training-loop.svg")
include ../../assets/img/training-loop.svg
p
| Of course, it's not enough to only show a model a single example once.
@ -192,11 +182,7 @@ p
+infobox
| For the #[strong full example and more details], see the usage guide on
| #[+a("/docs/usage/training-ner") training the named entity recognizer],
| #[+a("/usage/training#ner") training the named entity recognizer],
| or the runnable
| #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
| on GitHub.
+h(2) Examples
+under-construction

View File

@ -0,0 +1,61 @@
//- 💫 DOCS > USAGE > TRAINING > NER
p
| All #[+a("/models") spaCy models] support online learning, so
| you can update a pre-trained model with new examples. To update the
| model, you first need to create an instance of
| #[+api("goldparse") #[code GoldParse]], with the entity labels
| you want to learn. You'll usually need to provide many examples to
| meaningfully improve the system — a few hundred is a good start, although
| more is better.
p
| You should avoid iterating over the same few examples multiple times, or
| the model is likely to "forget" how to annotate other examples. If you
| iterate over the same few examples, you're effectively changing the loss
| function. The optimizer will find a way to minimize the loss on your
| examples, without regard for the consequences on the examples it's no
| longer paying attention to. One way to avoid this
| #[+a("https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting", true) "catastrophic forgetting" problem]
| is to "remind"
| the model of other examples by augmenting your annotations with sentences
| annotated with entities automatically recognised by the original model.
| Ultimately, this is an empirical process: you'll need to
| #[strong experiment on your own data] to find a solution that works best
| for you.
+h(3, "example-new-entity-type") Example: Training an additional entity type
p
| This script shows how to add a new entity type to an existing pre-trained
| NER model. To keep the example short and simple, only a few sentences are
| provided as examples. In practice, you'll need many more — a few hundred
| would be a good start. You will also likely need to mix in examples of
| other entity types, which might be obtained by running the entity
| recognizer over unlabelled sentences, and adding their annotations to the
| training set.
p
| The actual training is performed by looping over the examples, and
| calling #[+api("language#update") #[code nlp.update()]]. The
| #[code update] method steps through the words of the input. At each word,
| it makes a prediction. It then consults the annotations provided on the
| #[+api("goldparse") #[code GoldParse]] instance, to see whether it was
| right. If it was wrong, it adjusts its weights so that the correct
| action will score higher next time.
+github("spacy", "examples/training/train_new_entity_type.py")
+h(3, "example-ner-from-scratch") Example: Training an NER system from scratch
p
| This example is written to be self-contained and reasonably transparent.
| To achieve that, it duplicates some of spaCy's internal functionality.
| Specifically, in this example, we don't use spaCy's built-in
| #[+api("language") #[code Language]] class to wire together the
| #[+api("vocab") #[code Vocab]], #[+api("tokenizer") #[code Tokenizer]]
| and #[+api("entityrecognizer") #[code EntityRecognizer]]. Instead, we
| write our own simle #[code Pipeline] class, so that it's easier to see
| how the pieces interact.
+github("spacy", "examples/training/train_ner_standalone.py")

View File

@ -1,45 +1,4 @@
include ../../_includes/_mixins
+h(2, "101") Serialization 101
include _spacy-101/_serialization
+infobox("Important note")
| In spaCy v2.0, the API for saving and loading has changed to only use the
| four methods listed above consistently across objects and classes. For an
| overview of the changes, see #[+a("/docs/usage/v2#incompat") this table]
| and the notes on #[+a("/docs/usage/v2#migrating-saving-loading") migrating].
+h(3, "example-doc") Example: Saving and loading a document
p
| For simplicity, let's assume you've
| #[+a("/docs/usage/entity-recognition#setting") added custom entities] to
| a #[code Doc], either manually, or by using a
| #[+a("/docs/usage/rule-based-matching#on_match") match pattern]. You can
| save it locally by calling #[+api("doc#to_disk") #[code Doc.to_disk()]],
| and load it again via #[+api("doc#from_disk") #[code Doc.from_disk()]].
| This will overwrite the existing object and return it.
+code.
import spacy
from spacy.tokens import Span
text = u'Netflix is hiring a new VP of global policy'
nlp = spacy.load('en')
doc = nlp(text)
assert len(doc.ents) == 0 # Doc has no entities
doc.ents += ((Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])) # add entity
doc.to_disk('/path/to/doc') # save Doc to disk
new_doc = nlp(text)
assert len(new_doc.ents) == 0 # new Doc has no entities
new_doc = new_doc.from_disk('path/to/doc') # load from disk and overwrite
assert len(new_doc.ents) == 1 # entity is now recognised!
assert [(ent.text, ent.label_) for ent in new_doc.ents] == [(u'Netflix', u'ORG')]
+h(2, "models") Saving models
//- 💫 DOCS > USAGE > TRAINING > SAVING & LOADING
p
| After training your model, you'll usually want to save its state, and load
@ -55,6 +14,7 @@ p
| will be written out. To make the model more convenient to deploy, we
| recommend wrapping it as a Python package.
+h(3, "models-generating") Generating a model package
+infobox("Important note")
@ -105,13 +65,14 @@ p
| need to be named according to the naming conventions of
| #[code lang_name] and #[code lang_name-version].
+h(3, "models-custom") Customising the model setup
p
| The meta.json includes the model details, like name, requirements and
| license, and lets you customise how the model should be initialised and
| loaded. You can define the language data to be loaded and the
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
| #[+a("/usage/processing-pipelines") processing pipeline] to
| execute.
+table(["Setting", "Type", "Description"])
@ -126,7 +87,7 @@ p
+cell
| A list of strings mapping to the IDs of pipeline factories to
| apply in that order. If not set, spaCy's
| #[+a("/docs/usage/language-processing/pipelines") default pipeline]
| #[+a("/usage/processing-pipelines") default pipeline]
| will be used.
p
@ -135,7 +96,7 @@ p
| #[code Language] object with the loaded pipeline and data. If your model
| requires custom pipeline components, you should
| #[strong ship then with your model] and register their
| #[+a("/docs/usage/language-processing-pipeline#creating-factory") factories]
| #[+a("/usage/processing-pipelines#creating-factory") factories]
| via #[+api("spacy#set_factory") #[code set_factory()]].
+aside-code("Factory example").
@ -152,7 +113,7 @@ p
+infobox("Custom models with pipeline components")
| For more details and an example of how to package a sentiment model
| with a custom pipeline component, see the usage guide on
| #[+a("/docs/usage/language-processing-pipeline#example2") language processing pipelines].
| #[+a("/usage/processing-pipelines#example2") language processing pipelines].
+h(3, "models-building") Building the model package
@ -176,7 +137,7 @@ p
| You can then load the model via its name, #[code en_example_model], or
| import it directly as a module and then call its #[code load()] method.
+h(2, "loading") Loading a custom model package
+h(3, "loading") Loading a custom model package
p
| To load a model from a data directory, you can use
@ -209,3 +170,38 @@ p
+code-new nlp = English().from_disk('/path/to/data')
+code-old nlp = spacy.load('en', path='/path/to/data')
+h(3, "example-training-spacy") Example: How we're training and packaging models for spaCy
p
| Publishing a new version of spaCy often means re-training all available
| models currently, that's #{MODEL_COUNT} models for #{MODEL_LANG_COUNT}
| languages. To make this run smoothly, we're using an automated build
| process and a #[+api("cli#train") #[code spacy train]] template that
| looks like this:
+code(false, "bash", "$", false, false, true).
spacy train {lang} {models_dir}/{name} {train_data} {dev_data} -m meta/{name}.json -V {version} -g {gpu_id} -n {n_epoch} -ns {n_sents}
+aside-code("meta.json template", "json").
{
"lang": "en",
"name": "core_web_sm",
"license":"CC BY-SA 3.0",
"author":"Explosion AI",
"url":"https://explosion.ai",
"email":"contact@explosion.ai",
"sources": ["OntoNotes 5", "Common Crawl"],
"description":"English multi-task CNN trained on OntoNotes, with GloVe vectors trained on common crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities."
}
p In a directory #[code meta], we keep #[code meta.json] templates for the individual models, containing all relevant information that doesn't change across versions, like the name, description, author info and training data sources. When we train the model, we pass in the file to the meta template as the #[code --meta] argument, and specify the current model version as the #[code --version] argument.
p On each epoch, the model is saved out with a #[code meta.json] using our template and added properties, like the #[code pipeline], #[code accuracy] scores and the #[code spacy_version] used to train the model. After training completion, the best model is selected automatically and packaged using the #[+api("cli#package") #[code package]] command. Since a full meta file is already present on the trained model, no further setup is required to build a valid model package.
+code(false, "bash").
spacy package -f {best_model} dist/
cd dist/{model_name}
python setup.py sdist
p This process allows us to quickly trigger the model training and build process for all available models and languages, and generate the correct meta data automatically.

View File

@ -0,0 +1,3 @@
//- 💫 DOCS > USAGE > TRAINING > SIMILARITY
+under-construction

View File

@ -0,0 +1,3 @@
//- 💫 DOCS > USAGE > TRAINING > TAGGER & PARSER
+under-construction

View File

@ -0,0 +1,13 @@
//- 💫 DOCS > USAGE > TRAINING > TEXT CLASSIFICATION
+under-construction
+h(3, "example-textcat") Example: Training spaCy's text classifier
+tag-new(2)
p
| This example shows how to use and train spaCy's new
| #[+api("textcategorizer") #[code TextCategorizer]] pipeline component
| on IMDB movie reviews.
+github("spacy", "examples/training/train_textcat.py")

View File

@ -0,0 +1,15 @@
//- 💫 DOCS > USAGE > VECTORS & SIMILARITY > BASICS
+aside("Training word vectors")
| Dense, real valued vectors representing distributional similarity
| information are now a cornerstone of practical NLP. The most common way
| to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
| family of algorithms. The default
| #[+a("/models/en") English model] installs
| 300-dimensional vectors trained on the
| #[+a("http://commoncrawl.org") Common Crawl] corpus.
| If you need to train a word2vec model, we recommend the implementation in
| the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
include ../_spacy-101/_similarity
include ../_spacy-101/_word-vectors

View File

@ -0,0 +1,91 @@
//- 💫 DOCS > USAGE > VECTORS & SIMILARITY > CUSTOM VECTORS
p
| By default, #[+api("token#vector") #[code Token.vector]] returns the
| vector for its underlying #[+api("lexeme") #[code Lexeme]], while
| #[+api("doc#vector") #[code Doc.vector]] and
| #[+api("span#vector") #[code Span.vector]] return an average of the
| vectors of their tokens. You can customize these
| behaviours by modifying the #[code doc.user_hooks],
| #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
| dictionaries.
+infobox
| For more details on #[strong adding hooks] and #[strong overwriting] the
| built-in #[code Doc], #[code Span] and #[code Token] methods, see the
| usage guide on #[+a("/usage/processing-pipelines#user-hooks") user hooks].
+h(3, "custom-vectors-add") Adding vectors
+tag-new(2)
p
| The new #[+api("vectors") #[code Vectors]] class makes it easy to add
| your own vectors to spaCy. Just like the #[+api("vocab") #[code Vocab]],
| it is initialised with a #[+api("stringstore") #[code StringStore]] or
| a list of strings.
+code("Adding vectors one-by-one").
from spacy.strings import StringStore
from spacy.vectors import Vectors
vector_data = {'dog': numpy.random.uniform(-1, 1, (300,)),
'cat': numpy.random.uniform(-1, 1, (300,)),
'orange': numpy.random.uniform(-1, 1, (300,))}
vectors = Vectors(StringStore(), 300)
for word, vector in vector_data.items():
vectors.add(word, vector)
p
| You can also add the vector values directly on initialisation:
+code("Adding vectors on initialisation").
from spacy.vectors import Vectors
vector_table = numpy.zeros((3, 300), dtype='f')
vectors = Vectors([u'dog', u'cat', u'orange'], vector_table)
+h(3, "custom-loading-glove") Loading GloVe vectors
+tag-new(2)
p
| spaCy comes with built-in support for loading
| #[+a("https://nlp.stanford.edu/projects/glove/") GloVe] vectors from
| a directory. The #[+api("vectors#from_glove") #[code Vectors.from_glove]]
| method assumes a binary format, the vocab provided in a
| #[code vocab.txt], and the naming scheme of
| #[code vectors.{size}.[fd].bin]. For example:
+aside-code("Directory structure", "yaml").
└── vectors
├── vectors.128.f.bin # vectors file
└── vocab.txt # vocabulary
+table(["File name", "Dimensions", "Data type"])
+row
+cell #[code vectors.128.f.bin]
+cell 128
+cell float32
+row
+cell #[code vectors.300.d.bin]
+cell 300
+cell float64 (double)
+code.
from spacy.vectors import Vectors
vectors = Vectors([], 128)
vectors.from_glove('/path/to/vectors')
+h(3, "custom-loading-other") Loading other vectors
+tag-new(2)
p
| You can also choose to load in vectors from other sources, like the
| #[+a("https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md") fastText vectors]
| for 294 languages, trained on Wikipedia. After reading in the file,
| the vectors are added to the #[code Vocab] using the
| #[+api("vocab#set_vector") #[code set_vector]] method.
+github("spacy", "examples/vectors_fast_text.py")

View File

@ -0,0 +1,30 @@
//- 💫 DOCS > USAGE > VECTORS & SIMILARITY > GPU
p
| If you're using a GPU, it's much more efficient to keep the word vectors
| on the device. You can do that by setting the
| #[+api("vectors#attributes") #[code Vectors.data]] attribute to a
| #[code cupy.ndarray] object if you're using spaCy
| or #[+a("https://chainer.org") Chainer], or a
| #[code torch.Tensor] object if you're using
| #[+a("http://pytorch.org") PyTorch]. The #[code data] object just needs
| to support #[code __iter__] and #[code __getitem__], so if you're using
| another library such as #[+a("https://www.tensorflow.org") TensorFlow],
| you could also create a wrapper for your vectors data.
+code("spaCy, Thinc or Chainer").
import cupy.cuda
from spacy.vectors import Vectors
vector_table = numpy.zeros((3, 300), dtype='f')
vectors = Vectors([u'dog', u'cat', u'orange'], vector_table)
with cupy.cuda.Device(0):
vectors.data = cupy.asarray(vectors.data)
+code("PyTorch").
import torch
from spacy.vectors import Vectors
vector_table = numpy.zeros((3, 300), dtype='f')
vectors = Vectors([u'dog', u'cat', u'orange'], vector_table)
vectors.data = torch.Tensor(vectors.data).cuda(0)

View File

@ -1,34 +1,11 @@
//- 💫 DOCS > USAGE > WORD VECTORS & SIMILARITIES
include ../../_includes/_mixins
p
| Dense, real valued vectors representing distributional similarity
| information are now a cornerstone of practical NLP. The most common way
| to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
| family of algorithms. The default
| #[+a("/docs/usage/models#available") English model] installs
| 300-dimensional vectors trained on the
| #[+a("http://commoncrawl.org") Common Crawl] corpus.
+aside("Tip: Training a word2vec model")
| If you need to train a word2vec model, we recommend the implementation in
| the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
+h(2, "101") Similarity and word vectors 101
+tag-model("vectors")
include _spacy-101/_similarity
include _spacy-101/_word-vectors
+h(2, "similarity-context") Similarities in context
//- 💫 DOCS > USAGE > VECTORS & SIMILARITY > IN CONTEXT
p
| Aside from spaCy's built-in word vectors, which were trained on a lot of
| text with a wide vocabulary, the parsing, tagging and NER models also
| rely on vector representations of the #[strong meanings of words in context].
| As the first component of the
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline], the
| #[+a("/usage/processing-pipelines") processing pipeline], the
| tensorizer encodes a document's internal meaning representations as an
| array of floats, also called a tensor. This allows spaCy to make a
| reasonable guess at a word's meaning, based on its surrounding words.
@ -117,8 +94,8 @@ p
nlp(u"man dog bites"), nlp(u"dog man bites")]
for doc in docs:
for other_doc in docs:
print(doc.similarity(other_doc))
for other_doc in docs:
print(doc.similarity(other_doc))
p
| Interestingly, "man bites dog" and "man dog bites" are seen as slightly
@ -143,17 +120,3 @@ p
+cell.u-text-center #[code=cell.toFixed(2)]
| #[+procon(cell < 0.7 ? "con" : cell != 1 ? "pro" : "neutral")]
- counter++
+h(2, "custom") Customising word vectors
+under-construction
p
| By default, #[+api("token#vector") #[code Token.vector]] returns the
| vector for its underlying #[+api("lexeme") #[code Lexeme]], while
| #[+api("doc#vector") #[code Doc.vector]] and
| #[+api("span#vector") #[code Span.vector]] return an average of the
| vectors of their tokens. You can customize these
| behaviours by modifying the #[code doc.user_hooks],
| #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
| dictionaries.

View File

@ -0,0 +1,59 @@
//- 💫 DOCS > USAGE > ADDING LANGUAGES
include ../_includes/_mixins
+aside("Working on spaCy's source")
| To add a new language to spaCy, you'll need to
| #[strong modify the library&apos;s code]. The easiest way to do this is to
| clone the #[+src(gh("spaCy")) repository] and #[strong build spaCy from source].
| For more information on this, see the #[+a("/usage") installation guide].
| Unlike spaCy's core, which is mostly written in Cython, all language
| data is stored in regular Python files. This means that you won't have to
| rebuild anything in between you can simply make edits and reload spaCy
| to test them.
+grid.o-no-block
+grid-col("half")
p
| Obviously, there are lots of ways you can organise your code when
| you implement your own language data. This guide will focus on
| how it's done within spaCy. For full language support, you'll
| need to create a #[code Language] subclass, define custom
| #[strong language data], like a stop list and tokenizer
| exceptions and test the new tokenizer. Once the language is set
| up, you can #[strong build the vocabulary], including word
| frequencies, Brown clusters and word vectors. Finally, you can
| #[strong train the tagger and parser], and save the model to a
| directory.
p
| For some languages, you may also want to develop a solution for
| lemmatization and morphological analysis.
+table-of-contents
+item #[+a("#101") Language data 101]
+item #[+a("#language-subclass") The Language subclass]
+item #[+a("#stop-words") Stop words]
+item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
+item #[+a("#norm-exceptions") Norm exceptions]
+item #[+a("#lex-attrs") Lexical attributes]
+item #[+a("#syntax-iterators") Syntax iterators]
+item #[+a("#lemmatizer") Lemmatizer]
+item #[+a("#tag-map") Tag map]
+item #[+a("#morph-rules") Morph rules]
+item #[+a("#testing") Testing the language]
+item #[+a("#vocabulary") Building the vocabulary]
+item #[+a("#training") Training]
+section("language-data")
+h(2, "language-data") Language data
include _spacy-101/_language-data
include _adding-languages/_language-data
+section("testing")
+h(2, "testing") Testing the new language
include _adding-languages/_testing
+section("training")
+h(2, "training") Training a language model
include _adding-languages/_training

View File

@ -0,0 +1,29 @@
//- 💫 DOCS > USAGE > DEEP LEARNING
include ../_includes/_mixins
+section
+under-construction
+section("pre-processing")
+h(2, "pre-processing") Pre-processing text for deep learning
include _deep-learning/_pre-processing
+section("thinc")
+h(2, "thinc") spaCy and Thinc
include _deep-learning/_thinc
+section("tensorflow-keras")
+h(2, "tensorflow-keras") Using spaCy with TensorFlow / Keras
include _deep-learning/_tensorflow-keras
+section("scikit-learn")
+h(2, "scikit-learn") Using spaCy with scikit-learn
include _deep-learning/_scikit-learn
+section("pytorch")
+h(2, "pytorch") Using spaCy with PyTorch
include _deep-learning/_pytorch
+section("dynet")
+h(2, "dynet") Using spaCy with DyNet
include _deep-learning/_dynet

View File

@ -0,0 +1,73 @@
//- 💫 DOCS > USAGE > EXAMPLES
include ../_includes/_mixins
+section("matching")
+h(3, "matcher") Using spaCy's rule-based matcher
p
| This example shows how to use spaCy's rule-based
| #[+api("matcher") #[code Matcher]] to find and label entities across
| documents.
+github("spacy", "examples/matcher_example.py")
+h(3, "phrase-matcher") Using spaCy's phrase matcher
+tag-new(2)
p
| This example shows how to use the new
| #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find
| entities from a large terminology list.
+github("spacy", "examples/phrase_matcher.py")
+section("training")
+h(3, "new-entity-type") Training an additional entity type
p
| This script shows how to add a new entity type to an existing
| pre-trained NER model. To keep the example short and simple, only
| four sentences are provided as examples. In practice, you'll need
| many more — a few hundred would be a good start.
+github("spacy", "examples/training/train_new_entity_type.py")
+h(3, "ner-standalone") Training an NER system from scratch
p
| This example is written to be self-contained and reasonably
| transparent. To achieve that, it duplicates some of spaCy's internal
| functionality.
+github("spacy", "examples/training/train_ner_standalone.py")
+h(3, "textcat") Training spaCy's text classifier
+tag-new(2)
p
| This example shows how to use and train spaCy's new
| #[+api("textcategorizer") #[code TextCategorizer]] pipeline component
| on IMDB movie reviews.
+github("spacy", "examples/training/train_textcat.py")
+section("deep-learning")
+h(3, "keras") Text classification with Keras
p
| In this example, we're using spaCy to pre-process text for use with
| a #[+a("https://keras.io") Keras] text classification model.
+github("spacy", "examples/deep_learning_keras.py")
+h(3, "keras-parikh-entailment") A decomposable attention model for Natural Language Inference
p
| This example contains an implementation of the entailment prediction
| model described by #[+a("https://arxiv.org/pdf/1606.01933.pdf") Parikh et al. (2016)].
| The model is notable for its competitive performance with very few
| parameters, and was implemented using #[+a("https://keras.io") Keras]
| and spaCy.
+github("spacy", "examples/keras_parikh_entailment/__main__.py", "examples/keras_parikh_entailment")

View File

@ -0,0 +1,32 @@
//- 💫 DOCS > USAGE > FACTS & FIGURES
include ../_includes/_mixins
+section("comparison")
+h(2, "comparison") Feature comparison
include _facts-figures/_feature-comparison
+section("benchmarks")
+h(2, "benchmarks") Benchmarks
include _facts-figures/_benchmarks
+section("powered-by")
+h(2, "powered-by") Powered by spaCy
p
| Here's an overview of other tools and libraries that are using spaCy
| behind the scenes.
+grid
+card("torchtext", "https://github.com/pytorch/text", "PyTorch", "github")
| PyTorch's NLP datasets and loaders use spaCy for pre-processing
| and tokenization.
+card("allennlp", "https://github.com/allenai/allennlp", "Allen Institute for Artificial Intelligence", "github")
| The open-source NLP research library based on PyTorch uses spaCy
| for pre-processing and tokenization.
+section("other-libraries")
+h(2, "other-libraries") spaCy and other libraries
include _facts-figures/_other-libraries

27
website/usage/index.jade Normal file
View File

@ -0,0 +1,27 @@
//- 💫 DOCS > USAGE
include ../_includes/_mixins
p
| spaCy is compatible with #[strong 64-bit CPython 2.6+&#8725;3.3+] and
| runs on #[strong Unix/Linux], #[strong macOS/OS X] and
| #[strong Windows]. The latest spaCy releases are
| available over #[+a("https://pypi.python.org/pypi/spacy") pip] (source
| packages only) and #[+a("https://anaconda.org/conda-forge/spacy") conda].
| Installation requires a working build environment. See notes on
| #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
| and #[a(href="#source-windows") Windows] for details.
+section("quickstart")
include _install/_quickstart
+section("instructions")
+h(2, "installation") Installation instructions
include _install/_instructions
+section("troubleshooting")
+h(2, "troubleshooting") Troubleshooting guide
include _install/_troubleshooting
+section("changelog")
include _install/_changelog

View File

@ -0,0 +1,38 @@
//- 💫 DOCS > USAGE > LINGUISTIC FEATURES
include ../_includes/_mixins
p
| Processing raw text intelligently is difficult: most words are rare, and
| it's common for words that look completely different to mean almost the
| same thing. The same words in a different order can mean something
| completely different. Even splitting text into useful word-like units can
| be difficult in many languages. While it's possible to solve some
| problems starting from only the raw characters, it's usually better to
| use linguistic knowledge to add useful information. That's exactly what
| spaCy is designed to do: you put in raw text, and get back a
| #[+api("doc") #[code Doc]] object, that comes with a variety of
| annotations.
+section("pos-tagging")
+h(2, "pos-tagging") Part-of-speech tagging
+tag-model("tagger", "dependency parse")
include _linguistic-features/_pos-tagging
+section("dependency-parse")
+h(2, "dependency-parse") Dependency parsing
+tag-model("dependency parse")
include _linguistic-features/_dependency-parse
+section("named-entities")
+h(2, "named-entities") Named Entities
+tag-model("named entities")
include _linguistic-features/_named-entities
+section("tokenization")
+h(2, "tokenization") Tokenization
include _linguistic-features/_tokenization
+section("rule-based-matching")
+h(2, "rule-based-matching") Rule-based matching
include _linguistic-features/_rule-based-matching

37
website/usage/models.jade Normal file
View File

@ -0,0 +1,37 @@
//- 💫 DOCS > USAGE > MODELS
include ../_includes/_mixins
p
| As of v1.7.0, models for spaCy can be installed as #[strong Python packages].
| This means that they're a component of your application, just like any
| other module. They're versioned and can be defined as a dependency in your
| #[code requirements.txt]. Models can be installed from a download URL or
| a local directory, manually or via #[+a("https://pypi.python.org/pypi/pip") pip].
| Their data can be located anywhere on your file system.
+aside("Important note")
| If you're upgrading to spaCy v1.7.x or v2.x, you need to
| #[strong download the new models]. If you've trained statistical models
| that use spaCy's annotations, you should #[strong retrain your models]
| after updating spaCy. If you don't retrain, you may suffer train/test
| skew, which might decrease your accuracy.
+section("quickstart")
include _models/_quickstart
+section("available")
+h(2, "available") Available models
include _models/_available-models
+section("install")
+h(2, "download") Installing and using models
include _models/_install
+section("languages")
+h(2, "languages") Language support
include _models/_languages
+section("production")
+h(2, "production") Using models in production
include _models/_production

View File

@ -0,0 +1,25 @@
//- 💫 DOCS > USAGE > PIPELINE
include ../_includes/_mixins
include _spacy-101/_pipelines
+section("pipelines")
+h(2, "pipelines") How pipelines work
include _processing-pipelines/_pipelines
+section("examples")
+h(2, "examples") Examples
include _processing-pipelines/_examples
+section("multithreading")
+h(2, "multithreading") Multi-threading
include _processing-pipelines/_multithreading
+section("user-hooks")
+h(2, "user-hooks") User hooks
include _processing-pipelines/_user-hooks
+section("serialization")
+h(2, "serialization") Serialization
include _processing-pipelines/_serialization

View File

@ -0,0 +1,125 @@
//- 💫 DOCS > USAGE > RESOURCES
include ../_includes/_mixins
+aside("Contribute to this page")
| Have you built something cool with spaCy or come across a paper, book or
| course that should be featured here?
| #[a(href="mailto:#{EMAIL}") Let us know!]
+section("libraries")
+h(2, "libraries") Third-party libraries
+grid
+card("neuralcoref", "https://github.com/huggingface/neuralcoref", "Hugging Face", "github")
| State-of-the-art coreference resolution based on neural nets
| and spaCy
+card("rasa_nlu", "https://github.com/golastmile/rasa_nlu", "LastMile", "github")
| High level APIs for building your own language parser using
| existing NLP and ML libraries.
+card("textacy", "https://github.com/chartbeat-labs/textacy", "Burton DeWilde", "github")
| Higher-level NLP built on spaCy.
+card("spacyr", "https://github.com/kbenoit/spacyr", "Kenneth Benoit", "github")
| An R wrapper for spaCy.
+card("spacy_api", "https://github.com/kootenpv/spacy_api", "Pascal van Kooten", "github")
| Server/client to load models in a separate, dedicated process.
+card("spacy-api-docker", "https://github.com/jgontrum/spacy-api-docker", "Johannes Gontrum", "github")
| spaCy accessed by a REST API, wrapped in a Docker container.
+card("spacy-nlp-zeromq", "https://github.com/pasupulaphani/spacy-nlp-docker", "Phaninder Pasupula", "github")
| Docker image exposing spaCy with ZeroMQ bindings.
+card("spacy-nlp", "https://github.com/kengz/spacy-nlp", "Wah Loon Keng", "github")
| Expose spaCy NLP text parsing to Node.js (and other languages)
| via Socket.IO.
.u-text-right
+button("https://github.com/search?o=desc&q=spacy&s=stars&type=Repositories&utf8=%E2%9C%93", false, "primary", "small") See more projects on GitHub
+section("demos")
+h(2, "demos") Demos & Visualizations
+grid
+card("Neural coref", "https://huggingface.co/coref/", "Hugging Face")
+image("/assets/img/resources/neuralcoref.jpg").o-block-small
| State-of-the-art coreference resolution based on neural nets
| and spaCy.
+card("sense2vec", "https://demos.explosion.ai/sense2vec", "Matthew Honnibal and Ines Montani")
+image("/assets/img/resources/sense2vec.jpg").o-block-small
| Semantic analysis of the Reddit hivemind using sense2vec and spaCy.
+card("displaCy", "https://demos.explosion.ai/displacy", "Ines Montani")
+image("/assets/img/resources/displacy.jpg").o-block-small
| An open-source NLP visualiser for the modern web.
+card("displaCy ENT", "https://demos.explosion.ai/displacy-ent", "Ines Montani")
+image("/assets/img/resources/displacy-ent.jpg").o-block-small
| An open-source named entity visualiser for the modern web.
+section("books")
+h(2, "books") Books & Courses
+grid
+card("Natural Language Processing Fundamentals in Python", "https://www.datacamp.com/courses/natural-language-processing-fundamentals-in-python", "Katharine Jarmul (Datacamp, 2017)", "course")
| An interactive online course on everything you need to know about
| Natural Language Processing in Python, featuring spaCy and NLTK.
+card("Introduction to Machine Learning with Python: A Guide for Data Scientists", "https://books.google.com/books?id=vbQlDQAAQBAJ", "Andreas C. Müller and Sarah Guido (O'Reilly, 2016)", "book")
| Andreas is a lead developer of Scikit-Learn, and Sarah is a lead
| data scientist at Mashable. We're proud to get a mention.
+card("Text Analytics with Python", "https://www.amazon.com/Text-Analytics-Python-Real-World-Actionable/dp/148422387X", "Dipanjan Sarkar (Apress / Springer, 2016)", "book")
| A Practical Real-World Approach to Gaining Actionable Insights
| from your Data
+section("notebooks")
+h(2, "notebooks") Jupyter notebooks
+grid
+card("Modern NLP in Python", gh("spacy-notebooks", "notebooks/conference_notebooks/modern_nlp_in_python.ipynb"), "Patrick Harrison", "jupyter")
| Introduction to NLP in Python using spaCy and Gensim. Presented
| at PyData DC 2016.
+card("Advanced Text Analysis", gh("spacy-notebooks", "notebooks/conference_notebooks/advanced_text_analysis.ipynb"), "Jonathan Reeve", "jupyter")
| Advanced Text Analysis with spaCy and Scikit-Learn. Presented at
| NYU during NYCDH Week 2017.
.u-text-right
+button(gh("spacy-notebooks"), false, "primary", "small") See more notebooks on GitHub
+section("research")
+h(2, "research") Research systems
p Researchers are using spaCy to build ambitious, next-generation text processing technologies. spaCy is particularly popular amongst the biomedical NLP community, who are working on extracting knowledge from the huge volume of literature in their field.
+grid
+card(false, "https://www.semanticscholar.org/paper/Choosing-an-NLP-Library-for-Analyzing-Software-Doc-Omran-Treude/72f280e47e91b30af24205fa24d53247605aa591", "Fouad Nasser A. Al Omran et al. (2017)", "book", "third")
| Choosing an NLP Library for Analyzing Software Documentation: A
| Systematic Literature Review and a Series of Experiments
+card(false, "https://www.semanticscholar.org/paper/Mixing-Dirichlet-Topic-Models-and-Word-Embeddings-Moody/bf8116e06f7b498c6abfbf97aeb67d0838c08609", "Christopher E. Moody (2016)", "book", "third")
| Mixing Dirichlet Topic Models and Word Embeddings to Make lda2vec
+card(false, "https://www.semanticscholar.org/paper/Refactoring-the-Genia-Event-Extraction-Shared-Task-Kim-Wang/06d94b64a7bd2d3433f57caddad5084435d6a91f", "Jin-Dong Kim et al. (2016)", "book", "third")
| Refactoring the Genia Event Extraction Shared Task Toward a
| General Framework for IE-Driven KB Development
+card(false, "https://www.semanticscholar.org/paper/Predicting-Pre-click-Quality-for-Native-Zhou-Redi/564985430ff2fbc3a9daa9c2af8997b7f5046da8", "Ke Zhou et al. (2016)", "book", "third")
| Predicting Pre-click Quality for Native Advertisements
+card(false, "https://www.semanticscholar.org/paper/Threat-detection-in-online-discussions-Wester-%C3%98vrelid/f4150e2fb4d8646ebc2ea84f1a86afa1b593239b", "Aksel Wester et al. (2016)", "book", "third")
| Threat detection in online discussions
+card(false, "https://www.semanticscholar.org/paper/Distributional-semantics-for-understanding-spoken-Korpusik-Huang/5f55c5535e80d3e5ed7f1f0b89531e32725faff5", "Mandy Korpusik et al. (2016)", "book", "third")
| Distributional semantics for understanding spoken meal
| descriptions
.u-text-right
+button("https://scholar.google.com/scholar?scisbd=2&q=spacy&hl=en&as_sdt=1,5&as_vis=1", false, "primary", "small")
| See 200+ papers on Google Scholar

View File

@ -0,0 +1,300 @@
//- 💫 DOCS > USAGE > SPACY 101
include ../_includes/_mixins
p
| Whether you're new to spaCy, or just want to brush up on some
| NLP basics and implementation details this page should have you covered.
| Each section will explain one of spaCy's features in simple terms and
| with examples or illustrations. Some sections will also reappear across
| the usage guides as a quick introduction.
+aside("Help us improve the docs")
| Did you spot a mistake or come across explanations that
| are unclear? We always appreciate improvement
| #[+a(gh("spaCy") + "/issues") suggestions] or
| #[+a(gh("spaCy") + "/pulls") pull requests]. You can find a "Suggest
| edits" link at the bottom of each page that points you to the source.
+h(2, "whats-spacy") What's spaCy?
+grid.o-no-block
+grid-col("half")
p
| spaCy is a #[strong free, open-source library] for advanced
| #[strong Natural Language Processing] (NLP) in Python.
p
| If you're working with a lot of text, you'll eventually want to
| know more about it. For example, what's it about? What do the
| words mean in context? Who is doing what to whom? What companies
| and products are mentioned? Which texts are similar to each other?
p
| spaCy is designed specifically for #[strong production use] and
| helps you build applications that process and "understand"
| large volumes of text. It can be used to build
| #[strong information extraction] or
| #[strong natural language understanding] systems, or to
| pre-process text for #[strong deep learning].
+table-of-contents
+item #[+a("#features") Features]
+item #[+a("#annotations") Linguistic annotations]
+item #[+a("#annotations-token") Tokenization]
+item #[+a("#annotations-pos-deps") POS tags and dependencies]
+item #[+a("#annotations-ner") Named entities]
+item #[+a("#vectors-similarity") Word vectors and similarity]
+item #[+a("#pipelines") Pipelines]
+item #[+a("#vocab") Vocab, hashes and lexemes]
+item #[+a("#serialization") Serialization]
+item #[+a("#training") Training]
+item #[+a("#language-data") Language data]
+item #[+a("#lightning-tour") Lightning tour]
+item #[+a("#architecture") Architecture]
+item #[+a("#community") Community & FAQ]
+h(3, "what-spacy-isnt") What spaCy isn't
+list
+item #[strong spaCy is not a platform or "an API"].
| Unlike a platform, spaCy does not provide a software as a service, or
| a web application. It's an open-source library designed to help you
| build NLP applications, not a consumable service.
+item #[strong spaCy is not an out-of-the-box chat bot engine].
| While spaCy can be used to power conversational applications, it's
| not designed specifically for chat bots, and only provides the
| underlying text processing capabilities.
+item #[strong spaCy is not research software].
| It's built on the latest research, but it's designed to get
| things done. This leads to fairly different design decisions than
| #[+a("https://github./nltk/nltk") NLTK]
| or #[+a("https://stanfordnlp.github.io/CoreNLP/") CoreNLP], which were
| created as platforms for teaching and research. The main difference
| is that spaCy is integrated and opinionated. spaCy tries to avoid asking
| the user to choose between multiple algorithms that deliver equivalent
| functionality. Keeping the menu small lets spaCy deliver generally better
| performance and developer experience.
+item #[strong spaCy is not a company].
| It's an open-source library. Our company publishing spaCy and other
| software is called #[+a(COMPANY_URL, true) Explosion AI].
+section("features")
+h(2, "features") Features
p
| In the documentation, you'll come across mentions of spaCy's
| features and capabilities. Some of them refer to linguistic concepts,
| while others are related to more general machine learning
| functionality.
+aside
| If one of spaCy's functionalities #[strong needs a model], it means
| that you need to have one of the available
| #[+a("/models") statistical models] installed. Models are used
| to #[strong predict] linguistic annotations for example, if a word
| is a verb or a noun.
+table(["Name", "Description", "Needs model"])
+row
+cell #[strong Tokenization]
+cell Segmenting text into words, punctuations marks etc.
+cell #[+procon("con")]
+row
+cell #[strong Part-of-speech] (POS) #[strong Tagging]
+cell Assigning word types to tokens, like verb or noun.
+cell #[+procon("pro")]
+row
+cell #[strong Dependency Parsing]
+cell
| Assigning syntactic dependency labels, describing the
| relations between individual tokens, like subject or object.
+cell #[+procon("pro")]
+row
+cell #[strong Lemmatization]
+cell
| Assigning the base forms of words. For example, the lemma of
| "was" is "be", and the lemma of "rats" is "rat".
+cell #[+procon("pro")]
+row
+cell #[strong Sentence Boundary Detection] (SBD)
+cell Finding and segmenting individual sentences.
+cell #[+procon("pro")]
+row
+cell #[strong Named Entity Recongition] (NER)
+cell
| Labelling named "real-world" objects, like persons, companies
| or locations.
+cell #[+procon("pro")]
+row
+cell #[strong Similarity]
+cell
| Comparing words, text spans and documents and how similar
| they are to each other.
+cell #[+procon("pro")]
+row
+cell #[strong Text Classification]
+cell
| Assigning categories or labels to a whole document, or parts
| of a document.
+cell #[+procon("pro")]
+row
+cell #[strong Rule-based Matching]
+cell
| Finding sequences of tokens based on their texts and
| linguistic annotations, similar to regular expressions.
+cell #[+procon("con")]
+row
+cell #[strong Training]
+cell Updating and improving a statistical model's predictions.
+cell #[+procon("neutral")]
+row
+cell #[strong Serialization]
+cell Saving objects to files or byte strings.
+cell #[+procon("neutral")]
+h(2, "annotations") Linguistic annotations
p
| spaCy provides a variety of linguistic annotations to give you
| #[strong insights into a text&apos;s grammatical structure]. This
| includes the word types, like the parts of speech, and how the words
| are related to each other. For example, if you're analysing text, it
| makes a huge difference whether a noun is the subject of a sentence,
| or the object or whether "google" is used as a verb, or refers to
| the website or company in a specific context.
p
| Once you've downloaded and installed a #[+a("/usage/models") model],
| you can load it via #[+api("spacy#load") #[code spacy.load()]]. This will
| return a #[code Language] object contaning all components and data needed
| to process text. We usually call it #[code nlp]. Calling the #[code nlp]
| object on a string of text will return a processed #[code Doc]:
+code.
import spacy
nlp = spacy.load('en')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
p
| Even though a #[code Doc] is processed e.g. split into individual words
| and annotated it still holds #[strong all information of the original text],
| like whitespace characters. You can always get the offset of a token into the
| original string, or reconstruct the original by joining the tokens and their
| trailing whitespace. This way, you'll never lose any information
| when processing text with spaCy.
+h(3, "annotations-token") Tokenization
include _spacy-101/_tokenization
+infobox
| To learn more about how spaCy's tokenization rules work in detail,
| how to #[strong customise and replace] the default tokenizer and how to
| #[strong add language-specific data], see the usage guides on
| #[+a("/usage/adding-languages") adding languages] and
| #[+a("/usage/linguistic-features#tokenization") customising the tokenizer].
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
+tag-model("dependency parse")
include _spacy-101/_pos-deps
+infobox
| To learn more about #[strong part-of-speech tagging] and rule-based
| morphology, and how to #[strong navigate and use the parse tree]
| effectively, see the usage guides on
| #[+a("/usage/linguistic-features#pos-tagging") part-of-speech tagging] and
| #[+a("/usage/linguistic-features#dependency-parse") using the dependency parse].
+h(3, "annotations-ner") Named Entities
+tag-model("named entities")
include _spacy-101/_named-entities
+infobox
| To learn more about entity recognition in spaCy, how to
| #[strong add your own entities] to a document and how to
| #[strong train and update] the entity predictions of a model, see the
| usage guides on
| #[+a("/usage/linguistic-features#named-entities") named entity recognition] and
| #[+a("/usage/training#ner") training the named entity recognizer].
+h(2, "vectors-similarity") Word vectors and similarity
+tag-model("vectors")
include _spacy-101/_similarity
include _spacy-101/_word-vectors
+infobox
| To learn more about word vectors, how to #[strong customise them] and
| how to load #[strong your own vectors] into spaCy, see the usage
| guide on
| #[+a("/usage/vectors-similarity") using word vectors and semantic similarities].
+h(2, "pipelines") Pipelines
include _spacy-101/_pipelines
+infobox
| To learn more about #[strong how processing pipelines work] in detail,
| how to enable and disable their components, and how to
| #[strong create your own], see the usage guide on
| #[+a("/usage/processing-pipelines") language processing pipelines].
+h(2, "vocab") Vocab, hashes and lexemes
include _spacy-101/_vocab
+h(2, "serialization") Serialization
include _spacy-101/_serialization
+infobox
| To learn more about how to #[strong save and load your own models],
| see the usage guide on
| #[+a("/usage/training#saving-loading") saving and loading].
+h(2, "training") Training
include _spacy-101/_training
+infobox
| To learn more about #[strong training and updating] models, how to create
| training data and how to improve spaCy's named entity recognition models,
| see the usage guides on #[+a("/usage/training") training].
+h(2, "language-data") Language data
include _spacy-101/_language-data
+infobox
| To learn more about the individual components of the language data and
| how to #[strong add a new language] to spaCy in preparation for training
| a language model, see the usage guide on
| #[+a("/usage/adding-languages") adding languages].
+section("lightning-tour")
+h(2, "lightning-tour") Lightning tour
include _spacy-101/_lightning-tour
+section("architecture")
+h(2, "architecture") Architecture
include _spacy-101/_architecture
+section("community-faq")
+h(2, "community") Community & FAQ
include _spacy-101/_community-faq

View File

@ -0,0 +1,9 @@
//- 💫 DOCS > USAGE > TEXT CLASSIFICATION
include ../_includes/_mixins
+under-construction
+h(2, "example") Example
+github("spacy", "examples/training/train_textcat.py")

View File

@ -0,0 +1,33 @@
//- 💫 DOCS > USAGE > TRAINING
include ../_includes/_mixins
p
| This guide describes how to train new statistical models for spaCy's
| part-of-speech tagger, named entity recognizer and dependency parser.
| Once the model is trained, you can then
| #[+a("/usage/models#saving-loading") save and load] it.
+section("basics")
+h(2, "basics") Training basics
include _training/_basics
+section("ner")
+h(2, "ner") Training the named entity recognizer
include _training/_ner
+section("tagger-parser")
+h(2, "tagger-parser") Training the tagger and parser
include _training/_tagger-parser
+section("similarity")
+h(2, "similarity") Training a similarity model
include _training/_similarity
+section("textcat")
+h(2, "textcat") Training a text classification model
include _training/_textcat
+section("saving-loading")
+h(2, "saving-loading") Saving and loading models
include _training/_saving-loading

520
website/usage/v2.jade Normal file
View File

@ -0,0 +1,520 @@
//- 💫 DOCS > USAGE > WHAT'S NEW IN V2.0
include ../_includes/_mixins
p
| We're very excited to finally introduce spaCy v2.0! On this page, you'll
| find a summary of the new features, information on the backwards
| incompatibilities, including a handy overview of what's been renamed or
| deprecated. To help you make the most of v2.0, we also
| #[strong re-wrote almost all of the usage guides and API docs], and added
| more real-world examples. If you're new to spaCy, or just want to brush
| up on some NLP basics and the details of the library, check out
| the #[+a("/usage/spacy-101") spaCy 101 guide] that explains the most
| important concepts with examples and illustrations.
+h(2, "summary") Summary
+grid.o-no-block
+grid-col("half")
p This release features
| entirely new #[strong deep learning-powered models] for spaCy's tagger,
| parser and entity recognizer. The new models are #[strong 20x smaller]
| than the linear models that have powered spaCy until now: from 300 MB to
| only 15 MB.
p
| We've also made several usability improvements that are
| particularly helpful for #[strong production deployments]. spaCy
| v2 now fully supports the Pickle protocol, making it easy to use
| spaCy with #[+a("https://spark.apache.org/") Apache Spark]. The
| string-to-integer mapping is #[strong no longer stateful], making
| it easy to reconcile annotations made in different processes.
| Models are smaller and use less memory, and the APIs for serialization
| are now much more consistent.
+table-of-contents
+item #[+a("#summary") Summary]
+item #[+a("#features") New features]
+item #[+a("#features-models") Neural network models]
+item #[+a("#features-pipelines") Improved processing pipelines]
+item #[+a("#features-text-classification") Text classification]
+item #[+a("#features-hash-ids") Hash values instead of integer IDs]
+item #[+a("#features-serializer") Saving, loading and serialization]
+item #[+a("#features-displacy") displaCy visualizer]
+item #[+a("#features-language") Language data and lazy loading]
+item #[+a("#features-matcher") Revised matcher API and phrase matcher]
+item #[+a("#incompat") Backwards incompatibilities]
+item #[+a("#migrating") Migrating from spaCy v1.x]
+item #[+a("#benchmarks") Benchmarks]
p
| The main usability improvements you'll notice in spaCy v2.0 are around
| #[strong defining, training and loading your own models] and components.
| The new neural network models make it much easier to train a model from
| scratch, or update an existing model with a few examples. In v1.x, the
| statistical models depended on the state of the #[code Vocab]. If you
| taught the model a new word, you would have to save and load a lot of
| data — otherwise the model wouldn't correctly recall the features of your
| new example. That's no longer the case.
p
| Due to some clever use of hashing, the statistical models
| #[strong never change size], even as they learn new vocabulary items.
| The whole pipeline is also now fully differentiable. Even if you don't
| have explicitly annotated data, you can update spaCy using all the
| #[strong latest deep learning tricks] like adversarial training, noise
| contrastive estimation or reinforcement learning.
+section("features")
+h(2, "features") New features
p
| This section contains an overview of the most important
| #[strong new features and improvements]. The #[+a("/api") API docs]
| include additional deprecation notes. New methods and functions that
| were introduced in this version are marked with a #[+tag-new(2)] tag.
+h(3, "features-models") Convolutional neural network models
+aside-code("Example", "bash").
spacy download en # default English model
spacy download de # default German model
spacy download fr # default French model
spacy download es # default Spanish model
spacy download xx_ent_wiki_sm # multi-language NER
p
| spaCy v2.0 features new neural models for tagging,
| parsing and entity recognition. The models have
| been designed and implemented from scratch specifically for spaCy, to
| give you an unmatched balance of speed, size and accuracy. The new
| models are #[strong 10&times; smaller], #[strong 20% more accurate],
| and #[strong just as fast] as the previous generation.
| #[strong GPU usage] is now supported via
| #[+a("http://chainer.org") Chainer]'s CuPy module.
+infobox
| #[+label-inline Usage:] #[+a("/models") Models directory],
| #[+a("/usage/#gpu") Using spaCy with GPU]
+h(3, "features-pipelines") Improved processing pipelines
+aside-code("Example").
# Modify an existing pipeline
nlp = spacy.load('en')
nlp.pipeline.append(my_component)
# Register a factory to create a component
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory', mycomponent])
p
| It's now much easier to #[strong customise the pipeline] with your own
| components, functions that receive a #[code Doc] object, modify and
| return it. If your component is stateful, you can define and register a
| factory which receives the shared #[code Vocab] object and returns a
|  component. spaCy's default components can be added to your pipeline by
| using their string IDs. This way, you won't have to worry about finding
| and implementing them simply add #[code "tagger"] to the pipeline,
| and spaCy will know what to do.
+image
include ../assets/img/pipeline.svg
+infobox
| #[+label-inline API:] #[+api("language") #[code Language]]
| #[+label-inline Usage:] #[+a("/usage/language-processing-pipeline") Processing text]
+h(3, "features-text-classification") Text classification
+aside-code("Example").
from spacy.lang.en import English
nlp = English(pipeline=['tensorizer', 'tagger', 'textcat'])
p
| spaCy v2.0 lets you add text categorization models to spaCy pipelines.
| The model supports classification with multiple, non-mutually exclusive
| labels so multiple labels can apply at once. You can change the model
| architecture rather easily, but by default, the #[code TextCategorizer]
| class uses a convolutional neural network to assign position-sensitive
| vectors to each word in the document.
+infobox
| #[+label-inline API:] #[+api("textcategorizer") #[code TextCategorizer]],
| #[+api("doc#attributes") #[code Doc.cats]],
| #[+api("goldparse#attributes") #[code GoldParse.cats]]#[br]
| #[+label-inline Usage:] #[+a("/usage/text-classification") Text classification]
+h(3, "features-hash-ids") Hash values instead of integer IDs
+aside-code("Example").
doc = nlp(u'I love coffee')
assert doc.vocab.strings[u'coffee'] == 3197928453018144401
assert doc.vocab.strings[3197928453018144401] == u'coffee'
beer_hash = doc.vocab.strings.add(u'beer')
assert doc.vocab.strings[u'beer'] == beer_hash
assert doc.vocab.strings[beer_hash] == u'beer'
p
| The #[+api("stringstore") #[code StringStore]] now resolves all strings
| to hash values instead of integer IDs. This means that the string-to-int
| mapping #[strong no longer depends on the vocabulary state], making a lot
| of workflows much simpler, especially during training. Unlike integer IDs
| in spaCy v1.x, hash values will #[strong always match] even across
| models. Strings can now be added explicitly using the new
| #[+api("stringstore#add") #[code Stringstore.add]] method. A token's hash
| is available via #[code token.orth].
+infobox
| #[+label-inline API:] #[+api("stringstore") #[code StringStore]]
| #[+label-inline Usage:] #[+a("/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
+h(3, "features-serializer") Saving, loading and serialization
+aside-code("Example").
nlp = spacy.load('en') # shortcut link
nlp = spacy.load('en_core_web_sm') # package
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
nlp.to_disk('/path/to/nlp')
nlp = English().from_disk('/path/to/nlp')
p
| spay's serialization API has been made consistent across classes and
| objects. All container classes, i.e. #[code Language], #[code Doc],
| #[code Vocab] and #[code StringStore] now have a #[code to_bytes()],
| #[code from_bytes()], #[code to_disk()] and #[code from_disk()] method
| that supports the Pickle protocol.
p
| The improved #[code spacy.load] makes loading models easier and more
| transparent. You can load a model by supplying its
| #[+a("/usage/models#usage") shortcut link], the name of an installed
| #[+a("/usage/saving-loading#generating") model package] or a path.
| The #[code Language] class to initialise will be determined based on the
| model's settings. For a blank language, you can import the class directly,
| e.g. #[code from spacy.lang.en import English].
+infobox
| #[+label-inline API:] #[+api("spacy#load") #[code spacy.load]], #[+api("binder") #[code Binder]]
| #[+label-inline Usage:] #[+a("/usage/saving-loading") Saving and loading]
+h(3, "features-displacy") displaCy visualizer with Jupyter support
+aside-code("Example").
from spacy import displacy
doc = nlp(u'This is a sentence about Facebook.')
displacy.serve(doc, style='dep') # run the web server
html = displacy.render(doc, style='ent') # generate HTML
p
| Our popular dependency and named entity visualizers are now an official
| part of the spaCy library. displaCy can run a simple web server, or
| generate raw HTML markup or SVG files to be exported. You can pass in one
| or more docs, and customise the style. displaCy also auto-detects whether
| you're running #[+a("https://jupyter.org") Jupyter] and will render the
| visualizations in your notebook.
+infobox
| #[+label-inline API:] #[+api("displacy") #[code displacy]]
| #[+label-inline Usage:] #[+a("/usage/visualizers") Visualizing spaCy]
+h(3, "features-language") Improved language data and lazy loading
p
| Language-specfic data now lives in its own submodule, #[code spacy.lang].
| Languages are lazy-loaded, i.e. only loaded when you import a
| #[code Language] class, or load a model that initialises one. This allows
| languages to contain more custom data, e.g. lemmatizer lookup tables, or
| complex regular expressions. The language data has also been tidied up
| and simplified. spaCy now also supports simple lookup-based lemmatization.
+infobox
| #[+label-inline API:] #[+api("language") #[code Language]]
| #[+label-inline Code:] #[+src(gh("spaCy", "spacy/lang")) #[code spacy/lang]]
| #[+label-inline Usage:] #[+a("/usage/adding-languages") Adding languages]
+h(3, "features-matcher") Revised matcher API and phrase matcher
+aside-code("Example").
from spacy.matcher import Matcher, PhraseMatcher
matcher = Matcher(nlp.vocab)
matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
phrasematcher = PhraseMatcher(nlp.vocab)
phrasematcher.add('OBAMA', None, nlp(u"Barack Obama"))
p
| Patterns can now be added to the matcher by calling
| #[+api("matcher-add") #[code matcher.add()]] with a match ID, an optional
| callback function to be invoked on each match, and one or more patterns.
| This allows you to write powerful, pattern-specific logic using only one
| matcher. For example, you might only want to merge some entity types,
| and set custom flags for other matched patterns. The new
| #[+api("phrasematcher") #[code PhraseMatcher]] lets you efficiently
| match very large terminology lists using #[code Doc] objects as match
| patterns.
+infobox
| #[+label-inline API:] #[+api("matcher") #[code Matcher]],
| #[+api("phrasematcher") #[code PhraseMatcher]]
| #[+label-inline Usage:] #[+a("/usage/rule-based-matching") Rule-based matching]
+section("incompat")
+h(2, "incompat") Backwards incompatibilities
+table(["Old", "New"])
+row
+cell
| #[code spacy.en]
| #[code spacy.xx]
+cell
| #[code spacy.lang.en]
| #[code spacy.lang.xx]
+row
+cell #[code orth]
+cell #[code lang.xx.lex_attrs]
+row
+cell #[code syntax.iterators]
+cell #[code lang.xx.syntax_iterators]
+row
+cell #[code Language.save_to_directory]
+cell #[+api("language#to_disk") #[code Language.to_disk]]
+row
+cell #[code Language.create_make_doc]
+cell #[+api("language#attributes") #[code Language.tokenizer]]
+row
+cell
| #[code Vocab.load]
| #[code Vocab.load_lexemes]
+cell
| #[+api("vocab#from_disk") #[code Vocab.from_disk]]
| #[+api("vocab#from_bytes") #[code Vocab.from_bytes]]
+row
+cell
| #[code Vocab.dump]
+cell
| #[+api("vocab#to_disk") #[code Vocab.to_disk]]#[br]
| #[+api("vocab#to_bytes") #[code Vocab.to_bytes]]
+row
+cell
| #[code Vocab.load_vectors]
| #[code Vocab.load_vectors_from_bin_loc]
+cell
| #[+api("vectors#from_disk") #[code Vectors.from_disk]]
| #[+api("vectors#from_bytes") #[code Vectors.from_bytes]]
+row
+cell
| #[code Vocab.dump_vectors]
+cell
| #[+api("vectors#to_disk") #[code Vectors.to_disk]]
| #[+api("vectors#to_bytes") #[code Vectors.to_bytes]]
+row
+cell
| #[code StringStore.load]
+cell
| #[+api("stringstore#from_disk") #[code StringStore.from_disk]]
| #[+api("stringstore#from_bytes") #[code StringStore.from_bytes]]
+row
+cell
| #[code StringStore.dump]
+cell
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
+row
+cell #[code Tokenizer.load]
+cell
| #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
| #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
+row
+cell #[code Tagger.load]
+cell
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+row
+cell #[code DependencyParser.load]
+cell
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+row
+cell #[code EntityRecognizer.load]
+cell
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+row
+cell #[code Matcher.load]
+cell -
+row
+cell
| #[code Matcher.add_pattern]
| #[code Matcher.add_entity]
+cell #[+api("matcher#add") #[code Matcher.add]]
+row
+cell #[code Matcher.get_entity]
+cell #[+api("matcher#get") #[code Matcher.get]]
+row
+cell #[code Matcher.has_entity]
+cell #[+api("matcher#contains") #[code Matcher.__contains__]]
+row
+cell #[code Doc.read_bytes]
+cell #[+api("binder") #[code Binder]]
+row
+cell #[code Token.is_ancestor_of]
+cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
+row
+cell #[code cli.model]
+cell -
+section("migrating")
+h(2, "migrating") Migrating from spaCy 1.x
p
| Because we'e made so many architectural changes to the library, we've
| tried to #[strong keep breaking changes to a minimum]. A lot of projects
| follow the philosophy that if you're going to break anything, you may as
| well break everything. We think migration is easier if there's a logic to
| what has changed.
p
| We've therefore followed a policy of avoiding breaking changes to the
| #[code Doc], #[code Span] and #[code Token] objects. This way, you can
| focus on only migrating the code that does training, loading and
| serialization — in other words, code that works with the #[code nlp]
| object directly. Code that uses the annotations should continue to work.
+infobox("Important note")
| If you've trained your own models, keep in mind that your train and
| runtime inputs must match. This means you'll have to
| #[strong retrain your models] with spaCy v2.0.
+h(3, "migrating-saving-loading") Saving, loading and serialization
p
| Double-check all calls to #[code spacy.load()] and make sure they don't
| use the #[code path] keyword argument. If you're only loading in binary
| data and not a model package that can construct its own #[code Language]
| class and pipeline, you should now use the
| #[+api("language#from_disk") #[code Language.from_disk()]] method.
+code-new.
nlp = spacy.load('/model')
nlp = English().from_disk('/model/data')
+code-old nlp = spacy.load('en', path='/model')
p
| Review all other code that writes state to disk or bytes.
| All containers, now share the same, consistent API for saving and
| loading. Replace saving with #[code to_disk()] or #[code to_bytes()], and
| loading with #[code from_disk()] and #[code from_bytes()].
+code-new.
nlp.to_disk('/model')
nlp.vocab.to_disk('/vocab')
+code-old.
nlp.save_to_directory('/model')
nlp.vocab.dump('/vocab')
p
| If you've trained models with input from v1.x, you'll need to
| #[strong retrain them] with spaCy v2.0. All previous models will not
| be compatible with the new version.
+h(3, "migrating-strings") Strings and hash values
p
| The change from integer IDs to hash values may not actually affect your
| code very much. However, if you're adding strings to the vocab manually,
| you now need to call #[+api("stringstore#add") #[code StringStore.add()]]
| explicitly. You can also now be sure that the string-to-hash mapping will
| always match across vocabularies.
+code-new.
nlp.vocab.strings.add(u'coffee')
nlp.vocab.strings[u'coffee'] # 3197928453018144401
other_nlp.vocab.strings[u'coffee'] # 3197928453018144401
+code-old.
nlp.vocab.strings[u'coffee'] # 3672
other_nlp.vocab.strings[u'coffee'] # 40259
+h(3, "migrating-languages") Processing pipelines and language data
p
| If you're importing language data or #[code Language] classes, make sure
| to change your import statements to import from #[code spacy.lang]. If
| you've added your own custom language, it needs to be moved to
| #[code spacy/lang/xx] and adjusted accordingly.
+code-new from spacy.lang.en import English
+code-old from spacy.en import English
p
| If you've been using custom pipeline components, check out the new
| guide on #[+a("/usage/language-processing-pipelines") processing pipelines].
| Appending functions to the pipeline still works but you might be able
| to make this more convenient by registering "component factories".
| Components of the processing pipeline can now be disabled by passing a
| list of their names to the #[code disable] keyword argument on loading
| or processing.
+code-new.
nlp = spacy.load('en', disable=['tagger', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser'])
+code-old.
nlp = spacy.load('en', tagger=False, entity=False)
doc = nlp(u"I don't want parsed", parse=False)
+h(3, "migrating-matcher") Adding patterns and callbacks to the matcher
p
| If you're using the matcher, you can now add patterns in one step. This
| should be easy to update simply merge the ID, callback and patterns
| into one call to #[+api("matcher#add") #[code Matcher.add()]].
+code-new.
matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
+code-old.
matcher.add_entity('GoogleNow', on_match=merge_phrases)
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
p
| If you've been using #[strong acceptor functions], you'll need to move
| this logic into the
| #[+a("/usage/rule-based-matching#on_match") #[code on_match] callbacks].
| The callback function is invoked on every match and will give you access to
| the doc, the index of the current match and all total matches. This lets
| you both accept or reject the match, and define the actions to be
| triggered.
+section("benchmarks")
+h(2, "benchmarks") Benchmarks
include _facts-figures/_benchmarks-models

View File

@ -0,0 +1,18 @@
//- 💫 DOCS > USAGE > WORD VECTORS & SIMILARITIES
include ../_includes/_mixins
+section("basics")
include _vectors-similarity/_basics
+section("in-context")
+h(2, "in-context") Similarities in context
include _vectors-similarity/_in-context
+section("custom")
+h(2, "custom") Customising word vectors
include _vectors-similarity/_custom
+section("gpu")
+h(2, "gpu") Storing vectors on a GPU
include _vectors-similarity/_gpu

View File

@ -1,6 +1,6 @@
//- 💫 DOCS > USAGE > VISUALIZERS
include ../../_includes/_mixins
include ../_includes/_mixins
p
| As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy]
@ -19,8 +19,8 @@ p
| #[+a("#jupyter") ready to be rendered and exported].
+aside("What about the old visualizers?")
| Our JavaScript-based visualizers #[+src(gh("displacy")) displacy.js] and
| #[+src(gh("displacy-ent")) displacy-ent.js] will still be available on
| Our JavaScript-based visualizers #[+src(gh("displacy")) #[code displacy.js]] and
| #[+src(gh("displacy-ent")) #[code displacy-ent.js]] will still be available on
| GitHub. If you're looking to implement web-based visualizations, we
| generally recommend using those instead of spaCy's built-in
| #[code displacy] module. It'll allow your application to perform all
@ -148,7 +148,7 @@ p
| will render whichever spans and labels it receives. This makes it
| especially easy to work with custom entity types. By default, displaCy
| comes with colours for all
| #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy].
| #[+a("/api/annotation#named-entities") entity types supported by spaCy].
| If you're using custom entity types, you can use the #[code colors]
| setting to add your own colours for them.
@ -274,7 +274,7 @@ p
| #[code jupyter] keyword argument e.g. to return raw HTML in a notebook,
| or to force Jupyter rendering if auto-detection fails.
+image("/assets/img/docs/displacy_jupyter.jpg", 700, false, "Example of using the displaCy dependency and named entity visualizer in a Jupyter notebook")
+image("/assets/img/displacy_jupyter.jpg", 700, false, "Example of using the displaCy dependency and named entity visualizer in a Jupyter notebook")
p
| Internally, displaCy imports #[code display] and #[code HTML] from