Merge branch 'master' into spacy.io

2025-07-15 02:32:37 +03:00 · 2019-10-02 16:52:42 +02:00 · 2019-10-02 16:52:42 +02:00 · b31d01e8cc
commit b31d01e8cc
parent 31cebf66a8 f8e606c303
160 changed files with 2765 additions and 8664840 deletions
--- a/.flake8
+++ b/.flake8
@ -1,10 +0,0 @@
-[flake8]
-ignore = E203, E266, E501, E731, W503
-max-line-length = 80
-select = B,C,E,F,W,T4,B9
-exclude =
-    .env,
-    .git,
-    __pycache__,
-    _tokenizer_exceptions_list.py,
-    spacy/__init__.py
--- a/.github/contributors/er-raoniz.md
+++ b/.github/contributors/er-raoniz.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Rahul Soni           |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 30th September, 2019 |
+| GitHub username                | er-raoniz            |
+| Website (optional)             |                      |
--- a/.travis.yml
+++ b/.travis.yml
@ -13,7 +13,6 @@ install:
  - "pip install -e ."
 script:
  - "cat /proc/cpuinfo | grep flags | head -n 1"
-  - "pip install pytest pytest-timeout"
  - "python -m pytest --tb=native spacy"
 branches:
  except:
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,7 +1,8 @@
 recursive-include include *.h
+recursive-include spacy *.txt
 include LICENSE
 include README.md
-include pyproject.toml
 include bin/spacy
+include pyproject.toml
 recursive-exclude spacy/lang *.json
 recursive-include spacy/lang *.json.gz
--- a/README.md
+++ b/README.md
@ -3,24 +3,26 @@
 # spaCy: Industrial-strength NLP

 spaCy is a library for advanced Natural Language Processing in Python and
-Cython. It's built on the very latest research, and was designed from day one
-to be used in real products. spaCy comes with
-[pre-trained statistical models](https://spacy.io/models) and word vectors, and
+Cython. It's built on the very latest research, and was designed from day one to
+be used in real products. spaCy comes with
+[pretrained statistical models](https://spacy.io/models) and word vectors, and
 currently supports tokenization for **50+ languages**. It features
 state-of-the-art speed, convolutional **neural network models** for tagging,
 parsing and **named entity recognition** and easy **deep learning** integration.
 It's commercial open-source software, released under the MIT license.

-💫 **Version 2.1 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
+💫 **Version 2.2 out now!**
+[Check out the release notes here.](https://github.com/explosion/spaCy/releases)

-[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-devops&style=flat-square)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
-[![Travis Build Status](https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis)](https://travis-ci.org/explosion/spaCy)
-[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square)](https://github.com/explosion/spaCy/releases)
-[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.org/project/spacy/)
-[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy)
+[![Azure Pipelines](<https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-devops&style=flat-square&label=build+(3.x)>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
+[![Travis Build Status](<https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis-ci&logoColor=white&label=build+(2.7)>)](https://travis-ci.org/explosion/spaCy)
+[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases)
+[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
+[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
 [![Python wheels](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white)](https://github.com/explosion/wheelwright/releases)
-[![PyPi downloads](https://img.shields.io/pypi/dm/spacy?style=flat-square)](https://pypi.org/project/spacy/)
-[![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?style=flat-square)](https://anaconda.org/conda-forge/spacy)
+[![PyPi downloads](https://img.shields.io/pypi/dm/spacy?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
+[![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
+[![Model downloads](https://img.shields.io/github/downloads/explosion/spacy-models/total?style=flat-square&label=model+downloads)](https://github.com/explosion/spacy-models)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black)
 [![spaCy on Twitter](https://img.shields.io/twitter/follow/spacy_io.svg?style=social&label=Follow)](https://twitter.com/spacy_io)

@ -30,7 +32,7 @@ It's commercial open-source software, released under the MIT license.
 | --------------- | -------------------------------------------------------------- |
 | [spaCy 101]     | New to spaCy? Here's everything you need to know!              |
 | [Usage Guides]  | How to use spaCy and its features.                             |
-| [New in v2.1]   | New features, backwards incompatibilities and migration guide. |
+| [New in v2.2]   | New features, backwards incompatibilities and migration guide. |
 | [API Reference] | The detailed reference for spaCy's API.                        |
 | [Models]        | Download statistical language models for spaCy.                |
 | [Universe]      | Libraries, extensions, demos, books and courses.               |
@ -38,7 +40,7 @@ It's commercial open-source software, released under the MIT license.
 | [Contribute]    | How to contribute to the spaCy project and code base.          |

 [spacy 101]: https://spacy.io/usage/spacy-101
-[new in v2.1]: https://spacy.io/usage/v2-1
+[new in v2.2]: https://spacy.io/usage/v2-2
 [usage guides]: https://spacy.io/usage/
 [api reference]: https://spacy.io/api/
 [models]: https://spacy.io/models
@ -48,10 +50,13 @@ It's commercial open-source software, released under the MIT license.

 ## 💬 Where to ask questions

-The spaCy project is maintained by [@honnibal](https://github.com/honnibal)
-and [@ines](https://github.com/ines). Please understand that we won't be able
-to provide individual support via email. We also believe that help is much more
-valuable if it's shared publicly, so that more people can benefit from it.
+The spaCy project is maintained by [@honnibal](https://github.com/honnibal) and
+[@ines](https://github.com/ines), along with core contributors
+[@svlandeg](https://github.com/svlandeg) and
+[@adrianeboyd](https://github.com/adrianeboyd). Please understand that we won't
+be able to provide individual support via email. We also believe that help is
+much more valuable if it's shared publicly, so that more people can benefit from
+it.

 | Type                     | Platforms                                              |
 | ------------------------ | ------------------------------------------------------ |
@ -70,7 +75,7 @@ valuable if it's shared publicly, so that more people can benefit from it.
 -   Non-destructive **tokenization**
 -   **Named entity** recognition
 -   Support for **50+ languages**
-   Pre-trained [statistical models](https://spacy.io/models) and word vectors
+-   pretrained [statistical models](https://spacy.io/models) and word vectors
 -   State-of-the-art speed
 -   Easy **deep learning** integration
 -   Part-of-speech tagging
@ -91,7 +96,8 @@ valuable if it's shared publicly, so that more people can benefit from it.
 For detailed installation instructions, see the
 [documentation](https://spacy.io/usage).

-   **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual Studio)
+-   **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
+    Studio)
 -   **Python version**: Python 2.7, 3.5+ (only 64 bit)
 -   **Package managers**: [pip] · [conda] (via `conda-forge`)

@ -100,13 +106,20 @@ For detailed installation instructions, see the

 ### pip

-Using pip, spaCy releases are available as source packages and binary wheels
-(as of `v2.0.13`).
+Using pip, spaCy releases are available as source packages and binary wheels (as
+of `v2.0.13`).

 ```bash
 pip install spacy
 ```

+To install additional data tables for lemmatization in **spaCy v2.2+** you can
+run `pip install spacy[lookups]` or install
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
+separately. The lookups package is needed to create blank models with
+lemmatization data, and to lemmatize in languages that don't yet come with
+pretrained models and aren't powered by third-party libraries.
+
 When using pip it is generally recommended to install packages in a virtual
 environment to avoid modifying system state:

@ -126,9 +139,9 @@ conda config --add channels conda-forge
 conda install spacy
 ```

-For the feedstock including the build recipe and configuration,
-check out [this repository](https://github.com/conda-forge/spacy-feedstock).
-Improvements and pull requests to the recipe and setup are always appreciated.
+For the feedstock including the build recipe and configuration, check out
+[this repository](https://github.com/conda-forge/spacy-feedstock). Improvements
+and pull requests to the recipe and setup are always appreciated.

 ### Updating spaCy

@ -151,10 +164,10 @@ with the new version.

 ## Download models

-As of v1.7.0, models for spaCy can be installed as **Python packages**.
-This means that they're a component of your application, just like any
-other module. Models can be installed using spaCy's `download` command,
-or manually by pointing pip to a path or URL.
+As of v1.7.0, models for spaCy can be installed as **Python packages**. This
+means that they're a component of your application, just like any other module.
+Models can be installed using spaCy's `download` command, or manually by
+pointing pip to a path or URL.

 | Documentation          |                                                               |
 | ---------------------- | ------------------------------------------------------------- |
@ -203,8 +216,8 @@ doc = nlp(u"This is a sentence.")

 ### Support for older versions

-If you're using an older version (`v1.6.0` or below), you can still download
-and install the old models from within spaCy using `python -m spacy.en.download all`
+If you're using an older version (`v1.6.0` or below), you can still download and
+install the old models from within spaCy using `python -m spacy.en.download all`
 or `python -m spacy.de.download all`. The `.tar.gz` archives are also
 [attached to the v1.6.0 release](https://github.com/explosion/spaCy/tree/v1.6.0).
 To download and install the models manually, unpack the archive, drop the
@ -219,9 +232,10 @@ source. That is the common way if you want to make changes to the code base.
 You'll need to make sure that you have a development environment consisting of a
 Python distribution including header files, a compiler,
 [pip](https://pip.pypa.io/en/latest/installing/),
-[virtualenv](https://virtualenv.pypa.io/en/latest/) and [git](https://git-scm.com)
-installed. The compiler part is the trickiest. How to do that depends on your
-system. See notes on Ubuntu, OS X and Windows for details.
+[virtualenv](https://virtualenv.pypa.io/en/latest/) and
+[git](https://git-scm.com) installed. The compiler part is the trickiest. How to
+do that depends on your system. See notes on Ubuntu, OS X and Windows for
+details.

 ```bash
 # make sure you are using the latest pip
@ -240,8 +254,8 @@ Compared to regular install via pip, [requirements.txt](requirements.txt)
 additionally installs developer dependencies such as Cython. For more details
 and instructions, see the documentation on
 [compiling spaCy from source](https://spacy.io/usage#source) and the
-[quickstart widget](https://spacy.io/usage#section-quickstart) to get
-the right commands for your platform and Python version.
+[quickstart widget](https://spacy.io/usage#section-quickstart) to get the right
+commands for your platform and Python version.

 ### Ubuntu

@ -259,11 +273,12 @@ and git preinstalled.

 ### Windows

-Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or
-[Visual Studio Express](https://visualstudio.microsoft.com/vs/express/)
-that matches the version that was used to compile your Python
-interpreter. For official distributions these are VS 2008 (Python 2.7),
-VS 2010 (Python 3.4) and VS 2015 (Python 3.5).
+Install a version of the
+[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
+or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that
+matches the version that was used to compile your Python interpreter. For
+official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and
+VS 2015 (Python 3.5).

 ## Run tests

--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@ -349,7 +349,7 @@ def initialize_pipeline(nlp, docs, golds, config, device):


 def _load_pretrained_tok2vec(nlp, loc):
-    """Load pre-trained weights for the 'token-to-vector' part of the component
+    """Load pretrained weights for the 'token-to-vector' part of the component
    models, which is typically a CNN. See 'spacy pretrain'. Experimental.
    """
    with Path(loc).open("rb") as file_:
@ -445,7 +445,7 @@ class TreebankPaths(object):
    gpu_device=("Use GPU", "option", "g", int),
    use_oracle_segments=("Use oracle segments", "flag", "G", int),
    vectors_dir=(
-        "Path to directory with pre-trained vectors, named e.g. en/",
+        "Path to directory with pretrained vectors, named e.g. en/",
        "option",
        "v",
        Path,
--- a/bin/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@ -38,10 +38,10 @@ def create_kb(
    # check the length of the nlp vectors
    if "vectors" in nlp.meta and nlp.vocab.vectors.size:
        input_dim = nlp.vocab.vectors_length
-        logger.info("Loaded pre-trained vectors of size %s" % input_dim)
+        logger.info("Loaded pretrained vectors of size %s" % input_dim)
    else:
        raise ValueError(
-            "The `nlp` object should have access to pre-trained word vectors, "
+            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages."
        )

--- a/bin/wiki_entity_linking/wikidata_pretrain_kb.py
+++ b/bin/wiki_entity_linking/wikidata_pretrain_kb.py
@ -83,7 +83,7 @@ def main(
    # check the length of the nlp vectors
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
        raise ValueError(
-            "The `nlp` object should have access to pre-trained word vectors, "
+            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages."
        )

--- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py
+++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
@ -65,7 +65,7 @@ def main(

    # check that there is a NER component in the pipeline
    if "ner" not in nlp.pipe_names:
-        raise ValueError("The `nlp` object should have a pre-trained `ner` component.")
+        raise ValueError("The `nlp` object should have a pretrained `ner` component.")

    # STEP 2: create a training dataset from WP
    logger.info("STEP 2: reading training dataset from {}".format(training_path))
--- a/examples/streamlit_spacy.py
+++ b/examples/streamlit_spacy.py
@ -0,0 +1,151 @@
+# coding: utf-8
+"""
+Example of a Streamlit app for an interactive spaCy model visualizer. You can
+either download the script, or point streamlit run to the raw URL of this
+file. For more details, see https://streamlit.io.
+
+Installation:
+pip install streamlit
+python -m spacy download en_core_web_sm
+python -m spacy download en_core_web_md
+python -m spacy download de_core_news_sm
+
+Usage:
+streamlit run streamlit_spacy.py
+"""
+from __future__ import unicode_literals
+
+import streamlit as st
+import spacy
+from spacy import displacy
+import pandas as pd
+
+
+SPACY_MODEL_NAMES = ["en_core_web_sm", "en_core_web_md", "de_core_news_sm"]
+DEFAULT_TEXT = "Mark Zuckerberg is the CEO of Facebook."
+HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
+
+
+@st.cache(ignore_hash=True)
+def load_model(name):
+    return spacy.load(name)
+
+
+@st.cache(ignore_hash=True)
+def process_text(model_name, text):
+    nlp = load_model(model_name)
+    return nlp(text)
+
+
+st.sidebar.title("Interactive spaCy visualizer")
+st.sidebar.markdown(
+    """
+Process text with [spaCy](https://spacy.io) models and visualize named entities,
+dependencies and more. Uses spaCy's built-in
+[displaCy](http://spacy.io/usage/visualizers) visualizer under the hood.
+"""
+)
+
+spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES)
+model_load_state = st.info(f"Loading model '{spacy_model}'...")
+nlp = load_model(spacy_model)
+model_load_state.empty()
+
+text = st.text_area("Text to analyze", DEFAULT_TEXT)
+doc = process_text(spacy_model, text)
+
+if "parser" in nlp.pipe_names:
+    st.header("Dependency Parse & Part-of-speech tags")
+    st.sidebar.header("Dependency Parse")
+    split_sents = st.sidebar.checkbox("Split sentences", value=True)
+    collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True)
+    collapse_phrases = st.sidebar.checkbox("Collapse phrases")
+    compact = st.sidebar.checkbox("Compact mode")
+    options = {
+        "collapse_punct": collapse_punct,
+        "collapse_phrases": collapse_phrases,
+        "compact": compact,
+    }
+    docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
+    for sent in docs:
+        html = displacy.render(sent, options=options)
+        # Double newlines seem to mess with the rendering
+        html = html.replace("\n\n", "\n")
+        if split_sents and len(docs) > 1:
+            st.markdown(f"> {sent.text}")
+        st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
+
+if "ner" in nlp.pipe_names:
+    st.header("Named Entities")
+    st.sidebar.header("Named Entities")
+    label_set = nlp.get_pipe("ner").labels
+    labels = st.sidebar.multiselect("Entity labels", label_set, label_set)
+    html = displacy.render(doc, style="ent", options={"ents": labels})
+    # Newlines seem to mess with the rendering
+    html = html.replace("\n", " ")
+    st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
+    attrs = ["text", "label_", "start", "end", "start_char", "end_char"]
+    if "entity_linker" in nlp.pipe_names:
+        attrs.append("kb_id_")
+    data = [
+        [str(getattr(ent, attr)) for attr in attrs]
+        for ent in doc.ents
+        if ent.label_ in labels
+    ]
+    df = pd.DataFrame(data, columns=attrs)
+    st.dataframe(df)
+
+
+if "textcat" in nlp.pipe_names:
+    st.header("Text Classification")
+    st.markdown(f"> {text}")
+    df = pd.DataFrame(doc.cats.items(), columns=("Label", "Score"))
+    st.dataframe(df)
+
+
+vector_size = nlp.meta.get("vectors", {}).get("width", 0)
+if vector_size:
+    st.header("Vectors & Similarity")
+    st.code(nlp.meta["vectors"])
+    text1 = st.text_input("Text or word 1", "apple")
+    text2 = st.text_input("Text or word 2", "orange")
+    doc1 = process_text(spacy_model, text1)
+    doc2 = process_text(spacy_model, text2)
+    similarity = doc1.similarity(doc2)
+    if similarity > 0.5:
+        st.success(similarity)
+    else:
+        st.error(similarity)
+
+st.header("Token attributes")
+
+if st.button("Show token attributes"):
+    attrs = [
+        "idx",
+        "text",
+        "lemma_",
+        "pos_",
+        "tag_",
+        "dep_",
+        "head",
+        "ent_type_",
+        "ent_iob_",
+        "shape_",
+        "is_alpha",
+        "is_ascii",
+        "is_digit",
+        "is_punct",
+        "like_num",
+    ]
+    data = [[str(getattr(token, attr)) for attr in attrs] for token in doc]
+    df = pd.DataFrame(data, columns=attrs)
+    st.dataframe(df)
+
+
+st.header("JSON Doc")
+if st.button("Show JSON Doc"):
+    st.json(doc.to_json())
+
+st.header("JSON model meta")
+if st.button("Show JSON model meta"):
+    st.json(nlp.meta)
--- a/examples/training/pretrain_kb.py
+++ b/examples/training/pretrain_kb.py
@ -27,7 +27,7 @@ from bin.wiki_entity_linking.train_descriptions import EntityEncoder
 # Q7381115 (Russ Cochran): publisher
 ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}

-INPUT_DIM = 300  # dimension of pre-trained input vectors
+INPUT_DIM = 300  # dimension of pretrained input vectors
 DESC_WIDTH = 64  # dimension of output entity vectors


@ -39,7 +39,7 @@ DESC_WIDTH = 64  # dimension of output entity vectors
 )
 def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
    """Load the model, create the KB and pretrain the entity encodings.
-    Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings.
+    Either an nlp model or a vocab is needed to provide access to pretrained word embeddings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
    if model is None and vocab_path is None:
--- a/examples/training/pretrain_textcat.py
+++ b/examples/training/pretrain_textcat.py
@ -1,9 +1,9 @@
 """This script is experimental.

 Try pre-training the CNN component of the text categorizer using a cheap
-language modelling-like objective. Specifically, we load pre-trained vectors
+language modelling-like objective. Specifically, we load pretrained vectors
 (from something like word2vec, GloVe, FastText etc), and use the CNN to
-predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
+predict the tokens' pretrained vectors. This isn't as easy as it sounds:
 we're not merely doing compression here, because heavy dropout is applied,
 including over the input words. This means the model must often (50% of the time)
 use the context in order to predict the word.
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -2,7 +2,7 @@
 # coding: utf8
 """Example of training an additional entity type

-This script shows how to add a new entity type to an existing pre-trained NER
+This script shows how to add a new entity type to an existing pretrained NER
 model. To keep the example short and simple, only four sentences are provided
 as examples. In practice, you'll need many more — a few hundred would be a
 good start. You will also likely need to mix in examples of other entity
--- a/fabfile.py
+++ b/fabfile.py
@ -10,113 +10,145 @@ import sys


 PWD = path.dirname(__file__)
-ENV = environ['VENV_DIR'] if 'VENV_DIR' in environ else '.env'
+ENV = environ["VENV_DIR"] if "VENV_DIR" in environ else ".env"
 VENV_DIR = Path(PWD) / ENV


@contextlib.contextmanager
-def virtualenv(name, create=False, python='/usr/bin/python3.6'):
+def virtualenv(name, create=False, python="/usr/bin/python3.6"):
    python = Path(python).resolve()
    env_path = VENV_DIR
    if create:
        if env_path.exists():
            shutil.rmtree(str(env_path))
-        local('{python} -m venv {env_path}'.format(python=python, env_path=VENV_DIR))
+        local("{python} -m venv {env_path}".format(python=python, env_path=VENV_DIR))
+
    def wrapped_local(cmd, env_vars=[], capture=False, direct=False):
-        return local('source {}/bin/activate && {}'.format(env_path, cmd),
-                     shell='/bin/bash', capture=False)
+        return local(
+            "source {}/bin/activate && {}".format(env_path, cmd),
+            shell="/bin/bash",
+            capture=False,
+        )
+
    yield wrapped_local


-def env(lang='python3.6'):
+def env(lang="python3.6"):
    if VENV_DIR.exists():
-        local('rm -rf {env}'.format(env=VENV_DIR))
-    if lang.startswith('python3'):
-        local('{lang} -m venv {env}'.format(lang=lang, env=VENV_DIR))
+        local("rm -rf {env}".format(env=VENV_DIR))
+    if lang.startswith("python3"):
+        local("{lang} -m venv {env}".format(lang=lang, env=VENV_DIR))
    else:
-        local('{lang} -m pip install virtualenv --no-cache-dir'.format(lang=lang))
-        local('{lang} -m virtualenv {env} --no-cache-dir'.format(lang=lang, env=VENV_DIR))
+        local("{lang} -m pip install virtualenv --no-cache-dir".format(lang=lang))
+        local(
+            "{lang} -m virtualenv {env} --no-cache-dir".format(lang=lang, env=VENV_DIR)
+        )
    with virtualenv(VENV_DIR) as venv_local:
-        print(venv_local('python --version', capture=True))
-        venv_local('pip install --upgrade setuptools --no-cache-dir')
-        venv_local('pip install pytest --no-cache-dir')
-        venv_local('pip install wheel --no-cache-dir')
-        venv_local('pip install -r requirements.txt --no-cache-dir')
-        venv_local('pip install pex --no-cache-dir')
-
+        print(venv_local("python --version", capture=True))
+        venv_local("pip install --upgrade setuptools --no-cache-dir")
+        venv_local("pip install pytest --no-cache-dir")
+        venv_local("pip install wheel --no-cache-dir")
+        venv_local("pip install -r requirements.txt --no-cache-dir")
+        venv_local("pip install pex --no-cache-dir")


 def install():
    with virtualenv(VENV_DIR) as venv_local:
-        venv_local('pip install dist/*.tar.gz')
+        venv_local("pip install dist/*.tar.gz")


 def make():
    with lcd(path.dirname(__file__)):
-        local('export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace',
-            shell='/bin/bash')
+        local(
+            "export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace",
+            shell="/bin/bash",
+        )
+

 def sdist():
    with virtualenv(VENV_DIR) as venv_local:
        with lcd(path.dirname(__file__)):
-            local('python -m pip install -U setuptools')
-            local('python setup.py sdist')
+            venv_local("python -m pip install -U setuptools srsly")
+            venv_local("python setup.py sdist")
+

 def wheel():
    with virtualenv(VENV_DIR) as venv_local:
        with lcd(path.dirname(__file__)):
-            venv_local('python setup.py bdist_wheel')
+            venv_local("python setup.py bdist_wheel")
+

 def pex():
    with virtualenv(VENV_DIR) as venv_local:
        with lcd(path.dirname(__file__)):
-            sha = local('git rev-parse --short HEAD', capture=True)
-            venv_local('pex dist/*.whl -e spacy -o dist/spacy-%s.pex' % sha,
-                direct=True)
+            sha = local("git rev-parse --short HEAD", capture=True)
+            venv_local(
+                "pex dist/*.whl -e spacy -o dist/spacy-%s.pex" % sha, direct=True
+            )


 def clean():
    with lcd(path.dirname(__file__)):
-        local('rm -f dist/*.whl')
-        local('rm -f dist/*.pex')
+        local("rm -f dist/*.whl")
+        local("rm -f dist/*.pex")
        with virtualenv(VENV_DIR) as venv_local:
-            venv_local('python setup.py clean --all')
+            venv_local("python setup.py clean --all")


 def test():
    with virtualenv(VENV_DIR) as venv_local:
        with lcd(path.dirname(__file__)):
-            venv_local('pytest -x spacy/tests')
+            venv_local("pytest -x spacy/tests")
+

 def train():
-    args = environ.get('SPACY_TRAIN_ARGS', '')
+    args = environ.get("SPACY_TRAIN_ARGS", "")
    with virtualenv(VENV_DIR) as venv_local:
-        venv_local('spacy train {args}'.format(args=args))
+        venv_local("spacy train {args}".format(args=args))


-def conll17(treebank_dir, experiment_dir, vectors_dir, config, corpus=''):
-    is_not_clean = local('git status --porcelain', capture=True)
+def conll17(treebank_dir, experiment_dir, vectors_dir, config, corpus=""):
+    is_not_clean = local("git status --porcelain", capture=True)
    if is_not_clean:
        print("Repository is not clean")
        print(is_not_clean)
        sys.exit(1)
-    git_sha = local('git rev-parse --short HEAD', capture=True)
-    config_checksum = local('sha256sum {config}'.format(config=config), capture=True)
-    experiment_dir = Path(experiment_dir) / '{}--{}'.format(config_checksum[:6], git_sha)
+    git_sha = local("git rev-parse --short HEAD", capture=True)
+    config_checksum = local("sha256sum {config}".format(config=config), capture=True)
+    experiment_dir = Path(experiment_dir) / "{}--{}".format(
+        config_checksum[:6], git_sha
+    )
    if not experiment_dir.exists():
        experiment_dir.mkdir()
-    test_data_dir = Path(treebank_dir) / 'ud-test-v2.0-conll2017'
+    test_data_dir = Path(treebank_dir) / "ud-test-v2.0-conll2017"
    assert test_data_dir.exists()
    assert test_data_dir.is_dir()
    if corpus:
        corpora = [corpus]
    else:
-        corpora = ['UD_English', 'UD_Chinese', 'UD_Japanese', 'UD_Vietnamese']
+        corpora = ["UD_English", "UD_Chinese", "UD_Japanese", "UD_Vietnamese"]

-    local('cp {config} {experiment_dir}/config.json'.format(config=config, experiment_dir=experiment_dir))
+    local(
+        "cp {config} {experiment_dir}/config.json".format(
+            config=config, experiment_dir=experiment_dir
+        )
+    )
    with virtualenv(VENV_DIR) as venv_local:
        for corpus in corpora:
-            venv_local('spacy ud-train {treebank_dir} {experiment_dir} {config} {corpus} -v {vectors_dir}'.format(
-                treebank_dir=treebank_dir, experiment_dir=experiment_dir, config=config, corpus=corpus, vectors_dir=vectors_dir))
-            venv_local('spacy ud-run-test {test_data_dir} {experiment_dir} {corpus}'.format(
-                test_data_dir=test_data_dir, experiment_dir=experiment_dir, config=config, corpus=corpus))
+            venv_local(
+                "spacy ud-train {treebank_dir} {experiment_dir} {config} {corpus} -v {vectors_dir}".format(
+                    treebank_dir=treebank_dir,
+                    experiment_dir=experiment_dir,
+                    config=config,
+                    corpus=corpus,
+                    vectors_dir=vectors_dir,
+                )
+            )
+            venv_local(
+                "spacy ud-run-test {test_data_dir} {experiment_dir} {corpus}".format(
+                    test_data_dir=test_data_dir,
+                    experiment_dir=experiment_dir,
+                    config=config,
+                    corpus=corpus,
+                )
+            )
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,10 +1,3 @@
 [build-system]
-requires = ["setuptools",
-            "wheel>0.32.0,<0.33.0",
-            "Cython",
-            "cymem>=2.0.2,<2.1.0",
-            "preshed>=2.0.1,<2.1.0",
-            "murmurhash>=0.28.0,<1.1.0",
-            "thinc>=7.0.8,<7.1.0",
-            ]
+requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@ -15,7 +15,7 @@ pathlib==1.0.1; python_version < "3.4"
 jsonschema>=2.6.0,<3.1.0
 # Development dependencies
 cython>=0.25
-pytest>=4.0.0,<4.1.0
+pytest>=4.6.5
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.5.0,<3.6.0
--- a/setup.cfg
+++ b/setup.cfg
@ -0,0 +1,102 @@
+[metadata]
+description = Industrial-strength Natural Language Processing (NLP) in Python
+url = https://spacy.io
+author = Explosion
+author_email = contact@explosion.ai
+license = MIT
+long_description = file: README.md
+long_description_content_type = text/markdown
+classifiers =
+    Development Status :: 5 - Production/Stable
+    Environment :: Console
+    Intended Audience :: Developers
+    Intended Audience :: Science/Research
+    License :: OSI Approved :: MIT License
+    Operating System :: POSIX :: Linux
+    Operating System :: MacOS :: MacOS X
+    Operating System :: Microsoft :: Windows
+    Programming Language :: Cython
+    Programming Language :: Python :: 2
+    Programming Language :: Python :: 2.7
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3.5
+    Programming Language :: Python :: 3.6
+    Programming Language :: Python :: 3.7
+    Topic :: Scientific/Engineering
+
+[options]
+zip_safe = false
+include_package_data = true
+scripts =
+    bin/spacy
+python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
+setup_requires =
+    wheel
+    cython>=0.25
+    # We also need our Cython packages here to compile against
+    cymem>=2.0.2,<2.1.0
+    preshed>=3.0.2,<3.1.0
+    murmurhash>=0.28.0,<1.1.0
+    thinc>=7.1.1,<7.2.0
+install_requires =
+    numpy>=1.15.0
+    murmurhash>=0.28.0,<1.1.0
+    cymem>=2.0.2,<2.1.0
+    preshed>=3.0.2,<3.1.0
+    thinc>=7.1.1,<7.2.0
+    blis>=0.4.0,<0.5.0
+    plac<1.0.0,>=0.9.6
+    requests>=2.13.0,<3.0.0
+    wasabi>=0.2.0,<1.1.0
+    srsly>=0.1.0,<1.1.0
+    pathlib==1.0.1; python_version < "3.4"
+
+[options.extras_require]
+lookups =
+    spacy_lookups_data>=0.0.5<0.2.0
+cuda =
+    thinc_gpu_ops>=0.0.1,<0.1.0
+    cupy>=5.0.0b4
+cuda80 =
+    thinc_gpu_ops>=0.0.1,<0.1.0
+    cupy-cuda80>=5.0.0b4
+cuda90 =
+    thinc_gpu_ops>=0.0.1,<0.1.0
+    cupy-cuda90>=5.0.0b4
+cuda91 =
+    thinc_gpu_ops>=0.0.1,<0.1.0
+    cupy-cuda91>=5.0.0b4
+cuda92 =
+    thinc_gpu_ops>=0.0.1,<0.1.0
+    cupy-cuda92>=5.0.0b4
+cuda100 =
+    thinc_gpu_ops>=0.0.1,<0.1.0
+    cupy-cuda100>=5.0.0b4
+# Language tokenizers with external dependencies
+ja =
+    mecab-python3==0.7
+ko =
+    natto-py==0.9.0
+th =
+    pythainlp>=2.0
+
+[bdist_wheel]
+universal = false
+
+[sdist]
+formats = gztar
+
+[flake8]
+ignore = E203, E266, E501, E731, W503
+max-line-length = 80
+select = B,C,E,F,W,T4,B9
+exclude =
+    .env,
+    .git,
+    __pycache__,
+    _tokenizer_exceptions_list.py,
+    spacy/__init__.py
+
+[tool:pytest]
+markers =
+    slow
--- a/setup.py
+++ b/setup.py
@ -27,9 +27,6 @@ def is_new_osx():
        return False


-PACKAGE_DATA = {"": ["*.pyx", "*.pxd", "*.txt", "*.tokens", "*.json"]}
-
-
 PACKAGES = find_packages()


@ -86,22 +83,6 @@ if is_new_osx():
    LINK_OPTIONS["other"].append("-nodefaultlibs")


-USE_OPENMP_DEFAULT = "0" if sys.platform != "darwin" else None
-if os.environ.get("USE_OPENMP", USE_OPENMP_DEFAULT) == "1":
-    if sys.platform == "darwin":
-        COMPILE_OPTIONS["other"].append("-fopenmp")
-        LINK_OPTIONS["other"].append("-fopenmp")
-        PACKAGE_DATA["spacy.platform.darwin.lib"] = ["*.dylib"]
-        PACKAGES.append("spacy.platform.darwin.lib")
-
-    elif sys.platform == "win32":
-        COMPILE_OPTIONS["msvc"].append("/openmp")
-
-    else:
-        COMPILE_OPTIONS["other"].append("-fopenmp")
-        LINK_OPTIONS["other"].append("-fopenmp")
-
-
 # By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
 # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
 class build_ext_options:
@ -132,23 +113,6 @@ def generate_cython(root, source):
        raise RuntimeError("Running cythonize failed")


-def gzip_language_data(root, source):
-    print("Compressing language data")
-    import srsly
-    from pathlib import Path
-
-    base = Path(root) / source
-    for jsonfile in base.glob("**/*.json"):
-        outfile = jsonfile.with_suffix(jsonfile.suffix + ".gz")
-        if outfile.is_file() and outfile.stat().st_mtime > jsonfile.stat().st_mtime:
-            # If the gz is newer it doesn't need updating
-            print("Skipping {}, already compressed".format(jsonfile))
-            continue
-        data = srsly.read_json(jsonfile)
-        srsly.write_gzip_json(outfile, data)
-        print("Compressed {}".format(jsonfile))
-
-
 def is_source_release(path):
    return os.path.exists(os.path.join(path, "PKG-INFO"))

@ -185,9 +149,6 @@ def setup_package():
            about = {}
            exec(f.read(), about)

-        with io.open(os.path.join(root, "README.md"), encoding="utf8") as f:
-            readme = f.read()
-
        include_dirs = [
            get_python_inc(plat_specific=True),
            os.path.join(root, "include"),
@ -203,7 +164,6 @@ def setup_package():
        for mod_name in MOD_NAMES:
            mod_path = mod_name.replace(".", "/") + ".cpp"
            extra_link_args = []
-            extra_compile_args = []
            # ???
            # Imported from patch from @mikepb
            # See Issue #267. Running blind here...
@ -224,69 +184,12 @@ def setup_package():

        if not is_source_release(root):
            generate_cython(root, "spacy")
-            gzip_language_data(root, "spacy/lang")

        setup(
            name="spacy",
-            zip_safe=False,
            packages=PACKAGES,
-            package_data=PACKAGE_DATA,
-            description=about["__summary__"],
-            long_description=readme,
-            long_description_content_type="text/markdown",
-            author=about["__author__"],
-            author_email=about["__email__"],
            version=about["__version__"],
-            url=about["__uri__"],
-            license=about["__license__"],
            ext_modules=ext_modules,
-            scripts=["bin/spacy"],
-            install_requires=[
-                "numpy>=1.15.0",
-                "murmurhash>=0.28.0,<1.1.0",
-                "cymem>=2.0.2,<2.1.0",
-                "preshed>=2.0.1,<2.1.0",
-                "thinc>=7.0.8,<7.1.0",
-                "blis>=0.2.2,<0.3.0",
-                "plac<1.0.0,>=0.9.6",
-                "requests>=2.13.0,<3.0.0",
-                "wasabi>=0.2.0,<1.1.0",
-                "srsly>=0.1.0,<1.1.0",
-                'pathlib==1.0.1; python_version < "3.4"',
-            ],
-            setup_requires=["wheel"],
-            extras_require={
-                "cuda": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy>=5.0.0b4"],
-                "cuda80": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda80>=5.0.0b4"],
-                "cuda90": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda90>=5.0.0b4"],
-                "cuda91": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda91>=5.0.0b4"],
-                "cuda92": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda92>=5.0.0b4"],
-                "cuda100": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda100>=5.0.0b4"],
-                # Language tokenizers with external dependencies
-                "ja": ["mecab-python3==0.7"],
-                "ko": ["natto-py==0.9.0"],
-                "th": ["pythainlp>=2.0"],
-            },
-            python_requires=">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*",
-            classifiers=[
-                "Development Status :: 5 - Production/Stable",
-                "Environment :: Console",
-                "Intended Audience :: Developers",
-                "Intended Audience :: Science/Research",
-                "License :: OSI Approved :: MIT License",
-                "Operating System :: POSIX :: Linux",
-                "Operating System :: MacOS :: MacOS X",
-                "Operating System :: Microsoft :: Windows",
-                "Programming Language :: Cython",
-                "Programming Language :: Python :: 2",
-                "Programming Language :: Python :: 2.7",
-                "Programming Language :: Python :: 3",
-                "Programming Language :: Python :: 3.4",
-                "Programming Language :: Python :: 3.5",
-                "Programming Language :: Python :: 3.6",
-                "Programming Language :: Python :: 3.7",
-                "Topic :: Scientific/Engineering",
-            ],
            cmdclass={"build_ext": build_ext_subclass},
        )

--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,17 +1,7 @@
-# inspired from:
-# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
-# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 # fmt: off
-
 __title__ = "spacy"
-__version__ = "2.1.8"
-__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
-__uri__ = "https://spacy.io"
-__author__ = "Explosion AI"
-__email__ = "contact@explosion.ai"
-__license__ = "MIT"
+__version__ = "2.2.0"
 __release__ = True
-
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -96,9 +96,9 @@ def pretrain(
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
-    pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
-    vectors which match the pre-trained ones. The weights are saved to a directory
-    after each epoch. You can then pass a path to one of these pre-trained weights
+    pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict
+    vectors which match the pretrained ones. The weights are saved to a directory
+    after each epoch. You can then pass a path to one of these pretrained weights
    files to the 'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
@ -156,7 +156,7 @@ def pretrain(
            subword_features=True,  # Set to False for Chinese etc
        ),
    )
-    # Load in pre-trained weights
+    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -222,7 +222,7 @@ def train(

    nlp._optimizer = None

-    # Load in pre-trained weights
+    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))
@ -417,7 +417,7 @@ def _load_vectors(nlp, vectors):


 def _load_pretrained_tok2vec(nlp, loc):
-    """Load pre-trained weights for the 'token-to-vector' part of the component
+    """Load pretrained weights for the 'token-to-vector' part of the component
    models, which is typically a CNN. See 'spacy pretrain'. Experimental.
    """
    with loc.open("rb") as file_:
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -5,7 +5,7 @@ import uuid

 from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
 from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
-from ..util import minify_html, escape_html, get_entry_points
+from ..util import minify_html, escape_html, get_entry_points, ENTRY_POINTS
 from ..errors import Errors


@ -242,7 +242,7 @@ class EntityRenderer(object):
            "CARDINAL": "#e4e7d2",
            "PERCENT": "#e4e7d2",
        }
-        user_colors = get_entry_points("spacy_displacy_colors")
+        user_colors = get_entry_points(ENTRY_POINTS.displacy_colors)
        for user_color in user_colors.values():
            colors.update(user_color)
        colors.update(options.get("colors", {}))
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -84,6 +84,17 @@ class Warnings(object):
    W018 = ("Entity '{entity}' already exists in the Knowledge base.")
    W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
            "previously loaded vectors. See Issue #3853.")
+    W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
+            "loaded. (Shape: {shape})")
+    W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
+            "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
+    W022 = ("Training a new part-of-speech tagger using a model with no "
+            "lemmatization rules or data. This means that the trained model "
+            "may not be able to lemmatize correctly. If this is intentional "
+            "or the language you're using doesn't have lemmatization data, "
+            "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
+            "If this is surprising, make sure you have the spacy-lookups-data "
+            "package installed.")


@add_codes
@ -313,7 +324,9 @@ class Errors(object):
    E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
            "have been declared in previous edges.")
    E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
-            "tokens to merge.")
+            "tokens to merge. If you want to find the longest non-overlapping "
+            "spans, you can use the util.filter_spans helper:\n"
+            "https://spacy.io/api/top-level#util.filter_spans")
    E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
            "token can only be part of one entity, so make sure the entities "
            "you're setting don't overlap.")
@ -343,7 +356,7 @@ class Errors(object):
    E113 = ("The newly split token can only have one root (head = 0).")
    E114 = ("The newly split token needs to have a root (head = 0).")
    E115 = ("All subtokens must have associated heads.")
-    E116 = ("Cannot currently add labels to pre-trained text classifier. Add "
+    E116 = ("Cannot currently add labels to pretrained text classifier. Add "
            "labels before training begins. This functionality was available "
            "in previous versions, but had significant bugs that led to poor "
            "performance.")
@ -457,17 +470,42 @@ class Errors(object):
    E160 = ("Can't find language data file: {path}")
    E161 = ("Found an internal inconsistency when predicting entity links. "
            "This is likely a bug in spaCy, so feel free to open an issue.")
+    E162 = ("Cannot evaluate textcat model on data with different labels.\n"
+            "Labels in model: {model_labels}\nLabels in evaluation "
+            "data: {eval_labels}")
+    E163 = ("cumsum was found to be unstable: its last element does not "
+            "correspond to sum")
+    E164 = ("x is neither increasing nor decreasing: {}.")
+    E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
+            "that case.")
+    E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
+            "Current DocBin: {current}\nOther DocBin: {other}")
+    E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
+            "happen if the tagger was trained with a different set of "
+            "morphological features. If you're using a pretrained model, make "
+            "sure that your models are up to date:\npython -m spacy validate")
+    E168 = ("Unknown field: {field}")
+    E169 = ("Can't find module: {module}")
+    E170 = ("Cannot apply transition {name}: invalid for the current state.")
+    E171 = ("Matcher.add received invalid on_match callback argument: expected "
+            "callable or None, but got: {arg_type}")
+    E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
+            "Lemmatizer, initialize the class directly. See the docs for "
+            "details: https://spacy.io/api/lemmatizer")
+    E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
+            "Lookups containing the lemmatization tables. See the docs for "
+            "details: https://spacy.io/api/lemmatizer#init")


@add_codes
 class TempErrors(object):
-    T003 = ("Resizing pre-trained Tagger models is not currently supported.")
+    T003 = ("Resizing pretrained Tagger models is not currently supported.")
    T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
    T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
            "issue tracker: http://github.com/explosion/spaCy/issues")
    T008 = ("Bad configuration of Tagger. This is probably a bug within "
            "spaCy. We changed the name of an internal attribute for loading "
-            "pre-trained vectors, and the class has been passed the old name "
+            "pretrained vectors, and the class has been passed the old name "
            "(pretrained_dims) but not the new name (pretrained_vectors).")


--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -21,8 +21,6 @@ class BengaliDefaults(Language.Defaults):
    prefixes = TOKENIZER_PREFIXES
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
-    # Lemma rules: উচ্চতর বাংলা ব্যাকরণ ও রচনা - অধ্যাপক নিরঞ্জন অধিকারী ও অধ্যাপক ড. সফিউদ্দিন আহমদ
-    resources = {"lemma_rules": "lemma_rules.json"}


 class Bengali(Language):
--- a/spacy/lang/bn/lemma_rules.json
+++ b/spacy/lang/bn/lemma_rules.json
@ -1,57 +0,0 @@
-{
-    "noun": [
-        ["টা", ""],
-        ["টি", ""],
-        ["খান", ""],
-        ["খানা", ""],
-        ["খানি", ""],
-        ["গাছা", ""],
-        ["গাছি", ""],
-        ["ছড়া", ""],
-        ["কে", ""],
-        ["ে", ""],
-        ["তে", ""],
-        ["র", ""],
-        ["রা", ""],
-        ["রে", ""],
-        ["ের", ""],
-        ["েরা", ""],
-        ["দের", ""],
-        ["দেরকে", ""],
-        ["গুলা", ""],
-        ["গুলো", ""],
-        ["গুলি", ""],
-        ["কুল", ""],
-        ["গণ", ""],
-        ["দল", ""],
-        ["পাল", ""],
-        ["পুঞ্জ", ""],
-        ["মণ্ডলী", ""],
-        ["মালা", ""],
-        ["রাজি", ""],
-        ["বৃন্দ", ""],
-        ["বর্গ", ""],
-        ["শ্রেণী", ""],
-        ["শ্রেনি", ""],
-        ["রাশি", ""],
-        ["সকল", ""],
-        ["মহল", ""],
-        ["াবলি", ""],
-        ["০", "0"],
-        ["১", "1"],
-        ["২", "2"],
-        ["৩", "3"],
-        ["৪", "4"],
-        ["৫", "5"],
-        ["৬", "6"],
-        ["৭", "7"],
-        ["৮", "8"],
-        ["৯", "9"]
-    ],
-    "punct": [
-        ["“", "\""],
-        ["”", "\""],
-        ["‘", "'"],
-        ["’", "'"]
-    ]
-}
--- a/spacy/lang/ca/init.py
+++ b/spacy/lang/ca/init.py
@ -24,7 +24,6 @@ class CatalanDefaults(Language.Defaults):
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    infixes = TOKENIZER_INFIXES
-    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Catalan(Language):
--- a/spacy/lang/ca/lemma_lookup.json
+++ b/spacy/lang/ca/lemma_lookup.json
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -29,7 +29,6 @@ class DanishDefaults(Language.Defaults):
    suffixes = TOKENIZER_SUFFIXES
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
-    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Danish(Language):
--- a/spacy/lang/da/lemma_lookup.json
+++ b/spacy/lang/da/lemma_lookup.json
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -26,7 +26,20 @@ class GermanDefaults(Language.Defaults):
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
-    resources = {"lemma_lookup": "lemma_lookup.json"}
+    single_orth_variants = [
+        {"tags": ["$("], "variants": ["…", "..."]},
+        {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]},
+    ]
+    paired_orth_variants = [
+        {
+            "tags": ["$("],
+            "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")],
+        },
+        {
+            "tags": ["$("],
+            "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")],
+        },
+    ]


 class German(Language):
--- a/spacy/lang/de/lemma_lookup.json
+++ b/spacy/lang/de/lemma_lookup.json
--- a/spacy/lang/el/init.py
+++ b/spacy/lang/el/init.py
@ -13,8 +13,9 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from .norm_exceptions import NORM_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
+from ...lookups import Lookups
 from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups, get_lemma_tables
+from ...util import update_exc, add_lookups


 class GreekDefaults(Language.Defaults):
@ -31,16 +32,12 @@ class GreekDefaults(Language.Defaults):
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
    syntax_iterators = SYNTAX_ITERATORS
-    resources = {
-        "lemma_index": "lemmatizer/lemma_index.json",
-        "lemma_exc": "lemmatizer/lemma_exc.json",
-        "lemma_rules": "lemmatizer/lemma_rules.json",
-    }

    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
-        lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
-        return GreekLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
+        if lookups is None:
+            lookups = Lookups()
+        return GreekLemmatizer(lookups)


 class Greek(Language):
--- a/spacy/lang/el/lemmatizer/get_pos_from_wiktionary.py
+++ b/spacy/lang/el/lemmatizer/get_pos_from_wiktionary.py
--- a/spacy/lang/el/lemmatizer.py
+++ b/spacy/lang/el/lemmatizer.py
@ -0,0 +1,40 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...lemmatizer import Lemmatizer
+
+
+class GreekLemmatizer(Lemmatizer):
+    """
+    Greek language lemmatizer applies the default rule based lemmatization
+    procedure with some modifications for better Greek language support.
+
+    The first modification is that it checks if the word for lemmatization is
+    already a lemma and if yes, it just returns it.
+    The second modification is about removing the base forms function which is
+    not applicable for Greek language.
+    """
+
+    def lemmatize(self, string, index, exceptions, rules):
+        string = string.lower()
+        forms = []
+        if string in index:
+            forms.append(string)
+            return forms
+        forms.extend(exceptions.get(string, []))
+        oov_forms = []
+        if not forms:
+            for old, new in rules:
+                if string.endswith(old):
+                    form = string[: len(string) - len(old)] + new
+                    if not form:
+                        pass
+                    elif form in index or not form.isalpha():
+                        forms.append(form)
+                    else:
+                        oov_forms.append(form)
+        if not forms:
+            forms.extend(oov_forms)
+        if not forms:
+            forms.append(string)
+        return list(set(forms))
--- a/spacy/lang/el/lemmatizer/init.py
+++ b/spacy/lang/el/lemmatizer/init.py
@ -1,77 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from ....symbols import NOUN, VERB, ADJ, PUNCT
-
-
-class GreekLemmatizer(object):
-    """
-    Greek language lemmatizer applies the default rule based lemmatization
-    procedure with some modifications for better Greek language support.
-
-    The first modification is that it checks if the word for lemmatization is
-    already a lemma and if yes, it just returns it.
-    The second modification is about removing the base forms function which is
-    not applicable for Greek language.
-    """
-
-    @classmethod
-    def load(cls, path, index=None, exc=None, rules=None, lookup=None):
-        return cls(index, exc, rules, lookup)
-
-    def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
-        self.index = index
-        self.exc = exceptions
-        self.rules = rules
-        self.lookup_table = lookup if lookup is not None else {}
-
-    def __call__(self, string, univ_pos, morphology=None):
-        if not self.rules:
-            return [self.lookup_table.get(string, string)]
-        if univ_pos in (NOUN, "NOUN", "noun"):
-            univ_pos = "noun"
-        elif univ_pos in (VERB, "VERB", "verb"):
-            univ_pos = "verb"
-        elif univ_pos in (ADJ, "ADJ", "adj"):
-            univ_pos = "adj"
-        elif univ_pos in (PUNCT, "PUNCT", "punct"):
-            univ_pos = "punct"
-        else:
-            return list(set([string.lower()]))
-        lemmas = lemmatize(
-            string,
-            self.index.get(univ_pos, {}),
-            self.exc.get(univ_pos, {}),
-            self.rules.get(univ_pos, []),
-        )
-        return lemmas
-
-    def lookup(self, string):
-        if string in self.lookup_table:
-            return self.lookup_table[string]
-        return string
-
-
-def lemmatize(string, index, exceptions, rules):
-    string = string.lower()
-    forms = []
-    if string in index:
-        forms.append(string)
-        return forms
-    forms.extend(exceptions.get(string, []))
-    oov_forms = []
-    if not forms:
-        for old, new in rules:
-            if string.endswith(old):
-                form = string[: len(string) - len(old)] + new
-                if not form:
-                    pass
-                elif form in index or not form.isalpha():
-                    forms.append(form)
-                else:
-                    oov_forms.append(form)
-    if not forms:
-        forms.extend(oov_forms)
-    if not forms:
-        forms.append(string)
-    return list(set(forms))
--- a/spacy/lang/el/lemmatizer/lemma_exc.json
+++ b/spacy/lang/el/lemmatizer/lemma_exc.json
@ -1,236 +0,0 @@
-{
-    "adj": {
-        "χειρότερος": ["κακός"],
-        "χειρότερη": ["κακός"],
-        "χειρότερης": ["κακός"],
-        "χειρότερο": ["κακός"],
-        "χειρότεροι": ["κακός"],
-        "χειρότερων": ["κακός"],
-        "χειρότερου": ["κακός"],
-        "βέλτιστος": ["καλός"],
-        "βέλτιστη": ["καλός"],
-        "βέλτιστης": ["καλός"],
-        "βέλτιστο": ["καλός"],
-        "βέλτιστοι": ["καλός"],
-        "βέλτιστων": ["καλός"],
-        "βέλτιστου": ["καλός"],
-        "ελάχιστος": ["λίγος"],
-        "ελάχιστα": ["λίγος"],
-        "ελάχιστοι": ["λίγος"],
-        "ελάχιστων": ["λίγος"],
-        "ελάχιστη": ["λίγος"],
-        "ελάχιστης": ["λίγος"],
-        "ελάχιστο": ["λίγος"],
-        "ελάχιστου": ["λίγος"],
-        "πλείστος": ["πολύς"],
-        "πλείστου": ["πολύς"],
-        "πλείστων": ["πολύς"],
-        "πολλή": ["πολύ"],
-        "πολύς": ["πολύ"],
-        "πολλύ": ["πολύ"],
-        "πολλύς": ["πολύ"]
-    },
-    "noun": {
-        "λευτεριά": ["ελευθερία"],
-        "καφέδες": ["καφές"],
-        "ποιήματα": ["ποίημα"]
-    },
-    "det": {
-        "του": ["το"],
-        "των": ["το"],
-        "τους": ["το"],
-        "τις": ["τη"],
-        "τα": ["το"],
-        "οι": ["ο", "η"]
-    },
-    "verb": {
-        "είσαι": ["είμαι"],
-        "είναι": ["είμαι"],
-        "είμαστε": ["είμαι"],
-        "είστε": ["είμαι"],
-        "είσαστε": ["είμαι"],
-        "ήμουν": ["είμαι"],
-        "ήσουν": ["είμαι"],
-        "ήταν": ["είμαι"],
-        "ήμαστε": ["είμαι"],
-        "ήμασταν": ["είμαι"],
-        "είπα": ["λέω"],
-        "είπες": ["λέω"],
-        "είπε": ["λέω"],
-        "είπαμε": ["λέω"],
-        "είπατε": ["λέω"],
-        "είπαν": ["λέω"],
-        "είπανε": ["λέω"],
-        "πει": ["λέω"],
-        "πω": ["λέω"],
-        "πάω": ["πηγαίνω"],
-        "πάς": ["πηγαίνω"],
-        "πας": ["πηγαίνω"],
-        "πάει": ["πηγαίνω"],
-        "πάμε": ["πηγαίνω"],
-        "πάτε": ["πηγαίνω"],
-        "πάνε": ["πηγαίνω"],
-        "πήγα": ["πηγαίνω"],
-        "πήγες": ["πηγαίνω"],
-        "πήγε": ["πηγαίνω"],
-        "πήγαμε": ["πηγαίνω"],
-        "πήγατε": ["πηγαίνω"],
-        "πήγαν": ["πηγαίνω"],
-        "πήγανε": ["πηγαίνω"],
-        "έπαιζα": ["παίζω"],
-        "έπαιζες": ["παίζω"],
-        "έπαιζε": ["παίζω"],
-        "έπαιζαν": ["παίζω,"],
-        "έπαιξα": ["παίζω"],
-        "έπαιξες": ["παίζω"],
-        "έπαιξε": ["παίζω"],
-        "έτρωγα": ["τρώω"],
-        "έτρωγε": ["τρώω"],
-        "είχα": ["έχω"],
-        "είχες": ["έχω"],
-        "είχε": ["έχω"],
-        "είχαμε": ["έχω"],
-        "είχατε": ["έχω"],
-        "είχαν": ["έχω"],
-        "είχανε": ["έχω"],
-        "έπαιρνα": ["παίρνω"],
-        "έπαιρνες": ["παίρνω"],
-        "έπαιρνε": ["παίρνω"],
-        "έπαιρναν": ["παίρνω"],
-        "εδίνα": ["δίνω"],
-        "εδίνες": ["δίνω"],
-        "εδίνε": ["δίνω"],
-        "εδίναν": ["δίνω"],
-        "έκανα": ["κάνω"],
-        "έκανες": ["κάνω"],
-        "έκανε": ["κάνω"],
-        "έκαναν": ["κάνω"],
-        "ήθελα": ["θέλω"],
-        "ήθελες": ["θέλω"],
-        "ήθελε": ["θέλω"],
-        "ήθελαν": ["θέλω"],
-        "έβλεπα": ["βλέπω"],
-        "έβλεπες": ["βλέπω"],
-        "έβλεπε": ["βλέπω"],
-        "έβλεπαν": ["βλέπω"],
-        "είδα": ["βλέπω"],
-        "είδες": ["βλέπω"],
-        "είδε": ["βλέπω"],
-        "είδαμε": ["βλέπω"],
-        "είδατε": ["βλέπω"],
-        "είδαν": ["βλέπω"],
-        "έφερνα": ["φέρνω"],
-        "έφερνες": ["φέρνω"],
-        "έφερνε": ["φέρνω"],
-        "έφερναν": ["φέρνω"],
-        "έφερα": ["φέρω"],
-        "έφερες": ["φέρω"],
-        "έφερε": ["φέρω"],
-        "έφεραν": ["φέρω"],
-        "έλαβα": ["λαμβάνω"],
-        "έλαβες": ["λαμβάνω"],
-        "έλαβε": ["λαμβάνω"],
-        "έλαβαν": ["λαμβάνω"],
-        "έβρισκα": ["βρίσκω"],
-        "έβρισκες": ["βρίσκω"],
-        "έβρισκε": ["βρίσκω"],
-        "έβρισκαν": ["βρίσκω"],
-        "ήξερα": ["ξέρω"],
-        "ήξερες": ["ξέρω"],
-        "ήξερε": ["ξέρω"],
-        "ήξεραν": ["ξέρω"],
-        "ανέφερα": ["αναφέρω"],
-        "ανέφερες": ["αναφέρω"],
-        "ανέφερε": ["αναφέρω"],
-        "ανέφεραν": ["αναφέρω"],
-        "έβαζα": ["βάζω"],
-        "έβαζες": ["βάζω"],
-        "έβαζε": ["βάζω"],
-        "έβαζαν": ["βάζω"],
-        "έμεινα": ["μένω"],
-        "έμεινες": ["μένω"],
-        "έμεινε": ["μένω"],
-        "έμειναν": ["μένω"],
-        "έβγαζα": ["βγάζω"],
-        "έβγαζες": ["βγάζω"],
-        "έβγαζε": ["βγάζω"],
-        "έβγαζαν": ["βγάζω"],
-        "έμπαινα": ["μπαίνω"],
-        "έμπαινες": ["μπαίνω"],
-        "έμπαινε": ["μπαίνω"],
-        "έμπαιναν": ["μπαίνω"],
-        "βγήκα": ["βγαίνω"],
-        "βγήκες": ["βγαίνω"],
-        "βγήκε": ["βγαίνω"],
-        "βγήκαμε": ["βγαίνω"],
-        "βγήκατε": ["βγαίνω"],
-        "βγήκαν": ["βγαίνω"],
-        "έπεφτα": ["πέφτω"],
-        "έπεφτες": ["πέφτω"],
-        "έπεφτε": ["πέφτω"],
-        "έπεφταν": ["πέφτω"],
-        "έπεσα": ["πέφτω"],
-        "έπεσες": ["πέφτω"],
-        "έπεσε": ["πέφτω"],
-        "έπεσαν": ["πέφτω"],
-        "έστειλα": ["στέλνω"],
-        "έστειλες": ["στέλνω"],
-        "έστειλε": ["στέλνω"],
-        "έστειλαν": ["στέλνω"],
-        "έφυγα": ["φεύγω"],
-        "έφυγες": ["φεύγω"],
-        "έφυγαν": ["φεύγω"],
-        "έμαθα": ["μαθαίνω"],
-        "έμαθες": ["μαθαίνω"],
-        "έμαθε": ["μαθαίνω"],
-        "έμαθαν": ["μαθαίνω"],
-        "υπέβαλλα": ["υποβάλλω"],
-        "υπέβαλλες": ["υποβάλλω"],
-        "υπέβαλλε": ["υποβάλλω"],
-        "υπέβαλλαν": ["υποβάλλω"],
-        "έπινα": ["πίνω"],
-        "έπινες": ["πίνω"],
-        "έπινε": ["πίνω"],
-        "έπιναν": ["πίνω"],
-        "ήπια": ["πίνω"],
-        "ήπιες": ["πίνω"],
-        "ήπιε": ["πίνω"],
-        "ήπιαμε": ["πίνω"],
-        "ήπιατε": ["πίνω"],
-        "ήπιαν": ["πίνω"],
-        "ετύχα": ["τυχαίνω"],
-        "ετύχες": ["τυχαίνω"],
-        "ετύχε": ["τυχαίνω"],
-        "ετύχαν": ["τυχαίνω"],
-        "φάω": ["τρώω"],
-        "φάς": ["τρώω"],
-        "φάει": ["τρώω"],
-        "φάμε": ["τρώω"],
-        "φάτε": ["τρώω"],
-        "φάνε": ["τρώω"],
-        "φάν": ["τρώω"],
-        "έτρωγες": ["τρώω"],
-        "τρώγαμε": ["τρώω"],
-        "τρώγατε": ["τρώω"],
-        "τρώγανε": ["τρώω"],
-        "τρώγαν": ["τρώω"],
-        "πέρασα": ["περνώ"],
-        "πέρασες": ["περνώ"],
-        "πέρασε": ["περνώ"],
-        "πέρασαμε": ["περνώ"],
-        "πέρασατε": ["περνώ"],
-        "πέρασαν": ["περνώ"],
-        "έγδαρα": ["γδάρω"],
-        "έγδαρες": ["γδάρω"],
-        "έγδαρε": ["γδάρω"],
-        "έγδαραν": ["γδάρω"],
-        "έβγαλα": ["βγάλω"],
-        "έβγαλες": ["βγάλω"],
-        "έβγαλε": ["βγάλω"],
-        "έβγαλαν": ["βγάλω"],
-        "έφθασα": ["φτάνω"],
-        "έφθασες": ["φτάνω"],
-        "έφθασε": ["φτάνω"],
-        "έφθασαν": ["φτάνω"]
-    }
-}
--- a/spacy/lang/el/lemmatizer/lemma_index.json
+++ b/spacy/lang/el/lemmatizer/lemma_index.json
--- a/spacy/lang/el/lemmatizer/lemma_rules.json
+++ b/spacy/lang/el/lemmatizer/lemma_rules.json
@ -1,139 +0,0 @@
-{
-    "adj": [
-        ["οί", "ός"],
-        ["ών", "ός"],
-        ["ού", "ός"],
-        ["ή", "ός"],
-        ["ής", "ός"],
-        ["ές", "ός"],
-        ["οι", "ος"],
-        ["ων", "ος"],
-        ["ου", "ος"],
-        ["ο", "ος"],
-        ["α", "ος"],
-        ["ώδη", "ώδες"],
-        ["ύτερη", "ός"],
-        ["ύτερης", "ός"],
-        ["ύτερων", "ός"],
-        ["ύτερος", "ός"],
-        ["ύτερου", "ός"]
-    ],
-    "noun": [
-        ["ιού", "ί"],
-        ["ιά", "ί"],
-        ["ιών", "ί"],
-        ["ηριού", "ήρι"],
-        ["ια", "ι"],
-        ["ηριών", "ήρι"],
-        ["ας", "α"],
-        ["ες", "α"],
-        ["ων", "α"],
-        ["άς", "ά"],
-        ["ές", "ά"],
-        ["ών", "ά"],
-        ["ής", "ή"],
-        ["ές", "ή"],
-        ["ών", "ή"],
-        ["ές", "ής"],
-        ["ών", "ής"],
-        ["ου", "ο"],
-        ["α", "ο"],
-        ["ων", "ο"],
-        ["ητήματος", "ήτημα"],
-        ["ητήματα", "ήτημα"],
-        ["ητημάτων", "ήτημα"],
-        ["τος", ""],
-        ["τα", "α"],
-        ["ομάτων", "όμα"],
-        ["ού", "ός"],
-        ["οί", "ός"],
-        ["ών", "ός"],
-        ["ς", ""],
-        ["ες", "α"],
-        ["ιών", "ία"],
-        ["α", "ας"],
-        ["δων", ""]
-    ],
-    "verb": [
-        ["εις", "ω"],
-        ["ει", "ω"],
-        ["ουμε", "ω"],
-        ["ετε", "ω"],
-        ["ουνε", "ω"],
-        ["ουν", "ω"],
-        ["είς", "ώ"],
-        ["εί", "ώ"],
-        ["ούν", "ώ"],
-        ["εσαι", "ομαι"],
-        ["εται", "ομαι"],
-        ["ανόμαστε", "άνομαι"],
-        ["εστε", "ομαι"],
-        ["ονται", "ομαι"],
-        ["άς", "ώ"],
-        ["άει", "ώ"],
-        ["άμε", "ώ"],
-        ["άτε", "ώ"],
-        ["άνε", "ώ"],
-        ["άν", "ώ"],
-        ["άω", "ώ"],
-        ["ώ", "άω"],
-        ["ιζόμουν", "ίζομαι"],
-        ["ιζόσουν", "ίζομαι"],
-        ["ιζόταν", "ίζομαι"],
-        ["ιζόμασταν", "ίζομαι"],
-        ["ιζόσασταν", "ίζομαι"],
-        ["ονταν", "ομαι"],
-        ["όμουν", "άμαι"],
-        ["όσουν", "άμαι"],
-        ["όταν", "άμαι"],
-        ["όμασταν", "άμαι"],
-        ["όσασταν", "άμαι"],
-        ["όντουσταν", "άμαι"],
-        ["ούσα", "ώ"],
-        ["ούσες", "ώ"],
-        ["ούσε", "ώ"],
-        ["ούσαμε", "ώ"],
-        ["ούσατε", "ώ"],
-        ["ούσαν", "ώ"],
-        ["ούσανε", "ώ"],
-        ["λαμε", "ζω"],
-        ["λατε", "ζω"],
-        ["ήρα", "άρω"],
-        ["ήρες", "άρω"],
-        ["ήρε", "άρω"],
-        ["ήραμε", "άρω"],
-        ["ήρατε", "άρω"],
-        ["ήρα", "άρω"],
-        ["ένησα", "ενώ"],
-        ["ένησες", "ενώ"],
-        ["ένησε", "ενώ"],
-        ["ενήσαμε", "ενώ"],
-        ["ένησατε", "ενώ"],
-        ["ένησαν", "ενώ"],
-        ["όνεσα", "ονώ"],
-        ["όνεσες", "ονώ"],
-        ["όνεσε", "ονώ"],
-        ["έσαμε", "ώ"],
-        ["έσατε", "ώ"],
-        ["ισα", "ομαι"],
-        ["ισες", "ομαι"],
-        ["ισε", "ομαι"],
-        ["αθίσαμε", "άθομαι"],
-        ["αθίσατε", "άθομαι"],
-        ["ισαν", "ομαι"],
-        ["άπα", "απώ"],
-        ["ά", "ώ"],
-        ["οντας", "ω"],
-        ["ξω", "ζω"],
-        ["ξεις", "ζω"],
-        ["ξουμε", "ζω"],
-        ["ξετε", "ζω"],
-        ["ξουν", "ζω"]
-    ],
-    "punct": [
-        ["“", "\""],
-        ["”", "\""],
-        ["‘", "'"],
-        ["’", "'"]
-    ]
-}
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -32,12 +32,14 @@ class EnglishDefaults(Language.Defaults):
    stop_words = STOP_WORDS
    morph_rules = MORPH_RULES
    syntax_iterators = SYNTAX_ITERATORS
-    resources = {
-        "lemma_lookup": "lemmatizer/lemma_lookup.json",
-        "lemma_rules": "lemmatizer/lemma_rules.json",
-        "lemma_index": "lemmatizer/lemma_index.json",
-        "lemma_exc": "lemmatizer/lemma_exc.json",
-    }
+    single_orth_variants = [
+        {"tags": ["NFP"], "variants": ["…", "..."]},
+        {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
+    ]
+    paired_orth_variants = [
+        {"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
+        {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
+    ]


 class English(Language):
--- a/spacy/lang/en/lemmatizer/LICENSE
+++ b/spacy/lang/en/lemmatizer/LICENSE
@ -1,31 +0,0 @@
-WordNet Release 3.0
-
-This software and database is being provided to you, the LICENSEE, by  
-Princeton University under the following license.  By obtaining, using  
-and/or copying this software and database, you agree that you have  
-read, understood, and will comply with these terms and conditions.:  
-  
-Permission to use, copy, modify and distribute this software and  
-database and its documentation for any purpose and without fee or  
-royalty is hereby granted, provided that you agree to comply with  
-the following copyright notice and statements, including the disclaimer,  
-and that the same appear on ALL copies of the software, database and  
-documentation, including modifications that you make for internal  
-use or for distribution.  
-  
-WordNet 3.0 Copyright 2006 by Princeton University.  All rights reserved.  
-  
-THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON  
-UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR  
-IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON  
-UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-  
-ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE  
-OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT  
-INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR  
-OTHER RIGHTS.  
-  
-The name of Princeton University or Princeton may not be used in  
-advertising or publicity pertaining to distribution of the software  
-and/or database.  Title to copyright in this software, database and  
-any associated documentation shall at all times remain with  
-Princeton University and LICENSEE agrees to preserve same.  
--- a/spacy/lang/en/lemmatizer/init.py
+++ b/spacy/lang/en/lemmatizer/init.py
--- a/spacy/lang/en/lemmatizer/lemma_exc.json
+++ b/spacy/lang/en/lemmatizer/lemma_exc.json
--- a/spacy/lang/en/lemmatizer/lemma_index.json
+++ b/spacy/lang/en/lemmatizer/lemma_index.json
--- a/spacy/lang/en/lemmatizer/lemma_lookup.json
+++ b/spacy/lang/en/lemmatizer/lemma_lookup.json
--- a/spacy/lang/en/lemmatizer/lemma_rules.json
+++ b/spacy/lang/en/lemmatizer/lemma_rules.json
@ -1,35 +0,0 @@
-{
-    "adj": [
-        ["er", ""],
-        ["est", ""],
-        ["er", "e"],
-        ["est", "e"]
-    ],
-    "noun": [
-        ["s", ""],
-        ["ses", "s"],
-        ["ves", "f"],
-        ["xes", "x"],
-        ["zes", "z"],
-        ["ches", "ch"],
-        ["shes", "sh"],
-        ["men", "man"],
-        ["ies", "y"]
-    ],
-    "verb": [
-        ["s", ""],
-        ["ies", "y"],
-        ["es", "e"],
-        ["es", ""],
-        ["ed", "e"],
-        ["ed", ""],
-        ["ing", "e"],
-        ["ing", ""]
-    ],
-    "punct": [
-        ["“", "\""],
-        ["”", "\""],
-        ["‘", "'"],
-        ["’", "'"]
-    ]
-}
--- a/spacy/lang/es/init.py
+++ b/spacy/lang/es/init.py
@ -25,7 +25,6 @@ class SpanishDefaults(Language.Defaults):
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
-    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Spanish(Language):
--- a/spacy/lang/es/lemma_lookup.json
+++ b/spacy/lang/es/lemma_lookup.json
--- a/spacy/lang/fa/init.py
+++ b/spacy/lang/fa/init.py
@ -24,12 +24,6 @@ class PersianDefaults(Language.Defaults):
    tag_map = TAG_MAP
    suffixes = TOKENIZER_SUFFIXES
    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
-    # extracted from Mojgan Seraji's Persian Universal Dependencies Corpus
-    resources = {
-        "lemma_rules": "lemmatizer/lemma_rules.json",
-        "lemma_index": "lemmatizer/lemma_index.json",
-        "lemma_exc": "lemmatizer/lemma_exc.json",
-    }


 class Persian(Language):
--- a/spacy/lang/fa/lemmatizer/generate_verbs_exc.py
+++ b/spacy/lang/fa/lemmatizer/generate_verbs_exc.py
--- a/spacy/lang/fa/lemmatizer/init.py
+++ b/spacy/lang/fa/lemmatizer/init.py
--- a/spacy/lang/fa/lemmatizer/lemma_exc.json
+++ b/spacy/lang/fa/lemmatizer/lemma_exc.json
--- a/spacy/lang/fa/lemmatizer/lemma_index.json
+++ b/spacy/lang/fa/lemmatizer/lemma_index.json
--- a/spacy/lang/fa/lemmatizer/lemma_rules.json
+++ b/spacy/lang/fa/lemmatizer/lemma_rules.json
@ -1,41 +0,0 @@
-{
-    "adj": [
-        ["ین", ""],
-        ["‌ترین", ""],
-        ["ترین", ""],
-        ["‌تر", ""],
-        ["تر", ""],
-        ["‌ای", ""]
-    ],
-    "noun": [
-        ["ایان", "ا"],
-        ["ویان", "و"],
-        ["ایانی", "ا"],
-        ["ویانی", "و"],
-        ["گان", "ه"],
-        ["گانی", "ه"],
-        ["گان", ""],
-        ["گانی", ""],
-        ["ان", ""],
-        ["انی", ""],
-        ["ات", ""],
-        ["ات", "ه"],
-        ["ات", "ت"],
-        ["اتی", ""],
-        ["اتی", "ه"],
-        ["اتی", "ت"],
-        ["‌ها", ""],
-        ["ها", ""],
-        ["‌های", ""],
-        ["های", ""],
-        ["‌هایی", ""],
-        ["هایی", ""]
-    ],
-    "verb": [],
-    "punct": [
-        ["“", "\""],
-        ["”", "\""],
-        ["‘", "'"],
-        ["’", "'"]
-    ]
-}
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -12,8 +12,9 @@ from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
+from ...lookups import Lookups
 from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups, get_lemma_tables
+from ...util import update_exc, add_lookups


 class FrenchDefaults(Language.Defaults):
@ -30,17 +31,12 @@ class FrenchDefaults(Language.Defaults):
    suffixes = TOKENIZER_SUFFIXES
    token_match = TOKEN_MATCH
    syntax_iterators = SYNTAX_ITERATORS
-    resources = {
-        "lemma_rules": "lemmatizer/lemma_rules.json",
-        "lemma_index": "lemmatizer/lemma_index.json",
-        "lemma_exc": "lemmatizer/lemma_exc.json",
-        "lemma_lookup": "lemmatizer/lemma_lookup.json",
-    }

    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
-        lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
-        return FrenchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
+        if lookups is None:
+            lookups = Lookups()
+        return FrenchLemmatizer(lookups)


 class French(Language):
--- a/spacy/lang/fr/lemmatizer/init.py
+++ b/spacy/lang/fr/lemmatizer/init.py
@ -1,12 +1,13 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
-from ....symbols import SCONJ, CCONJ
-from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
+from ...lemmatizer import Lemmatizer
+from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
+from ...symbols import SCONJ, CCONJ
+from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos


-class FrenchLemmatizer(object):
+class FrenchLemmatizer(Lemmatizer):
    """
    French language lemmatizer applies the default rule based lemmatization
    procedure with some modifications for better French language support.
@ -16,19 +17,10 @@ class FrenchLemmatizer(object):
    the lookup table.
    """

-    @classmethod
-    def load(cls, path, index=None, exc=None, rules=None, lookup=None):
-        return cls(index, exc, rules, lookup)
-
-    def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
-        self.index = index
-        self.exc = exceptions
-        self.rules = rules
-        self.lookup_table = lookup if lookup is not None else {}
-
    def __call__(self, string, univ_pos, morphology=None):
-        if not self.rules:
-            return [self.lookup_table.get(string, string)]
+        lookup_table = self.lookups.get_table("lemma_lookup", {})
+        if "lemma_rules" not in self.lookups:
+            return [lookup_table.get(string, string)]
        if univ_pos in (NOUN, "NOUN", "noun"):
            univ_pos = "noun"
        elif univ_pos in (VERB, "VERB", "verb"):
@ -56,12 +48,14 @@ class FrenchLemmatizer(object):
        # See Issue #435 for example of where this logic is requied.
        if self.is_base_form(univ_pos, morphology):
            return list(set([string.lower()]))
-        lemmas = lemmatize(
+        index_table = self.lookups.get_table("lemma_index", {})
+        exc_table = self.lookups.get_table("lemma_exc", {})
+        rules_table = self.lookups.get_table("lemma_rules", {})
+        lemmas = self.lemmatize(
            string,
-            self.index.get(univ_pos, {}),
-            self.exc.get(univ_pos, {}),
-            self.rules.get(univ_pos, []),
-            self.lookup_table,
+            index_table.get(univ_pos, {}),
+            exc_table.get(univ_pos, {}),
+            rules_table.get(univ_pos, []),
        )
        return lemmas

@ -114,34 +108,35 @@ class FrenchLemmatizer(object):
    def punct(self, string, morphology=None):
        return self(string, "punct", morphology)

-    def lookup(self, string):
-        if string in self.lookup_table:
-            return self.lookup_table[string][0]
+    def lookup(self, string, orth=None):
+        lookup_table = self.lookups.get_table("lemma_lookup", {})
+        if orth is not None and orth in lookup_table:
+            return lookup_table[orth][0]
        return string

-
-def lemmatize(string, index, exceptions, rules, lookup):
-    string = string.lower()
-    forms = []
-    if string in index:
-        forms.append(string)
-        return forms
-    forms.extend(exceptions.get(string, []))
-    oov_forms = []
-    if not forms:
-        for old, new in rules:
-            if string.endswith(old):
-                form = string[: len(string) - len(old)] + new
-                if not form:
-                    pass
-                elif form in index or not form.isalpha():
-                    forms.append(form)
-                else:
-                    oov_forms.append(form)
-    if not forms:
-        forms.extend(oov_forms)
-    if not forms and string in lookup.keys():
-        forms.append(lookup[string][0])
-    if not forms:
-        forms.append(string)
-    return list(set(forms))
+    def lemmatize(self, string, index, exceptions, rules):
+        lookup_table = self.lookups.get_table("lemma_lookup", {})
+        string = string.lower()
+        forms = []
+        if string in index:
+            forms.append(string)
+            return forms
+        forms.extend(exceptions.get(string, []))
+        oov_forms = []
+        if not forms:
+            for old, new in rules:
+                if string.endswith(old):
+                    form = string[: len(string) - len(old)] + new
+                    if not form:
+                        pass
+                    elif form in index or not form.isalpha():
+                        forms.append(form)
+                    else:
+                        oov_forms.append(form)
+        if not forms:
+            forms.extend(oov_forms)
+        if not forms and string in lookup_table.keys():
+            forms.append(lookup_table[string][0])
+        if not forms:
+            forms.append(string)
+        return list(set(forms))
--- a/spacy/lang/fr/lemmatizer/lemma_exc.json
+++ b/spacy/lang/fr/lemmatizer/lemma_exc.json
--- a/spacy/lang/fr/lemmatizer/lemma_index.json
+++ b/spacy/lang/fr/lemmatizer/lemma_index.json
--- a/spacy/lang/fr/lemmatizer/lemma_lookup.json
+++ b/spacy/lang/fr/lemmatizer/lemma_lookup.json
--- a/spacy/lang/fr/lemmatizer/lemma_rules.json
+++ b/spacy/lang/fr/lemmatizer/lemma_rules.json
@ -1,126 +0,0 @@
-{
-    "adj": [
-        ["a", "a"],
-        ["aux", "al"],
-        ["c", "c"],
-        ["d", "d"],
-        ["e", ""],
-        ["é", "é"],
-        ["eux", "eux"],
-        ["f", "f"],
-        ["i", "i"],
-        ["ï", "ï"],
-        ["l", "l"],
-        ["m", "m"],
-        ["n", "n"],
-        ["o", "o"],
-        ["p", "p"],
-        ["r", "r"],
-        ["s", ""],
-        ["t", "t"],
-        ["u", "u"],
-        ["y", "y"]
-    ],
-    "noun": [
-        ["a", "a"],
-        ["à", "à"],
-        ["â", "â"],
-        ["b", "b"],
-        ["c", "c"],
-        ["ç", "ç"],
-        ["d", "d"],
-        ["e", "e"],
-        ["é", "é"],
-        ["è", "è"],
-        ["ê", "ê"],
-        ["ë", "ë"],
-        ["f", "f"],
-        ["g", "g"],
-        ["h", "h"],
-        ["i", "i"],
-        ["î", "î"],
-        ["ï", "ï"],
-        ["j", "j"],
-        ["k", "k"],
-        ["l", "l"],
-        ["m", "m"],
-        ["n", "n"],
-        ["o", "o"],
-        ["ô", "ö"],
-        ["ö", "ö"],
-        ["p", "p"],
-        ["q", "q"],
-        ["r", "r"],
-        ["t", "t"],
-        ["u", "u"],
-        ["û", "û"],
-        ["v", "v"],
-        ["w", "w"],
-        ["y", "y"],
-        ["z", "z"],
-        ["s", ""],
-        ["x", ""],
-        ["nt(e", "nt"],
-        ["nt(e)", "nt"],
-        ["al(e", "ale"],
-        ["é(", "é"],
-        ["é(e", "é"],
-        ["é.e", "é"],
-        ["el(le", "el"],
-        ["eurs(rices", "eur"],
-        ["eur(rice", "eur"],
-        ["eux(se", "eux"],
-        ["ial(e", "ial"],
-        ["er(ère", "er"],
-        ["eur(se", "eur"],
-        ["teur(trice", "teur"],
-        ["teurs(trices", "teur"]
-    ],
-    "verb": [
-        ["é", "er"],
-        ["és", "er"],
-        ["ée", "er"],
-        ["ées", "er"],
-        ["é", "er"],
-        ["es", "er"],
-        ["ons", "er"],
-        ["ez", "er"],
-        ["ent", "er"],
-        ["ais", "er"],
-        ["ait", "er"],
-        ["ions", "er"],
-        ["iez", "er"],
-        ["aient", "er"],
-        ["ai", "er"],
-        ["as", "er"],
-        ["a", "er"],
-        ["âmes", "er"],
-        ["âtes", "er"],
-        ["èrent", "er"],
-        ["erai", "er"],
-        ["eras", "er"],
-        ["era", "er"],
-        ["erons", "er"],
-        ["erez", "er"],
-        ["eront", "er"],
-        ["erais", "er"],
-        ["erait", "er"],
-        ["erions", "er"],
-        ["eriez", "er"],
-        ["eraient", "er"],
-        ["asse", "er"],
-        ["asses", "er"],
-        ["ât", "er"],
-        ["assions", "er"],
-        ["assiez", "er"],
-        ["assent", "er"],
-        ["ant", "er"],
-        ["ante", "er"],
-        ["ants", "er"],
-        ["antes", "er"],
-        ["u(er", "u"],
-        ["és(ées", "er"],
-        ["é()e", "er"],
-        ["é()", "er"]
-    ]
-}
--- a/spacy/lang/hi/examples.py
+++ b/spacy/lang/hi/examples.py
@ -11,12 +11,12 @@ Example sentences to test spaCy and its language models.


 sentences = [
-    "एप्पल 1 अरब डॉलर के लिए यू.के. स्टार्टअप खरीदने पर विचार कर रहा है",
-    "स्वायत्त कार निर्माताओं की ओर बीमा दायित्व रखती है",
-    "सैन फ्रांसिस्को फुटवे डिलीवरी रोबोटों पर प्रतिबंध लगाने का विचार कर रहा है",
-    "लंदन यूनाइटेड किंगडम का बड़ा शहर है।",
-    "आप कहाँ हैं?",
+    "एप्पल 1 अरब डॉलर के लिए यू.के. स्टार्टअप खरीदने पर विचार कर रहा है।",
+    "स्वायत्त कारें निर्माताओं की ओर बीमा दायित्व रखतीं हैं।",
+    "सैन फ्रांसिस्को फुटपाथ वितरण रोबोटों पर प्रतिबंध लगाने का विचार कर रहा है।",
+    "लंदन यूनाइटेड किंगडम का विशाल शहर है।",
+    "आप कहाँ हो?",
    "फ्रांस के राष्ट्रपति कौन हैं?",
-    "संयुक्त राज्य की राजधानी क्या है?",
-    "बराक ओबामा का जन्म हुआ था?",
+    "संयुक्त राज्यों की राजधानी क्या है?",
+    "बराक ओबामा का जन्म कब हुआ था?",
 ]
--- a/spacy/lang/hr/init.py
+++ b/spacy/lang/hr/init.py
@ -18,7 +18,6 @@ class CroatianDefaults(Language.Defaults):
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
    stop_words = STOP_WORDS
-    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Croatian(Language):
--- a/spacy/lang/hr/lemma_lookup.json
+++ b/spacy/lang/hr/lemma_lookup.json
--- a/spacy/lang/hu/init.py
+++ b/spacy/lang/hu/init.py
@ -24,7 +24,6 @@ class HungarianDefaults(Language.Defaults):
    suffixes = TOKENIZER_SUFFIXES
    infixes = TOKENIZER_INFIXES
    token_match = TOKEN_MATCH
-    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Hungarian(Language):
--- a/spacy/lang/hu/lemma_lookup.json
+++ b/spacy/lang/hu/lemma_lookup.json
--- a/spacy/lang/id/init.py
+++ b/spacy/lang/id/init.py
@ -30,7 +30,6 @@ class IndonesianDefaults(Language.Defaults):
    infixes = TOKENIZER_INFIXES
    syntax_iterators = SYNTAX_ITERATORS
    tag_map = TAG_MAP
-    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Indonesian(Language):
--- a/spacy/lang/id/lemma_lookup.json
+++ b/spacy/lang/id/lemma_lookup.json
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -23,7 +23,6 @@ class ItalianDefaults(Language.Defaults):
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
    infixes = TOKENIZER_INFIXES
-    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Italian(Language):
--- a/spacy/lang/it/lemma_lookup.json
+++ b/spacy/lang/it/lemma_lookup.json
--- a/spacy/lang/lt/init.py
+++ b/spacy/lang/lt/init.py
@ -30,7 +30,6 @@ class LithuanianDefaults(Language.Defaults):
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
    morph_rules = MORPH_RULES
-    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Lithuanian(Language):
--- a/spacy/lang/lt/lemma_lookup.json
+++ b/spacy/lang/lt/lemma_lookup.json
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -25,11 +25,6 @@ class NorwegianDefaults(Language.Defaults):
    morph_rules = MORPH_RULES
    tag_map = TAG_MAP
    syntax_iterators = SYNTAX_ITERATORS
-    resources = {
-        "lemma_lookup": "lemmatizer/lemma_lookup.json",
-        "lemma_exc": "lemmatizer/lemma_exc.json",
-        "lemma_rules": "lemmatizer/lemma_rules.json",
-    }


 class Norwegian(Language):
--- a/spacy/lang/nb/lemmatizer/LICENSE
+++ b/spacy/lang/nb/lemmatizer/LICENSE
@ -1,7 +0,0 @@
-Note on noun wordforms / lemmas:
-All wordforms are extracted from Norsk Ordbank in Norwegian Bokmål 2005, updated 20180627
-(CLARINO NB - Språkbanken), Nasjonalbiblioteket, Norway:
-https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-5&lang=en
-
-License:
-Creative_Commons-BY (CC-BY) (https://creativecommons.org/licenses/by/4.0/)
--- a/spacy/lang/nb/lemmatizer/init.py
+++ b/spacy/lang/nb/lemmatizer/init.py
--- a/spacy/lang/nb/lemmatizer/lemma_exc.json
+++ b/spacy/lang/nb/lemmatizer/lemma_exc.json
--- a/spacy/lang/nb/lemmatizer/lemma_lookup.json
+++ b/spacy/lang/nb/lemmatizer/lemma_lookup.json
--- a/spacy/lang/nb/lemmatizer/lemma_rules.json
+++ b/spacy/lang/nb/lemmatizer/lemma_rules.json
@ -1,24 +0,0 @@
-{
-    "adj": [
-        ["e", ""],
-        ["ere", ""],
-        ["est", ""],
-        ["este", ""]
-    ],
-    "noun": [
-        ["en", "e"],
-        ["a", "e"],
-        ["et", ""],
-        ["er", "e"],
-        ["ene", "e"]
-    ],
-    "verb": [
-        ["er", "e"],
-        ["et", "e"],
-        ["a", "e"],
-        ["es", "e"],
-        ["te", "e"],
-        ["år", "å"]
-    ],
-    "punct": []
-}
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -10,8 +10,9 @@ from .lemmatizer import DutchLemmatizer
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
+from ...lookups import Lookups
 from ...attrs import LANG, NORM
-from ...util import update_exc, add_lookups, get_lemma_tables
+from ...util import update_exc, add_lookups


 class DutchDefaults(Language.Defaults):
@ -26,17 +27,12 @@ class DutchDefaults(Language.Defaults):
    tag_map = TAG_MAP
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
-    resources = {
-        "lemma_rules": "lemmatizer/lemma_rules.json",
-        "lemma_index": "lemmatizer/lemma_index.json",
-        "lemma_exc": "lemmatizer/lemma_exc.json",
-        "lemma_lookup": "lemmatizer/lemma_lookup.json",
-    }

    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
-        lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
-        return DutchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
+        if lookups is None:
+            lookups = Lookups()
+        return DutchLemmatizer(lookups)


 class Dutch(Language):
--- a/spacy/lang/nl/lemmatizer/init.py
+++ b/spacy/lang/nl/lemmatizer/init.py
@ -1,10 +1,11 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ....symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
+from ...lemmatizer import Lemmatizer
+from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV


-class DutchLemmatizer(object):
+class DutchLemmatizer(Lemmatizer):
    # Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
    univ_pos_name_variants = {
        NOUN: "noun",
@ -36,16 +37,6 @@ class DutchLemmatizer(object):
        "num": "num",
    }

-    @classmethod
-    def load(cls, path, index=None, exc=None, rules=None, lookup=None):
-        return cls(index, exc, rules, lookup)
-
-    def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
-        self.index = index
-        self.exc = exceptions
-        self.rules = rules or {}
-        self.lookup_table = lookup if lookup is not None else {}
-
    def __call__(self, string, univ_pos, morphology=None):
        # Difference 1: self.rules is assumed to be non-None, so no
        # 'is None' check required.
@ -62,27 +53,28 @@ class DutchLemmatizer(object):
            # are not lemmatized. They are lowercased, however.
            return [string]
            # if string in self.lemma_index.get(univ_pos)
-        lemma_index = self.index.get(univ_pos, {})
+        index_table = self.lookups.get_table("lemma_index", {})
+        lemma_index = index_table.get(univ_pos, {})
        # string is already lemma
        if string in lemma_index:
            return [string]
-        exceptions = self.exc.get(univ_pos, {})
+        exc_table = self.lookups.get_table("lemma_exc", {})
+        exceptions = exc_table.get(univ_pos, {})
        # string is irregular token contained in exceptions index.
        try:
            lemma = exceptions[string]
            return [lemma[0]]
        except KeyError:
            pass
-        # string corresponds to  key in lookup table
-        lookup_table = self.lookup_table
+        # string corresponds to key in lookup table
+        lookup_table = self.lookups.get_table("lemma_lookup", {})
        looked_up_lemma = lookup_table.get(string)
        if looked_up_lemma and looked_up_lemma in lemma_index:
            return [looked_up_lemma]
-
-        forms, is_known = lemmatize(
-            string, lemma_index, exceptions, self.rules.get(univ_pos, [])
+        rules_table = self.lookups.get_table("lemma_rules", {})
+        forms, is_known = self.lemmatize(
+            string, lemma_index, exceptions, rules_table.get(univ_pos, [])
        )
-
        # Back-off through remaining return value candidates.
        if forms:
            if is_known:
@ -103,44 +95,26 @@ class DutchLemmatizer(object):
    # Overrides parent method so that a lowercased version of the string is
    # used to search the lookup table. This is necessary because our lookup
    # table consists entirely of lowercase keys.
-    def lookup(self, string):
+    def lookup(self, string, orth=None):
+        lookup_table = self.lookups.get_table("lemma_lookup", {})
        string = string.lower()
-        return self.lookup_table.get(string, string)
+        if orth is not None:
+            return lookup_table.get(orth, string)
+        else:
+            return lookup_table.get(string, string)

-    def noun(self, string, morphology=None):
-        return self(string, "noun", morphology)
-
-    def verb(self, string, morphology=None):
-        return self(string, "verb", morphology)
-
-    def adj(self, string, morphology=None):
-        return self(string, "adj", morphology)
-
-    def det(self, string, morphology=None):
-        return self(string, "det", morphology)
-
-    def pron(self, string, morphology=None):
-        return self(string, "pron", morphology)
-
-    def adp(self, string, morphology=None):
-        return self(string, "adp", morphology)
-
-    def punct(self, string, morphology=None):
-        return self(string, "punct", morphology)
-
-
-# Reimplemented to focus more on application of suffix rules and to return
-# as early as possible.
-def lemmatize(string, index, exceptions, rules):
-    # returns (forms, is_known: bool)
-    oov_forms = []
-    for old, new in rules:
-        if string.endswith(old):
-            form = string[: len(string) - len(old)] + new
-            if not form:
-                pass
-            elif form in index:
-                return [form], True  # True = Is known (is lemma)
-            else:
-                oov_forms.append(form)
-    return list(set(oov_forms)), False
+    # Reimplemented to focus more on application of suffix rules and to return
+    # as early as possible.
+    def lemmatize(self, string, index, exceptions, rules):
+        # returns (forms, is_known: bool)
+        oov_forms = []
+        for old, new in rules:
+            if string.endswith(old):
+                form = string[: len(string) - len(old)] + new
+                if not form:
+                    pass
+                elif form in index:
+                    return [form], True  # True = Is known (is lemma)
+                else:
+                    oov_forms.append(form)
+        return list(set(oov_forms)), False
--- a/spacy/lang/nl/lemmatizer/lemma_exc.json
+++ b/spacy/lang/nl/lemmatizer/lemma_exc.json
--- a/spacy/lang/nl/lemmatizer/lemma_index.json
+++ b/spacy/lang/nl/lemmatizer/lemma_index.json
--- a/spacy/lang/nl/lemmatizer/lemma_lookup.json
+++ b/spacy/lang/nl/lemmatizer/lemma_lookup.json
--- a/spacy/lang/nl/lemmatizer/lemma_rules.json
+++ b/spacy/lang/nl/lemmatizer/lemma_rules.json
@ -1,55 +0,0 @@
-{
-    "adj": [
-        ["sten", ""],
-        ["ende", "end"],
-        ["ste", ""],
-        ["st", ""],
-        ["er", ""],
-        ["en", ""],
-        ["e", ""]
-    ],
-    "noun": [
-        ["heden", "heid"],
-        ["elen", "eel"],
-        ["ezen", "ees"],
-        ["even", "eef"],
-        ["ssen", "s"],
-        ["rren", "r"],
-        ["kken", "k"],
-        ["bben", "b"],
-        ["'er", ""],
-        ["tje", ""],
-        ["kje", ""],
-        ["ici", "icus"],
-        ["en", ""],
-        ["ën", ""],
-        ["'s", ""],
-        ["s", ""]
-    ],
-    "verb": [
-        ["dden", "den"],
-        ["tten", "ten"],
-        ["dde", "den"],
-        ["tte", "ten"],
-        ["end", "en"],
-        ["dt", "den"],
-        ["de", "en"],
-        ["te", "en"]
-    ],
-    "num": [
-        ["sten", ""],
-        ["tjes", ""],
-        ["ste", ""],
-        ["ën", ""],
-        ["en", ""],
-        ["de", ""],
-        ["er", ""],
-        ["ër", ""]
-    ],
-    "punct": [
-        ["“", "\""],
-        ["”", "\""],
-        ["‘", "'"],
-        ["’", "'"]
-    ]
-}
--- a/spacy/lang/pt/init.py
+++ b/spacy/lang/pt/init.py
@ -27,7 +27,6 @@ class PortugueseDefaults(Language.Defaults):
    tag_map = TAG_MAP
    infixes = TOKENIZER_INFIXES
    prefixes = TOKENIZER_PREFIXES
-    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Portuguese(Language):
--- a/spacy/lang/pt/lemma_lookup.json
+++ b/spacy/lang/pt/lemma_lookup.json
--- a/spacy/lang/ro/init.py
+++ b/spacy/lang/ro/init.py
@ -24,7 +24,6 @@ class RomanianDefaults(Language.Defaults):
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
-    resources = {"lemma_lookup": "lemma_lookup.json"}
    tag_map = TAG_MAP


--- a/spacy/lang/ro/lemma_lookup.json
+++ b/spacy/lang/ro/lemma_lookup.json
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -12,6 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...util import update_exc, add_lookups
 from ...language import Language
+from ...lookups import Lookups
 from ...attrs import LANG, NORM


@ -27,8 +28,10 @@ class RussianDefaults(Language.Defaults):
    tag_map = TAG_MAP

    @classmethod
-    def create_lemmatizer(cls, nlp=None, **kwargs):
-        return RussianLemmatizer()
+    def create_lemmatizer(cls, nlp=None, lookups=None):
+        if lookups is None:
+            lookups = Lookups()
+        return RussianLemmatizer(lookups)


 class Russian(Language):
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -9,8 +9,8 @@ from ...compat import unicode_
 class RussianLemmatizer(Lemmatizer):
    _morph = None

-    def __init__(self):
-        super(RussianLemmatizer, self).__init__()
+    def __init__(self, lookups=None):
+        super(RussianLemmatizer, self).__init__(lookups)
        try:
            from pymorphy2 import MorphAnalyzer
        except ImportError:
@ -102,20 +102,7 @@ class RussianLemmatizer(Lemmatizer):
            return symbols_to_str[univ_pos]
        return None

-    def is_base_form(self, univ_pos, morphology=None):
-        # TODO
-        raise NotImplementedError
-
-    def det(self, string, morphology=None):
-        return self(string, "det", morphology)
-
-    def num(self, string, morphology=None):
-        return self(string, "num", morphology)
-
-    def pron(self, string, morphology=None):
-        return self(string, "pron", morphology)
-
-    def lookup(self, string):
+    def lookup(self, string, orth=None):
        analyses = self._morph.parse(string)
        if len(analyses) == 1:
            return analyses[0].normal_form
--- a/spacy/lang/sr/init.py
+++ b/spacy/lang/sr/init.py
@ -21,7 +21,6 @@ class SerbianDefaults(Language.Defaults):
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
-    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Serbian(Language):
--- a/spacy/lang/sr/lemma_lookup.json
+++ b/spacy/lang/sr/lemma_lookup.json
--- a/spacy/lang/sv/init.py
+++ b/spacy/lang/sv/init.py
@ -29,10 +29,6 @@ class SwedishDefaults(Language.Defaults):
    suffixes = TOKENIZER_SUFFIXES
    stop_words = STOP_WORDS
    morph_rules = MORPH_RULES
-    resources = {
-        "lemma_lookup": "lemmatizer/lemma_lookup.json",
-        "lemma_rules": "lemmatizer/lemma_rules.json",
-    }


 class Swedish(Language):
--- a/spacy/lang/sv/lemmatizer/init.py
+++ b/spacy/lang/sv/lemmatizer/init.py
--- a/spacy/lang/sv/lemmatizer/lemma_lookup.json
+++ b/spacy/lang/sv/lemmatizer/lemma_lookup.json
--- a/spacy/lang/sv/lemmatizer/lemma_rules.json
+++ b/spacy/lang/sv/lemmatizer/lemma_rules.json
@ -1,103 +0,0 @@
-{
-    "noun": [
-        ["t", ""],
-        ["n", ""],
-        ["na", ""],
-        ["na", "e"],
-        ["or", "a"],
-        ["orna", "a"],
-        ["et", ""],
-        ["en", ""],
-        ["en", "e"],
-        ["er", ""],
-        ["erna", ""],
-        ["ar", "e"],
-        ["ar", ""],
-        ["lar", "el"],
-        ["arna", "e"],
-        ["arna", ""],
-        ["larna", "el"]
-    ],
-    "verb": [
-        ["r", ""],
-        ["de", ""],
-        ["t", ""],
-        ["er", ""],
-        ["te", ""],
-        ["a", ""],
-        ["e", ""],
-        ["t", "d"],
-        ["tt", "d"],
-        ["tt", ""],
-        ["ev", "iv"],
-        ["ack", "ick"],
-        ["ög", "yg"],
-        ["it", ""],
-        ["uckit", "ick"],
-        ["ugit", "yg"],
-        ["it", "et"],
-        ["id", "ed"],
-        ["ip", "ep"],
-        ["iv", "ev"],
-        ["in", "en"],
-        ["ik", "ek"],
-        ["ig", "eg"],
-        ["ind", ""],
-        ["inn", "ann"],
-        ["nder", "nd"],
-        ["inner", "inn"],
-        ["and", "ind"],
-        ["ann", "inn"],
-        ["s", ""],
-        ["anns", "inn"],
-        ["undit", "ind"],
-        ["unnit", "inn"],
-        ["unnits", "inn"],
-        ["uppit", "ipp"],
-        ["ungit", "ing"],
-        ["öd", "ud"],
-        ["öt", "jut"],
-        ["öt", "ut"],
-        ["ög", "ug"],
-        ["ögg", "ugg"],
-        ["öng", "ung"],
-        ["önk", "unk"],
-        ["öt", "yt"],
-        ["utit", "yt"],
-        ["ös", "ys"],
-        ["öv", "yv"],
-        ["uvit", "yv"],
-        ["öp", "yp"],
-        ["upit", "yp"],
-        ["ök", "yk"],
-        ["ukit", "yk"],
-        ["or", "ar"],
-        ["öll", "all"],
-        ["ät", "åt"],
-        ["öll", "åll"],
-        ["or", "är"],
-        ["urit", "är"],
-        ["åt", "ät"],
-        ["ar", "är"],
-        ["alt", "ält"],
-        ["ultit", "ält"]
-    ],
-    "adj": [
-        ["are", ""],
-        ["ast", ""],
-        ["re", ""],
-        ["st", ""],
-        ["ägre", "åg"],
-        ["ägst", "åg"],
-        ["ängre", "ång"],
-        ["ängst", "ång"],
-        ["örre", "or"],
-        ["örst", "or"]
-    ],
-    "punct": [
-        ["“", "\""],
-        ["”", "\""],
-        ["‘", "'"],
-        ["’", "'"]
-    ]
-}
--- a/spacy/lang/tl/init.py
+++ b/spacy/lang/tl/init.py
@ -24,7 +24,6 @@ class TagalogDefaults(Language.Defaults):
    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
-    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Tagalog(Language):
--- a/spacy/lang/tl/lemma_lookup.json
+++ b/spacy/lang/tl/lemma_lookup.json
@ -1,9 +0,0 @@
-{
-    "kaugnayan": "ugnay",
-    "sangkatauhan": "tao",
-    "kanayunan": "nayon",
-    "pandaigdigan": "daigdig",
-    "kasaysayan": "saysay",
-    "kabayanihan": "bayani",
-    "karuwagan": "duwag"
-}
--- a/spacy/lang/tr/init.py
+++ b/spacy/lang/tr/init.py
@ -10,9 +10,6 @@ from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

-# Lemma data source:
-# http://st2.zargan.com/duyuru/Zargan_Linguistic_Resources_for_Turkish.html - Bilgin, O. (2016). Biçimbilimsel Bakımdan Karmaşık Türkçe Kelimelerin İşlenmesinde Frekans Etkileri (yayınlanmamış yüksek lisans tezi). Boğaziçi Üniversitesi, İstanbul. Erişim: http://st2.zargan.com/public/resources/turkish/frequency_effects_in_turkish.pdf
-

 class TurkishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
@ -22,7 +19,6 @@ class TurkishDefaults(Language.Defaults):
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
-    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Turkish(Language):
--- a/spacy/lang/tr/lemma_lookup.json
+++ b/spacy/lang/tr/lemma_lookup.json
--- a/spacy/lang/uk/init.py
+++ b/spacy/lang/uk/init.py
@ -9,6 +9,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...util import update_exc, add_lookups
 from ...language import Language
+from ...lookups import Lookups
 from ...attrs import LANG, NORM
 from .lemmatizer import UkrainianLemmatizer

@ -24,8 +25,10 @@ class UkrainianDefaults(Language.Defaults):
    stop_words = STOP_WORDS

    @classmethod
-    def create_lemmatizer(cls, nlp=None, **kwargs):
-        return UkrainianLemmatizer()
+    def create_lemmatizer(cls, nlp=None, lookups=None):
+        if lookups is None:
+            lookups = Lookups()
+        return UkrainianLemmatizer(lookups)


 class Ukrainian(Language):
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -6,8 +6,8 @@ from ...lemmatizer import Lemmatizer
 class UkrainianLemmatizer(Lemmatizer):
    _morph = None

-    def __init__(self):
-        super(UkrainianLemmatizer, self).__init__()
+    def __init__(self, lookups=None):
+        super(UkrainianLemmatizer, self).__init__(lookups)
        try:
            from pymorphy2 import MorphAnalyzer

@ -99,20 +99,7 @@ class UkrainianLemmatizer(Lemmatizer):
            return symbols_to_str[univ_pos]
        return None

-    def is_base_form(self, univ_pos, morphology=None):
-        # TODO
-        raise NotImplementedError
-
-    def det(self, string, morphology=None):
-        return self(string, "det", morphology)
-
-    def num(self, string, morphology=None):
-        return self(string, "num", morphology)
-
-    def pron(self, string, morphology=None):
-        return self(string, "pron", morphology)
-
-    def lookup(self, string):
+    def lookup(self, string, orth=None):
        analyses = self._morph.parse(string)
        if len(analyses) == 1:
            return analyses[0].normal_form
--- a/Show More
+++ b/Show More