mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-06 22:50:33 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
b31d01e8cc
10
.flake8
10
.flake8
|
@ -1,10 +0,0 @@
|
|||
[flake8]
|
||||
ignore = E203, E266, E501, E731, W503
|
||||
max-line-length = 80
|
||||
select = B,C,E,F,W,T4,B9
|
||||
exclude =
|
||||
.env,
|
||||
.git,
|
||||
__pycache__,
|
||||
_tokenizer_exceptions_list.py,
|
||||
spacy/__init__.py
|
106
.github/contributors/er-raoniz.md
vendored
Normal file
106
.github/contributors/er-raoniz.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Rahul Soni |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 30th September, 2019 |
|
||||
| GitHub username | er-raoniz |
|
||||
| Website (optional) | |
|
|
@ -13,7 +13,6 @@ install:
|
|||
- "pip install -e ."
|
||||
script:
|
||||
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
||||
- "pip install pytest pytest-timeout"
|
||||
- "python -m pytest --tb=native spacy"
|
||||
branches:
|
||||
except:
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
recursive-include include *.h
|
||||
recursive-include spacy *.txt
|
||||
include LICENSE
|
||||
include README.md
|
||||
include pyproject.toml
|
||||
include bin/spacy
|
||||
include pyproject.toml
|
||||
recursive-exclude spacy/lang *.json
|
||||
recursive-include spacy/lang *.json.gz
|
||||
|
|
95
README.md
95
README.md
|
@ -3,24 +3,26 @@
|
|||
# spaCy: Industrial-strength NLP
|
||||
|
||||
spaCy is a library for advanced Natural Language Processing in Python and
|
||||
Cython. It's built on the very latest research, and was designed from day one
|
||||
to be used in real products. spaCy comes with
|
||||
[pre-trained statistical models](https://spacy.io/models) and word vectors, and
|
||||
Cython. It's built on the very latest research, and was designed from day one to
|
||||
be used in real products. spaCy comes with
|
||||
[pretrained statistical models](https://spacy.io/models) and word vectors, and
|
||||
currently supports tokenization for **50+ languages**. It features
|
||||
state-of-the-art speed, convolutional **neural network models** for tagging,
|
||||
parsing and **named entity recognition** and easy **deep learning** integration.
|
||||
It's commercial open-source software, released under the MIT license.
|
||||
|
||||
💫 **Version 2.1 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
💫 **Version 2.2 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-devops&style=flat-square)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
[![Travis Build Status](https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis)](https://travis-ci.org/explosion/spaCy)
|
||||
[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square)](https://github.com/explosion/spaCy/releases)
|
||||
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.org/project/spacy/)
|
||||
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy)
|
||||
[![Azure Pipelines](<https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-devops&style=flat-square&label=build+(3.x)>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
[![Travis Build Status](<https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis-ci&logoColor=white&label=build+(2.7)>)](https://travis-ci.org/explosion/spaCy)
|
||||
[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases)
|
||||
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
|
||||
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
|
||||
[![Python wheels](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white)](https://github.com/explosion/wheelwright/releases)
|
||||
[![PyPi downloads](https://img.shields.io/pypi/dm/spacy?style=flat-square)](https://pypi.org/project/spacy/)
|
||||
[![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?style=flat-square)](https://anaconda.org/conda-forge/spacy)
|
||||
[![PyPi downloads](https://img.shields.io/pypi/dm/spacy?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
|
||||
[![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
|
||||
[![Model downloads](https://img.shields.io/github/downloads/explosion/spacy-models/total?style=flat-square&label=model+downloads)](https://github.com/explosion/spacy-models)
|
||||
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black)
|
||||
[![spaCy on Twitter](https://img.shields.io/twitter/follow/spacy_io.svg?style=social&label=Follow)](https://twitter.com/spacy_io)
|
||||
|
||||
|
@ -30,7 +32,7 @@ It's commercial open-source software, released under the MIT license.
|
|||
| --------------- | -------------------------------------------------------------- |
|
||||
| [spaCy 101] | New to spaCy? Here's everything you need to know! |
|
||||
| [Usage Guides] | How to use spaCy and its features. |
|
||||
| [New in v2.1] | New features, backwards incompatibilities and migration guide. |
|
||||
| [New in v2.2] | New features, backwards incompatibilities and migration guide. |
|
||||
| [API Reference] | The detailed reference for spaCy's API. |
|
||||
| [Models] | Download statistical language models for spaCy. |
|
||||
| [Universe] | Libraries, extensions, demos, books and courses. |
|
||||
|
@ -38,7 +40,7 @@ It's commercial open-source software, released under the MIT license.
|
|||
| [Contribute] | How to contribute to the spaCy project and code base. |
|
||||
|
||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||
[new in v2.1]: https://spacy.io/usage/v2-1
|
||||
[new in v2.2]: https://spacy.io/usage/v2-2
|
||||
[usage guides]: https://spacy.io/usage/
|
||||
[api reference]: https://spacy.io/api/
|
||||
[models]: https://spacy.io/models
|
||||
|
@ -48,10 +50,13 @@ It's commercial open-source software, released under the MIT license.
|
|||
|
||||
## 💬 Where to ask questions
|
||||
|
||||
The spaCy project is maintained by [@honnibal](https://github.com/honnibal)
|
||||
and [@ines](https://github.com/ines). Please understand that we won't be able
|
||||
to provide individual support via email. We also believe that help is much more
|
||||
valuable if it's shared publicly, so that more people can benefit from it.
|
||||
The spaCy project is maintained by [@honnibal](https://github.com/honnibal) and
|
||||
[@ines](https://github.com/ines), along with core contributors
|
||||
[@svlandeg](https://github.com/svlandeg) and
|
||||
[@adrianeboyd](https://github.com/adrianeboyd). Please understand that we won't
|
||||
be able to provide individual support via email. We also believe that help is
|
||||
much more valuable if it's shared publicly, so that more people can benefit from
|
||||
it.
|
||||
|
||||
| Type | Platforms |
|
||||
| ------------------------ | ------------------------------------------------------ |
|
||||
|
@ -70,7 +75,7 @@ valuable if it's shared publicly, so that more people can benefit from it.
|
|||
- Non-destructive **tokenization**
|
||||
- **Named entity** recognition
|
||||
- Support for **50+ languages**
|
||||
- Pre-trained [statistical models](https://spacy.io/models) and word vectors
|
||||
- pretrained [statistical models](https://spacy.io/models) and word vectors
|
||||
- State-of-the-art speed
|
||||
- Easy **deep learning** integration
|
||||
- Part-of-speech tagging
|
||||
|
@ -91,7 +96,8 @@ valuable if it's shared publicly, so that more people can benefit from it.
|
|||
For detailed installation instructions, see the
|
||||
[documentation](https://spacy.io/usage).
|
||||
|
||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual Studio)
|
||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
||||
Studio)
|
||||
- **Python version**: Python 2.7, 3.5+ (only 64 bit)
|
||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
||||
|
||||
|
@ -100,13 +106,20 @@ For detailed installation instructions, see the
|
|||
|
||||
### pip
|
||||
|
||||
Using pip, spaCy releases are available as source packages and binary wheels
|
||||
(as of `v2.0.13`).
|
||||
Using pip, spaCy releases are available as source packages and binary wheels (as
|
||||
of `v2.0.13`).
|
||||
|
||||
```bash
|
||||
pip install spacy
|
||||
```
|
||||
|
||||
To install additional data tables for lemmatization in **spaCy v2.2+** you can
|
||||
run `pip install spacy[lookups]` or install
|
||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
||||
separately. The lookups package is needed to create blank models with
|
||||
lemmatization data, and to lemmatize in languages that don't yet come with
|
||||
pretrained models and aren't powered by third-party libraries.
|
||||
|
||||
When using pip it is generally recommended to install packages in a virtual
|
||||
environment to avoid modifying system state:
|
||||
|
||||
|
@ -126,9 +139,9 @@ conda config --add channels conda-forge
|
|||
conda install spacy
|
||||
```
|
||||
|
||||
For the feedstock including the build recipe and configuration,
|
||||
check out [this repository](https://github.com/conda-forge/spacy-feedstock).
|
||||
Improvements and pull requests to the recipe and setup are always appreciated.
|
||||
For the feedstock including the build recipe and configuration, check out
|
||||
[this repository](https://github.com/conda-forge/spacy-feedstock). Improvements
|
||||
and pull requests to the recipe and setup are always appreciated.
|
||||
|
||||
### Updating spaCy
|
||||
|
||||
|
@ -151,10 +164,10 @@ with the new version.
|
|||
|
||||
## Download models
|
||||
|
||||
As of v1.7.0, models for spaCy can be installed as **Python packages**.
|
||||
This means that they're a component of your application, just like any
|
||||
other module. Models can be installed using spaCy's `download` command,
|
||||
or manually by pointing pip to a path or URL.
|
||||
As of v1.7.0, models for spaCy can be installed as **Python packages**. This
|
||||
means that they're a component of your application, just like any other module.
|
||||
Models can be installed using spaCy's `download` command, or manually by
|
||||
pointing pip to a path or URL.
|
||||
|
||||
| Documentation | |
|
||||
| ---------------------- | ------------------------------------------------------------- |
|
||||
|
@ -203,8 +216,8 @@ doc = nlp(u"This is a sentence.")
|
|||
|
||||
### Support for older versions
|
||||
|
||||
If you're using an older version (`v1.6.0` or below), you can still download
|
||||
and install the old models from within spaCy using `python -m spacy.en.download all`
|
||||
If you're using an older version (`v1.6.0` or below), you can still download and
|
||||
install the old models from within spaCy using `python -m spacy.en.download all`
|
||||
or `python -m spacy.de.download all`. The `.tar.gz` archives are also
|
||||
[attached to the v1.6.0 release](https://github.com/explosion/spaCy/tree/v1.6.0).
|
||||
To download and install the models manually, unpack the archive, drop the
|
||||
|
@ -219,9 +232,10 @@ source. That is the common way if you want to make changes to the code base.
|
|||
You'll need to make sure that you have a development environment consisting of a
|
||||
Python distribution including header files, a compiler,
|
||||
[pip](https://pip.pypa.io/en/latest/installing/),
|
||||
[virtualenv](https://virtualenv.pypa.io/en/latest/) and [git](https://git-scm.com)
|
||||
installed. The compiler part is the trickiest. How to do that depends on your
|
||||
system. See notes on Ubuntu, OS X and Windows for details.
|
||||
[virtualenv](https://virtualenv.pypa.io/en/latest/) and
|
||||
[git](https://git-scm.com) installed. The compiler part is the trickiest. How to
|
||||
do that depends on your system. See notes on Ubuntu, OS X and Windows for
|
||||
details.
|
||||
|
||||
```bash
|
||||
# make sure you are using the latest pip
|
||||
|
@ -240,8 +254,8 @@ Compared to regular install via pip, [requirements.txt](requirements.txt)
|
|||
additionally installs developer dependencies such as Cython. For more details
|
||||
and instructions, see the documentation on
|
||||
[compiling spaCy from source](https://spacy.io/usage#source) and the
|
||||
[quickstart widget](https://spacy.io/usage#section-quickstart) to get
|
||||
the right commands for your platform and Python version.
|
||||
[quickstart widget](https://spacy.io/usage#section-quickstart) to get the right
|
||||
commands for your platform and Python version.
|
||||
|
||||
### Ubuntu
|
||||
|
||||
|
@ -259,11 +273,12 @@ and git preinstalled.
|
|||
|
||||
### Windows
|
||||
|
||||
Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or
|
||||
[Visual Studio Express](https://visualstudio.microsoft.com/vs/express/)
|
||||
that matches the version that was used to compile your Python
|
||||
interpreter. For official distributions these are VS 2008 (Python 2.7),
|
||||
VS 2010 (Python 3.4) and VS 2015 (Python 3.5).
|
||||
Install a version of the
|
||||
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
|
||||
or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that
|
||||
matches the version that was used to compile your Python interpreter. For
|
||||
official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and
|
||||
VS 2015 (Python 3.5).
|
||||
|
||||
## Run tests
|
||||
|
||||
|
|
|
@ -349,7 +349,7 @@ def initialize_pipeline(nlp, docs, golds, config, device):
|
|||
|
||||
|
||||
def _load_pretrained_tok2vec(nlp, loc):
|
||||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
||||
"""Load pretrained weights for the 'token-to-vector' part of the component
|
||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||
"""
|
||||
with Path(loc).open("rb") as file_:
|
||||
|
@ -445,7 +445,7 @@ class TreebankPaths(object):
|
|||
gpu_device=("Use GPU", "option", "g", int),
|
||||
use_oracle_segments=("Use oracle segments", "flag", "G", int),
|
||||
vectors_dir=(
|
||||
"Path to directory with pre-trained vectors, named e.g. en/",
|
||||
"Path to directory with pretrained vectors, named e.g. en/",
|
||||
"option",
|
||||
"v",
|
||||
Path,
|
||||
|
|
|
@ -38,10 +38,10 @@ def create_kb(
|
|||
# check the length of the nlp vectors
|
||||
if "vectors" in nlp.meta and nlp.vocab.vectors.size:
|
||||
input_dim = nlp.vocab.vectors_length
|
||||
logger.info("Loaded pre-trained vectors of size %s" % input_dim)
|
||||
logger.info("Loaded pretrained vectors of size %s" % input_dim)
|
||||
else:
|
||||
raise ValueError(
|
||||
"The `nlp` object should have access to pre-trained word vectors, "
|
||||
"The `nlp` object should have access to pretrained word vectors, "
|
||||
" cf. https://spacy.io/usage/models#languages."
|
||||
)
|
||||
|
||||
|
|
|
@ -83,7 +83,7 @@ def main(
|
|||
# check the length of the nlp vectors
|
||||
if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
|
||||
raise ValueError(
|
||||
"The `nlp` object should have access to pre-trained word vectors, "
|
||||
"The `nlp` object should have access to pretrained word vectors, "
|
||||
" cf. https://spacy.io/usage/models#languages."
|
||||
)
|
||||
|
||||
|
|
|
@ -65,7 +65,7 @@ def main(
|
|||
|
||||
# check that there is a NER component in the pipeline
|
||||
if "ner" not in nlp.pipe_names:
|
||||
raise ValueError("The `nlp` object should have a pre-trained `ner` component.")
|
||||
raise ValueError("The `nlp` object should have a pretrained `ner` component.")
|
||||
|
||||
# STEP 2: create a training dataset from WP
|
||||
logger.info("STEP 2: reading training dataset from {}".format(training_path))
|
||||
|
|
151
examples/streamlit_spacy.py
Normal file
151
examples/streamlit_spacy.py
Normal file
|
@ -0,0 +1,151 @@
|
|||
# coding: utf-8
|
||||
"""
|
||||
Example of a Streamlit app for an interactive spaCy model visualizer. You can
|
||||
either download the script, or point streamlit run to the raw URL of this
|
||||
file. For more details, see https://streamlit.io.
|
||||
|
||||
Installation:
|
||||
pip install streamlit
|
||||
python -m spacy download en_core_web_sm
|
||||
python -m spacy download en_core_web_md
|
||||
python -m spacy download de_core_news_sm
|
||||
|
||||
Usage:
|
||||
streamlit run streamlit_spacy.py
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import streamlit as st
|
||||
import spacy
|
||||
from spacy import displacy
|
||||
import pandas as pd
|
||||
|
||||
|
||||
SPACY_MODEL_NAMES = ["en_core_web_sm", "en_core_web_md", "de_core_news_sm"]
|
||||
DEFAULT_TEXT = "Mark Zuckerberg is the CEO of Facebook."
|
||||
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
|
||||
|
||||
|
||||
@st.cache(ignore_hash=True)
|
||||
def load_model(name):
|
||||
return spacy.load(name)
|
||||
|
||||
|
||||
@st.cache(ignore_hash=True)
|
||||
def process_text(model_name, text):
|
||||
nlp = load_model(model_name)
|
||||
return nlp(text)
|
||||
|
||||
|
||||
st.sidebar.title("Interactive spaCy visualizer")
|
||||
st.sidebar.markdown(
|
||||
"""
|
||||
Process text with [spaCy](https://spacy.io) models and visualize named entities,
|
||||
dependencies and more. Uses spaCy's built-in
|
||||
[displaCy](http://spacy.io/usage/visualizers) visualizer under the hood.
|
||||
"""
|
||||
)
|
||||
|
||||
spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES)
|
||||
model_load_state = st.info(f"Loading model '{spacy_model}'...")
|
||||
nlp = load_model(spacy_model)
|
||||
model_load_state.empty()
|
||||
|
||||
text = st.text_area("Text to analyze", DEFAULT_TEXT)
|
||||
doc = process_text(spacy_model, text)
|
||||
|
||||
if "parser" in nlp.pipe_names:
|
||||
st.header("Dependency Parse & Part-of-speech tags")
|
||||
st.sidebar.header("Dependency Parse")
|
||||
split_sents = st.sidebar.checkbox("Split sentences", value=True)
|
||||
collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True)
|
||||
collapse_phrases = st.sidebar.checkbox("Collapse phrases")
|
||||
compact = st.sidebar.checkbox("Compact mode")
|
||||
options = {
|
||||
"collapse_punct": collapse_punct,
|
||||
"collapse_phrases": collapse_phrases,
|
||||
"compact": compact,
|
||||
}
|
||||
docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
|
||||
for sent in docs:
|
||||
html = displacy.render(sent, options=options)
|
||||
# Double newlines seem to mess with the rendering
|
||||
html = html.replace("\n\n", "\n")
|
||||
if split_sents and len(docs) > 1:
|
||||
st.markdown(f"> {sent.text}")
|
||||
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
||||
|
||||
if "ner" in nlp.pipe_names:
|
||||
st.header("Named Entities")
|
||||
st.sidebar.header("Named Entities")
|
||||
label_set = nlp.get_pipe("ner").labels
|
||||
labels = st.sidebar.multiselect("Entity labels", label_set, label_set)
|
||||
html = displacy.render(doc, style="ent", options={"ents": labels})
|
||||
# Newlines seem to mess with the rendering
|
||||
html = html.replace("\n", " ")
|
||||
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
||||
attrs = ["text", "label_", "start", "end", "start_char", "end_char"]
|
||||
if "entity_linker" in nlp.pipe_names:
|
||||
attrs.append("kb_id_")
|
||||
data = [
|
||||
[str(getattr(ent, attr)) for attr in attrs]
|
||||
for ent in doc.ents
|
||||
if ent.label_ in labels
|
||||
]
|
||||
df = pd.DataFrame(data, columns=attrs)
|
||||
st.dataframe(df)
|
||||
|
||||
|
||||
if "textcat" in nlp.pipe_names:
|
||||
st.header("Text Classification")
|
||||
st.markdown(f"> {text}")
|
||||
df = pd.DataFrame(doc.cats.items(), columns=("Label", "Score"))
|
||||
st.dataframe(df)
|
||||
|
||||
|
||||
vector_size = nlp.meta.get("vectors", {}).get("width", 0)
|
||||
if vector_size:
|
||||
st.header("Vectors & Similarity")
|
||||
st.code(nlp.meta["vectors"])
|
||||
text1 = st.text_input("Text or word 1", "apple")
|
||||
text2 = st.text_input("Text or word 2", "orange")
|
||||
doc1 = process_text(spacy_model, text1)
|
||||
doc2 = process_text(spacy_model, text2)
|
||||
similarity = doc1.similarity(doc2)
|
||||
if similarity > 0.5:
|
||||
st.success(similarity)
|
||||
else:
|
||||
st.error(similarity)
|
||||
|
||||
st.header("Token attributes")
|
||||
|
||||
if st.button("Show token attributes"):
|
||||
attrs = [
|
||||
"idx",
|
||||
"text",
|
||||
"lemma_",
|
||||
"pos_",
|
||||
"tag_",
|
||||
"dep_",
|
||||
"head",
|
||||
"ent_type_",
|
||||
"ent_iob_",
|
||||
"shape_",
|
||||
"is_alpha",
|
||||
"is_ascii",
|
||||
"is_digit",
|
||||
"is_punct",
|
||||
"like_num",
|
||||
]
|
||||
data = [[str(getattr(token, attr)) for attr in attrs] for token in doc]
|
||||
df = pd.DataFrame(data, columns=attrs)
|
||||
st.dataframe(df)
|
||||
|
||||
|
||||
st.header("JSON Doc")
|
||||
if st.button("Show JSON Doc"):
|
||||
st.json(doc.to_json())
|
||||
|
||||
st.header("JSON model meta")
|
||||
if st.button("Show JSON model meta"):
|
||||
st.json(nlp.meta)
|
|
@ -27,7 +27,7 @@ from bin.wiki_entity_linking.train_descriptions import EntityEncoder
|
|||
# Q7381115 (Russ Cochran): publisher
|
||||
ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
|
||||
|
||||
INPUT_DIM = 300 # dimension of pre-trained input vectors
|
||||
INPUT_DIM = 300 # dimension of pretrained input vectors
|
||||
DESC_WIDTH = 64 # dimension of output entity vectors
|
||||
|
||||
|
||||
|
@ -39,7 +39,7 @@ DESC_WIDTH = 64 # dimension of output entity vectors
|
|||
)
|
||||
def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
|
||||
"""Load the model, create the KB and pretrain the entity encodings.
|
||||
Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings.
|
||||
Either an nlp model or a vocab is needed to provide access to pretrained word embeddings.
|
||||
If an output_dir is provided, the KB will be stored there in a file 'kb'.
|
||||
When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
|
||||
if model is None and vocab_path is None:
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
"""This script is experimental.
|
||||
|
||||
Try pre-training the CNN component of the text categorizer using a cheap
|
||||
language modelling-like objective. Specifically, we load pre-trained vectors
|
||||
language modelling-like objective. Specifically, we load pretrained vectors
|
||||
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
||||
predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
|
||||
predict the tokens' pretrained vectors. This isn't as easy as it sounds:
|
||||
we're not merely doing compression here, because heavy dropout is applied,
|
||||
including over the input words. This means the model must often (50% of the time)
|
||||
use the context in order to predict the word.
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# coding: utf8
|
||||
"""Example of training an additional entity type
|
||||
|
||||
This script shows how to add a new entity type to an existing pre-trained NER
|
||||
This script shows how to add a new entity type to an existing pretrained NER
|
||||
model. To keep the example short and simple, only four sentences are provided
|
||||
as examples. In practice, you'll need many more — a few hundred would be a
|
||||
good start. You will also likely need to mix in examples of other entity
|
||||
|
|
122
fabfile.py
vendored
122
fabfile.py
vendored
|
@ -10,113 +10,145 @@ import sys
|
|||
|
||||
|
||||
PWD = path.dirname(__file__)
|
||||
ENV = environ['VENV_DIR'] if 'VENV_DIR' in environ else '.env'
|
||||
ENV = environ["VENV_DIR"] if "VENV_DIR" in environ else ".env"
|
||||
VENV_DIR = Path(PWD) / ENV
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def virtualenv(name, create=False, python='/usr/bin/python3.6'):
|
||||
def virtualenv(name, create=False, python="/usr/bin/python3.6"):
|
||||
python = Path(python).resolve()
|
||||
env_path = VENV_DIR
|
||||
if create:
|
||||
if env_path.exists():
|
||||
shutil.rmtree(str(env_path))
|
||||
local('{python} -m venv {env_path}'.format(python=python, env_path=VENV_DIR))
|
||||
local("{python} -m venv {env_path}".format(python=python, env_path=VENV_DIR))
|
||||
|
||||
def wrapped_local(cmd, env_vars=[], capture=False, direct=False):
|
||||
return local('source {}/bin/activate && {}'.format(env_path, cmd),
|
||||
shell='/bin/bash', capture=False)
|
||||
return local(
|
||||
"source {}/bin/activate && {}".format(env_path, cmd),
|
||||
shell="/bin/bash",
|
||||
capture=False,
|
||||
)
|
||||
|
||||
yield wrapped_local
|
||||
|
||||
|
||||
def env(lang='python3.6'):
|
||||
def env(lang="python3.6"):
|
||||
if VENV_DIR.exists():
|
||||
local('rm -rf {env}'.format(env=VENV_DIR))
|
||||
if lang.startswith('python3'):
|
||||
local('{lang} -m venv {env}'.format(lang=lang, env=VENV_DIR))
|
||||
local("rm -rf {env}".format(env=VENV_DIR))
|
||||
if lang.startswith("python3"):
|
||||
local("{lang} -m venv {env}".format(lang=lang, env=VENV_DIR))
|
||||
else:
|
||||
local('{lang} -m pip install virtualenv --no-cache-dir'.format(lang=lang))
|
||||
local('{lang} -m virtualenv {env} --no-cache-dir'.format(lang=lang, env=VENV_DIR))
|
||||
local("{lang} -m pip install virtualenv --no-cache-dir".format(lang=lang))
|
||||
local(
|
||||
"{lang} -m virtualenv {env} --no-cache-dir".format(lang=lang, env=VENV_DIR)
|
||||
)
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
print(venv_local('python --version', capture=True))
|
||||
venv_local('pip install --upgrade setuptools --no-cache-dir')
|
||||
venv_local('pip install pytest --no-cache-dir')
|
||||
venv_local('pip install wheel --no-cache-dir')
|
||||
venv_local('pip install -r requirements.txt --no-cache-dir')
|
||||
venv_local('pip install pex --no-cache-dir')
|
||||
|
||||
print(venv_local("python --version", capture=True))
|
||||
venv_local("pip install --upgrade setuptools --no-cache-dir")
|
||||
venv_local("pip install pytest --no-cache-dir")
|
||||
venv_local("pip install wheel --no-cache-dir")
|
||||
venv_local("pip install -r requirements.txt --no-cache-dir")
|
||||
venv_local("pip install pex --no-cache-dir")
|
||||
|
||||
|
||||
def install():
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
venv_local('pip install dist/*.tar.gz')
|
||||
venv_local("pip install dist/*.tar.gz")
|
||||
|
||||
|
||||
def make():
|
||||
with lcd(path.dirname(__file__)):
|
||||
local('export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace',
|
||||
shell='/bin/bash')
|
||||
local(
|
||||
"export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace",
|
||||
shell="/bin/bash",
|
||||
)
|
||||
|
||||
|
||||
def sdist():
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
with lcd(path.dirname(__file__)):
|
||||
local('python -m pip install -U setuptools')
|
||||
local('python setup.py sdist')
|
||||
venv_local("python -m pip install -U setuptools srsly")
|
||||
venv_local("python setup.py sdist")
|
||||
|
||||
|
||||
def wheel():
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
with lcd(path.dirname(__file__)):
|
||||
venv_local('python setup.py bdist_wheel')
|
||||
venv_local("python setup.py bdist_wheel")
|
||||
|
||||
|
||||
def pex():
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
with lcd(path.dirname(__file__)):
|
||||
sha = local('git rev-parse --short HEAD', capture=True)
|
||||
venv_local('pex dist/*.whl -e spacy -o dist/spacy-%s.pex' % sha,
|
||||
direct=True)
|
||||
sha = local("git rev-parse --short HEAD", capture=True)
|
||||
venv_local(
|
||||
"pex dist/*.whl -e spacy -o dist/spacy-%s.pex" % sha, direct=True
|
||||
)
|
||||
|
||||
|
||||
def clean():
|
||||
with lcd(path.dirname(__file__)):
|
||||
local('rm -f dist/*.whl')
|
||||
local('rm -f dist/*.pex')
|
||||
local("rm -f dist/*.whl")
|
||||
local("rm -f dist/*.pex")
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
venv_local('python setup.py clean --all')
|
||||
venv_local("python setup.py clean --all")
|
||||
|
||||
|
||||
def test():
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
with lcd(path.dirname(__file__)):
|
||||
venv_local('pytest -x spacy/tests')
|
||||
venv_local("pytest -x spacy/tests")
|
||||
|
||||
|
||||
def train():
|
||||
args = environ.get('SPACY_TRAIN_ARGS', '')
|
||||
args = environ.get("SPACY_TRAIN_ARGS", "")
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
venv_local('spacy train {args}'.format(args=args))
|
||||
venv_local("spacy train {args}".format(args=args))
|
||||
|
||||
|
||||
def conll17(treebank_dir, experiment_dir, vectors_dir, config, corpus=''):
|
||||
is_not_clean = local('git status --porcelain', capture=True)
|
||||
def conll17(treebank_dir, experiment_dir, vectors_dir, config, corpus=""):
|
||||
is_not_clean = local("git status --porcelain", capture=True)
|
||||
if is_not_clean:
|
||||
print("Repository is not clean")
|
||||
print(is_not_clean)
|
||||
sys.exit(1)
|
||||
git_sha = local('git rev-parse --short HEAD', capture=True)
|
||||
config_checksum = local('sha256sum {config}'.format(config=config), capture=True)
|
||||
experiment_dir = Path(experiment_dir) / '{}--{}'.format(config_checksum[:6], git_sha)
|
||||
git_sha = local("git rev-parse --short HEAD", capture=True)
|
||||
config_checksum = local("sha256sum {config}".format(config=config), capture=True)
|
||||
experiment_dir = Path(experiment_dir) / "{}--{}".format(
|
||||
config_checksum[:6], git_sha
|
||||
)
|
||||
if not experiment_dir.exists():
|
||||
experiment_dir.mkdir()
|
||||
test_data_dir = Path(treebank_dir) / 'ud-test-v2.0-conll2017'
|
||||
test_data_dir = Path(treebank_dir) / "ud-test-v2.0-conll2017"
|
||||
assert test_data_dir.exists()
|
||||
assert test_data_dir.is_dir()
|
||||
if corpus:
|
||||
corpora = [corpus]
|
||||
else:
|
||||
corpora = ['UD_English', 'UD_Chinese', 'UD_Japanese', 'UD_Vietnamese']
|
||||
corpora = ["UD_English", "UD_Chinese", "UD_Japanese", "UD_Vietnamese"]
|
||||
|
||||
local('cp {config} {experiment_dir}/config.json'.format(config=config, experiment_dir=experiment_dir))
|
||||
local(
|
||||
"cp {config} {experiment_dir}/config.json".format(
|
||||
config=config, experiment_dir=experiment_dir
|
||||
)
|
||||
)
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
for corpus in corpora:
|
||||
venv_local('spacy ud-train {treebank_dir} {experiment_dir} {config} {corpus} -v {vectors_dir}'.format(
|
||||
treebank_dir=treebank_dir, experiment_dir=experiment_dir, config=config, corpus=corpus, vectors_dir=vectors_dir))
|
||||
venv_local('spacy ud-run-test {test_data_dir} {experiment_dir} {corpus}'.format(
|
||||
test_data_dir=test_data_dir, experiment_dir=experiment_dir, config=config, corpus=corpus))
|
||||
venv_local(
|
||||
"spacy ud-train {treebank_dir} {experiment_dir} {config} {corpus} -v {vectors_dir}".format(
|
||||
treebank_dir=treebank_dir,
|
||||
experiment_dir=experiment_dir,
|
||||
config=config,
|
||||
corpus=corpus,
|
||||
vectors_dir=vectors_dir,
|
||||
)
|
||||
)
|
||||
venv_local(
|
||||
"spacy ud-run-test {test_data_dir} {experiment_dir} {corpus}".format(
|
||||
test_data_dir=test_data_dir,
|
||||
experiment_dir=experiment_dir,
|
||||
config=config,
|
||||
corpus=corpus,
|
||||
)
|
||||
)
|
||||
|
|
|
@ -1,10 +1,3 @@
|
|||
[build-system]
|
||||
requires = ["setuptools",
|
||||
"wheel>0.32.0,<0.33.0",
|
||||
"Cython",
|
||||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=2.0.1,<2.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=7.0.8,<7.1.0",
|
||||
]
|
||||
requires = ["setuptools"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -15,7 +15,7 @@ pathlib==1.0.1; python_version < "3.4"
|
|||
jsonschema>=2.6.0,<3.1.0
|
||||
# Development dependencies
|
||||
cython>=0.25
|
||||
pytest>=4.0.0,<4.1.0
|
||||
pytest>=4.6.5
|
||||
pytest-timeout>=1.3.0,<2.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.5.0,<3.6.0
|
||||
|
|
102
setup.cfg
Normal file
102
setup.cfg
Normal file
|
@ -0,0 +1,102 @@
|
|||
[metadata]
|
||||
description = Industrial-strength Natural Language Processing (NLP) in Python
|
||||
url = https://spacy.io
|
||||
author = Explosion
|
||||
author_email = contact@explosion.ai
|
||||
license = MIT
|
||||
long_description = file: README.md
|
||||
long_description_content_type = text/markdown
|
||||
classifiers =
|
||||
Development Status :: 5 - Production/Stable
|
||||
Environment :: Console
|
||||
Intended Audience :: Developers
|
||||
Intended Audience :: Science/Research
|
||||
License :: OSI Approved :: MIT License
|
||||
Operating System :: POSIX :: Linux
|
||||
Operating System :: MacOS :: MacOS X
|
||||
Operating System :: Microsoft :: Windows
|
||||
Programming Language :: Cython
|
||||
Programming Language :: Python :: 2
|
||||
Programming Language :: Python :: 2.7
|
||||
Programming Language :: Python :: 3
|
||||
Programming Language :: Python :: 3.5
|
||||
Programming Language :: Python :: 3.6
|
||||
Programming Language :: Python :: 3.7
|
||||
Topic :: Scientific/Engineering
|
||||
|
||||
[options]
|
||||
zip_safe = false
|
||||
include_package_data = true
|
||||
scripts =
|
||||
bin/spacy
|
||||
python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
|
||||
setup_requires =
|
||||
wheel
|
||||
cython>=0.25
|
||||
# We also need our Cython packages here to compile against
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=7.1.1,<7.2.0
|
||||
install_requires =
|
||||
numpy>=1.15.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=7.1.1,<7.2.0
|
||||
blis>=0.4.0,<0.5.0
|
||||
plac<1.0.0,>=0.9.6
|
||||
requests>=2.13.0,<3.0.0
|
||||
wasabi>=0.2.0,<1.1.0
|
||||
srsly>=0.1.0,<1.1.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
|
||||
[options.extras_require]
|
||||
lookups =
|
||||
spacy_lookups_data>=0.0.5<0.2.0
|
||||
cuda =
|
||||
thinc_gpu_ops>=0.0.1,<0.1.0
|
||||
cupy>=5.0.0b4
|
||||
cuda80 =
|
||||
thinc_gpu_ops>=0.0.1,<0.1.0
|
||||
cupy-cuda80>=5.0.0b4
|
||||
cuda90 =
|
||||
thinc_gpu_ops>=0.0.1,<0.1.0
|
||||
cupy-cuda90>=5.0.0b4
|
||||
cuda91 =
|
||||
thinc_gpu_ops>=0.0.1,<0.1.0
|
||||
cupy-cuda91>=5.0.0b4
|
||||
cuda92 =
|
||||
thinc_gpu_ops>=0.0.1,<0.1.0
|
||||
cupy-cuda92>=5.0.0b4
|
||||
cuda100 =
|
||||
thinc_gpu_ops>=0.0.1,<0.1.0
|
||||
cupy-cuda100>=5.0.0b4
|
||||
# Language tokenizers with external dependencies
|
||||
ja =
|
||||
mecab-python3==0.7
|
||||
ko =
|
||||
natto-py==0.9.0
|
||||
th =
|
||||
pythainlp>=2.0
|
||||
|
||||
[bdist_wheel]
|
||||
universal = false
|
||||
|
||||
[sdist]
|
||||
formats = gztar
|
||||
|
||||
[flake8]
|
||||
ignore = E203, E266, E501, E731, W503
|
||||
max-line-length = 80
|
||||
select = B,C,E,F,W,T4,B9
|
||||
exclude =
|
||||
.env,
|
||||
.git,
|
||||
__pycache__,
|
||||
_tokenizer_exceptions_list.py,
|
||||
spacy/__init__.py
|
||||
|
||||
[tool:pytest]
|
||||
markers =
|
||||
slow
|
97
setup.py
97
setup.py
|
@ -27,9 +27,6 @@ def is_new_osx():
|
|||
return False
|
||||
|
||||
|
||||
PACKAGE_DATA = {"": ["*.pyx", "*.pxd", "*.txt", "*.tokens", "*.json"]}
|
||||
|
||||
|
||||
PACKAGES = find_packages()
|
||||
|
||||
|
||||
|
@ -86,22 +83,6 @@ if is_new_osx():
|
|||
LINK_OPTIONS["other"].append("-nodefaultlibs")
|
||||
|
||||
|
||||
USE_OPENMP_DEFAULT = "0" if sys.platform != "darwin" else None
|
||||
if os.environ.get("USE_OPENMP", USE_OPENMP_DEFAULT) == "1":
|
||||
if sys.platform == "darwin":
|
||||
COMPILE_OPTIONS["other"].append("-fopenmp")
|
||||
LINK_OPTIONS["other"].append("-fopenmp")
|
||||
PACKAGE_DATA["spacy.platform.darwin.lib"] = ["*.dylib"]
|
||||
PACKAGES.append("spacy.platform.darwin.lib")
|
||||
|
||||
elif sys.platform == "win32":
|
||||
COMPILE_OPTIONS["msvc"].append("/openmp")
|
||||
|
||||
else:
|
||||
COMPILE_OPTIONS["other"].append("-fopenmp")
|
||||
LINK_OPTIONS["other"].append("-fopenmp")
|
||||
|
||||
|
||||
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
|
||||
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
|
||||
class build_ext_options:
|
||||
|
@ -132,23 +113,6 @@ def generate_cython(root, source):
|
|||
raise RuntimeError("Running cythonize failed")
|
||||
|
||||
|
||||
def gzip_language_data(root, source):
|
||||
print("Compressing language data")
|
||||
import srsly
|
||||
from pathlib import Path
|
||||
|
||||
base = Path(root) / source
|
||||
for jsonfile in base.glob("**/*.json"):
|
||||
outfile = jsonfile.with_suffix(jsonfile.suffix + ".gz")
|
||||
if outfile.is_file() and outfile.stat().st_mtime > jsonfile.stat().st_mtime:
|
||||
# If the gz is newer it doesn't need updating
|
||||
print("Skipping {}, already compressed".format(jsonfile))
|
||||
continue
|
||||
data = srsly.read_json(jsonfile)
|
||||
srsly.write_gzip_json(outfile, data)
|
||||
print("Compressed {}".format(jsonfile))
|
||||
|
||||
|
||||
def is_source_release(path):
|
||||
return os.path.exists(os.path.join(path, "PKG-INFO"))
|
||||
|
||||
|
@ -185,9 +149,6 @@ def setup_package():
|
|||
about = {}
|
||||
exec(f.read(), about)
|
||||
|
||||
with io.open(os.path.join(root, "README.md"), encoding="utf8") as f:
|
||||
readme = f.read()
|
||||
|
||||
include_dirs = [
|
||||
get_python_inc(plat_specific=True),
|
||||
os.path.join(root, "include"),
|
||||
|
@ -203,7 +164,6 @@ def setup_package():
|
|||
for mod_name in MOD_NAMES:
|
||||
mod_path = mod_name.replace(".", "/") + ".cpp"
|
||||
extra_link_args = []
|
||||
extra_compile_args = []
|
||||
# ???
|
||||
# Imported from patch from @mikepb
|
||||
# See Issue #267. Running blind here...
|
||||
|
@ -224,69 +184,12 @@ def setup_package():
|
|||
|
||||
if not is_source_release(root):
|
||||
generate_cython(root, "spacy")
|
||||
gzip_language_data(root, "spacy/lang")
|
||||
|
||||
setup(
|
||||
name="spacy",
|
||||
zip_safe=False,
|
||||
packages=PACKAGES,
|
||||
package_data=PACKAGE_DATA,
|
||||
description=about["__summary__"],
|
||||
long_description=readme,
|
||||
long_description_content_type="text/markdown",
|
||||
author=about["__author__"],
|
||||
author_email=about["__email__"],
|
||||
version=about["__version__"],
|
||||
url=about["__uri__"],
|
||||
license=about["__license__"],
|
||||
ext_modules=ext_modules,
|
||||
scripts=["bin/spacy"],
|
||||
install_requires=[
|
||||
"numpy>=1.15.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=2.0.1,<2.1.0",
|
||||
"thinc>=7.0.8,<7.1.0",
|
||||
"blis>=0.2.2,<0.3.0",
|
||||
"plac<1.0.0,>=0.9.6",
|
||||
"requests>=2.13.0,<3.0.0",
|
||||
"wasabi>=0.2.0,<1.1.0",
|
||||
"srsly>=0.1.0,<1.1.0",
|
||||
'pathlib==1.0.1; python_version < "3.4"',
|
||||
],
|
||||
setup_requires=["wheel"],
|
||||
extras_require={
|
||||
"cuda": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy>=5.0.0b4"],
|
||||
"cuda80": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda80>=5.0.0b4"],
|
||||
"cuda90": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda90>=5.0.0b4"],
|
||||
"cuda91": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda91>=5.0.0b4"],
|
||||
"cuda92": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda92>=5.0.0b4"],
|
||||
"cuda100": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda100>=5.0.0b4"],
|
||||
# Language tokenizers with external dependencies
|
||||
"ja": ["mecab-python3==0.7"],
|
||||
"ko": ["natto-py==0.9.0"],
|
||||
"th": ["pythainlp>=2.0"],
|
||||
},
|
||||
python_requires=">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*",
|
||||
classifiers=[
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Environment :: Console",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Science/Research",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: POSIX :: Linux",
|
||||
"Operating System :: MacOS :: MacOS X",
|
||||
"Operating System :: Microsoft :: Windows",
|
||||
"Programming Language :: Cython",
|
||||
"Programming Language :: Python :: 2",
|
||||
"Programming Language :: Python :: 2.7",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.4",
|
||||
"Programming Language :: Python :: 3.5",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Topic :: Scientific/Engineering",
|
||||
],
|
||||
cmdclass={"build_ext": build_ext_subclass},
|
||||
)
|
||||
|
||||
|
|
|
@ -1,17 +1,7 @@
|
|||
# inspired from:
|
||||
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
|
||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
# fmt: off
|
||||
|
||||
__title__ = "spacy"
|
||||
__version__ = "2.1.8"
|
||||
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
|
||||
__uri__ = "https://spacy.io"
|
||||
__author__ = "Explosion AI"
|
||||
__email__ = "contact@explosion.ai"
|
||||
__license__ = "MIT"
|
||||
__version__ = "2.2.0"
|
||||
__release__ = True
|
||||
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
|
||||
|
|
|
@ -96,9 +96,9 @@ def pretrain(
|
|||
"""
|
||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||
using an approximate language-modelling objective. Specifically, we load
|
||||
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
||||
vectors which match the pre-trained ones. The weights are saved to a directory
|
||||
after each epoch. You can then pass a path to one of these pre-trained weights
|
||||
pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
||||
vectors which match the pretrained ones. The weights are saved to a directory
|
||||
after each epoch. You can then pass a path to one of these pretrained weights
|
||||
files to the 'spacy train' command.
|
||||
|
||||
This technique may be especially helpful if you have little labelled data.
|
||||
|
@ -156,7 +156,7 @@ def pretrain(
|
|||
subword_features=True, # Set to False for Chinese etc
|
||||
),
|
||||
)
|
||||
# Load in pre-trained weights
|
||||
# Load in pretrained weights
|
||||
if init_tok2vec is not None:
|
||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
||||
|
|
|
@ -222,7 +222,7 @@ def train(
|
|||
|
||||
nlp._optimizer = None
|
||||
|
||||
# Load in pre-trained weights
|
||||
# Load in pretrained weights
|
||||
if init_tok2vec is not None:
|
||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
||||
|
@ -417,7 +417,7 @@ def _load_vectors(nlp, vectors):
|
|||
|
||||
|
||||
def _load_pretrained_tok2vec(nlp, loc):
|
||||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
||||
"""Load pretrained weights for the 'token-to-vector' part of the component
|
||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||
"""
|
||||
with loc.open("rb") as file_:
|
||||
|
|
|
@ -5,7 +5,7 @@ import uuid
|
|||
|
||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
|
||||
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
||||
from ..util import minify_html, escape_html, get_entry_points
|
||||
from ..util import minify_html, escape_html, get_entry_points, ENTRY_POINTS
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
|
@ -242,7 +242,7 @@ class EntityRenderer(object):
|
|||
"CARDINAL": "#e4e7d2",
|
||||
"PERCENT": "#e4e7d2",
|
||||
}
|
||||
user_colors = get_entry_points("spacy_displacy_colors")
|
||||
user_colors = get_entry_points(ENTRY_POINTS.displacy_colors)
|
||||
for user_color in user_colors.values():
|
||||
colors.update(user_color)
|
||||
colors.update(options.get("colors", {}))
|
||||
|
|
|
@ -84,6 +84,17 @@ class Warnings(object):
|
|||
W018 = ("Entity '{entity}' already exists in the Knowledge base.")
|
||||
W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
|
||||
"previously loaded vectors. See Issue #3853.")
|
||||
W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
|
||||
"loaded. (Shape: {shape})")
|
||||
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
||||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||
W022 = ("Training a new part-of-speech tagger using a model with no "
|
||||
"lemmatization rules or data. This means that the trained model "
|
||||
"may not be able to lemmatize correctly. If this is intentional "
|
||||
"or the language you're using doesn't have lemmatization data, "
|
||||
"you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
|
||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||
"package installed.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
@ -313,7 +324,9 @@ class Errors(object):
|
|||
E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
|
||||
"have been declared in previous edges.")
|
||||
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
|
||||
"tokens to merge.")
|
||||
"tokens to merge. If you want to find the longest non-overlapping "
|
||||
"spans, you can use the util.filter_spans helper:\n"
|
||||
"https://spacy.io/api/top-level#util.filter_spans")
|
||||
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
|
||||
"token can only be part of one entity, so make sure the entities "
|
||||
"you're setting don't overlap.")
|
||||
|
@ -343,7 +356,7 @@ class Errors(object):
|
|||
E113 = ("The newly split token can only have one root (head = 0).")
|
||||
E114 = ("The newly split token needs to have a root (head = 0).")
|
||||
E115 = ("All subtokens must have associated heads.")
|
||||
E116 = ("Cannot currently add labels to pre-trained text classifier. Add "
|
||||
E116 = ("Cannot currently add labels to pretrained text classifier. Add "
|
||||
"labels before training begins. This functionality was available "
|
||||
"in previous versions, but had significant bugs that led to poor "
|
||||
"performance.")
|
||||
|
@ -457,17 +470,42 @@ class Errors(object):
|
|||
E160 = ("Can't find language data file: {path}")
|
||||
E161 = ("Found an internal inconsistency when predicting entity links. "
|
||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||
E162 = ("Cannot evaluate textcat model on data with different labels.\n"
|
||||
"Labels in model: {model_labels}\nLabels in evaluation "
|
||||
"data: {eval_labels}")
|
||||
E163 = ("cumsum was found to be unstable: its last element does not "
|
||||
"correspond to sum")
|
||||
E164 = ("x is neither increasing nor decreasing: {}.")
|
||||
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
|
||||
"that case.")
|
||||
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
|
||||
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||
E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
|
||||
"happen if the tagger was trained with a different set of "
|
||||
"morphological features. If you're using a pretrained model, make "
|
||||
"sure that your models are up to date:\npython -m spacy validate")
|
||||
E168 = ("Unknown field: {field}")
|
||||
E169 = ("Can't find module: {module}")
|
||||
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
||||
E171 = ("Matcher.add received invalid on_match callback argument: expected "
|
||||
"callable or None, but got: {arg_type}")
|
||||
E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
|
||||
"Lemmatizer, initialize the class directly. See the docs for "
|
||||
"details: https://spacy.io/api/lemmatizer")
|
||||
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
|
||||
"Lookups containing the lemmatization tables. See the docs for "
|
||||
"details: https://spacy.io/api/lemmatizer#init")
|
||||
|
||||
|
||||
@add_codes
|
||||
class TempErrors(object):
|
||||
T003 = ("Resizing pre-trained Tagger models is not currently supported.")
|
||||
T003 = ("Resizing pretrained Tagger models is not currently supported.")
|
||||
T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
|
||||
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
T008 = ("Bad configuration of Tagger. This is probably a bug within "
|
||||
"spaCy. We changed the name of an internal attribute for loading "
|
||||
"pre-trained vectors, and the class has been passed the old name "
|
||||
"pretrained vectors, and the class has been passed the old name "
|
||||
"(pretrained_dims) but not the new name (pretrained_vectors).")
|
||||
|
||||
|
||||
|
|
|
@ -21,8 +21,6 @@ class BengaliDefaults(Language.Defaults):
|
|||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
# Lemma rules: উচ্চতর বাংলা ব্যাকরণ ও রচনা - অধ্যাপক নিরঞ্জন অধিকারী ও অধ্যাপক ড. সফিউদ্দিন আহমদ
|
||||
resources = {"lemma_rules": "lemma_rules.json"}
|
||||
|
||||
|
||||
class Bengali(Language):
|
||||
|
|
|
@ -1,57 +0,0 @@
|
|||
{
|
||||
"noun": [
|
||||
["টা", ""],
|
||||
["টি", ""],
|
||||
["খান", ""],
|
||||
["খানা", ""],
|
||||
["খানি", ""],
|
||||
["গাছা", ""],
|
||||
["গাছি", ""],
|
||||
["ছড়া", ""],
|
||||
["কে", ""],
|
||||
["ে", ""],
|
||||
["তে", ""],
|
||||
["র", ""],
|
||||
["রা", ""],
|
||||
["রে", ""],
|
||||
["ের", ""],
|
||||
["েরা", ""],
|
||||
["দের", ""],
|
||||
["দেরকে", ""],
|
||||
["গুলা", ""],
|
||||
["গুলো", ""],
|
||||
["গুলি", ""],
|
||||
["কুল", ""],
|
||||
["গণ", ""],
|
||||
["দল", ""],
|
||||
["পাল", ""],
|
||||
["পুঞ্জ", ""],
|
||||
["মণ্ডলী", ""],
|
||||
["মালা", ""],
|
||||
["রাজি", ""],
|
||||
["বৃন্দ", ""],
|
||||
["বর্গ", ""],
|
||||
["শ্রেণী", ""],
|
||||
["শ্রেনি", ""],
|
||||
["রাশি", ""],
|
||||
["সকল", ""],
|
||||
["মহল", ""],
|
||||
["াবলি", ""],
|
||||
["০", "0"],
|
||||
["১", "1"],
|
||||
["২", "2"],
|
||||
["৩", "3"],
|
||||
["৪", "4"],
|
||||
["৫", "5"],
|
||||
["৬", "6"],
|
||||
["৭", "7"],
|
||||
["৮", "8"],
|
||||
["৯", "9"]
|
||||
],
|
||||
"punct": [
|
||||
["“", "\""],
|
||||
["”", "\""],
|
||||
["‘", "'"],
|
||||
["’", "'"]
|
||||
]
|
||||
}
|
|
@ -24,7 +24,6 @@ class CatalanDefaults(Language.Defaults):
|
|||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Catalan(Language):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -29,7 +29,6 @@ class DanishDefaults(Language.Defaults):
|
|||
suffixes = TOKENIZER_SUFFIXES
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Danish(Language):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -26,7 +26,20 @@ class GermanDefaults(Language.Defaults):
|
|||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
single_orth_variants = [
|
||||
{"tags": ["$("], "variants": ["…", "..."]},
|
||||
{"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]},
|
||||
]
|
||||
paired_orth_variants = [
|
||||
{
|
||||
"tags": ["$("],
|
||||
"variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")],
|
||||
},
|
||||
{
|
||||
"tags": ["$("],
|
||||
"variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class German(Language):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -13,8 +13,9 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups, get_lemma_tables
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class GreekDefaults(Language.Defaults):
|
||||
|
@ -31,16 +32,12 @@ class GreekDefaults(Language.Defaults):
|
|||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
resources = {
|
||||
"lemma_index": "lemmatizer/lemma_index.json",
|
||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
|
||||
return GreekLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return GreekLemmatizer(lookups)
|
||||
|
||||
|
||||
class Greek(Language):
|
||||
|
|
40
spacy/lang/el/lemmatizer.py
Normal file
40
spacy/lang/el/lemmatizer.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
class GreekLemmatizer(Lemmatizer):
|
||||
"""
|
||||
Greek language lemmatizer applies the default rule based lemmatization
|
||||
procedure with some modifications for better Greek language support.
|
||||
|
||||
The first modification is that it checks if the word for lemmatization is
|
||||
already a lemma and if yes, it just returns it.
|
||||
The second modification is about removing the base forms function which is
|
||||
not applicable for Greek language.
|
||||
"""
|
||||
|
||||
def lemmatize(self, string, index, exceptions, rules):
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
return forms
|
||||
forms.extend(exceptions.get(string, []))
|
||||
oov_forms = []
|
||||
if not forms:
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[: len(string) - len(old)] + new
|
||||
if not form:
|
||||
pass
|
||||
elif form in index or not form.isalpha():
|
||||
forms.append(form)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
return list(set(forms))
|
|
@ -1,77 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ....symbols import NOUN, VERB, ADJ, PUNCT
|
||||
|
||||
|
||||
class GreekLemmatizer(object):
|
||||
"""
|
||||
Greek language lemmatizer applies the default rule based lemmatization
|
||||
procedure with some modifications for better Greek language support.
|
||||
|
||||
The first modification is that it checks if the word for lemmatization is
|
||||
already a lemma and if yes, it just returns it.
|
||||
The second modification is about removing the base forms function which is
|
||||
not applicable for Greek language.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
||||
return cls(index, exc, rules, lookup)
|
||||
|
||||
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
||||
self.index = index
|
||||
self.exc = exceptions
|
||||
self.rules = rules
|
||||
self.lookup_table = lookup if lookup is not None else {}
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
if not self.rules:
|
||||
return [self.lookup_table.get(string, string)]
|
||||
if univ_pos in (NOUN, "NOUN", "noun"):
|
||||
univ_pos = "noun"
|
||||
elif univ_pos in (VERB, "VERB", "verb"):
|
||||
univ_pos = "verb"
|
||||
elif univ_pos in (ADJ, "ADJ", "adj"):
|
||||
univ_pos = "adj"
|
||||
elif univ_pos in (PUNCT, "PUNCT", "punct"):
|
||||
univ_pos = "punct"
|
||||
else:
|
||||
return list(set([string.lower()]))
|
||||
lemmas = lemmatize(
|
||||
string,
|
||||
self.index.get(univ_pos, {}),
|
||||
self.exc.get(univ_pos, {}),
|
||||
self.rules.get(univ_pos, []),
|
||||
)
|
||||
return lemmas
|
||||
|
||||
def lookup(self, string):
|
||||
if string in self.lookup_table:
|
||||
return self.lookup_table[string]
|
||||
return string
|
||||
|
||||
|
||||
def lemmatize(string, index, exceptions, rules):
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
return forms
|
||||
forms.extend(exceptions.get(string, []))
|
||||
oov_forms = []
|
||||
if not forms:
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[: len(string) - len(old)] + new
|
||||
if not form:
|
||||
pass
|
||||
elif form in index or not form.isalpha():
|
||||
forms.append(form)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
return list(set(forms))
|
|
@ -1,236 +0,0 @@
|
|||
{
|
||||
"adj": {
|
||||
"χειρότερος": ["κακός"],
|
||||
"χειρότερη": ["κακός"],
|
||||
"χειρότερης": ["κακός"],
|
||||
"χειρότερο": ["κακός"],
|
||||
"χειρότεροι": ["κακός"],
|
||||
"χειρότερων": ["κακός"],
|
||||
"χειρότερου": ["κακός"],
|
||||
"βέλτιστος": ["καλός"],
|
||||
"βέλτιστη": ["καλός"],
|
||||
"βέλτιστης": ["καλός"],
|
||||
"βέλτιστο": ["καλός"],
|
||||
"βέλτιστοι": ["καλός"],
|
||||
"βέλτιστων": ["καλός"],
|
||||
"βέλτιστου": ["καλός"],
|
||||
"ελάχιστος": ["λίγος"],
|
||||
"ελάχιστα": ["λίγος"],
|
||||
"ελάχιστοι": ["λίγος"],
|
||||
"ελάχιστων": ["λίγος"],
|
||||
"ελάχιστη": ["λίγος"],
|
||||
"ελάχιστης": ["λίγος"],
|
||||
"ελάχιστο": ["λίγος"],
|
||||
"ελάχιστου": ["λίγος"],
|
||||
"πλείστος": ["πολύς"],
|
||||
"πλείστου": ["πολύς"],
|
||||
"πλείστων": ["πολύς"],
|
||||
"πολλή": ["πολύ"],
|
||||
"πολύς": ["πολύ"],
|
||||
"πολλύ": ["πολύ"],
|
||||
"πολλύς": ["πολύ"]
|
||||
},
|
||||
"noun": {
|
||||
"λευτεριά": ["ελευθερία"],
|
||||
"καφέδες": ["καφές"],
|
||||
"ποιήματα": ["ποίημα"]
|
||||
},
|
||||
"det": {
|
||||
"του": ["το"],
|
||||
"των": ["το"],
|
||||
"τους": ["το"],
|
||||
"τις": ["τη"],
|
||||
"τα": ["το"],
|
||||
"οι": ["ο", "η"]
|
||||
},
|
||||
"verb": {
|
||||
"είσαι": ["είμαι"],
|
||||
"είναι": ["είμαι"],
|
||||
"είμαστε": ["είμαι"],
|
||||
"είστε": ["είμαι"],
|
||||
"είσαστε": ["είμαι"],
|
||||
"ήμουν": ["είμαι"],
|
||||
"ήσουν": ["είμαι"],
|
||||
"ήταν": ["είμαι"],
|
||||
"ήμαστε": ["είμαι"],
|
||||
"ήμασταν": ["είμαι"],
|
||||
"είπα": ["λέω"],
|
||||
"είπες": ["λέω"],
|
||||
"είπε": ["λέω"],
|
||||
"είπαμε": ["λέω"],
|
||||
"είπατε": ["λέω"],
|
||||
"είπαν": ["λέω"],
|
||||
"είπανε": ["λέω"],
|
||||
"πει": ["λέω"],
|
||||
"πω": ["λέω"],
|
||||
"πάω": ["πηγαίνω"],
|
||||
"πάς": ["πηγαίνω"],
|
||||
"πας": ["πηγαίνω"],
|
||||
"πάει": ["πηγαίνω"],
|
||||
"πάμε": ["πηγαίνω"],
|
||||
"πάτε": ["πηγαίνω"],
|
||||
"πάνε": ["πηγαίνω"],
|
||||
"πήγα": ["πηγαίνω"],
|
||||
"πήγες": ["πηγαίνω"],
|
||||
"πήγε": ["πηγαίνω"],
|
||||
"πήγαμε": ["πηγαίνω"],
|
||||
"πήγατε": ["πηγαίνω"],
|
||||
"πήγαν": ["πηγαίνω"],
|
||||
"πήγανε": ["πηγαίνω"],
|
||||
"έπαιζα": ["παίζω"],
|
||||
"έπαιζες": ["παίζω"],
|
||||
"έπαιζε": ["παίζω"],
|
||||
"έπαιζαν": ["παίζω,"],
|
||||
"έπαιξα": ["παίζω"],
|
||||
"έπαιξες": ["παίζω"],
|
||||
"έπαιξε": ["παίζω"],
|
||||
"έτρωγα": ["τρώω"],
|
||||
"έτρωγε": ["τρώω"],
|
||||
"είχα": ["έχω"],
|
||||
"είχες": ["έχω"],
|
||||
"είχε": ["έχω"],
|
||||
"είχαμε": ["έχω"],
|
||||
"είχατε": ["έχω"],
|
||||
"είχαν": ["έχω"],
|
||||
"είχανε": ["έχω"],
|
||||
"έπαιρνα": ["παίρνω"],
|
||||
"έπαιρνες": ["παίρνω"],
|
||||
"έπαιρνε": ["παίρνω"],
|
||||
"έπαιρναν": ["παίρνω"],
|
||||
"εδίνα": ["δίνω"],
|
||||
"εδίνες": ["δίνω"],
|
||||
"εδίνε": ["δίνω"],
|
||||
"εδίναν": ["δίνω"],
|
||||
"έκανα": ["κάνω"],
|
||||
"έκανες": ["κάνω"],
|
||||
"έκανε": ["κάνω"],
|
||||
"έκαναν": ["κάνω"],
|
||||
"ήθελα": ["θέλω"],
|
||||
"ήθελες": ["θέλω"],
|
||||
"ήθελε": ["θέλω"],
|
||||
"ήθελαν": ["θέλω"],
|
||||
"έβλεπα": ["βλέπω"],
|
||||
"έβλεπες": ["βλέπω"],
|
||||
"έβλεπε": ["βλέπω"],
|
||||
"έβλεπαν": ["βλέπω"],
|
||||
"είδα": ["βλέπω"],
|
||||
"είδες": ["βλέπω"],
|
||||
"είδε": ["βλέπω"],
|
||||
"είδαμε": ["βλέπω"],
|
||||
"είδατε": ["βλέπω"],
|
||||
"είδαν": ["βλέπω"],
|
||||
"έφερνα": ["φέρνω"],
|
||||
"έφερνες": ["φέρνω"],
|
||||
"έφερνε": ["φέρνω"],
|
||||
"έφερναν": ["φέρνω"],
|
||||
"έφερα": ["φέρω"],
|
||||
"έφερες": ["φέρω"],
|
||||
"έφερε": ["φέρω"],
|
||||
"έφεραν": ["φέρω"],
|
||||
"έλαβα": ["λαμβάνω"],
|
||||
"έλαβες": ["λαμβάνω"],
|
||||
"έλαβε": ["λαμβάνω"],
|
||||
"έλαβαν": ["λαμβάνω"],
|
||||
"έβρισκα": ["βρίσκω"],
|
||||
"έβρισκες": ["βρίσκω"],
|
||||
"έβρισκε": ["βρίσκω"],
|
||||
"έβρισκαν": ["βρίσκω"],
|
||||
"ήξερα": ["ξέρω"],
|
||||
"ήξερες": ["ξέρω"],
|
||||
"ήξερε": ["ξέρω"],
|
||||
"ήξεραν": ["ξέρω"],
|
||||
"ανέφερα": ["αναφέρω"],
|
||||
"ανέφερες": ["αναφέρω"],
|
||||
"ανέφερε": ["αναφέρω"],
|
||||
"ανέφεραν": ["αναφέρω"],
|
||||
"έβαζα": ["βάζω"],
|
||||
"έβαζες": ["βάζω"],
|
||||
"έβαζε": ["βάζω"],
|
||||
"έβαζαν": ["βάζω"],
|
||||
"έμεινα": ["μένω"],
|
||||
"έμεινες": ["μένω"],
|
||||
"έμεινε": ["μένω"],
|
||||
"έμειναν": ["μένω"],
|
||||
"έβγαζα": ["βγάζω"],
|
||||
"έβγαζες": ["βγάζω"],
|
||||
"έβγαζε": ["βγάζω"],
|
||||
"έβγαζαν": ["βγάζω"],
|
||||
"έμπαινα": ["μπαίνω"],
|
||||
"έμπαινες": ["μπαίνω"],
|
||||
"έμπαινε": ["μπαίνω"],
|
||||
"έμπαιναν": ["μπαίνω"],
|
||||
"βγήκα": ["βγαίνω"],
|
||||
"βγήκες": ["βγαίνω"],
|
||||
"βγήκε": ["βγαίνω"],
|
||||
"βγήκαμε": ["βγαίνω"],
|
||||
"βγήκατε": ["βγαίνω"],
|
||||
"βγήκαν": ["βγαίνω"],
|
||||
"έπεφτα": ["πέφτω"],
|
||||
"έπεφτες": ["πέφτω"],
|
||||
"έπεφτε": ["πέφτω"],
|
||||
"έπεφταν": ["πέφτω"],
|
||||
"έπεσα": ["πέφτω"],
|
||||
"έπεσες": ["πέφτω"],
|
||||
"έπεσε": ["πέφτω"],
|
||||
"έπεσαν": ["πέφτω"],
|
||||
"έστειλα": ["στέλνω"],
|
||||
"έστειλες": ["στέλνω"],
|
||||
"έστειλε": ["στέλνω"],
|
||||
"έστειλαν": ["στέλνω"],
|
||||
"έφυγα": ["φεύγω"],
|
||||
"έφυγες": ["φεύγω"],
|
||||
"έφυγαν": ["φεύγω"],
|
||||
"έμαθα": ["μαθαίνω"],
|
||||
"έμαθες": ["μαθαίνω"],
|
||||
"έμαθε": ["μαθαίνω"],
|
||||
"έμαθαν": ["μαθαίνω"],
|
||||
"υπέβαλλα": ["υποβάλλω"],
|
||||
"υπέβαλλες": ["υποβάλλω"],
|
||||
"υπέβαλλε": ["υποβάλλω"],
|
||||
"υπέβαλλαν": ["υποβάλλω"],
|
||||
"έπινα": ["πίνω"],
|
||||
"έπινες": ["πίνω"],
|
||||
"έπινε": ["πίνω"],
|
||||
"έπιναν": ["πίνω"],
|
||||
"ήπια": ["πίνω"],
|
||||
"ήπιες": ["πίνω"],
|
||||
"ήπιε": ["πίνω"],
|
||||
"ήπιαμε": ["πίνω"],
|
||||
"ήπιατε": ["πίνω"],
|
||||
"ήπιαν": ["πίνω"],
|
||||
"ετύχα": ["τυχαίνω"],
|
||||
"ετύχες": ["τυχαίνω"],
|
||||
"ετύχε": ["τυχαίνω"],
|
||||
"ετύχαν": ["τυχαίνω"],
|
||||
"φάω": ["τρώω"],
|
||||
"φάς": ["τρώω"],
|
||||
"φάει": ["τρώω"],
|
||||
"φάμε": ["τρώω"],
|
||||
"φάτε": ["τρώω"],
|
||||
"φάνε": ["τρώω"],
|
||||
"φάν": ["τρώω"],
|
||||
"έτρωγες": ["τρώω"],
|
||||
"τρώγαμε": ["τρώω"],
|
||||
"τρώγατε": ["τρώω"],
|
||||
"τρώγανε": ["τρώω"],
|
||||
"τρώγαν": ["τρώω"],
|
||||
"πέρασα": ["περνώ"],
|
||||
"πέρασες": ["περνώ"],
|
||||
"πέρασε": ["περνώ"],
|
||||
"πέρασαμε": ["περνώ"],
|
||||
"πέρασατε": ["περνώ"],
|
||||
"πέρασαν": ["περνώ"],
|
||||
"έγδαρα": ["γδάρω"],
|
||||
"έγδαρες": ["γδάρω"],
|
||||
"έγδαρε": ["γδάρω"],
|
||||
"έγδαραν": ["γδάρω"],
|
||||
"έβγαλα": ["βγάλω"],
|
||||
"έβγαλες": ["βγάλω"],
|
||||
"έβγαλε": ["βγάλω"],
|
||||
"έβγαλαν": ["βγάλω"],
|
||||
"έφθασα": ["φτάνω"],
|
||||
"έφθασες": ["φτάνω"],
|
||||
"έφθασε": ["φτάνω"],
|
||||
"έφθασαν": ["φτάνω"]
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -1,139 +0,0 @@
|
|||
{
|
||||
"adj": [
|
||||
["οί", "ός"],
|
||||
["ών", "ός"],
|
||||
["ού", "ός"],
|
||||
["ή", "ός"],
|
||||
["ής", "ός"],
|
||||
["ές", "ός"],
|
||||
["οι", "ος"],
|
||||
["ων", "ος"],
|
||||
["ου", "ος"],
|
||||
["ο", "ος"],
|
||||
["α", "ος"],
|
||||
["ώδη", "ώδες"],
|
||||
["ύτερη", "ός"],
|
||||
["ύτερης", "ός"],
|
||||
["ύτερων", "ός"],
|
||||
["ύτερος", "ός"],
|
||||
["ύτερου", "ός"]
|
||||
],
|
||||
"noun": [
|
||||
["ιού", "ί"],
|
||||
["ιά", "ί"],
|
||||
["ιών", "ί"],
|
||||
["ηριού", "ήρι"],
|
||||
["ια", "ι"],
|
||||
["ηριών", "ήρι"],
|
||||
["ας", "α"],
|
||||
["ες", "α"],
|
||||
["ων", "α"],
|
||||
["άς", "ά"],
|
||||
["ές", "ά"],
|
||||
["ών", "ά"],
|
||||
["ής", "ή"],
|
||||
["ές", "ή"],
|
||||
["ών", "ή"],
|
||||
["ές", "ής"],
|
||||
["ών", "ής"],
|
||||
["ου", "ο"],
|
||||
["α", "ο"],
|
||||
["ων", "ο"],
|
||||
["ητήματος", "ήτημα"],
|
||||
["ητήματα", "ήτημα"],
|
||||
["ητημάτων", "ήτημα"],
|
||||
["τος", ""],
|
||||
["τα", "α"],
|
||||
["ομάτων", "όμα"],
|
||||
["ού", "ός"],
|
||||
["οί", "ός"],
|
||||
["ών", "ός"],
|
||||
["ς", ""],
|
||||
["ες", "α"],
|
||||
["ιών", "ία"],
|
||||
["α", "ας"],
|
||||
["δων", ""]
|
||||
],
|
||||
"verb": [
|
||||
["εις", "ω"],
|
||||
["ει", "ω"],
|
||||
["ουμε", "ω"],
|
||||
["ετε", "ω"],
|
||||
["ουνε", "ω"],
|
||||
["ουν", "ω"],
|
||||
["είς", "ώ"],
|
||||
["εί", "ώ"],
|
||||
["ούν", "ώ"],
|
||||
["εσαι", "ομαι"],
|
||||
["εται", "ομαι"],
|
||||
["ανόμαστε", "άνομαι"],
|
||||
["εστε", "ομαι"],
|
||||
["ονται", "ομαι"],
|
||||
["άς", "ώ"],
|
||||
["άει", "ώ"],
|
||||
["άμε", "ώ"],
|
||||
["άτε", "ώ"],
|
||||
["άνε", "ώ"],
|
||||
["άν", "ώ"],
|
||||
["άω", "ώ"],
|
||||
["ώ", "άω"],
|
||||
["ιζόμουν", "ίζομαι"],
|
||||
["ιζόσουν", "ίζομαι"],
|
||||
["ιζόταν", "ίζομαι"],
|
||||
["ιζόμασταν", "ίζομαι"],
|
||||
["ιζόσασταν", "ίζομαι"],
|
||||
["ονταν", "ομαι"],
|
||||
["όμουν", "άμαι"],
|
||||
["όσουν", "άμαι"],
|
||||
["όταν", "άμαι"],
|
||||
["όμασταν", "άμαι"],
|
||||
["όσασταν", "άμαι"],
|
||||
["όντουσταν", "άμαι"],
|
||||
["ούσα", "ώ"],
|
||||
["ούσες", "ώ"],
|
||||
["ούσε", "ώ"],
|
||||
["ούσαμε", "ώ"],
|
||||
["ούσατε", "ώ"],
|
||||
["ούσαν", "ώ"],
|
||||
["ούσανε", "ώ"],
|
||||
["λαμε", "ζω"],
|
||||
["λατε", "ζω"],
|
||||
["ήρα", "άρω"],
|
||||
["ήρες", "άρω"],
|
||||
["ήρε", "άρω"],
|
||||
["ήραμε", "άρω"],
|
||||
["ήρατε", "άρω"],
|
||||
["ήρα", "άρω"],
|
||||
["ένησα", "ενώ"],
|
||||
["ένησες", "ενώ"],
|
||||
["ένησε", "ενώ"],
|
||||
["ενήσαμε", "ενώ"],
|
||||
["ένησατε", "ενώ"],
|
||||
["ένησαν", "ενώ"],
|
||||
["όνεσα", "ονώ"],
|
||||
["όνεσες", "ονώ"],
|
||||
["όνεσε", "ονώ"],
|
||||
["έσαμε", "ώ"],
|
||||
["έσατε", "ώ"],
|
||||
["ισα", "ομαι"],
|
||||
["ισες", "ομαι"],
|
||||
["ισε", "ομαι"],
|
||||
["αθίσαμε", "άθομαι"],
|
||||
["αθίσατε", "άθομαι"],
|
||||
["ισαν", "ομαι"],
|
||||
["άπα", "απώ"],
|
||||
["ά", "ώ"],
|
||||
["οντας", "ω"],
|
||||
["ξω", "ζω"],
|
||||
["ξεις", "ζω"],
|
||||
["ξουμε", "ζω"],
|
||||
["ξετε", "ζω"],
|
||||
["ξουν", "ζω"]
|
||||
],
|
||||
"punct": [
|
||||
["“", "\""],
|
||||
["”", "\""],
|
||||
["‘", "'"],
|
||||
["’", "'"]
|
||||
]
|
||||
}
|
|
@ -32,12 +32,14 @@ class EnglishDefaults(Language.Defaults):
|
|||
stop_words = STOP_WORDS
|
||||
morph_rules = MORPH_RULES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
resources = {
|
||||
"lemma_lookup": "lemmatizer/lemma_lookup.json",
|
||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
||||
"lemma_index": "lemmatizer/lemma_index.json",
|
||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
||||
}
|
||||
single_orth_variants = [
|
||||
{"tags": ["NFP"], "variants": ["…", "..."]},
|
||||
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
|
||||
]
|
||||
paired_orth_variants = [
|
||||
{"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
|
||||
{"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
|
||||
]
|
||||
|
||||
|
||||
class English(Language):
|
||||
|
|
|
@ -1,31 +0,0 @@
|
|||
WordNet Release 3.0
|
||||
|
||||
This software and database is being provided to you, the LICENSEE, by
|
||||
Princeton University under the following license. By obtaining, using
|
||||
and/or copying this software and database, you agree that you have
|
||||
read, understood, and will comply with these terms and conditions.:
|
||||
|
||||
Permission to use, copy, modify and distribute this software and
|
||||
database and its documentation for any purpose and without fee or
|
||||
royalty is hereby granted, provided that you agree to comply with
|
||||
the following copyright notice and statements, including the disclaimer,
|
||||
and that the same appear on ALL copies of the software, database and
|
||||
documentation, including modifications that you make for internal
|
||||
use or for distribution.
|
||||
|
||||
WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved.
|
||||
|
||||
THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
|
||||
UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
|
||||
IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
|
||||
UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
|
||||
ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
|
||||
OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
|
||||
INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
|
||||
OTHER RIGHTS.
|
||||
|
||||
The name of Princeton University or Princeton may not be used in
|
||||
advertising or publicity pertaining to distribution of the software
|
||||
and/or database. Title to copyright in this software, database and
|
||||
any associated documentation shall at all times remain with
|
||||
Princeton University and LICENSEE agrees to preserve same.
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,35 +0,0 @@
|
|||
{
|
||||
"adj": [
|
||||
["er", ""],
|
||||
["est", ""],
|
||||
["er", "e"],
|
||||
["est", "e"]
|
||||
],
|
||||
"noun": [
|
||||
["s", ""],
|
||||
["ses", "s"],
|
||||
["ves", "f"],
|
||||
["xes", "x"],
|
||||
["zes", "z"],
|
||||
["ches", "ch"],
|
||||
["shes", "sh"],
|
||||
["men", "man"],
|
||||
["ies", "y"]
|
||||
],
|
||||
"verb": [
|
||||
["s", ""],
|
||||
["ies", "y"],
|
||||
["es", "e"],
|
||||
["es", ""],
|
||||
["ed", "e"],
|
||||
["ed", ""],
|
||||
["ing", "e"],
|
||||
["ing", ""]
|
||||
],
|
||||
"punct": [
|
||||
["“", "\""],
|
||||
["”", "\""],
|
||||
["‘", "'"],
|
||||
["’", "'"]
|
||||
]
|
||||
}
|
|
@ -25,7 +25,6 @@ class SpanishDefaults(Language.Defaults):
|
|||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Spanish(Language):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -24,12 +24,6 @@ class PersianDefaults(Language.Defaults):
|
|||
tag_map = TAG_MAP
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||
# extracted from Mojgan Seraji's Persian Universal Dependencies Corpus
|
||||
resources = {
|
||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
||||
"lemma_index": "lemmatizer/lemma_index.json",
|
||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
||||
}
|
||||
|
||||
|
||||
class Persian(Language):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -1,41 +0,0 @@
|
|||
{
|
||||
"adj": [
|
||||
["ین", ""],
|
||||
["ترین", ""],
|
||||
["ترین", ""],
|
||||
["تر", ""],
|
||||
["تر", ""],
|
||||
["ای", ""]
|
||||
],
|
||||
"noun": [
|
||||
["ایان", "ا"],
|
||||
["ویان", "و"],
|
||||
["ایانی", "ا"],
|
||||
["ویانی", "و"],
|
||||
["گان", "ه"],
|
||||
["گانی", "ه"],
|
||||
["گان", ""],
|
||||
["گانی", ""],
|
||||
["ان", ""],
|
||||
["انی", ""],
|
||||
["ات", ""],
|
||||
["ات", "ه"],
|
||||
["ات", "ت"],
|
||||
["اتی", ""],
|
||||
["اتی", "ه"],
|
||||
["اتی", "ت"],
|
||||
["ها", ""],
|
||||
["ها", ""],
|
||||
["های", ""],
|
||||
["های", ""],
|
||||
["هایی", ""],
|
||||
["هایی", ""]
|
||||
],
|
||||
"verb": [],
|
||||
"punct": [
|
||||
["“", "\""],
|
||||
["”", "\""],
|
||||
["‘", "'"],
|
||||
["’", "'"]
|
||||
]
|
||||
}
|
|
@ -12,8 +12,9 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups, get_lemma_tables
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class FrenchDefaults(Language.Defaults):
|
||||
|
@ -30,17 +31,12 @@ class FrenchDefaults(Language.Defaults):
|
|||
suffixes = TOKENIZER_SUFFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
resources = {
|
||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
||||
"lemma_index": "lemmatizer/lemma_index.json",
|
||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
||||
"lemma_lookup": "lemmatizer/lemma_lookup.json",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
|
||||
return FrenchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return FrenchLemmatizer(lookups)
|
||||
|
||||
|
||||
class French(Language):
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ....symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
|
||||
from ....symbols import SCONJ, CCONJ
|
||||
from ....symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
|
||||
from ...symbols import SCONJ, CCONJ
|
||||
from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||
|
||||
|
||||
class FrenchLemmatizer(object):
|
||||
class FrenchLemmatizer(Lemmatizer):
|
||||
"""
|
||||
French language lemmatizer applies the default rule based lemmatization
|
||||
procedure with some modifications for better French language support.
|
||||
|
@ -16,19 +17,10 @@ class FrenchLemmatizer(object):
|
|||
the lookup table.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
||||
return cls(index, exc, rules, lookup)
|
||||
|
||||
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
||||
self.index = index
|
||||
self.exc = exceptions
|
||||
self.rules = rules
|
||||
self.lookup_table = lookup if lookup is not None else {}
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
if not self.rules:
|
||||
return [self.lookup_table.get(string, string)]
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
if "lemma_rules" not in self.lookups:
|
||||
return [lookup_table.get(string, string)]
|
||||
if univ_pos in (NOUN, "NOUN", "noun"):
|
||||
univ_pos = "noun"
|
||||
elif univ_pos in (VERB, "VERB", "verb"):
|
||||
|
@ -56,12 +48,14 @@ class FrenchLemmatizer(object):
|
|||
# See Issue #435 for example of where this logic is requied.
|
||||
if self.is_base_form(univ_pos, morphology):
|
||||
return list(set([string.lower()]))
|
||||
lemmas = lemmatize(
|
||||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
lemmas = self.lemmatize(
|
||||
string,
|
||||
self.index.get(univ_pos, {}),
|
||||
self.exc.get(univ_pos, {}),
|
||||
self.rules.get(univ_pos, []),
|
||||
self.lookup_table,
|
||||
index_table.get(univ_pos, {}),
|
||||
exc_table.get(univ_pos, {}),
|
||||
rules_table.get(univ_pos, []),
|
||||
)
|
||||
return lemmas
|
||||
|
||||
|
@ -114,34 +108,35 @@ class FrenchLemmatizer(object):
|
|||
def punct(self, string, morphology=None):
|
||||
return self(string, "punct", morphology)
|
||||
|
||||
def lookup(self, string):
|
||||
if string in self.lookup_table:
|
||||
return self.lookup_table[string][0]
|
||||
def lookup(self, string, orth=None):
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
if orth is not None and orth in lookup_table:
|
||||
return lookup_table[orth][0]
|
||||
return string
|
||||
|
||||
|
||||
def lemmatize(string, index, exceptions, rules, lookup):
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
return forms
|
||||
forms.extend(exceptions.get(string, []))
|
||||
oov_forms = []
|
||||
if not forms:
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[: len(string) - len(old)] + new
|
||||
if not form:
|
||||
pass
|
||||
elif form in index or not form.isalpha():
|
||||
forms.append(form)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms and string in lookup.keys():
|
||||
forms.append(lookup[string][0])
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
return list(set(forms))
|
||||
def lemmatize(self, string, index, exceptions, rules):
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
return forms
|
||||
forms.extend(exceptions.get(string, []))
|
||||
oov_forms = []
|
||||
if not forms:
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[: len(string) - len(old)] + new
|
||||
if not form:
|
||||
pass
|
||||
elif form in index or not form.isalpha():
|
||||
forms.append(form)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms and string in lookup_table.keys():
|
||||
forms.append(lookup_table[string][0])
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
return list(set(forms))
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,126 +0,0 @@
|
|||
{
|
||||
"adj": [
|
||||
["a", "a"],
|
||||
["aux", "al"],
|
||||
["c", "c"],
|
||||
["d", "d"],
|
||||
["e", ""],
|
||||
["é", "é"],
|
||||
["eux", "eux"],
|
||||
["f", "f"],
|
||||
["i", "i"],
|
||||
["ï", "ï"],
|
||||
["l", "l"],
|
||||
["m", "m"],
|
||||
["n", "n"],
|
||||
["o", "o"],
|
||||
["p", "p"],
|
||||
["r", "r"],
|
||||
["s", ""],
|
||||
["t", "t"],
|
||||
["u", "u"],
|
||||
["y", "y"]
|
||||
],
|
||||
"noun": [
|
||||
["a", "a"],
|
||||
["à", "à"],
|
||||
["â", "â"],
|
||||
["b", "b"],
|
||||
["c", "c"],
|
||||
["ç", "ç"],
|
||||
["d", "d"],
|
||||
["e", "e"],
|
||||
["é", "é"],
|
||||
["è", "è"],
|
||||
["ê", "ê"],
|
||||
["ë", "ë"],
|
||||
["f", "f"],
|
||||
["g", "g"],
|
||||
["h", "h"],
|
||||
["i", "i"],
|
||||
["î", "î"],
|
||||
["ï", "ï"],
|
||||
["j", "j"],
|
||||
["k", "k"],
|
||||
["l", "l"],
|
||||
["m", "m"],
|
||||
["n", "n"],
|
||||
["o", "o"],
|
||||
["ô", "ö"],
|
||||
["ö", "ö"],
|
||||
["p", "p"],
|
||||
["q", "q"],
|
||||
["r", "r"],
|
||||
["t", "t"],
|
||||
["u", "u"],
|
||||
["û", "û"],
|
||||
["v", "v"],
|
||||
["w", "w"],
|
||||
["y", "y"],
|
||||
["z", "z"],
|
||||
["s", ""],
|
||||
["x", ""],
|
||||
["nt(e", "nt"],
|
||||
["nt(e)", "nt"],
|
||||
["al(e", "ale"],
|
||||
["é(", "é"],
|
||||
["é(e", "é"],
|
||||
["é.e", "é"],
|
||||
["el(le", "el"],
|
||||
["eurs(rices", "eur"],
|
||||
["eur(rice", "eur"],
|
||||
["eux(se", "eux"],
|
||||
["ial(e", "ial"],
|
||||
["er(ère", "er"],
|
||||
["eur(se", "eur"],
|
||||
["teur(trice", "teur"],
|
||||
["teurs(trices", "teur"]
|
||||
],
|
||||
"verb": [
|
||||
["é", "er"],
|
||||
["és", "er"],
|
||||
["ée", "er"],
|
||||
["ées", "er"],
|
||||
["é", "er"],
|
||||
["es", "er"],
|
||||
["ons", "er"],
|
||||
["ez", "er"],
|
||||
["ent", "er"],
|
||||
["ais", "er"],
|
||||
["ait", "er"],
|
||||
["ions", "er"],
|
||||
["iez", "er"],
|
||||
["aient", "er"],
|
||||
["ai", "er"],
|
||||
["as", "er"],
|
||||
["a", "er"],
|
||||
["âmes", "er"],
|
||||
["âtes", "er"],
|
||||
["èrent", "er"],
|
||||
["erai", "er"],
|
||||
["eras", "er"],
|
||||
["era", "er"],
|
||||
["erons", "er"],
|
||||
["erez", "er"],
|
||||
["eront", "er"],
|
||||
["erais", "er"],
|
||||
["erait", "er"],
|
||||
["erions", "er"],
|
||||
["eriez", "er"],
|
||||
["eraient", "er"],
|
||||
["asse", "er"],
|
||||
["asses", "er"],
|
||||
["ât", "er"],
|
||||
["assions", "er"],
|
||||
["assiez", "er"],
|
||||
["assent", "er"],
|
||||
["ant", "er"],
|
||||
["ante", "er"],
|
||||
["ants", "er"],
|
||||
["antes", "er"],
|
||||
["u(er", "u"],
|
||||
["és(ées", "er"],
|
||||
["é()e", "er"],
|
||||
["é()", "er"]
|
||||
]
|
||||
}
|
|
@ -11,12 +11,12 @@ Example sentences to test spaCy and its language models.
|
|||
|
||||
|
||||
sentences = [
|
||||
"एप्पल 1 अरब डॉलर के लिए यू.के. स्टार्टअप खरीदने पर विचार कर रहा है",
|
||||
"स्वायत्त कार निर्माताओं की ओर बीमा दायित्व रखती है",
|
||||
"सैन फ्रांसिस्को फुटवे डिलीवरी रोबोटों पर प्रतिबंध लगाने का विचार कर रहा है",
|
||||
"लंदन यूनाइटेड किंगडम का बड़ा शहर है।",
|
||||
"आप कहाँ हैं?",
|
||||
"एप्पल 1 अरब डॉलर के लिए यू.के. स्टार्टअप खरीदने पर विचार कर रहा है।",
|
||||
"स्वायत्त कारें निर्माताओं की ओर बीमा दायित्व रखतीं हैं।",
|
||||
"सैन फ्रांसिस्को फुटपाथ वितरण रोबोटों पर प्रतिबंध लगाने का विचार कर रहा है।",
|
||||
"लंदन यूनाइटेड किंगडम का विशाल शहर है।",
|
||||
"आप कहाँ हो?",
|
||||
"फ्रांस के राष्ट्रपति कौन हैं?",
|
||||
"संयुक्त राज्य की राजधानी क्या है?",
|
||||
"बराक ओबामा का जन्म हुआ था?",
|
||||
"संयुक्त राज्यों की राजधानी क्या है?",
|
||||
"बराक ओबामा का जन्म कब हुआ था?",
|
||||
]
|
||||
|
|
|
@ -18,7 +18,6 @@ class CroatianDefaults(Language.Defaults):
|
|||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Croatian(Language):
|
||||
|
|
1313609
spacy/lang/hr/lemma_lookup.json
1313609
spacy/lang/hr/lemma_lookup.json
File diff suppressed because it is too large
Load Diff
|
@ -24,7 +24,6 @@ class HungarianDefaults(Language.Defaults):
|
|||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Hungarian(Language):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -30,7 +30,6 @@ class IndonesianDefaults(Language.Defaults):
|
|||
infixes = TOKENIZER_INFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
tag_map = TAG_MAP
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Indonesian(Language):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -23,7 +23,6 @@ class ItalianDefaults(Language.Defaults):
|
|||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
infixes = TOKENIZER_INFIXES
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Italian(Language):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -30,7 +30,6 @@ class LithuanianDefaults(Language.Defaults):
|
|||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
morph_rules = MORPH_RULES
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Lithuanian(Language):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -25,11 +25,6 @@ class NorwegianDefaults(Language.Defaults):
|
|||
morph_rules = MORPH_RULES
|
||||
tag_map = TAG_MAP
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
resources = {
|
||||
"lemma_lookup": "lemmatizer/lemma_lookup.json",
|
||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
||||
}
|
||||
|
||||
|
||||
class Norwegian(Language):
|
||||
|
|
|
@ -1,7 +0,0 @@
|
|||
Note on noun wordforms / lemmas:
|
||||
All wordforms are extracted from Norsk Ordbank in Norwegian Bokmål 2005, updated 20180627
|
||||
(CLARINO NB - Språkbanken), Nasjonalbiblioteket, Norway:
|
||||
https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-5&lang=en
|
||||
|
||||
License:
|
||||
Creative_Commons-BY (CC-BY) (https://creativecommons.org/licenses/by/4.0/)
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,24 +0,0 @@
|
|||
{
|
||||
"adj": [
|
||||
["e", ""],
|
||||
["ere", ""],
|
||||
["est", ""],
|
||||
["este", ""]
|
||||
],
|
||||
"noun": [
|
||||
["en", "e"],
|
||||
["a", "e"],
|
||||
["et", ""],
|
||||
["er", "e"],
|
||||
["ene", "e"]
|
||||
],
|
||||
"verb": [
|
||||
["er", "e"],
|
||||
["et", "e"],
|
||||
["a", "e"],
|
||||
["es", "e"],
|
||||
["te", "e"],
|
||||
["år", "å"]
|
||||
],
|
||||
"punct": []
|
||||
}
|
|
@ -10,8 +10,9 @@ from .lemmatizer import DutchLemmatizer
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups, get_lemma_tables
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class DutchDefaults(Language.Defaults):
|
||||
|
@ -26,17 +27,12 @@ class DutchDefaults(Language.Defaults):
|
|||
tag_map = TAG_MAP
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
resources = {
|
||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
||||
"lemma_index": "lemmatizer/lemma_index.json",
|
||||
"lemma_exc": "lemmatizer/lemma_exc.json",
|
||||
"lemma_lookup": "lemmatizer/lemma_lookup.json",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
lemma_rules, lemma_index, lemma_exc, lemma_lookup = get_lemma_tables(lookups)
|
||||
return DutchLemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return DutchLemmatizer(lookups)
|
||||
|
||||
|
||||
class Dutch(Language):
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ....symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
||||
|
||||
|
||||
class DutchLemmatizer(object):
|
||||
class DutchLemmatizer(Lemmatizer):
|
||||
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
|
||||
univ_pos_name_variants = {
|
||||
NOUN: "noun",
|
||||
|
@ -36,16 +37,6 @@ class DutchLemmatizer(object):
|
|||
"num": "num",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
||||
return cls(index, exc, rules, lookup)
|
||||
|
||||
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
||||
self.index = index
|
||||
self.exc = exceptions
|
||||
self.rules = rules or {}
|
||||
self.lookup_table = lookup if lookup is not None else {}
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
# Difference 1: self.rules is assumed to be non-None, so no
|
||||
# 'is None' check required.
|
||||
|
@ -62,27 +53,28 @@ class DutchLemmatizer(object):
|
|||
# are not lemmatized. They are lowercased, however.
|
||||
return [string]
|
||||
# if string in self.lemma_index.get(univ_pos)
|
||||
lemma_index = self.index.get(univ_pos, {})
|
||||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
lemma_index = index_table.get(univ_pos, {})
|
||||
# string is already lemma
|
||||
if string in lemma_index:
|
||||
return [string]
|
||||
exceptions = self.exc.get(univ_pos, {})
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
exceptions = exc_table.get(univ_pos, {})
|
||||
# string is irregular token contained in exceptions index.
|
||||
try:
|
||||
lemma = exceptions[string]
|
||||
return [lemma[0]]
|
||||
except KeyError:
|
||||
pass
|
||||
# string corresponds to key in lookup table
|
||||
lookup_table = self.lookup_table
|
||||
# string corresponds to key in lookup table
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
looked_up_lemma = lookup_table.get(string)
|
||||
if looked_up_lemma and looked_up_lemma in lemma_index:
|
||||
return [looked_up_lemma]
|
||||
|
||||
forms, is_known = lemmatize(
|
||||
string, lemma_index, exceptions, self.rules.get(univ_pos, [])
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
forms, is_known = self.lemmatize(
|
||||
string, lemma_index, exceptions, rules_table.get(univ_pos, [])
|
||||
)
|
||||
|
||||
# Back-off through remaining return value candidates.
|
||||
if forms:
|
||||
if is_known:
|
||||
|
@ -103,44 +95,26 @@ class DutchLemmatizer(object):
|
|||
# Overrides parent method so that a lowercased version of the string is
|
||||
# used to search the lookup table. This is necessary because our lookup
|
||||
# table consists entirely of lowercase keys.
|
||||
def lookup(self, string):
|
||||
def lookup(self, string, orth=None):
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
string = string.lower()
|
||||
return self.lookup_table.get(string, string)
|
||||
if orth is not None:
|
||||
return lookup_table.get(orth, string)
|
||||
else:
|
||||
return lookup_table.get(string, string)
|
||||
|
||||
def noun(self, string, morphology=None):
|
||||
return self(string, "noun", morphology)
|
||||
|
||||
def verb(self, string, morphology=None):
|
||||
return self(string, "verb", morphology)
|
||||
|
||||
def adj(self, string, morphology=None):
|
||||
return self(string, "adj", morphology)
|
||||
|
||||
def det(self, string, morphology=None):
|
||||
return self(string, "det", morphology)
|
||||
|
||||
def pron(self, string, morphology=None):
|
||||
return self(string, "pron", morphology)
|
||||
|
||||
def adp(self, string, morphology=None):
|
||||
return self(string, "adp", morphology)
|
||||
|
||||
def punct(self, string, morphology=None):
|
||||
return self(string, "punct", morphology)
|
||||
|
||||
|
||||
# Reimplemented to focus more on application of suffix rules and to return
|
||||
# as early as possible.
|
||||
def lemmatize(string, index, exceptions, rules):
|
||||
# returns (forms, is_known: bool)
|
||||
oov_forms = []
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[: len(string) - len(old)] + new
|
||||
if not form:
|
||||
pass
|
||||
elif form in index:
|
||||
return [form], True # True = Is known (is lemma)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
return list(set(oov_forms)), False
|
||||
# Reimplemented to focus more on application of suffix rules and to return
|
||||
# as early as possible.
|
||||
def lemmatize(self, string, index, exceptions, rules):
|
||||
# returns (forms, is_known: bool)
|
||||
oov_forms = []
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[: len(string) - len(old)] + new
|
||||
if not form:
|
||||
pass
|
||||
elif form in index:
|
||||
return [form], True # True = Is known (is lemma)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
return list(set(oov_forms)), False
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,55 +0,0 @@
|
|||
{
|
||||
"adj": [
|
||||
["sten", ""],
|
||||
["ende", "end"],
|
||||
["ste", ""],
|
||||
["st", ""],
|
||||
["er", ""],
|
||||
["en", ""],
|
||||
["e", ""]
|
||||
],
|
||||
"noun": [
|
||||
["heden", "heid"],
|
||||
["elen", "eel"],
|
||||
["ezen", "ees"],
|
||||
["even", "eef"],
|
||||
["ssen", "s"],
|
||||
["rren", "r"],
|
||||
["kken", "k"],
|
||||
["bben", "b"],
|
||||
["'er", ""],
|
||||
["tje", ""],
|
||||
["kje", ""],
|
||||
["ici", "icus"],
|
||||
["en", ""],
|
||||
["ën", ""],
|
||||
["'s", ""],
|
||||
["s", ""]
|
||||
],
|
||||
"verb": [
|
||||
["dden", "den"],
|
||||
["tten", "ten"],
|
||||
["dde", "den"],
|
||||
["tte", "ten"],
|
||||
["end", "en"],
|
||||
["dt", "den"],
|
||||
["de", "en"],
|
||||
["te", "en"]
|
||||
],
|
||||
"num": [
|
||||
["sten", ""],
|
||||
["tjes", ""],
|
||||
["ste", ""],
|
||||
["ën", ""],
|
||||
["en", ""],
|
||||
["de", ""],
|
||||
["er", ""],
|
||||
["ër", ""]
|
||||
],
|
||||
"punct": [
|
||||
["“", "\""],
|
||||
["”", "\""],
|
||||
["‘", "'"],
|
||||
["’", "'"]
|
||||
]
|
||||
}
|
|
@ -27,7 +27,6 @@ class PortugueseDefaults(Language.Defaults):
|
|||
tag_map = TAG_MAP
|
||||
infixes = TOKENIZER_INFIXES
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Portuguese(Language):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -24,7 +24,6 @@ class RomanianDefaults(Language.Defaults):
|
|||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
tag_map = TAG_MAP
|
||||
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -12,6 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
|
||||
|
||||
|
@ -27,8 +28,10 @@ class RussianDefaults(Language.Defaults):
|
|||
tag_map = TAG_MAP
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, **kwargs):
|
||||
return RussianLemmatizer()
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return RussianLemmatizer(lookups)
|
||||
|
||||
|
||||
class Russian(Language):
|
||||
|
|
|
@ -9,8 +9,8 @@ from ...compat import unicode_
|
|||
class RussianLemmatizer(Lemmatizer):
|
||||
_morph = None
|
||||
|
||||
def __init__(self):
|
||||
super(RussianLemmatizer, self).__init__()
|
||||
def __init__(self, lookups=None):
|
||||
super(RussianLemmatizer, self).__init__(lookups)
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
except ImportError:
|
||||
|
@ -102,20 +102,7 @@ class RussianLemmatizer(Lemmatizer):
|
|||
return symbols_to_str[univ_pos]
|
||||
return None
|
||||
|
||||
def is_base_form(self, univ_pos, morphology=None):
|
||||
# TODO
|
||||
raise NotImplementedError
|
||||
|
||||
def det(self, string, morphology=None):
|
||||
return self(string, "det", morphology)
|
||||
|
||||
def num(self, string, morphology=None):
|
||||
return self(string, "num", morphology)
|
||||
|
||||
def pron(self, string, morphology=None):
|
||||
return self(string, "pron", morphology)
|
||||
|
||||
def lookup(self, string):
|
||||
def lookup(self, string, orth=None):
|
||||
analyses = self._morph.parse(string)
|
||||
if len(analyses) == 1:
|
||||
return analyses[0].normal_form
|
||||
|
|
|
@ -21,7 +21,6 @@ class SerbianDefaults(Language.Defaults):
|
|||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Serbian(Language):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -29,10 +29,6 @@ class SwedishDefaults(Language.Defaults):
|
|||
suffixes = TOKENIZER_SUFFIXES
|
||||
stop_words = STOP_WORDS
|
||||
morph_rules = MORPH_RULES
|
||||
resources = {
|
||||
"lemma_lookup": "lemmatizer/lemma_lookup.json",
|
||||
"lemma_rules": "lemmatizer/lemma_rules.json",
|
||||
}
|
||||
|
||||
|
||||
class Swedish(Language):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,103 +0,0 @@
|
|||
{
|
||||
"noun": [
|
||||
["t", ""],
|
||||
["n", ""],
|
||||
["na", ""],
|
||||
["na", "e"],
|
||||
["or", "a"],
|
||||
["orna", "a"],
|
||||
["et", ""],
|
||||
["en", ""],
|
||||
["en", "e"],
|
||||
["er", ""],
|
||||
["erna", ""],
|
||||
["ar", "e"],
|
||||
["ar", ""],
|
||||
["lar", "el"],
|
||||
["arna", "e"],
|
||||
["arna", ""],
|
||||
["larna", "el"]
|
||||
],
|
||||
"verb": [
|
||||
["r", ""],
|
||||
["de", ""],
|
||||
["t", ""],
|
||||
["er", ""],
|
||||
["te", ""],
|
||||
["a", ""],
|
||||
["e", ""],
|
||||
["t", "d"],
|
||||
["tt", "d"],
|
||||
["tt", ""],
|
||||
["ev", "iv"],
|
||||
["ack", "ick"],
|
||||
["ög", "yg"],
|
||||
["it", ""],
|
||||
["uckit", "ick"],
|
||||
["ugit", "yg"],
|
||||
["it", "et"],
|
||||
["id", "ed"],
|
||||
["ip", "ep"],
|
||||
["iv", "ev"],
|
||||
["in", "en"],
|
||||
["ik", "ek"],
|
||||
["ig", "eg"],
|
||||
["ind", ""],
|
||||
["inn", "ann"],
|
||||
["nder", "nd"],
|
||||
["inner", "inn"],
|
||||
["and", "ind"],
|
||||
["ann", "inn"],
|
||||
["s", ""],
|
||||
["anns", "inn"],
|
||||
["undit", "ind"],
|
||||
["unnit", "inn"],
|
||||
["unnits", "inn"],
|
||||
["uppit", "ipp"],
|
||||
["ungit", "ing"],
|
||||
["öd", "ud"],
|
||||
["öt", "jut"],
|
||||
["öt", "ut"],
|
||||
["ög", "ug"],
|
||||
["ögg", "ugg"],
|
||||
["öng", "ung"],
|
||||
["önk", "unk"],
|
||||
["öt", "yt"],
|
||||
["utit", "yt"],
|
||||
["ös", "ys"],
|
||||
["öv", "yv"],
|
||||
["uvit", "yv"],
|
||||
["öp", "yp"],
|
||||
["upit", "yp"],
|
||||
["ök", "yk"],
|
||||
["ukit", "yk"],
|
||||
["or", "ar"],
|
||||
["öll", "all"],
|
||||
["ät", "åt"],
|
||||
["öll", "åll"],
|
||||
["or", "är"],
|
||||
["urit", "är"],
|
||||
["åt", "ät"],
|
||||
["ar", "är"],
|
||||
["alt", "ält"],
|
||||
["ultit", "ält"]
|
||||
],
|
||||
"adj": [
|
||||
["are", ""],
|
||||
["ast", ""],
|
||||
["re", ""],
|
||||
["st", ""],
|
||||
["ägre", "åg"],
|
||||
["ägst", "åg"],
|
||||
["ängre", "ång"],
|
||||
["ängst", "ång"],
|
||||
["örre", "or"],
|
||||
["örst", "or"]
|
||||
],
|
||||
"punct": [
|
||||
["“", "\""],
|
||||
["”", "\""],
|
||||
["‘", "'"],
|
||||
["’", "'"]
|
||||
]
|
||||
}
|
|
@ -24,7 +24,6 @@ class TagalogDefaults(Language.Defaults):
|
|||
lex_attr_getters.update(LEX_ATTRS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Tagalog(Language):
|
||||
|
|
|
@ -1,9 +0,0 @@
|
|||
{
|
||||
"kaugnayan": "ugnay",
|
||||
"sangkatauhan": "tao",
|
||||
"kanayunan": "nayon",
|
||||
"pandaigdigan": "daigdig",
|
||||
"kasaysayan": "saysay",
|
||||
"kabayanihan": "bayani",
|
||||
"karuwagan": "duwag"
|
||||
}
|
|
@ -10,9 +10,6 @@ from ...language import Language
|
|||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
# Lemma data source:
|
||||
# http://st2.zargan.com/duyuru/Zargan_Linguistic_Resources_for_Turkish.html - Bilgin, O. (2016). Biçimbilimsel Bakımdan Karmaşık Türkçe Kelimelerin İşlenmesinde Frekans Etkileri (yayınlanmamış yüksek lisans tezi). Boğaziçi Üniversitesi, İstanbul. Erişim: http://st2.zargan.com/public/resources/turkish/frequency_effects_in_turkish.pdf
|
||||
|
||||
|
||||
class TurkishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
|
@ -22,7 +19,6 @@ class TurkishDefaults(Language.Defaults):
|
|||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Turkish(Language):
|
||||
|
|
1333973
spacy/lang/tr/lemma_lookup.json
1333973
spacy/lang/tr/lemma_lookup.json
File diff suppressed because it is too large
Load Diff
|
@ -9,6 +9,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
from .lemmatizer import UkrainianLemmatizer
|
||||
|
||||
|
@ -24,8 +25,10 @@ class UkrainianDefaults(Language.Defaults):
|
|||
stop_words = STOP_WORDS
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, **kwargs):
|
||||
return UkrainianLemmatizer()
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return UkrainianLemmatizer(lookups)
|
||||
|
||||
|
||||
class Ukrainian(Language):
|
||||
|
|
|
@ -6,8 +6,8 @@ from ...lemmatizer import Lemmatizer
|
|||
class UkrainianLemmatizer(Lemmatizer):
|
||||
_morph = None
|
||||
|
||||
def __init__(self):
|
||||
super(UkrainianLemmatizer, self).__init__()
|
||||
def __init__(self, lookups=None):
|
||||
super(UkrainianLemmatizer, self).__init__(lookups)
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
|
||||
|
@ -99,20 +99,7 @@ class UkrainianLemmatizer(Lemmatizer):
|
|||
return symbols_to_str[univ_pos]
|
||||
return None
|
||||
|
||||
def is_base_form(self, univ_pos, morphology=None):
|
||||
# TODO
|
||||
raise NotImplementedError
|
||||
|
||||
def det(self, string, morphology=None):
|
||||
return self(string, "det", morphology)
|
||||
|
||||
def num(self, string, morphology=None):
|
||||
return self(string, "num", morphology)
|
||||
|
||||
def pron(self, string, morphology=None):
|
||||
return self(string, "pron", morphology)
|
||||
|
||||
def lookup(self, string):
|
||||
def lookup(self, string, orth=None):
|
||||
analyses = self._morph.parse(string)
|
||||
if len(analyses) == 1:
|
||||
return analyses[0].normal_form
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user