mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
parent
852bc2ac16
commit
eddeb36c96
11
.flake8
11
.flake8
|
@ -1,4 +1,13 @@
|
||||||
[flake8]
|
[flake8]
|
||||||
ignore = E203, E266, E501, W503
|
ignore = E203, E266, E501, E731, W503
|
||||||
max-line-length = 80
|
max-line-length = 80
|
||||||
select = B,C,E,F,W,T4,B9
|
select = B,C,E,F,W,T4,B9
|
||||||
|
exclude =
|
||||||
|
.env,
|
||||||
|
.git,
|
||||||
|
__pycache__,
|
||||||
|
lemmatizer.py,
|
||||||
|
lookup.py,
|
||||||
|
_tokenizer_exceptions_list.py,
|
||||||
|
spacy/lang/fr/lemmatizer,
|
||||||
|
spacy/lang/nb/lemmatizer
|
||||||
|
|
100
CONTRIBUTING.md
100
CONTRIBUTING.md
|
@ -186,13 +186,99 @@ sure your test passes and reference the issue in your commit message.
|
||||||
## Code conventions
|
## Code conventions
|
||||||
|
|
||||||
Code should loosely follow [pep8](https://www.python.org/dev/peps/pep-0008/).
|
Code should loosely follow [pep8](https://www.python.org/dev/peps/pep-0008/).
|
||||||
Regular line length is **80 characters**, with some tolerance for lines up to
|
As of `v2.1.0`, spaCy uses [`black`](https://github.com/ambv/black) for code
|
||||||
90 characters if the alternative would be worse — for instance, if your list
|
formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
|
||||||
comprehension comes to 82 characters, it's better not to split it over two lines.
|
Python modules. If you've built spaCy from source, you'll already have both
|
||||||
You can also use a linter like [`flake8`](https://pypi.python.org/pypi/flake8)
|
tools installed.
|
||||||
or [`frosted`](https://pypi.python.org/pypi/frosted) – just keep in mind that
|
|
||||||
it won't work very well for `.pyx` files and will complain about Cython syntax
|
**⚠️ Note that formatting and linting is currently only possible for Python
|
||||||
like `<int*>` or `cimport`.
|
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
|
||||||
|
|
||||||
|
### Code formatting
|
||||||
|
|
||||||
|
[`black`](https://github.com/ambv/black) is an opinionated Python code
|
||||||
|
formatter, optimised to produce readable code and small diffs. You can run
|
||||||
|
`black` from the command-line, or via your code editor. For example, if you're
|
||||||
|
using [Visual Studio Code](https://code.visualstudio.com/), you can add the
|
||||||
|
following to your `settings.json` to use `black` for formatting and auto-format
|
||||||
|
your files on save:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"python.formatting.provider": "black",
|
||||||
|
"[python]": {
|
||||||
|
"editor.formatOnSave": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
[See here](https://github.com/ambv/black#editor-integration) for the full
|
||||||
|
list of available editor integrations.
|
||||||
|
|
||||||
|
#### Disabling formatting
|
||||||
|
|
||||||
|
There are a few cases where auto-formatting doesn't improve readability – for
|
||||||
|
example, in some of the the language data files like the `tag_map.py`, or in
|
||||||
|
the tests that construct `Doc` objects from lists of words and other labels.
|
||||||
|
Wrapping a block in `# fmt: off` and `# fmt: on` lets you disable formatting
|
||||||
|
for that particular code. Here's an example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# fmt: off
|
||||||
|
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
||||||
|
heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
|
||||||
|
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
|
||||||
|
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
|
||||||
|
"poss", "nsubj", "ccomp", "punct"]
|
||||||
|
# fmt: on
|
||||||
|
```
|
||||||
|
|
||||||
|
### Code linting
|
||||||
|
|
||||||
|
[`flake8`](http://flake8.pycqa.org/en/latest/) is a tool for enforcing code
|
||||||
|
style. It scans one or more files and outputs errors and warnings. This feedback
|
||||||
|
can help you stick to general standards and conventions, and can be very useful
|
||||||
|
for spotting potential mistakes and inconsistencies in your code. The most
|
||||||
|
important things to watch out for are syntax errors and undefined names, but you
|
||||||
|
also want to keep an eye on unused declared variables or repeated
|
||||||
|
(i.e. overwritten) dictionary keys. If your code was formatted with `black`
|
||||||
|
(see above), you shouldn't see any formatting-related warnings.
|
||||||
|
|
||||||
|
The [`.flake8`](.flake8) config defines the configuration we use for this
|
||||||
|
codebase. For example, we're not super strict about the line length, and we're
|
||||||
|
excluding very large files like lemmatization and tokenizer exception tables.
|
||||||
|
|
||||||
|
Ideally, running the following command from within the repo directory should
|
||||||
|
not return any errors or warnings:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
flake8 spacy
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Disabling linting
|
||||||
|
|
||||||
|
Sometimes, you explicitly want to write code that's not compatible with our
|
||||||
|
rules. For example, a module's `__init__.py` might import a function so other
|
||||||
|
modules can import it from there, but `flake8` will complain about an unused
|
||||||
|
import. And although it's generally discouraged, there might be cases where it
|
||||||
|
makes sense to use a bare `except`.
|
||||||
|
|
||||||
|
To ignore a given line, you can add a comment like `# noqa: F401`, specifying
|
||||||
|
the code of the error or warning we want to ignore. It's also possible to
|
||||||
|
ignore several comma-separated codes at once, e.g. `# noqa: E731,E123`. Here
|
||||||
|
are some examples:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# The imported class isn't used in this file, but imported here, so it can be
|
||||||
|
# imported *from* here by another module.
|
||||||
|
from .submodule import SomeClass # noqa: F401
|
||||||
|
|
||||||
|
try:
|
||||||
|
do_something()
|
||||||
|
except: # noqa: E722
|
||||||
|
# This bare except is justified, for some specific reason
|
||||||
|
do_something_else()
|
||||||
|
```
|
||||||
|
|
||||||
### Python conventions
|
### Python conventions
|
||||||
|
|
||||||
|
|
|
@ -35,41 +35,49 @@ import subprocess
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
HASH_FILE = 'cythonize.json'
|
HASH_FILE = "cythonize.json"
|
||||||
|
|
||||||
|
|
||||||
def process_pyx(fromfile, tofile, language_level='-2'):
|
def process_pyx(fromfile, tofile, language_level="-2"):
|
||||||
print('Processing %s' % fromfile)
|
print("Processing %s" % fromfile)
|
||||||
try:
|
try:
|
||||||
from Cython.Compiler.Version import version as cython_version
|
from Cython.Compiler.Version import version as cython_version
|
||||||
from distutils.version import LooseVersion
|
from distutils.version import LooseVersion
|
||||||
if LooseVersion(cython_version) < LooseVersion('0.19'):
|
|
||||||
raise Exception('Require Cython >= 0.19')
|
if LooseVersion(cython_version) < LooseVersion("0.19"):
|
||||||
|
raise Exception("Require Cython >= 0.19")
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
flags = ['--fast-fail', language_level]
|
flags = ["--fast-fail", language_level]
|
||||||
if tofile.endswith('.cpp'):
|
if tofile.endswith(".cpp"):
|
||||||
flags += ['--cplus']
|
flags += ["--cplus"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
r = subprocess.call(['cython'] + flags + ['-o', tofile, fromfile],
|
r = subprocess.call(
|
||||||
env=os.environ) # See Issue #791
|
["cython"] + flags + ["-o", tofile, fromfile], env=os.environ
|
||||||
|
) # See Issue #791
|
||||||
if r != 0:
|
if r != 0:
|
||||||
raise Exception('Cython failed')
|
raise Exception("Cython failed")
|
||||||
except OSError:
|
except OSError:
|
||||||
# There are ways of installing Cython that don't result in a cython
|
# There are ways of installing Cython that don't result in a cython
|
||||||
# executable on the path, see gh-2397.
|
# executable on the path, see gh-2397.
|
||||||
r = subprocess.call([sys.executable, '-c',
|
r = subprocess.call(
|
||||||
'import sys; from Cython.Compiler.Main import '
|
[
|
||||||
'setuptools_main as main; sys.exit(main())'] + flags +
|
sys.executable,
|
||||||
['-o', tofile, fromfile])
|
"-c",
|
||||||
|
"import sys; from Cython.Compiler.Main import "
|
||||||
|
"setuptools_main as main; sys.exit(main())",
|
||||||
|
]
|
||||||
|
+ flags
|
||||||
|
+ ["-o", tofile, fromfile]
|
||||||
|
)
|
||||||
if r != 0:
|
if r != 0:
|
||||||
raise Exception('Cython failed')
|
raise Exception("Cython failed")
|
||||||
except OSError:
|
except OSError:
|
||||||
raise OSError('Cython needs to be installed')
|
raise OSError("Cython needs to be installed")
|
||||||
|
|
||||||
|
|
||||||
def preserve_cwd(path, func, *args):
|
def preserve_cwd(path, func, *args):
|
||||||
|
@ -89,12 +97,12 @@ def load_hashes(filename):
|
||||||
|
|
||||||
|
|
||||||
def save_hashes(hash_db, filename):
|
def save_hashes(hash_db, filename):
|
||||||
with open(filename, 'w') as f:
|
with open(filename, "w") as f:
|
||||||
f.write(json.dumps(hash_db))
|
f.write(json.dumps(hash_db))
|
||||||
|
|
||||||
|
|
||||||
def get_hash(path):
|
def get_hash(path):
|
||||||
return hashlib.md5(open(path, 'rb').read()).hexdigest()
|
return hashlib.md5(open(path, "rb").read()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def hash_changed(base, path, db):
|
def hash_changed(base, path, db):
|
||||||
|
@ -109,25 +117,27 @@ def hash_add(base, path, db):
|
||||||
|
|
||||||
def process(base, filename, db):
|
def process(base, filename, db):
|
||||||
root, ext = os.path.splitext(filename)
|
root, ext = os.path.splitext(filename)
|
||||||
if ext in ['.pyx', '.cpp']:
|
if ext in [".pyx", ".cpp"]:
|
||||||
if hash_changed(base, filename, db) or not os.path.isfile(os.path.join(base, root + '.cpp')):
|
if hash_changed(base, filename, db) or not os.path.isfile(
|
||||||
preserve_cwd(base, process_pyx, root + '.pyx', root + '.cpp')
|
os.path.join(base, root + ".cpp")
|
||||||
hash_add(base, root + '.cpp', db)
|
):
|
||||||
hash_add(base, root + '.pyx', db)
|
preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp")
|
||||||
|
hash_add(base, root + ".cpp", db)
|
||||||
|
hash_add(base, root + ".pyx", db)
|
||||||
|
|
||||||
|
|
||||||
def check_changes(root, db):
|
def check_changes(root, db):
|
||||||
res = False
|
res = False
|
||||||
new_db = {}
|
new_db = {}
|
||||||
|
|
||||||
setup_filename = 'setup.py'
|
setup_filename = "setup.py"
|
||||||
hash_add('.', setup_filename, new_db)
|
hash_add(".", setup_filename, new_db)
|
||||||
if hash_changed('.', setup_filename, db):
|
if hash_changed(".", setup_filename, db):
|
||||||
res = True
|
res = True
|
||||||
|
|
||||||
for base, _, files in os.walk(root):
|
for base, _, files in os.walk(root):
|
||||||
for filename in files:
|
for filename in files:
|
||||||
if filename.endswith('.pxd'):
|
if filename.endswith(".pxd"):
|
||||||
hash_add(base, filename, new_db)
|
hash_add(base, filename, new_db)
|
||||||
if hash_changed(base, filename, db):
|
if hash_changed(base, filename, db):
|
||||||
res = True
|
res = True
|
||||||
|
@ -150,8 +160,10 @@ def run(root):
|
||||||
save_hashes(db, HASH_FILE)
|
save_hashes(db, HASH_FILE)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description='Cythonize pyx files into C++ files as needed')
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument('root', help='root directory')
|
description="Cythonize pyx files into C++ files as needed"
|
||||||
|
)
|
||||||
|
parser.add_argument("root", help="root directory")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
run(args.root)
|
run(args.root)
|
||||||
|
|
|
@ -15,12 +15,13 @@ _unset = object()
|
||||||
|
|
||||||
class Reddit(object):
|
class Reddit(object):
|
||||||
"""Stream cleaned comments from Reddit."""
|
"""Stream cleaned comments from Reddit."""
|
||||||
pre_format_re = re.compile(r'^[\`\*\~]')
|
|
||||||
post_format_re = re.compile(r'[\`\*\~]$')
|
|
||||||
url_re = re.compile(r'\[([^]]+)\]\(%%URL\)')
|
|
||||||
link_re = re.compile(r'\[([^]]+)\]\(https?://[^\)]+\)')
|
|
||||||
|
|
||||||
def __init__(self, file_path, meta_keys={'subreddit': 'section'}):
|
pre_format_re = re.compile(r"^[\`\*\~]")
|
||||||
|
post_format_re = re.compile(r"[\`\*\~]$")
|
||||||
|
url_re = re.compile(r"\[([^]]+)\]\(%%URL\)")
|
||||||
|
link_re = re.compile(r"\[([^]]+)\]\(https?://[^\)]+\)")
|
||||||
|
|
||||||
|
def __init__(self, file_path, meta_keys={"subreddit": "section"}):
|
||||||
"""
|
"""
|
||||||
file_path (unicode / Path): Path to archive or directory of archives.
|
file_path (unicode / Path): Path to archive or directory of archives.
|
||||||
meta_keys (dict): Meta data key included in the Reddit corpus, mapped
|
meta_keys (dict): Meta data key included in the Reddit corpus, mapped
|
||||||
|
@ -45,28 +46,30 @@ class Reddit(object):
|
||||||
continue
|
continue
|
||||||
comment = ujson.loads(line)
|
comment = ujson.loads(line)
|
||||||
if self.is_valid(comment):
|
if self.is_valid(comment):
|
||||||
text = self.strip_tags(comment['body'])
|
text = self.strip_tags(comment["body"])
|
||||||
yield {'text': text}
|
yield {"text": text}
|
||||||
|
|
||||||
def get_meta(self, item):
|
def get_meta(self, item):
|
||||||
return {name: item.get(key, 'n/a') for key, name in self.meta.items()}
|
return {name: item.get(key, "n/a") for key, name in self.meta.items()}
|
||||||
|
|
||||||
def iter_files(self):
|
def iter_files(self):
|
||||||
for file_path in self.files:
|
for file_path in self.files:
|
||||||
yield file_path
|
yield file_path
|
||||||
|
|
||||||
def strip_tags(self, text):
|
def strip_tags(self, text):
|
||||||
text = self.link_re.sub(r'\1', text)
|
text = self.link_re.sub(r"\1", text)
|
||||||
text = text.replace('>', '>').replace('<', '<')
|
text = text.replace(">", ">").replace("<", "<")
|
||||||
text = self.pre_format_re.sub('', text)
|
text = self.pre_format_re.sub("", text)
|
||||||
text = self.post_format_re.sub('', text)
|
text = self.post_format_re.sub("", text)
|
||||||
text = re.sub(r'\s+', ' ', text)
|
text = re.sub(r"\s+", " ", text)
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
def is_valid(self, comment):
|
def is_valid(self, comment):
|
||||||
return comment['body'] is not None \
|
return (
|
||||||
and comment['body'] != '[deleted]' \
|
comment["body"] is not None
|
||||||
and comment['body'] != '[removed]'
|
and comment["body"] != "[deleted]"
|
||||||
|
and comment["body"] != "[removed]"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main(path):
|
def main(path):
|
||||||
|
@ -75,8 +78,9 @@ def main(path):
|
||||||
print(ujson.dumps(comment))
|
print(ujson.dumps(comment))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
import socket
|
import socket
|
||||||
|
|
||||||
try:
|
try:
|
||||||
BrokenPipeError
|
BrokenPipeError
|
||||||
except NameError:
|
except NameError:
|
||||||
|
@ -85,6 +89,7 @@ if __name__ == '__main__':
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
except BrokenPipeError:
|
except BrokenPipeError:
|
||||||
import os, sys
|
import os, sys
|
||||||
|
|
||||||
# Python flushes standard streams on exit; redirect remaining output
|
# Python flushes standard streams on exit; redirect remaining output
|
||||||
# to devnull to avoid another BrokenPipeError at shutdown
|
# to devnull to avoid another BrokenPipeError at shutdown
|
||||||
devnull = os.open(os.devnull, os.O_WRONLY)
|
devnull = os.open(os.devnull, os.O_WRONLY)
|
||||||
|
|
|
@ -11,7 +11,10 @@ ujson>=1.35
|
||||||
dill>=0.2,<0.3
|
dill>=0.2,<0.3
|
||||||
regex==2018.01.10
|
regex==2018.01.10
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
|
pathlib==1.0.1; python_version < "3.4"
|
||||||
|
# Development dependencies
|
||||||
pytest>=4.0.0,<5.0.0
|
pytest>=4.0.0,<5.0.0
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
black==18.9b0
|
||||||
|
flake8>=3.5.0,<3.6.0
|
||||||
|
|
301
spacy/_ml.py
301
spacy/_ml.py
|
@ -14,8 +14,7 @@ from thinc.api import uniqued, wrap, noop
|
||||||
from thinc.api import with_square_sequences
|
from thinc.api import with_square_sequences
|
||||||
from thinc.linear.linear import LinearModel
|
from thinc.linear.linear import LinearModel
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module, copy_array
|
from thinc.neural.util import get_array_module
|
||||||
from thinc.neural._lsuv import svd_orthonormal
|
|
||||||
from thinc.neural.optimizers import Adam
|
from thinc.neural.optimizers import Adam
|
||||||
|
|
||||||
from thinc import describe
|
from thinc import describe
|
||||||
|
@ -33,36 +32,36 @@ try:
|
||||||
except:
|
except:
|
||||||
torch = None
|
torch = None
|
||||||
|
|
||||||
VECTORS_KEY = 'spacy_pretrained_vectors'
|
VECTORS_KEY = "spacy_pretrained_vectors"
|
||||||
|
|
||||||
|
|
||||||
def cosine(vec1, vec2):
|
def cosine(vec1, vec2):
|
||||||
xp = get_array_module(vec1)
|
xp = get_array_module(vec1)
|
||||||
norm1 = xp.linalg.norm(vec1)
|
norm1 = xp.linalg.norm(vec1)
|
||||||
norm2 = xp.linalg.norm(vec2)
|
norm2 = xp.linalg.norm(vec2)
|
||||||
if norm1 == 0. or norm2 == 0.:
|
if norm1 == 0.0 or norm2 == 0.0:
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
return vec1.dot(vec2) / (norm1 * norm2)
|
return vec1.dot(vec2) / (norm1 * norm2)
|
||||||
|
|
||||||
|
|
||||||
def create_default_optimizer(ops, **cfg):
|
def create_default_optimizer(ops, **cfg):
|
||||||
learn_rate = util.env_opt('learn_rate', 0.001)
|
learn_rate = util.env_opt("learn_rate", 0.001)
|
||||||
beta1 = util.env_opt('optimizer_B1', 0.8)
|
beta1 = util.env_opt("optimizer_B1", 0.8)
|
||||||
beta2 = util.env_opt('optimizer_B2', 0.8)
|
beta2 = util.env_opt("optimizer_B2", 0.8)
|
||||||
eps = util.env_opt('optimizer_eps', 0.00001)
|
eps = util.env_opt("optimizer_eps", 0.00001)
|
||||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
L2 = util.env_opt("L2_penalty", 1e-6)
|
||||||
max_grad_norm = util.env_opt('grad_norm_clip', 5.)
|
max_grad_norm = util.env_opt("grad_norm_clip", 5.0)
|
||||||
optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1,
|
optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps)
|
||||||
beta2=beta2, eps=eps)
|
|
||||||
optimizer.max_grad_norm = max_grad_norm
|
optimizer.max_grad_norm = max_grad_norm
|
||||||
optimizer.device = ops.device
|
optimizer.device = ops.device
|
||||||
return optimizer
|
return optimizer
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
def _flatten_add_lengths(seqs, pad=0, drop=0.0):
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
|
||||||
|
|
||||||
def finish_update(d_X, sgd=None):
|
def finish_update(d_X, sgd=None):
|
||||||
return ops.unflatten(d_X, lengths, pad=pad)
|
return ops.unflatten(d_X, lengths, pad=pad)
|
||||||
|
@ -74,14 +73,15 @@ def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
||||||
def _zero_init(model):
|
def _zero_init(model):
|
||||||
def _zero_init_impl(self, X, y):
|
def _zero_init_impl(self, X, y):
|
||||||
self.W.fill(0)
|
self.W.fill(0)
|
||||||
|
|
||||||
model.on_data_hooks.append(_zero_init_impl)
|
model.on_data_hooks.append(_zero_init_impl)
|
||||||
if model.W is not None:
|
if model.W is not None:
|
||||||
model.W.fill(0.)
|
model.W.fill(0.0)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def _preprocess_doc(docs, drop=0.):
|
def _preprocess_doc(docs, drop=0.0):
|
||||||
keys = [doc.to_array(LOWER) for doc in docs]
|
keys = [doc.to_array(LOWER) for doc in docs]
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
# The dtype here matches what thinc is expecting -- which differs per
|
# The dtype here matches what thinc is expecting -- which differs per
|
||||||
|
@ -89,11 +89,12 @@ def _preprocess_doc(docs, drop=0.):
|
||||||
# is fixed on Thinc's side.
|
# is fixed on Thinc's side.
|
||||||
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
||||||
keys = ops.xp.concatenate(keys)
|
keys = ops.xp.concatenate(keys)
|
||||||
vals = ops.allocate(keys.shape) + 1.
|
vals = ops.allocate(keys.shape) + 1.0
|
||||||
return (keys, vals, lengths), None
|
return (keys, vals, lengths), None
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def _preprocess_doc_bigrams(docs, drop=0.):
|
def _preprocess_doc_bigrams(docs, drop=0.0):
|
||||||
unigrams = [doc.to_array(LOWER) for doc in docs]
|
unigrams = [doc.to_array(LOWER) for doc in docs]
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
bigrams = [ops.ngrams(2, doc_unis) for doc_unis in unigrams]
|
bigrams = [ops.ngrams(2, doc_unis) for doc_unis in unigrams]
|
||||||
|
@ -104,27 +105,29 @@ def _preprocess_doc_bigrams(docs, drop=0.):
|
||||||
# is fixed on Thinc's side.
|
# is fixed on Thinc's side.
|
||||||
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
||||||
keys = ops.xp.concatenate(keys)
|
keys = ops.xp.concatenate(keys)
|
||||||
vals = ops.asarray(ops.xp.concatenate(vals), dtype='f')
|
vals = ops.asarray(ops.xp.concatenate(vals), dtype="f")
|
||||||
return (keys, vals, lengths), None
|
return (keys, vals, lengths), None
|
||||||
|
|
||||||
|
|
||||||
@describe.on_data(_set_dimensions_if_needed,
|
@describe.on_data(
|
||||||
lambda model, X, y: model.init_weights(model))
|
_set_dimensions_if_needed, lambda model, X, y: model.init_weights(model)
|
||||||
|
)
|
||||||
@describe.attributes(
|
@describe.attributes(
|
||||||
nI=Dimension("Input size"),
|
nI=Dimension("Input size"),
|
||||||
nF=Dimension("Number of features"),
|
nF=Dimension("Number of features"),
|
||||||
nO=Dimension("Output size"),
|
nO=Dimension("Output size"),
|
||||||
nP=Dimension("Maxout pieces"),
|
nP=Dimension("Maxout pieces"),
|
||||||
W=Synapses("Weights matrix",
|
W=Synapses("Weights matrix", lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
|
||||||
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
|
b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)),
|
||||||
b=Biases("Bias vector",
|
pad=Synapses(
|
||||||
lambda obj: (obj.nO, obj.nP)),
|
"Pad",
|
||||||
pad=Synapses("Pad",
|
|
||||||
lambda obj: (1, obj.nF, obj.nO, obj.nP),
|
lambda obj: (1, obj.nF, obj.nO, obj.nP),
|
||||||
lambda M, ops: ops.normal_init(M, 1.)),
|
lambda M, ops: ops.normal_init(M, 1.0),
|
||||||
|
),
|
||||||
d_W=Gradient("W"),
|
d_W=Gradient("W"),
|
||||||
d_pad=Gradient("pad"),
|
d_pad=Gradient("pad"),
|
||||||
d_b=Gradient("b"))
|
d_b=Gradient("b"),
|
||||||
|
)
|
||||||
class PrecomputableAffine(Model):
|
class PrecomputableAffine(Model):
|
||||||
def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
|
def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
|
||||||
Model.__init__(self, **kwargs)
|
Model.__init__(self, **kwargs)
|
||||||
|
@ -133,9 +136,10 @@ class PrecomputableAffine(Model):
|
||||||
self.nI = nI
|
self.nI = nI
|
||||||
self.nF = nF
|
self.nF = nF
|
||||||
|
|
||||||
def begin_update(self, X, drop=0.):
|
def begin_update(self, X, drop=0.0):
|
||||||
Yf = self.ops.gemm(X,
|
Yf = self.ops.gemm(
|
||||||
self.W.reshape((self.nF*self.nO*self.nP, self.nI)), trans2=True)
|
X, self.W.reshape((self.nF * self.nO * self.nP, self.nI)), trans2=True
|
||||||
|
)
|
||||||
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
|
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
|
||||||
Yf = self._add_padding(Yf)
|
Yf = self._add_padding(Yf)
|
||||||
|
|
||||||
|
@ -146,15 +150,16 @@ class PrecomputableAffine(Model):
|
||||||
Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
|
Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
|
||||||
|
|
||||||
self.d_b += dY.sum(axis=0)
|
self.d_b += dY.sum(axis=0)
|
||||||
dY = dY.reshape((dY.shape[0], self.nO*self.nP))
|
dY = dY.reshape((dY.shape[0], self.nO * self.nP))
|
||||||
|
|
||||||
Wopfi = self.W.transpose((1, 2, 0, 3))
|
Wopfi = self.W.transpose((1, 2, 0, 3))
|
||||||
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
|
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
|
||||||
Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
|
Wopfi = Wopfi.reshape((self.nO * self.nP, self.nF * self.nI))
|
||||||
dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
|
dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO * self.nP)), Wopfi)
|
||||||
|
|
||||||
# Reuse the buffer
|
# Reuse the buffer
|
||||||
dWopfi = Wopfi; dWopfi.fill(0.)
|
dWopfi = Wopfi
|
||||||
|
dWopfi.fill(0.0)
|
||||||
self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
|
self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
|
||||||
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
|
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
|
||||||
# (o, p, f, i) --> (f, o, p, i)
|
# (o, p, f, i) --> (f, o, p, i)
|
||||||
|
@ -163,6 +168,7 @@ class PrecomputableAffine(Model):
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||||
return dXf.reshape((dXf.shape[0], self.nF, self.nI))
|
return dXf.reshape((dXf.shape[0], self.nF, self.nI))
|
||||||
|
|
||||||
return Yf, backward
|
return Yf, backward
|
||||||
|
|
||||||
def _add_padding(self, Yf):
|
def _add_padding(self, Yf):
|
||||||
|
@ -171,7 +177,7 @@ class PrecomputableAffine(Model):
|
||||||
|
|
||||||
def _backprop_padding(self, dY, ids):
|
def _backprop_padding(self, dY, ids):
|
||||||
# (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
|
# (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
|
||||||
mask = ids < 0.
|
mask = ids < 0.0
|
||||||
mask = mask.sum(axis=1)
|
mask = mask.sum(axis=1)
|
||||||
d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
|
d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
|
||||||
self.d_pad += d_pad.sum(axis=0)
|
self.d_pad += d_pad.sum(axis=0)
|
||||||
|
@ -179,33 +185,36 @@ class PrecomputableAffine(Model):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def init_weights(model):
|
def init_weights(model):
|
||||||
'''This is like the 'layer sequential unit variance', but instead
|
"""This is like the 'layer sequential unit variance', but instead
|
||||||
of taking the actual inputs, we randomly generate whitened data.
|
of taking the actual inputs, we randomly generate whitened data.
|
||||||
|
|
||||||
Why's this all so complicated? We have a huge number of inputs,
|
Why's this all so complicated? We have a huge number of inputs,
|
||||||
and the maxout unit makes guessing the dynamics tricky. Instead
|
and the maxout unit makes guessing the dynamics tricky. Instead
|
||||||
we set the maxout weights to values that empirically result in
|
we set the maxout weights to values that empirically result in
|
||||||
whitened outputs given whitened inputs.
|
whitened outputs given whitened inputs.
|
||||||
'''
|
"""
|
||||||
if (model.W**2).sum() != 0.:
|
if (model.W ** 2).sum() != 0.0:
|
||||||
return
|
return
|
||||||
ops = model.ops
|
ops = model.ops
|
||||||
xp = ops.xp
|
xp = ops.xp
|
||||||
ops.normal_init(model.W, model.nF * model.nI, inplace=True)
|
ops.normal_init(model.W, model.nF * model.nI, inplace=True)
|
||||||
|
|
||||||
ids = ops.allocate((5000, model.nF), dtype='f')
|
ids = ops.allocate((5000, model.nF), dtype="f")
|
||||||
ids += xp.random.uniform(0, 1000, ids.shape)
|
ids += xp.random.uniform(0, 1000, ids.shape)
|
||||||
ids = ops.asarray(ids, dtype='i')
|
ids = ops.asarray(ids, dtype="i")
|
||||||
tokvecs = ops.allocate((5000, model.nI), dtype='f')
|
tokvecs = ops.allocate((5000, model.nI), dtype="f")
|
||||||
tokvecs += xp.random.normal(loc=0., scale=1.,
|
tokvecs += xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
||||||
size=tokvecs.size).reshape(tokvecs.shape)
|
tokvecs.shape
|
||||||
|
)
|
||||||
|
|
||||||
def predict(ids, tokvecs):
|
def predict(ids, tokvecs):
|
||||||
# nS ids. nW tokvecs. Exclude the padding array.
|
# nS ids. nW tokvecs. Exclude the padding array.
|
||||||
hiddens = model(tokvecs[:-1]) # (nW, f, o, p)
|
hiddens = model(tokvecs[:-1]) # (nW, f, o, p)
|
||||||
vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype='f')
|
vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype="f")
|
||||||
# need nS vectors
|
# need nS vectors
|
||||||
hiddens = hiddens.reshape((hiddens.shape[0] * model.nF, model.nO * model.nP))
|
hiddens = hiddens.reshape(
|
||||||
|
(hiddens.shape[0] * model.nF, model.nO * model.nP)
|
||||||
|
)
|
||||||
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
||||||
vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP))
|
vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP))
|
||||||
vectors += model.b
|
vectors += model.b
|
||||||
|
@ -238,7 +247,8 @@ def link_vectors_to_models(vocab):
|
||||||
if vectors.data.size != 0:
|
if vectors.data.size != 0:
|
||||||
print(
|
print(
|
||||||
"Warning: Unnamed vectors -- this won't allow multiple vectors "
|
"Warning: Unnamed vectors -- this won't allow multiple vectors "
|
||||||
"models to be loaded. (Shape: (%d, %d))" % vectors.data.shape)
|
"models to be loaded. (Shape: (%d, %d))" % vectors.data.shape
|
||||||
|
)
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
for word in vocab:
|
for word in vocab:
|
||||||
if word.orth in vectors.key2row:
|
if word.orth in vectors.key2row:
|
||||||
|
@ -254,28 +264,31 @@ def link_vectors_to_models(vocab):
|
||||||
def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
|
def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
|
||||||
if depth == 0:
|
if depth == 0:
|
||||||
return layerize(noop())
|
return layerize(noop())
|
||||||
model = torch.nn.LSTM(nI, nO//2, depth, bidirectional=True, dropout=dropout)
|
model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
|
||||||
return with_square_sequences(PyTorchWrapperRNN(model))
|
return with_square_sequences(PyTorchWrapperRNN(model))
|
||||||
|
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, **kwargs):
|
def Tok2Vec(width, embed_size, **kwargs):
|
||||||
pretrained_vectors = kwargs.get('pretrained_vectors', None)
|
pretrained_vectors = kwargs.get("pretrained_vectors", None)
|
||||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
|
cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 2)
|
||||||
subword_features = kwargs.get('subword_features', True)
|
subword_features = kwargs.get("subword_features", True)
|
||||||
conv_depth = kwargs.get('conv_depth', 4)
|
conv_depth = kwargs.get("conv_depth", 4)
|
||||||
bilstm_depth = kwargs.get('bilstm_depth', 0)
|
bilstm_depth = kwargs.get("bilstm_depth", 0)
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
|
with Model.define_operators(
|
||||||
'+': add, '*': reapply}):
|
{">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply}
|
||||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM),
|
):
|
||||||
name='embed_norm')
|
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
|
||||||
if subword_features:
|
if subword_features:
|
||||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX),
|
prefix = HashEmbed(
|
||||||
name='embed_prefix')
|
width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
|
||||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX),
|
)
|
||||||
name='embed_suffix')
|
suffix = HashEmbed(
|
||||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
|
width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
|
||||||
name='embed_shape')
|
)
|
||||||
|
shape = HashEmbed(
|
||||||
|
width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
prefix, suffix, shape = (None, None, None)
|
prefix, suffix, shape = (None, None, None)
|
||||||
if pretrained_vectors is not None:
|
if pretrained_vectors is not None:
|
||||||
|
@ -284,28 +297,29 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
if subword_features:
|
if subword_features:
|
||||||
embed = uniqued(
|
embed = uniqued(
|
||||||
(glove | norm | prefix | suffix | shape)
|
(glove | norm | prefix | suffix | shape)
|
||||||
>> LN(Maxout(width, width*5, pieces=3)), column=cols.index(ORTH))
|
>> LN(Maxout(width, width * 5, pieces=3)),
|
||||||
|
column=cols.index(ORTH),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
embed = uniqued(
|
embed = uniqued(
|
||||||
(glove | norm)
|
(glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
|
||||||
>> LN(Maxout(width, width*2, pieces=3)), column=cols.index(ORTH))
|
column=cols.index(ORTH),
|
||||||
|
)
|
||||||
elif subword_features:
|
elif subword_features:
|
||||||
embed = uniqued(
|
embed = uniqued(
|
||||||
(norm | prefix | suffix | shape)
|
(norm | prefix | suffix | shape)
|
||||||
>> LN(Maxout(width, width*4, pieces=3)), column=cols.index(ORTH))
|
>> LN(Maxout(width, width * 4, pieces=3)),
|
||||||
|
column=cols.index(ORTH),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
embed = norm
|
embed = norm
|
||||||
|
|
||||||
convolution = Residual(
|
convolution = Residual(
|
||||||
ExtractWindow(nW=1)
|
ExtractWindow(nW=1)
|
||||||
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
>> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
|
||||||
)
|
)
|
||||||
tok2vec = (
|
tok2vec = FeatureExtracter(cols) >> with_flatten(
|
||||||
FeatureExtracter(cols)
|
embed >> convolution ** conv_depth, pad=conv_depth
|
||||||
>> with_flatten(
|
|
||||||
embed
|
|
||||||
>> convolution ** conv_depth, pad=conv_depth
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
if bilstm_depth >= 1:
|
if bilstm_depth >= 1:
|
||||||
tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
|
tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
|
||||||
|
@ -316,7 +330,7 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
def reapply(layer, n_times):
|
def reapply(layer, n_times):
|
||||||
def reapply_fwd(X, drop=0.):
|
def reapply_fwd(X, drop=0.0):
|
||||||
backprops = []
|
backprops = []
|
||||||
for i in range(n_times):
|
for i in range(n_times):
|
||||||
Y, backprop = layer.begin_update(X, drop=drop)
|
Y, backprop = layer.begin_update(X, drop=drop)
|
||||||
|
@ -334,12 +348,14 @@ def reapply(layer, n_times):
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
return Y, reapply_bwd
|
return Y, reapply_bwd
|
||||||
|
|
||||||
return wrap(reapply_fwd, layer)
|
return wrap(reapply_fwd, layer)
|
||||||
|
|
||||||
|
|
||||||
def asarray(ops, dtype):
|
def asarray(ops, dtype):
|
||||||
def forward(X, drop=0.):
|
def forward(X, drop=0.0):
|
||||||
return ops.asarray(X, dtype=dtype), None
|
return ops.asarray(X, dtype=dtype), None
|
||||||
|
|
||||||
return layerize(forward)
|
return layerize(forward)
|
||||||
|
|
||||||
|
|
||||||
|
@ -347,7 +363,7 @@ def _divide_array(X, size):
|
||||||
parts = []
|
parts = []
|
||||||
index = 0
|
index = 0
|
||||||
while index < len(X):
|
while index < len(X):
|
||||||
parts.append(X[index:index + size])
|
parts.append(X[index : index + size])
|
||||||
index += size
|
index += size
|
||||||
return parts
|
return parts
|
||||||
|
|
||||||
|
@ -356,7 +372,7 @@ def get_col(idx):
|
||||||
if idx < 0:
|
if idx < 0:
|
||||||
raise IndexError(Errors.E066.format(value=idx))
|
raise IndexError(Errors.E066.format(value=idx))
|
||||||
|
|
||||||
def forward(X, drop=0.):
|
def forward(X, drop=0.0):
|
||||||
if isinstance(X, numpy.ndarray):
|
if isinstance(X, numpy.ndarray):
|
||||||
ops = NumpyOps()
|
ops = NumpyOps()
|
||||||
else:
|
else:
|
||||||
|
@ -377,7 +393,7 @@ def doc2feats(cols=None):
|
||||||
if cols is None:
|
if cols is None:
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
|
|
||||||
def forward(docs, drop=0.):
|
def forward(docs, drop=0.0):
|
||||||
feats = []
|
feats = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
feats.append(doc.to_array(cols))
|
feats.append(doc.to_array(cols))
|
||||||
|
@ -389,13 +405,14 @@ def doc2feats(cols=None):
|
||||||
|
|
||||||
|
|
||||||
def print_shape(prefix):
|
def print_shape(prefix):
|
||||||
def forward(X, drop=0.):
|
def forward(X, drop=0.0):
|
||||||
return X, lambda dX, **kwargs: dX
|
return X, lambda dX, **kwargs: dX
|
||||||
|
|
||||||
return layerize(forward)
|
return layerize(forward)
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
def get_token_vectors(tokens_attrs_vectors, drop=0.0):
|
||||||
tokens, attrs, vectors = tokens_attrs_vectors
|
tokens, attrs, vectors = tokens_attrs_vectors
|
||||||
|
|
||||||
def backward(d_output, sgd=None):
|
def backward(d_output, sgd=None):
|
||||||
|
@ -405,17 +422,17 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def logistic(X, drop=0.):
|
def logistic(X, drop=0.0):
|
||||||
xp = get_array_module(X)
|
xp = get_array_module(X)
|
||||||
if not isinstance(X, xp.ndarray):
|
if not isinstance(X, xp.ndarray):
|
||||||
X = xp.asarray(X)
|
X = xp.asarray(X)
|
||||||
# Clip to range (-10, 10)
|
# Clip to range (-10, 10)
|
||||||
X = xp.minimum(X, 10., X)
|
X = xp.minimum(X, 10.0, X)
|
||||||
X = xp.maximum(X, -10., X)
|
X = xp.maximum(X, -10.0, X)
|
||||||
Y = 1. / (1. + xp.exp(-X))
|
Y = 1.0 / (1.0 + xp.exp(-X))
|
||||||
|
|
||||||
def logistic_bwd(dY, sgd=None):
|
def logistic_bwd(dY, sgd=None):
|
||||||
dX = dY * (Y * (1-Y))
|
dX = dY * (Y * (1 - Y))
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
return Y, logistic_bwd
|
return Y, logistic_bwd
|
||||||
|
@ -424,12 +441,13 @@ def logistic(X, drop=0.):
|
||||||
def zero_init(model):
|
def zero_init(model):
|
||||||
def _zero_init_impl(self, X, y):
|
def _zero_init_impl(self, X, y):
|
||||||
self.W.fill(0)
|
self.W.fill(0)
|
||||||
|
|
||||||
model.on_data_hooks.append(_zero_init_impl)
|
model.on_data_hooks.append(_zero_init_impl)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def preprocess_doc(docs, drop=0.):
|
def preprocess_doc(docs, drop=0.0):
|
||||||
keys = [doc.to_array([LOWER]) for doc in docs]
|
keys = [doc.to_array([LOWER]) for doc in docs]
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
lengths = ops.asarray([arr.shape[0] for arr in keys])
|
lengths = ops.asarray([arr.shape[0] for arr in keys])
|
||||||
|
@ -439,31 +457,32 @@ def preprocess_doc(docs, drop=0.):
|
||||||
|
|
||||||
|
|
||||||
def getitem(i):
|
def getitem(i):
|
||||||
def getitem_fwd(X, drop=0.):
|
def getitem_fwd(X, drop=0.0):
|
||||||
return X[i], None
|
return X[i], None
|
||||||
|
|
||||||
return layerize(getitem_fwd)
|
return layerize(getitem_fwd)
|
||||||
|
|
||||||
|
|
||||||
def build_tagger_model(nr_class, **cfg):
|
def build_tagger_model(nr_class, **cfg):
|
||||||
embed_size = util.env_opt('embed_size', 2000)
|
embed_size = util.env_opt("embed_size", 2000)
|
||||||
if 'token_vector_width' in cfg:
|
if "token_vector_width" in cfg:
|
||||||
token_vector_width = cfg['token_vector_width']
|
token_vector_width = cfg["token_vector_width"]
|
||||||
else:
|
else:
|
||||||
token_vector_width = util.env_opt('token_vector_width', 96)
|
token_vector_width = util.env_opt("token_vector_width", 96)
|
||||||
pretrained_vectors = cfg.get('pretrained_vectors')
|
pretrained_vectors = cfg.get("pretrained_vectors")
|
||||||
subword_features = cfg.get('subword_features', True)
|
subword_features = cfg.get("subword_features", True)
|
||||||
with Model.define_operators({'>>': chain, '+': add}):
|
with Model.define_operators({">>": chain, "+": add}):
|
||||||
if 'tok2vec' in cfg:
|
if "tok2vec" in cfg:
|
||||||
tok2vec = cfg['tok2vec']
|
tok2vec = cfg["tok2vec"]
|
||||||
else:
|
else:
|
||||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
tok2vec = Tok2Vec(
|
||||||
subword_features=subword_features,
|
token_vector_width,
|
||||||
pretrained_vectors=pretrained_vectors)
|
embed_size,
|
||||||
|
subword_features=subword_features,
|
||||||
|
pretrained_vectors=pretrained_vectors,
|
||||||
|
)
|
||||||
softmax = with_flatten(Softmax(nr_class, token_vector_width))
|
softmax = with_flatten(Softmax(nr_class, token_vector_width))
|
||||||
model = (
|
model = tok2vec >> softmax
|
||||||
tok2vec
|
|
||||||
>> softmax
|
|
||||||
)
|
|
||||||
model.nI = None
|
model.nI = None
|
||||||
model.tok2vec = tok2vec
|
model.tok2vec = tok2vec
|
||||||
model.softmax = softmax
|
model.softmax = softmax
|
||||||
|
@ -471,10 +490,10 @@ def build_tagger_model(nr_class, **cfg):
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def SpacyVectors(docs, drop=0.):
|
def SpacyVectors(docs, drop=0.0):
|
||||||
batch = []
|
batch = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
indices = numpy.zeros((len(doc),), dtype='i')
|
indices = numpy.zeros((len(doc),), dtype="i")
|
||||||
for i, word in enumerate(doc):
|
for i, word in enumerate(doc):
|
||||||
if word.orth in doc.vocab.vectors.key2row:
|
if word.orth in doc.vocab.vectors.key2row:
|
||||||
indices[i] = doc.vocab.vectors.key2row[word.orth]
|
indices[i] = doc.vocab.vectors.key2row[word.orth]
|
||||||
|
@ -486,12 +505,11 @@ def SpacyVectors(docs, drop=0.):
|
||||||
|
|
||||||
|
|
||||||
def build_text_classifier(nr_class, width=64, **cfg):
|
def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
depth = cfg.get('depth', 2)
|
depth = cfg.get("depth", 2)
|
||||||
nr_vector = cfg.get('nr_vector', 5000)
|
nr_vector = cfg.get("nr_vector", 5000)
|
||||||
pretrained_dims = cfg.get('pretrained_dims', 0)
|
pretrained_dims = cfg.get("pretrained_dims", 0)
|
||||||
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
|
with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}):
|
||||||
'**': clone}):
|
if cfg.get("low_data") and pretrained_dims:
|
||||||
if cfg.get('low_data') and pretrained_dims:
|
|
||||||
model = (
|
model = (
|
||||||
SpacyVectors
|
SpacyVectors
|
||||||
>> flatten_add_lengths
|
>> flatten_add_lengths
|
||||||
|
@ -505,41 +523,35 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
lower = HashEmbed(width, nr_vector, column=1)
|
lower = HashEmbed(width, nr_vector, column=1)
|
||||||
prefix = HashEmbed(width//2, nr_vector, column=2)
|
prefix = HashEmbed(width // 2, nr_vector, column=2)
|
||||||
suffix = HashEmbed(width//2, nr_vector, column=3)
|
suffix = HashEmbed(width // 2, nr_vector, column=3)
|
||||||
shape = HashEmbed(width//2, nr_vector, column=4)
|
shape = HashEmbed(width // 2, nr_vector, column=4)
|
||||||
|
|
||||||
trained_vectors = (
|
trained_vectors = FeatureExtracter(
|
||||||
FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
|
[ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||||
>> with_flatten(
|
) >> with_flatten(
|
||||||
uniqued(
|
uniqued(
|
||||||
(lower | prefix | suffix | shape)
|
(lower | prefix | suffix | shape)
|
||||||
>> LN(Maxout(width, width+(width//2)*3)),
|
>> LN(Maxout(width, width + (width // 2) * 3)),
|
||||||
column=0
|
column=0,
|
||||||
)
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if pretrained_dims:
|
if pretrained_dims:
|
||||||
static_vectors = (
|
static_vectors = SpacyVectors >> with_flatten(
|
||||||
SpacyVectors
|
Affine(width, pretrained_dims)
|
||||||
>> with_flatten(Affine(width, pretrained_dims))
|
|
||||||
)
|
)
|
||||||
# TODO Make concatenate support lists
|
# TODO Make concatenate support lists
|
||||||
vectors = concatenate_lists(trained_vectors, static_vectors)
|
vectors = concatenate_lists(trained_vectors, static_vectors)
|
||||||
vectors_width = width*2
|
vectors_width = width * 2
|
||||||
else:
|
else:
|
||||||
vectors = trained_vectors
|
vectors = trained_vectors
|
||||||
vectors_width = width
|
vectors_width = width
|
||||||
static_vectors = None
|
static_vectors = None
|
||||||
tok2vec = (
|
tok2vec = vectors >> with_flatten(
|
||||||
vectors
|
LN(Maxout(width, vectors_width))
|
||||||
>> with_flatten(
|
>> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth,
|
||||||
LN(Maxout(width, vectors_width))
|
pad=depth,
|
||||||
>> Residual(
|
|
||||||
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
|
||||||
) ** depth, pad=depth
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
cnn_model = (
|
cnn_model = (
|
||||||
tok2vec
|
tok2vec
|
||||||
|
@ -550,13 +562,10 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
||||||
)
|
)
|
||||||
|
|
||||||
linear_model = (
|
linear_model = _preprocess_doc >> LinearModel(nr_class)
|
||||||
_preprocess_doc
|
|
||||||
>> LinearModel(nr_class)
|
|
||||||
)
|
|
||||||
model = (
|
model = (
|
||||||
(linear_model | cnn_model)
|
(linear_model | cnn_model)
|
||||||
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
>> zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
|
||||||
>> logistic
|
>> logistic
|
||||||
)
|
)
|
||||||
model.tok2vec = tok2vec
|
model.tok2vec = tok2vec
|
||||||
|
@ -566,9 +575,9 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def flatten(seqs, drop=0.):
|
def flatten(seqs, drop=0.0):
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
|
||||||
|
|
||||||
def finish_update(d_X, sgd=None):
|
def finish_update(d_X, sgd=None):
|
||||||
return ops.unflatten(d_X, lengths, pad=0)
|
return ops.unflatten(d_X, lengths, pad=0)
|
||||||
|
@ -583,14 +592,14 @@ def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
||||||
"""
|
"""
|
||||||
if not layers:
|
if not layers:
|
||||||
return noop()
|
return noop()
|
||||||
drop_factor = kwargs.get('drop_factor', 1.0)
|
drop_factor = kwargs.get("drop_factor", 1.0)
|
||||||
ops = layers[0].ops
|
ops = layers[0].ops
|
||||||
layers = [chain(layer, flatten) for layer in layers]
|
layers = [chain(layer, flatten) for layer in layers]
|
||||||
concat = concatenate(*layers)
|
concat = concatenate(*layers)
|
||||||
|
|
||||||
def concatenate_lists_fwd(Xs, drop=0.):
|
def concatenate_lists_fwd(Xs, drop=0.0):
|
||||||
drop *= drop_factor
|
drop *= drop_factor
|
||||||
lengths = ops.asarray([len(X) for X in Xs], dtype='i')
|
lengths = ops.asarray([len(X) for X in Xs], dtype="i")
|
||||||
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
||||||
ys = ops.unflatten(flat_y, lengths)
|
ys = ops.unflatten(flat_y, lengths)
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,17 @@
|
||||||
# inspired from:
|
# inspired from:
|
||||||
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
|
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
|
||||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||||
|
# fmt: off
|
||||||
|
|
||||||
__title__ = 'spacy-nightly'
|
__title__ = "spacy-nightly"
|
||||||
__version__ = '2.1.0a3'
|
__version__ = "2.1.0a3"
|
||||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
|
||||||
__uri__ = 'https://spacy.io'
|
__uri__ = "https://spacy.io"
|
||||||
__author__ = 'Explosion AI'
|
__author__ = "Explosion AI"
|
||||||
__email__ = 'contact@explosion.ai'
|
__email__ = "contact@explosion.ai"
|
||||||
__license__ = 'MIT'
|
__license__ = "MIT"
|
||||||
__release__ = False
|
__release__ = False
|
||||||
|
|
||||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json'
|
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
|
||||||
|
|
|
@ -6,7 +6,6 @@ import sys
|
||||||
import ujson
|
import ujson
|
||||||
import itertools
|
import itertools
|
||||||
import locale
|
import locale
|
||||||
import os
|
|
||||||
|
|
||||||
from thinc.neural.util import copy_array
|
from thinc.neural.util import copy_array
|
||||||
|
|
||||||
|
@ -31,9 +30,9 @@ except ImportError:
|
||||||
cupy = None
|
cupy = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from thinc.neural.optimizers import Optimizer
|
from thinc.neural.optimizers import Optimizer # noqa: F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from thinc.neural.optimizers import Adam as Optimizer
|
from thinc.neural.optimizers import Adam as Optimizer # noqa: F401
|
||||||
|
|
||||||
pickle = pickle
|
pickle = pickle
|
||||||
copy_reg = copy_reg
|
copy_reg = copy_reg
|
||||||
|
|
|
@ -12,8 +12,15 @@ _html = {}
|
||||||
IS_JUPYTER = is_in_jupyter()
|
IS_JUPYTER = is_in_jupyter()
|
||||||
|
|
||||||
|
|
||||||
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
def render(
|
||||||
options={}, manual=False):
|
docs,
|
||||||
|
style="dep",
|
||||||
|
page=False,
|
||||||
|
minify=False,
|
||||||
|
jupyter=IS_JUPYTER,
|
||||||
|
options={},
|
||||||
|
manual=False,
|
||||||
|
):
|
||||||
"""Render displaCy visualisation.
|
"""Render displaCy visualisation.
|
||||||
|
|
||||||
docs (list or Doc): Document(s) to visualise.
|
docs (list or Doc): Document(s) to visualise.
|
||||||
|
@ -25,8 +32,10 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
||||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||||
RETURNS (unicode): Rendered HTML markup.
|
RETURNS (unicode): Rendered HTML markup.
|
||||||
"""
|
"""
|
||||||
factories = {'dep': (DependencyRenderer, parse_deps),
|
factories = {
|
||||||
'ent': (EntityRenderer, parse_ents)}
|
"dep": (DependencyRenderer, parse_deps),
|
||||||
|
"ent": (EntityRenderer, parse_ents),
|
||||||
|
}
|
||||||
if style not in factories:
|
if style not in factories:
|
||||||
raise ValueError(Errors.E087.format(style=style))
|
raise ValueError(Errors.E087.format(style=style))
|
||||||
if isinstance(docs, (Doc, Span, dict)):
|
if isinstance(docs, (Doc, Span, dict)):
|
||||||
|
@ -37,16 +46,18 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
||||||
renderer, converter = factories[style]
|
renderer, converter = factories[style]
|
||||||
renderer = renderer(options=options)
|
renderer = renderer(options=options)
|
||||||
parsed = [converter(doc, options) for doc in docs] if not manual else docs
|
parsed = [converter(doc, options) for doc in docs] if not manual else docs
|
||||||
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
|
_html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip()
|
||||||
html = _html['parsed']
|
html = _html["parsed"]
|
||||||
if jupyter: # return HTML rendered by IPython display()
|
if jupyter: # return HTML rendered by IPython display()
|
||||||
from IPython.core.display import display, HTML
|
from IPython.core.display import display, HTML
|
||||||
|
|
||||||
return display(HTML(html))
|
return display(HTML(html))
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
||||||
def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
def serve(
|
||||||
port=5000):
|
docs, style="dep", page=True, minify=False, options={}, manual=False, port=5000
|
||||||
|
):
|
||||||
"""Serve displaCy visualisation.
|
"""Serve displaCy visualisation.
|
||||||
|
|
||||||
docs (list or Doc): Document(s) to visualise.
|
docs (list or Doc): Document(s) to visualise.
|
||||||
|
@ -58,11 +69,13 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
||||||
port (int): Port to serve visualisation.
|
port (int): Port to serve visualisation.
|
||||||
"""
|
"""
|
||||||
from wsgiref import simple_server
|
from wsgiref import simple_server
|
||||||
render(docs, style=style, page=page, minify=minify, options=options,
|
|
||||||
manual=manual)
|
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||||
httpd = simple_server.make_server('0.0.0.0', port, app)
|
httpd = simple_server.make_server("0.0.0.0", port, app)
|
||||||
prints("Using the '{}' visualizer".format(style),
|
prints(
|
||||||
title="Serving on port {}...".format(port))
|
"Using the '{}' visualizer".format(style),
|
||||||
|
title="Serving on port {}...".format(port),
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
httpd.serve_forever()
|
httpd.serve_forever()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
|
@ -72,11 +85,10 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
||||||
|
|
||||||
|
|
||||||
def app(environ, start_response):
|
def app(environ, start_response):
|
||||||
# headers and status need to be bytes in Python 2, see #1227
|
# Headers and status need to be bytes in Python 2, see #1227
|
||||||
headers = [(b_to_str(b'Content-type'),
|
headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))]
|
||||||
b_to_str(b'text/html; charset=utf-8'))]
|
start_response(b_to_str(b"200 OK"), headers)
|
||||||
start_response(b_to_str(b'200 OK'), headers)
|
res = _html["parsed"].encode(encoding="utf-8")
|
||||||
res = _html['parsed'].encode(encoding='utf-8')
|
|
||||||
return [res]
|
return [res]
|
||||||
|
|
||||||
|
|
||||||
|
@ -89,11 +101,10 @@ def parse_deps(orig_doc, options={}):
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
|
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
user_warning(Warnings.W005)
|
user_warning(Warnings.W005)
|
||||||
if options.get('collapse_phrases', False):
|
if options.get("collapse_phrases", False):
|
||||||
for np in list(doc.noun_chunks):
|
for np in list(doc.noun_chunks):
|
||||||
np.merge(tag=np.root.tag_, lemma=np.root.lemma_,
|
np.merge(tag=np.root.tag_, lemma=np.root.lemma_, ent_type=np.root.ent_type_)
|
||||||
ent_type=np.root.ent_type_)
|
if options.get("collapse_punct", True):
|
||||||
if options.get('collapse_punct', True):
|
|
||||||
spans = []
|
spans = []
|
||||||
for word in doc[:-1]:
|
for word in doc[:-1]:
|
||||||
if word.is_punct or not word.nbor(1).is_punct:
|
if word.is_punct or not word.nbor(1).is_punct:
|
||||||
|
@ -103,23 +114,31 @@ def parse_deps(orig_doc, options={}):
|
||||||
while end < len(doc) and doc[end].is_punct:
|
while end < len(doc) and doc[end].is_punct:
|
||||||
end += 1
|
end += 1
|
||||||
span = doc[start:end]
|
span = doc[start:end]
|
||||||
spans.append((span.start_char, span.end_char, word.tag_,
|
spans.append(
|
||||||
word.lemma_, word.ent_type_))
|
(span.start_char, span.end_char, word.tag_, word.lemma_, word.ent_type_)
|
||||||
|
)
|
||||||
for start, end, tag, lemma, ent_type in spans:
|
for start, end, tag, lemma, ent_type in spans:
|
||||||
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
|
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
|
||||||
if options.get('fine_grained'):
|
if options.get("fine_grained"):
|
||||||
words = [{'text': w.text, 'tag': w.tag_} for w in doc]
|
words = [{"text": w.text, "tag": w.tag_} for w in doc]
|
||||||
else:
|
else:
|
||||||
words = [{'text': w.text, 'tag': w.pos_} for w in doc]
|
words = [{"text": w.text, "tag": w.pos_} for w in doc]
|
||||||
arcs = []
|
arcs = []
|
||||||
for word in doc:
|
for word in doc:
|
||||||
if word.i < word.head.i:
|
if word.i < word.head.i:
|
||||||
arcs.append({'start': word.i, 'end': word.head.i,
|
arcs.append(
|
||||||
'label': word.dep_, 'dir': 'left'})
|
{"start": word.i, "end": word.head.i, "label": word.dep_, "dir": "left"}
|
||||||
|
)
|
||||||
elif word.i > word.head.i:
|
elif word.i > word.head.i:
|
||||||
arcs.append({'start': word.head.i, 'end': word.i,
|
arcs.append(
|
||||||
'label': word.dep_, 'dir': 'right'})
|
{
|
||||||
return {'words': words, 'arcs': arcs}
|
"start": word.head.i,
|
||||||
|
"end": word.i,
|
||||||
|
"label": word.dep_,
|
||||||
|
"dir": "right",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return {"words": words, "arcs": arcs}
|
||||||
|
|
||||||
|
|
||||||
def parse_ents(doc, options={}):
|
def parse_ents(doc, options={}):
|
||||||
|
@ -128,10 +147,11 @@ def parse_ents(doc, options={}):
|
||||||
doc (Doc): Document do parse.
|
doc (Doc): Document do parse.
|
||||||
RETURNS (dict): Generated entities keyed by text (original text) and ents.
|
RETURNS (dict): Generated entities keyed by text (original text) and ents.
|
||||||
"""
|
"""
|
||||||
ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
|
ents = [
|
||||||
for ent in doc.ents]
|
{"start": ent.start_char, "end": ent.end_char, "label": ent.label_}
|
||||||
|
for ent in doc.ents
|
||||||
|
]
|
||||||
if not ents:
|
if not ents:
|
||||||
user_warning(Warnings.W006)
|
user_warning(Warnings.W006)
|
||||||
title = (doc.user_data.get('title', None)
|
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
||||||
if hasattr(doc, 'user_data') else None)
|
return {"text": doc.text, "ents": ents, "title": title}
|
||||||
return {'text': doc.text, 'ents': ents, 'title': title}
|
|
||||||
|
|
|
@ -10,7 +10,8 @@ from ..util import minify_html, escape_html
|
||||||
|
|
||||||
class DependencyRenderer(object):
|
class DependencyRenderer(object):
|
||||||
"""Render dependency parses as SVGs."""
|
"""Render dependency parses as SVGs."""
|
||||||
style = 'dep'
|
|
||||||
|
style = "dep"
|
||||||
|
|
||||||
def __init__(self, options={}):
|
def __init__(self, options={}):
|
||||||
"""Initialise dependency renderer.
|
"""Initialise dependency renderer.
|
||||||
|
@ -19,18 +20,16 @@ class DependencyRenderer(object):
|
||||||
arrow_spacing, arrow_width, arrow_stroke, distance, offset_x,
|
arrow_spacing, arrow_width, arrow_stroke, distance, offset_x,
|
||||||
color, bg, font)
|
color, bg, font)
|
||||||
"""
|
"""
|
||||||
self.compact = options.get('compact', False)
|
self.compact = options.get("compact", False)
|
||||||
self.word_spacing = options.get('word_spacing', 45)
|
self.word_spacing = options.get("word_spacing", 45)
|
||||||
self.arrow_spacing = options.get('arrow_spacing',
|
self.arrow_spacing = options.get("arrow_spacing", 12 if self.compact else 20)
|
||||||
12 if self.compact else 20)
|
self.arrow_width = options.get("arrow_width", 6 if self.compact else 10)
|
||||||
self.arrow_width = options.get('arrow_width',
|
self.arrow_stroke = options.get("arrow_stroke", 2)
|
||||||
6 if self.compact else 10)
|
self.distance = options.get("distance", 150 if self.compact else 175)
|
||||||
self.arrow_stroke = options.get('arrow_stroke', 2)
|
self.offset_x = options.get("offset_x", 50)
|
||||||
self.distance = options.get('distance', 150 if self.compact else 175)
|
self.color = options.get("color", "#000000")
|
||||||
self.offset_x = options.get('offset_x', 50)
|
self.bg = options.get("bg", "#ffffff")
|
||||||
self.color = options.get('color', '#000000')
|
self.font = options.get("font", "Arial")
|
||||||
self.bg = options.get('bg', '#ffffff')
|
|
||||||
self.font = options.get('font', 'Arial')
|
|
||||||
|
|
||||||
def render(self, parsed, page=False, minify=False):
|
def render(self, parsed, page=False, minify=False):
|
||||||
"""Render complete markup.
|
"""Render complete markup.
|
||||||
|
@ -43,14 +42,15 @@ class DependencyRenderer(object):
|
||||||
# Create a random ID prefix to make sure parses don't receive the
|
# Create a random ID prefix to make sure parses don't receive the
|
||||||
# same ID, even if they're identical
|
# same ID, even if they're identical
|
||||||
id_prefix = random.randint(0, 999)
|
id_prefix = random.randint(0, 999)
|
||||||
rendered = [self.render_svg('{}-{}'.format(id_prefix, i), p['words'], p['arcs'])
|
rendered = [
|
||||||
for i, p in enumerate(parsed)]
|
self.render_svg("{}-{}".format(id_prefix, i), p["words"], p["arcs"])
|
||||||
|
for i, p in enumerate(parsed)
|
||||||
|
]
|
||||||
if page:
|
if page:
|
||||||
content = ''.join([TPL_FIGURE.format(content=svg)
|
content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered])
|
||||||
for svg in rendered])
|
|
||||||
markup = TPL_PAGE.format(content=content)
|
markup = TPL_PAGE.format(content=content)
|
||||||
else:
|
else:
|
||||||
markup = ''.join(rendered)
|
markup = "".join(rendered)
|
||||||
if minify:
|
if minify:
|
||||||
return minify_html(markup)
|
return minify_html(markup)
|
||||||
return markup
|
return markup
|
||||||
|
@ -65,19 +65,25 @@ class DependencyRenderer(object):
|
||||||
"""
|
"""
|
||||||
self.levels = self.get_levels(arcs)
|
self.levels = self.get_levels(arcs)
|
||||||
self.highest_level = len(self.levels)
|
self.highest_level = len(self.levels)
|
||||||
self.offset_y = self.distance/2*self.highest_level+self.arrow_stroke
|
self.offset_y = self.distance / 2 * self.highest_level + self.arrow_stroke
|
||||||
self.width = self.offset_x+len(words)*self.distance
|
self.width = self.offset_x + len(words) * self.distance
|
||||||
self.height = self.offset_y+3*self.word_spacing
|
self.height = self.offset_y + 3 * self.word_spacing
|
||||||
self.id = render_id
|
self.id = render_id
|
||||||
words = [self.render_word(w['text'], w['tag'], i)
|
words = [self.render_word(w["text"], w["tag"], i) for i, w in enumerate(words)]
|
||||||
for i, w in enumerate(words)]
|
arcs = [
|
||||||
arcs = [self.render_arrow(a['label'], a['start'],
|
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
|
||||||
a['end'], a['dir'], i)
|
for i, a in enumerate(arcs)
|
||||||
for i, a in enumerate(arcs)]
|
]
|
||||||
content = ''.join(words) + ''.join(arcs)
|
content = "".join(words) + "".join(arcs)
|
||||||
return TPL_DEP_SVG.format(id=self.id, width=self.width,
|
return TPL_DEP_SVG.format(
|
||||||
height=self.height, color=self.color,
|
id=self.id,
|
||||||
bg=self.bg, font=self.font, content=content)
|
width=self.width,
|
||||||
|
height=self.height,
|
||||||
|
color=self.color,
|
||||||
|
bg=self.bg,
|
||||||
|
font=self.font,
|
||||||
|
content=content,
|
||||||
|
)
|
||||||
|
|
||||||
def render_word(self, text, tag, i):
|
def render_word(self, text, tag, i):
|
||||||
"""Render individual word.
|
"""Render individual word.
|
||||||
|
@ -87,12 +93,11 @@ class DependencyRenderer(object):
|
||||||
i (int): Unique ID, typically word index.
|
i (int): Unique ID, typically word index.
|
||||||
RETURNS (unicode): Rendered SVG markup.
|
RETURNS (unicode): Rendered SVG markup.
|
||||||
"""
|
"""
|
||||||
y = self.offset_y+self.word_spacing
|
y = self.offset_y + self.word_spacing
|
||||||
x = self.offset_x+i*self.distance
|
x = self.offset_x + i * self.distance
|
||||||
html_text = escape_html(text)
|
html_text = escape_html(text)
|
||||||
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
||||||
|
|
||||||
|
|
||||||
def render_arrow(self, label, start, end, direction, i):
|
def render_arrow(self, label, start, end, direction, i):
|
||||||
"""Render indivicual arrow.
|
"""Render indivicual arrow.
|
||||||
|
|
||||||
|
@ -103,20 +108,30 @@ class DependencyRenderer(object):
|
||||||
i (int): Unique ID, typically arrow index.
|
i (int): Unique ID, typically arrow index.
|
||||||
RETURNS (unicode): Rendered SVG markup.
|
RETURNS (unicode): Rendered SVG markup.
|
||||||
"""
|
"""
|
||||||
level = self.levels.index(end-start)+1
|
level = self.levels.index(end - start) + 1
|
||||||
x_start = self.offset_x+start*self.distance+self.arrow_spacing
|
x_start = self.offset_x + start * self.distance + self.arrow_spacing
|
||||||
y = self.offset_y
|
y = self.offset_y
|
||||||
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
|
x_end = (
|
||||||
- self.arrow_spacing*(self.highest_level-level)/4)
|
self.offset_x
|
||||||
y_curve = self.offset_y-level*self.distance/2
|
+ (end - start) * self.distance
|
||||||
|
+ start * self.distance
|
||||||
|
- self.arrow_spacing * (self.highest_level - level) / 4
|
||||||
|
)
|
||||||
|
y_curve = self.offset_y - level * self.distance / 2
|
||||||
if self.compact:
|
if self.compact:
|
||||||
y_curve = self.offset_y-level*self.distance/6
|
y_curve = self.offset_y - level * self.distance / 6
|
||||||
if y_curve == 0 and len(self.levels) > 5:
|
if y_curve == 0 and len(self.levels) > 5:
|
||||||
y_curve = -self.distance
|
y_curve = -self.distance
|
||||||
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
|
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
|
||||||
arc = self.get_arc(x_start, y, y_curve, x_end)
|
arc = self.get_arc(x_start, y, y_curve, x_end)
|
||||||
return TPL_DEP_ARCS.format(id=self.id, i=i, stroke=self.arrow_stroke,
|
return TPL_DEP_ARCS.format(
|
||||||
head=arrowhead, label=label, arc=arc)
|
id=self.id,
|
||||||
|
i=i,
|
||||||
|
stroke=self.arrow_stroke,
|
||||||
|
head=arrowhead,
|
||||||
|
label=label,
|
||||||
|
arc=arc,
|
||||||
|
)
|
||||||
|
|
||||||
def get_arc(self, x_start, y, y_curve, x_end):
|
def get_arc(self, x_start, y, y_curve, x_end):
|
||||||
"""Render individual arc.
|
"""Render individual arc.
|
||||||
|
@ -141,13 +156,22 @@ class DependencyRenderer(object):
|
||||||
end (int): X-coordinate of arrow end point.
|
end (int): X-coordinate of arrow end point.
|
||||||
RETURNS (unicode): Definition of the arrow head path ('d' attribute).
|
RETURNS (unicode): Definition of the arrow head path ('d' attribute).
|
||||||
"""
|
"""
|
||||||
if direction == 'left':
|
if direction == "left":
|
||||||
pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2)
|
pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
|
||||||
else:
|
else:
|
||||||
pos1, pos2, pos3 = (end, end+self.arrow_width-2,
|
pos1, pos2, pos3 = (
|
||||||
end-self.arrow_width+2)
|
end,
|
||||||
arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3,
|
end + self.arrow_width - 2,
|
||||||
y-self.arrow_width)
|
end - self.arrow_width + 2,
|
||||||
|
)
|
||||||
|
arrowhead = (
|
||||||
|
pos1,
|
||||||
|
y + 2,
|
||||||
|
pos2,
|
||||||
|
y - self.arrow_width,
|
||||||
|
pos3,
|
||||||
|
y - self.arrow_width,
|
||||||
|
)
|
||||||
return "M{},{} L{},{} {},{}".format(*arrowhead)
|
return "M{},{} L{},{} {},{}".format(*arrowhead)
|
||||||
|
|
||||||
def get_levels(self, arcs):
|
def get_levels(self, arcs):
|
||||||
|
@ -157,30 +181,44 @@ class DependencyRenderer(object):
|
||||||
args (list): Individual arcs and their start, end, direction and label.
|
args (list): Individual arcs and their start, end, direction and label.
|
||||||
RETURNS (list): Arc levels sorted from lowest to highest.
|
RETURNS (list): Arc levels sorted from lowest to highest.
|
||||||
"""
|
"""
|
||||||
levels = set(map(lambda arc: arc['end'] - arc['start'], arcs))
|
levels = set(map(lambda arc: arc["end"] - arc["start"], arcs))
|
||||||
return sorted(list(levels))
|
return sorted(list(levels))
|
||||||
|
|
||||||
|
|
||||||
class EntityRenderer(object):
|
class EntityRenderer(object):
|
||||||
"""Render named entities as HTML."""
|
"""Render named entities as HTML."""
|
||||||
style = 'ent'
|
|
||||||
|
style = "ent"
|
||||||
|
|
||||||
def __init__(self, options={}):
|
def __init__(self, options={}):
|
||||||
"""Initialise dependency renderer.
|
"""Initialise dependency renderer.
|
||||||
|
|
||||||
options (dict): Visualiser-specific options (colors, ents)
|
options (dict): Visualiser-specific options (colors, ents)
|
||||||
"""
|
"""
|
||||||
colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74',
|
colors = {
|
||||||
'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb',
|
"ORG": "#7aecec",
|
||||||
'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LAW': '#ff8197',
|
"PRODUCT": "#bfeeb7",
|
||||||
'LANGUAGE': '#ff8197', 'WORK_OF_ART': '#f0d0ff',
|
"GPE": "#feca74",
|
||||||
'DATE': '#bfe1d9', 'TIME': '#bfe1d9', 'MONEY': '#e4e7d2',
|
"LOC": "#ff9561",
|
||||||
'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2',
|
"PERSON": "#aa9cfc",
|
||||||
'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'}
|
"NORP": "#c887fb",
|
||||||
colors.update(options.get('colors', {}))
|
"FACILITY": "#9cc9cc",
|
||||||
self.default_color = '#ddd'
|
"EVENT": "#ffeb80",
|
||||||
|
"LAW": "#ff8197",
|
||||||
|
"LANGUAGE": "#ff8197",
|
||||||
|
"WORK_OF_ART": "#f0d0ff",
|
||||||
|
"DATE": "#bfe1d9",
|
||||||
|
"TIME": "#bfe1d9",
|
||||||
|
"MONEY": "#e4e7d2",
|
||||||
|
"QUANTITY": "#e4e7d2",
|
||||||
|
"ORDINAL": "#e4e7d2",
|
||||||
|
"CARDINAL": "#e4e7d2",
|
||||||
|
"PERCENT": "#e4e7d2",
|
||||||
|
}
|
||||||
|
colors.update(options.get("colors", {}))
|
||||||
|
self.default_color = "#ddd"
|
||||||
self.colors = colors
|
self.colors = colors
|
||||||
self.ents = options.get('ents', None)
|
self.ents = options.get("ents", None)
|
||||||
|
|
||||||
def render(self, parsed, page=False, minify=False):
|
def render(self, parsed, page=False, minify=False):
|
||||||
"""Render complete markup.
|
"""Render complete markup.
|
||||||
|
@ -190,14 +228,14 @@ class EntityRenderer(object):
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
RETURNS (unicode): Rendered HTML markup.
|
RETURNS (unicode): Rendered HTML markup.
|
||||||
"""
|
"""
|
||||||
rendered = [self.render_ents(p['text'], p['ents'],
|
rendered = [
|
||||||
p.get('title', None)) for p in parsed]
|
self.render_ents(p["text"], p["ents"], p.get("title", None)) for p in parsed
|
||||||
|
]
|
||||||
if page:
|
if page:
|
||||||
docs = ''.join([TPL_FIGURE.format(content=doc)
|
docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered])
|
||||||
for doc in rendered])
|
|
||||||
markup = TPL_PAGE.format(content=docs)
|
markup = TPL_PAGE.format(content=docs)
|
||||||
else:
|
else:
|
||||||
markup = ''.join(rendered)
|
markup = "".join(rendered)
|
||||||
if minify:
|
if minify:
|
||||||
return minify_html(markup)
|
return minify_html(markup)
|
||||||
return markup
|
return markup
|
||||||
|
@ -209,18 +247,18 @@ class EntityRenderer(object):
|
||||||
spans (list): Individual entity spans and their start, end and label.
|
spans (list): Individual entity spans and their start, end and label.
|
||||||
title (unicode or None): Document title set in Doc.user_data['title'].
|
title (unicode or None): Document title set in Doc.user_data['title'].
|
||||||
"""
|
"""
|
||||||
markup = ''
|
markup = ""
|
||||||
offset = 0
|
offset = 0
|
||||||
for span in spans:
|
for span in spans:
|
||||||
label = span['label']
|
label = span["label"]
|
||||||
start = span['start']
|
start = span["start"]
|
||||||
end = span['end']
|
end = span["end"]
|
||||||
entity = text[start:end]
|
entity = text[start:end]
|
||||||
fragments = text[offset:start].split('\n')
|
fragments = text[offset:start].split("\n")
|
||||||
for i, fragment in enumerate(fragments):
|
for i, fragment in enumerate(fragments):
|
||||||
markup += fragment
|
markup += fragment
|
||||||
if len(fragments) > 1 and i != len(fragments)-1:
|
if len(fragments) > 1 and i != len(fragments) - 1:
|
||||||
markup += '</br>'
|
markup += "</br>"
|
||||||
if self.ents is None or label.upper() in self.ents:
|
if self.ents is None or label.upper() in self.ents:
|
||||||
color = self.colors.get(label.upper(), self.default_color)
|
color = self.colors.get(label.upper(), self.default_color)
|
||||||
markup += TPL_ENT.format(label=label, text=entity, bg=color)
|
markup += TPL_ENT.format(label=label, text=entity, bg=color)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
# setting explicit height and max-width: none on the SVG is required for
|
# Setting explicit height and max-width: none on the SVG is required for
|
||||||
# Jupyter to render it properly in a cell
|
# Jupyter to render it properly in a cell
|
||||||
|
|
||||||
TPL_DEP_SVG = """
|
TPL_DEP_SVG = """
|
||||||
|
|
|
@ -8,13 +8,17 @@ import inspect
|
||||||
|
|
||||||
def add_codes(err_cls):
|
def add_codes(err_cls):
|
||||||
"""Add error codes to string messages via class attribute names."""
|
"""Add error codes to string messages via class attribute names."""
|
||||||
|
|
||||||
class ErrorsWithCodes(object):
|
class ErrorsWithCodes(object):
|
||||||
def __getattribute__(self, code):
|
def __getattribute__(self, code):
|
||||||
msg = getattr(err_cls, code)
|
msg = getattr(err_cls, code)
|
||||||
return '[{code}] {msg}'.format(code=code, msg=msg)
|
return "[{code}] {msg}".format(code=code, msg=msg)
|
||||||
|
|
||||||
return ErrorsWithCodes()
|
return ErrorsWithCodes()
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class Warnings(object):
|
class Warnings(object):
|
||||||
W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
|
W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
|
||||||
|
@ -260,7 +264,7 @@ class Errors(object):
|
||||||
E095 = ("Can't write to frozen dictionary. This is likely an internal "
|
E095 = ("Can't write to frozen dictionary. This is likely an internal "
|
||||||
"error. Are you writing to a default function argument?")
|
"error. Are you writing to a default function argument?")
|
||||||
E096 = ("Invalid object passed to displaCy: Can only visualize Doc or "
|
E096 = ("Invalid object passed to displaCy: Can only visualize Doc or "
|
||||||
"Span objects, or dicts if set to manual=True.")
|
"Span objects, or dicts if set to manual=True.")
|
||||||
E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
|
E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
|
||||||
"phrase pattern (string) but got:\n{pattern}")
|
"phrase pattern (string) but got:\n{pattern}")
|
||||||
E098 = ("Invalid pattern specified: expected both SPEC and PATTERN.")
|
E098 = ("Invalid pattern specified: expected both SPEC and PATTERN.")
|
||||||
|
@ -275,6 +279,7 @@ class Errors(object):
|
||||||
" can only be part of one entity, so make sure the entities you're "
|
" can only be part of one entity, so make sure the entities you're "
|
||||||
"setting don't overlap.")
|
"setting don't overlap.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class TempErrors(object):
|
class TempErrors(object):
|
||||||
T001 = ("Max length currently 10 for phrase matching")
|
T001 = ("Max length currently 10 for phrase matching")
|
||||||
|
@ -292,55 +297,57 @@ class TempErrors(object):
|
||||||
"(pretrained_dims) but not the new name (pretrained_vectors).")
|
"(pretrained_dims) but not the new name (pretrained_vectors).")
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
class ModelsWarning(UserWarning):
|
class ModelsWarning(UserWarning):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
WARNINGS = {
|
WARNINGS = {
|
||||||
'user': UserWarning,
|
"user": UserWarning,
|
||||||
'deprecation': DeprecationWarning,
|
"deprecation": DeprecationWarning,
|
||||||
'models': ModelsWarning,
|
"models": ModelsWarning,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _get_warn_types(arg):
|
def _get_warn_types(arg):
|
||||||
if arg == '': # don't show any warnings
|
if arg == "": # don't show any warnings
|
||||||
return []
|
return []
|
||||||
if not arg or arg == 'all': # show all available warnings
|
if not arg or arg == "all": # show all available warnings
|
||||||
return WARNINGS.keys()
|
return WARNINGS.keys()
|
||||||
return [w_type.strip() for w_type in arg.split(',')
|
return [w_type.strip() for w_type in arg.split(",") if w_type.strip() in WARNINGS]
|
||||||
if w_type.strip() in WARNINGS]
|
|
||||||
|
|
||||||
|
|
||||||
def _get_warn_excl(arg):
|
def _get_warn_excl(arg):
|
||||||
if not arg:
|
if not arg:
|
||||||
return []
|
return []
|
||||||
return [w_id.strip() for w_id in arg.split(',')]
|
return [w_id.strip() for w_id in arg.split(",")]
|
||||||
|
|
||||||
|
|
||||||
SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER')
|
SPACY_WARNING_FILTER = os.environ.get("SPACY_WARNING_FILTER")
|
||||||
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES'))
|
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get("SPACY_WARNING_TYPES"))
|
||||||
SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get('SPACY_WARNING_IGNORE'))
|
SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get("SPACY_WARNING_IGNORE"))
|
||||||
|
|
||||||
|
|
||||||
def user_warning(message):
|
def user_warning(message):
|
||||||
_warn(message, 'user')
|
_warn(message, "user")
|
||||||
|
|
||||||
|
|
||||||
def deprecation_warning(message):
|
def deprecation_warning(message):
|
||||||
_warn(message, 'deprecation')
|
_warn(message, "deprecation")
|
||||||
|
|
||||||
|
|
||||||
def models_warning(message):
|
def models_warning(message):
|
||||||
_warn(message, 'models')
|
_warn(message, "models")
|
||||||
|
|
||||||
|
|
||||||
def _warn(message, warn_type='user'):
|
def _warn(message, warn_type="user"):
|
||||||
"""
|
"""
|
||||||
message (unicode): The message to display.
|
message (unicode): The message to display.
|
||||||
category (Warning): The Warning to show.
|
category (Warning): The Warning to show.
|
||||||
"""
|
"""
|
||||||
w_id = message.split('[', 1)[1].split(']', 1)[0] # get ID from string
|
w_id = message.split("[", 1)[1].split("]", 1)[0] # get ID from string
|
||||||
if warn_type in SPACY_WARNING_TYPES and w_id not in SPACY_WARNING_IGNORE:
|
if warn_type in SPACY_WARNING_TYPES and w_id not in SPACY_WARNING_IGNORE:
|
||||||
category = WARNINGS[warn_type]
|
category = WARNINGS[warn_type]
|
||||||
stack = inspect.stack()[-1]
|
stack = inspect.stack()[-1]
|
||||||
|
|
|
@ -21,295 +21,272 @@ GLOSSARY = {
|
||||||
# POS tags
|
# POS tags
|
||||||
# Universal POS Tags
|
# Universal POS Tags
|
||||||
# http://universaldependencies.org/u/pos/
|
# http://universaldependencies.org/u/pos/
|
||||||
|
"ADJ": "adjective",
|
||||||
'ADJ': 'adjective',
|
"ADP": "adposition",
|
||||||
'ADP': 'adposition',
|
"ADV": "adverb",
|
||||||
'ADV': 'adverb',
|
"AUX": "auxiliary",
|
||||||
'AUX': 'auxiliary',
|
"CONJ": "conjunction",
|
||||||
'CONJ': 'conjunction',
|
"CCONJ": "coordinating conjunction",
|
||||||
'CCONJ': 'coordinating conjunction',
|
"DET": "determiner",
|
||||||
'DET': 'determiner',
|
"INTJ": "interjection",
|
||||||
'INTJ': 'interjection',
|
"NOUN": "noun",
|
||||||
'NOUN': 'noun',
|
"NUM": "numeral",
|
||||||
'NUM': 'numeral',
|
"PART": "particle",
|
||||||
'PART': 'particle',
|
"PRON": "pronoun",
|
||||||
'PRON': 'pronoun',
|
"PROPN": "proper noun",
|
||||||
'PROPN': 'proper noun',
|
"PUNCT": "punctuation",
|
||||||
'PUNCT': 'punctuation',
|
"SCONJ": "subordinating conjunction",
|
||||||
'SCONJ': 'subordinating conjunction',
|
"SYM": "symbol",
|
||||||
'SYM': 'symbol',
|
"VERB": "verb",
|
||||||
'VERB': 'verb',
|
"X": "other",
|
||||||
'X': 'other',
|
"EOL": "end of line",
|
||||||
'EOL': 'end of line',
|
"SPACE": "space",
|
||||||
'SPACE': 'space',
|
|
||||||
|
|
||||||
|
|
||||||
# POS tags (English)
|
# POS tags (English)
|
||||||
# OntoNotes 5 / Penn Treebank
|
# OntoNotes 5 / Penn Treebank
|
||||||
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
|
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
|
||||||
|
".": "punctuation mark, sentence closer",
|
||||||
'.': 'punctuation mark, sentence closer',
|
",": "punctuation mark, comma",
|
||||||
',': 'punctuation mark, comma',
|
"-LRB-": "left round bracket",
|
||||||
'-LRB-': 'left round bracket',
|
"-RRB-": "right round bracket",
|
||||||
'-RRB-': 'right round bracket',
|
"``": "opening quotation mark",
|
||||||
'``': 'opening quotation mark',
|
'""': "closing quotation mark",
|
||||||
'""': 'closing quotation mark',
|
"''": "closing quotation mark",
|
||||||
"''": 'closing quotation mark',
|
":": "punctuation mark, colon or ellipsis",
|
||||||
':': 'punctuation mark, colon or ellipsis',
|
"$": "symbol, currency",
|
||||||
'$': 'symbol, currency',
|
"#": "symbol, number sign",
|
||||||
'#': 'symbol, number sign',
|
"AFX": "affix",
|
||||||
'AFX': 'affix',
|
"CC": "conjunction, coordinating",
|
||||||
'CC': 'conjunction, coordinating',
|
"CD": "cardinal number",
|
||||||
'CD': 'cardinal number',
|
"DT": "determiner",
|
||||||
'DT': 'determiner',
|
"EX": "existential there",
|
||||||
'EX': 'existential there',
|
"FW": "foreign word",
|
||||||
'FW': 'foreign word',
|
"HYPH": "punctuation mark, hyphen",
|
||||||
'HYPH': 'punctuation mark, hyphen',
|
"IN": "conjunction, subordinating or preposition",
|
||||||
'IN': 'conjunction, subordinating or preposition',
|
"JJ": "adjective",
|
||||||
'JJ': 'adjective',
|
"JJR": "adjective, comparative",
|
||||||
'JJR': 'adjective, comparative',
|
"JJS": "adjective, superlative",
|
||||||
'JJS': 'adjective, superlative',
|
"LS": "list item marker",
|
||||||
'LS': 'list item marker',
|
"MD": "verb, modal auxiliary",
|
||||||
'MD': 'verb, modal auxiliary',
|
"NIL": "missing tag",
|
||||||
'NIL': 'missing tag',
|
"NN": "noun, singular or mass",
|
||||||
'NN': 'noun, singular or mass',
|
"NNP": "noun, proper singular",
|
||||||
'NNP': 'noun, proper singular',
|
"NNPS": "noun, proper plural",
|
||||||
'NNPS': 'noun, proper plural',
|
"NNS": "noun, plural",
|
||||||
'NNS': 'noun, plural',
|
"PDT": "predeterminer",
|
||||||
'PDT': 'predeterminer',
|
"POS": "possessive ending",
|
||||||
'POS': 'possessive ending',
|
"PRP": "pronoun, personal",
|
||||||
'PRP': 'pronoun, personal',
|
"PRP$": "pronoun, possessive",
|
||||||
'PRP$': 'pronoun, possessive',
|
"RB": "adverb",
|
||||||
'RB': 'adverb',
|
"RBR": "adverb, comparative",
|
||||||
'RBR': 'adverb, comparative',
|
"RBS": "adverb, superlative",
|
||||||
'RBS': 'adverb, superlative',
|
"RP": "adverb, particle",
|
||||||
'RP': 'adverb, particle',
|
"TO": "infinitival to",
|
||||||
'TO': 'infinitival to',
|
"UH": "interjection",
|
||||||
'UH': 'interjection',
|
"VB": "verb, base form",
|
||||||
'VB': 'verb, base form',
|
"VBD": "verb, past tense",
|
||||||
'VBD': 'verb, past tense',
|
"VBG": "verb, gerund or present participle",
|
||||||
'VBG': 'verb, gerund or present participle',
|
"VBN": "verb, past participle",
|
||||||
'VBN': 'verb, past participle',
|
"VBP": "verb, non-3rd person singular present",
|
||||||
'VBP': 'verb, non-3rd person singular present',
|
"VBZ": "verb, 3rd person singular present",
|
||||||
'VBZ': 'verb, 3rd person singular present',
|
"WDT": "wh-determiner",
|
||||||
'WDT': 'wh-determiner',
|
"WP": "wh-pronoun, personal",
|
||||||
'WP': 'wh-pronoun, personal',
|
"WP$": "wh-pronoun, possessive",
|
||||||
'WP$': 'wh-pronoun, possessive',
|
"WRB": "wh-adverb",
|
||||||
'WRB': 'wh-adverb',
|
"SP": "space",
|
||||||
'SP': 'space',
|
"ADD": "email",
|
||||||
'ADD': 'email',
|
"NFP": "superfluous punctuation",
|
||||||
'NFP': 'superfluous punctuation',
|
"GW": "additional word in multi-word expression",
|
||||||
'GW': 'additional word in multi-word expression',
|
"XX": "unknown",
|
||||||
'XX': 'unknown',
|
"BES": 'auxiliary "be"',
|
||||||
'BES': 'auxiliary "be"',
|
"HVS": 'forms of "have"',
|
||||||
'HVS': 'forms of "have"',
|
|
||||||
|
|
||||||
|
|
||||||
# POS Tags (German)
|
# POS Tags (German)
|
||||||
# TIGER Treebank
|
# TIGER Treebank
|
||||||
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
||||||
|
"$(": "other sentence-internal punctuation mark",
|
||||||
'$(': 'other sentence-internal punctuation mark',
|
"$,": "comma",
|
||||||
'$,': 'comma',
|
"$.": "sentence-final punctuation mark",
|
||||||
'$.': 'sentence-final punctuation mark',
|
"ADJA": "adjective, attributive",
|
||||||
'ADJA': 'adjective, attributive',
|
"ADJD": "adjective, adverbial or predicative",
|
||||||
'ADJD': 'adjective, adverbial or predicative',
|
"APPO": "postposition",
|
||||||
'APPO': 'postposition',
|
"APPR": "preposition; circumposition left",
|
||||||
'APPR': 'preposition; circumposition left',
|
"APPRART": "preposition with article",
|
||||||
'APPRART': 'preposition with article',
|
"APZR": "circumposition right",
|
||||||
'APZR': 'circumposition right',
|
"ART": "definite or indefinite article",
|
||||||
'ART': 'definite or indefinite article',
|
"CARD": "cardinal number",
|
||||||
'CARD': 'cardinal number',
|
"FM": "foreign language material",
|
||||||
'FM': 'foreign language material',
|
"ITJ": "interjection",
|
||||||
'ITJ': 'interjection',
|
"KOKOM": "comparative conjunction",
|
||||||
'KOKOM': 'comparative conjunction',
|
"KON": "coordinate conjunction",
|
||||||
'KON': 'coordinate conjunction',
|
"KOUI": 'subordinate conjunction with "zu" and infinitive',
|
||||||
'KOUI': 'subordinate conjunction with "zu" and infinitive',
|
"KOUS": "subordinate conjunction with sentence",
|
||||||
'KOUS': 'subordinate conjunction with sentence',
|
"NE": "proper noun",
|
||||||
'NE': 'proper noun',
|
"NNE": "proper noun",
|
||||||
'NNE': 'proper noun',
|
"PAV": "pronominal adverb",
|
||||||
'PAV': 'pronominal adverb',
|
"PROAV": "pronominal adverb",
|
||||||
'PROAV': 'pronominal adverb',
|
"PDAT": "attributive demonstrative pronoun",
|
||||||
'PDAT': 'attributive demonstrative pronoun',
|
"PDS": "substituting demonstrative pronoun",
|
||||||
'PDS': 'substituting demonstrative pronoun',
|
"PIAT": "attributive indefinite pronoun without determiner",
|
||||||
'PIAT': 'attributive indefinite pronoun without determiner',
|
"PIDAT": "attributive indefinite pronoun with determiner",
|
||||||
'PIDAT': 'attributive indefinite pronoun with determiner',
|
"PIS": "substituting indefinite pronoun",
|
||||||
'PIS': 'substituting indefinite pronoun',
|
"PPER": "non-reflexive personal pronoun",
|
||||||
'PPER': 'non-reflexive personal pronoun',
|
"PPOSAT": "attributive possessive pronoun",
|
||||||
'PPOSAT': 'attributive possessive pronoun',
|
"PPOSS": "substituting possessive pronoun",
|
||||||
'PPOSS': 'substituting possessive pronoun',
|
"PRELAT": "attributive relative pronoun",
|
||||||
'PRELAT': 'attributive relative pronoun',
|
"PRELS": "substituting relative pronoun",
|
||||||
'PRELS': 'substituting relative pronoun',
|
"PRF": "reflexive personal pronoun",
|
||||||
'PRF': 'reflexive personal pronoun',
|
"PTKA": "particle with adjective or adverb",
|
||||||
'PTKA': 'particle with adjective or adverb',
|
"PTKANT": "answer particle",
|
||||||
'PTKANT': 'answer particle',
|
"PTKNEG": "negative particle",
|
||||||
'PTKNEG': 'negative particle',
|
"PTKVZ": "separable verbal particle",
|
||||||
'PTKVZ': 'separable verbal particle',
|
"PTKZU": '"zu" before infinitive',
|
||||||
'PTKZU': '"zu" before infinitive',
|
"PWAT": "attributive interrogative pronoun",
|
||||||
'PWAT': 'attributive interrogative pronoun',
|
"PWAV": "adverbial interrogative or relative pronoun",
|
||||||
'PWAV': 'adverbial interrogative or relative pronoun',
|
"PWS": "substituting interrogative pronoun",
|
||||||
'PWS': 'substituting interrogative pronoun',
|
"TRUNC": "word remnant",
|
||||||
'TRUNC': 'word remnant',
|
"VAFIN": "finite verb, auxiliary",
|
||||||
'VAFIN': 'finite verb, auxiliary',
|
"VAIMP": "imperative, auxiliary",
|
||||||
'VAIMP': 'imperative, auxiliary',
|
"VAINF": "infinitive, auxiliary",
|
||||||
'VAINF': 'infinitive, auxiliary',
|
"VAPP": "perfect participle, auxiliary",
|
||||||
'VAPP': 'perfect participle, auxiliary',
|
"VMFIN": "finite verb, modal",
|
||||||
'VMFIN': 'finite verb, modal',
|
"VMINF": "infinitive, modal",
|
||||||
'VMINF': 'infinitive, modal',
|
"VMPP": "perfect participle, modal",
|
||||||
'VMPP': 'perfect participle, modal',
|
"VVFIN": "finite verb, full",
|
||||||
'VVFIN': 'finite verb, full',
|
"VVIMP": "imperative, full",
|
||||||
'VVIMP': 'imperative, full',
|
"VVINF": "infinitive, full",
|
||||||
'VVINF': 'infinitive, full',
|
"VVIZU": 'infinitive with "zu", full',
|
||||||
'VVIZU': 'infinitive with "zu", full',
|
"VVPP": "perfect participle, full",
|
||||||
'VVPP': 'perfect participle, full',
|
"XY": "non-word containing non-letter",
|
||||||
'XY': 'non-word containing non-letter',
|
|
||||||
|
|
||||||
|
|
||||||
# Noun chunks
|
# Noun chunks
|
||||||
|
"NP": "noun phrase",
|
||||||
'NP': 'noun phrase',
|
"PP": "prepositional phrase",
|
||||||
'PP': 'prepositional phrase',
|
"VP": "verb phrase",
|
||||||
'VP': 'verb phrase',
|
"ADVP": "adverb phrase",
|
||||||
'ADVP': 'adverb phrase',
|
"ADJP": "adjective phrase",
|
||||||
'ADJP': 'adjective phrase',
|
"SBAR": "subordinating conjunction",
|
||||||
'SBAR': 'subordinating conjunction',
|
"PRT": "particle",
|
||||||
'PRT': 'particle',
|
"PNP": "prepositional noun phrase",
|
||||||
'PNP': 'prepositional noun phrase',
|
|
||||||
|
|
||||||
|
|
||||||
# Dependency Labels (English)
|
# Dependency Labels (English)
|
||||||
# ClearNLP / Universal Dependencies
|
# ClearNLP / Universal Dependencies
|
||||||
# https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
|
# https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
|
||||||
|
"acomp": "adjectival complement",
|
||||||
'acomp': 'adjectival complement',
|
"advcl": "adverbial clause modifier",
|
||||||
'advcl': 'adverbial clause modifier',
|
"advmod": "adverbial modifier",
|
||||||
'advmod': 'adverbial modifier',
|
"agent": "agent",
|
||||||
'agent': 'agent',
|
"amod": "adjectival modifier",
|
||||||
'amod': 'adjectival modifier',
|
"appos": "appositional modifier",
|
||||||
'appos': 'appositional modifier',
|
"attr": "attribute",
|
||||||
'attr': 'attribute',
|
"aux": "auxiliary",
|
||||||
'aux': 'auxiliary',
|
"auxpass": "auxiliary (passive)",
|
||||||
'auxpass': 'auxiliary (passive)',
|
"cc": "coordinating conjunction",
|
||||||
'cc': 'coordinating conjunction',
|
"ccomp": "clausal complement",
|
||||||
'ccomp': 'clausal complement',
|
"complm": "complementizer",
|
||||||
'complm': 'complementizer',
|
"conj": "conjunct",
|
||||||
'conj': 'conjunct',
|
"cop": "copula",
|
||||||
'cop': 'copula',
|
"csubj": "clausal subject",
|
||||||
'csubj': 'clausal subject',
|
"csubjpass": "clausal subject (passive)",
|
||||||
'csubjpass': 'clausal subject (passive)',
|
"dep": "unclassified dependent",
|
||||||
'dep': 'unclassified dependent',
|
"det": "determiner",
|
||||||
'det': 'determiner',
|
"dobj": "direct object",
|
||||||
'dobj': 'direct object',
|
"expl": "expletive",
|
||||||
'expl': 'expletive',
|
"hmod": "modifier in hyphenation",
|
||||||
'hmod': 'modifier in hyphenation',
|
"hyph": "hyphen",
|
||||||
'hyph': 'hyphen',
|
"infmod": "infinitival modifier",
|
||||||
'infmod': 'infinitival modifier',
|
"intj": "interjection",
|
||||||
'intj': 'interjection',
|
"iobj": "indirect object",
|
||||||
'iobj': 'indirect object',
|
"mark": "marker",
|
||||||
'mark': 'marker',
|
"meta": "meta modifier",
|
||||||
'meta': 'meta modifier',
|
"neg": "negation modifier",
|
||||||
'neg': 'negation modifier',
|
"nmod": "modifier of nominal",
|
||||||
'nmod': 'modifier of nominal',
|
"nn": "noun compound modifier",
|
||||||
'nn': 'noun compound modifier',
|
"npadvmod": "noun phrase as adverbial modifier",
|
||||||
'npadvmod': 'noun phrase as adverbial modifier',
|
"nsubj": "nominal subject",
|
||||||
'nsubj': 'nominal subject',
|
"nsubjpass": "nominal subject (passive)",
|
||||||
'nsubjpass': 'nominal subject (passive)',
|
"num": "number modifier",
|
||||||
'num': 'number modifier',
|
"number": "number compound modifier",
|
||||||
'number': 'number compound modifier',
|
"oprd": "object predicate",
|
||||||
'oprd': 'object predicate',
|
"obj": "object",
|
||||||
'obj': 'object',
|
"obl": "oblique nominal",
|
||||||
'obl': 'oblique nominal',
|
"parataxis": "parataxis",
|
||||||
'parataxis': 'parataxis',
|
"partmod": "participal modifier",
|
||||||
'partmod': 'participal modifier',
|
"pcomp": "complement of preposition",
|
||||||
'pcomp': 'complement of preposition',
|
"pobj": "object of preposition",
|
||||||
'pobj': 'object of preposition',
|
"poss": "possession modifier",
|
||||||
'poss': 'possession modifier',
|
"possessive": "possessive modifier",
|
||||||
'possessive': 'possessive modifier',
|
"preconj": "pre-correlative conjunction",
|
||||||
'preconj': 'pre-correlative conjunction',
|
"prep": "prepositional modifier",
|
||||||
'prep': 'prepositional modifier',
|
"prt": "particle",
|
||||||
'prt': 'particle',
|
"punct": "punctuation",
|
||||||
'punct': 'punctuation',
|
"quantmod": "modifier of quantifier",
|
||||||
'quantmod': 'modifier of quantifier',
|
"rcmod": "relative clause modifier",
|
||||||
'rcmod': 'relative clause modifier',
|
"root": "root",
|
||||||
'root': 'root',
|
"xcomp": "open clausal complement",
|
||||||
'xcomp': 'open clausal complement',
|
|
||||||
|
|
||||||
|
|
||||||
# Dependency labels (German)
|
# Dependency labels (German)
|
||||||
# TIGER Treebank
|
# TIGER Treebank
|
||||||
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
|
||||||
# currently missing: 'cc' (comparative complement) because of conflict
|
# currently missing: 'cc' (comparative complement) because of conflict
|
||||||
# with English labels
|
# with English labels
|
||||||
|
"ac": "adpositional case marker",
|
||||||
'ac': 'adpositional case marker',
|
"adc": "adjective component",
|
||||||
'adc': 'adjective component',
|
"ag": "genitive attribute",
|
||||||
'ag': 'genitive attribute',
|
"ams": "measure argument of adjective",
|
||||||
'ams': 'measure argument of adjective',
|
"app": "apposition",
|
||||||
'app': 'apposition',
|
"avc": "adverbial phrase component",
|
||||||
'avc': 'adverbial phrase component',
|
"cd": "coordinating conjunction",
|
||||||
'cd': 'coordinating conjunction',
|
"cj": "conjunct",
|
||||||
'cj': 'conjunct',
|
"cm": "comparative conjunction",
|
||||||
'cm': 'comparative conjunction',
|
"cp": "complementizer",
|
||||||
'cp': 'complementizer',
|
"cvc": "collocational verb construction",
|
||||||
'cvc': 'collocational verb construction',
|
"da": "dative",
|
||||||
'da': 'dative',
|
"dh": "discourse-level head",
|
||||||
'dh': 'discourse-level head',
|
"dm": "discourse marker",
|
||||||
'dm': 'discourse marker',
|
"ep": "expletive es",
|
||||||
'ep': 'expletive es',
|
"hd": "head",
|
||||||
'hd': 'head',
|
"ju": "junctor",
|
||||||
'ju': 'junctor',
|
"mnr": "postnominal modifier",
|
||||||
'mnr': 'postnominal modifier',
|
"mo": "modifier",
|
||||||
'mo': 'modifier',
|
"ng": "negation",
|
||||||
'ng': 'negation',
|
"nk": "noun kernel element",
|
||||||
'nk': 'noun kernel element',
|
"nmc": "numerical component",
|
||||||
'nmc': 'numerical component',
|
"oa": "accusative object",
|
||||||
'oa': 'accusative object',
|
"oc": "clausal object",
|
||||||
'oc': 'clausal object',
|
"og": "genitive object",
|
||||||
'og': 'genitive object',
|
"op": "prepositional object",
|
||||||
'op': 'prepositional object',
|
"par": "parenthetical element",
|
||||||
'par': 'parenthetical element',
|
"pd": "predicate",
|
||||||
'pd': 'predicate',
|
"pg": "phrasal genitive",
|
||||||
'pg': 'phrasal genitive',
|
"ph": "placeholder",
|
||||||
'ph': 'placeholder',
|
"pm": "morphological particle",
|
||||||
'pm': 'morphological particle',
|
"pnc": "proper noun component",
|
||||||
'pnc': 'proper noun component',
|
"rc": "relative clause",
|
||||||
'rc': 'relative clause',
|
"re": "repeated element",
|
||||||
're': 'repeated element',
|
"rs": "reported speech",
|
||||||
'rs': 'reported speech',
|
"sb": "subject",
|
||||||
'sb': 'subject',
|
|
||||||
|
|
||||||
|
|
||||||
# Named Entity Recognition
|
# Named Entity Recognition
|
||||||
# OntoNotes 5
|
# OntoNotes 5
|
||||||
# https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
|
# https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
|
||||||
|
"PERSON": "People, including fictional",
|
||||||
'PERSON': 'People, including fictional',
|
"NORP": "Nationalities or religious or political groups",
|
||||||
'NORP': 'Nationalities or religious or political groups',
|
"FACILITY": "Buildings, airports, highways, bridges, etc.",
|
||||||
'FACILITY': 'Buildings, airports, highways, bridges, etc.',
|
"FAC": "Buildings, airports, highways, bridges, etc.",
|
||||||
'FAC': 'Buildings, airports, highways, bridges, etc.',
|
"ORG": "Companies, agencies, institutions, etc.",
|
||||||
'ORG': 'Companies, agencies, institutions, etc.',
|
"GPE": "Countries, cities, states",
|
||||||
'GPE': 'Countries, cities, states',
|
"LOC": "Non-GPE locations, mountain ranges, bodies of water",
|
||||||
'LOC': 'Non-GPE locations, mountain ranges, bodies of water',
|
"PRODUCT": "Objects, vehicles, foods, etc. (not services)",
|
||||||
'PRODUCT': 'Objects, vehicles, foods, etc. (not services)',
|
"EVENT": "Named hurricanes, battles, wars, sports events, etc.",
|
||||||
'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
|
"WORK_OF_ART": "Titles of books, songs, etc.",
|
||||||
'WORK_OF_ART': 'Titles of books, songs, etc.',
|
"LAW": "Named documents made into laws.",
|
||||||
'LAW': 'Named documents made into laws.',
|
"LANGUAGE": "Any named language",
|
||||||
'LANGUAGE': 'Any named language',
|
"DATE": "Absolute or relative dates or periods",
|
||||||
'DATE': 'Absolute or relative dates or periods',
|
"TIME": "Times smaller than a day",
|
||||||
'TIME': 'Times smaller than a day',
|
"PERCENT": 'Percentage, including "%"',
|
||||||
'PERCENT': 'Percentage, including "%"',
|
"MONEY": "Monetary values, including unit",
|
||||||
'MONEY': 'Monetary values, including unit',
|
"QUANTITY": "Measurements, as of weight or distance",
|
||||||
'QUANTITY': 'Measurements, as of weight or distance',
|
"ORDINAL": '"first", "second", etc.',
|
||||||
'ORDINAL': '"first", "second", etc.',
|
"CARDINAL": "Numerals that do not fall under another type",
|
||||||
'CARDINAL': 'Numerals that do not fall under another type',
|
|
||||||
|
|
||||||
|
|
||||||
# Named Entity Recognition
|
# Named Entity Recognition
|
||||||
# Wikipedia
|
# Wikipedia
|
||||||
# http://www.sciencedirect.com/science/article/pii/S0004370212000276
|
# http://www.sciencedirect.com/science/article/pii/S0004370212000276
|
||||||
# https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
|
# https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
|
||||||
|
"PER": "Named person or family.",
|
||||||
'PER': 'Named person or family.',
|
"MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art",
|
||||||
'MISC': ('Miscellaneous entities, e.g. events, nationalities, '
|
|
||||||
'products or works of art'),
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,16 +16,18 @@ from ...util import update_exc, add_lookups
|
||||||
class ArabicDefaults(Language.Defaults):
|
class ArabicDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: 'ar'
|
lex_attr_getters[LANG] = lambda text: "ar"
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
class Arabic(Language):
|
class Arabic(Language):
|
||||||
lang = 'ar'
|
lang = "ar"
|
||||||
Defaults = ArabicDefaults
|
Defaults = ArabicDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Arabic']
|
__all__ = ["Arabic"]
|
||||||
|
|
|
@ -10,11 +10,11 @@ Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
"نال الكاتب خالد توفيق جائزة الرواية العربية في معرض الشارقة الدولي للكتاب",
|
"نال الكاتب خالد توفيق جائزة الرواية العربية في معرض الشارقة الدولي للكتاب",
|
||||||
"أين تقع دمشق ؟"
|
"أين تقع دمشق ؟",
|
||||||
"كيف حالك ؟",
|
"كيف حالك ؟",
|
||||||
"هل يمكن ان نلتقي على الساعة الثانية عشرة ظهرا ؟",
|
"هل يمكن ان نلتقي على الساعة الثانية عشرة ظهرا ؟",
|
||||||
"ماهي أبرز التطورات السياسية، الأمنية والاجتماعية في العالم ؟",
|
"ماهي أبرز التطورات السياسية، الأمنية والاجتماعية في العالم ؟",
|
||||||
"هل بالإمكان أن نلتقي غدا؟",
|
"هل بالإمكان أن نلتقي غدا؟",
|
||||||
"هناك نحو 382 مليون شخص مصاب بداء السكَّري في العالم",
|
"هناك نحو 382 مليون شخص مصاب بداء السكَّري في العالم",
|
||||||
"كشفت دراسة حديثة أن الخيل تقرأ تعبيرات الوجه وتستطيع أن تتذكر مشاعر الناس وعواطفهم"
|
"كشفت دراسة حديثة أن الخيل تقرأ تعبيرات الوجه وتستطيع أن تتذكر مشاعر الناس وعواطفهم",
|
||||||
]
|
]
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
_num_words = set("""
|
_num_words = set(
|
||||||
|
"""
|
||||||
صفر
|
صفر
|
||||||
واحد
|
واحد
|
||||||
إثنان
|
إثنان
|
||||||
|
@ -52,9 +53,11 @@ _num_words = set("""
|
||||||
مليون
|
مليون
|
||||||
مليار
|
مليار
|
||||||
مليارات
|
مليارات
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
_ordinal_words = set("""
|
_ordinal_words = set(
|
||||||
|
"""
|
||||||
اول
|
اول
|
||||||
أول
|
أول
|
||||||
حاد
|
حاد
|
||||||
|
@ -69,20 +72,21 @@ _ordinal_words = set("""
|
||||||
ثامن
|
ثامن
|
||||||
تاسع
|
تاسع
|
||||||
عاشر
|
عاشر
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
"""
|
"""
|
||||||
check if text resembles a number
|
Check if text resembles a number
|
||||||
"""
|
"""
|
||||||
if text.startswith(('+', '-', '±', '~')):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(",", "").replace(".", "")
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.count('/') == 1:
|
if text.count("/") == 1:
|
||||||
num, denom = text.split('/')
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text in _num_words:
|
if text in _num_words:
|
||||||
|
@ -92,6 +96,4 @@ def like_num(text):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||||
LIKE_NUM: like_num
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,15 +1,20 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..punctuation import TOKENIZER_INFIXES
|
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||||
from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import UNITS, ALPHA_UPPER
|
||||||
|
|
||||||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
_suffixes = (
|
||||||
[r'(?<=[0-9])\+',
|
LIST_PUNCT
|
||||||
# Arabic is written from Right-To-Left
|
+ LIST_ELLIPSES
|
||||||
r'(?<=[0-9])(?:{})'.format(CURRENCY),
|
+ LIST_QUOTES
|
||||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
+ [
|
||||||
r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER)])
|
r"(?<=[0-9])\+",
|
||||||
|
# Arabic is written from Right-To-Left
|
||||||
|
r"(?<=[0-9])(?:{})".format(CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{})".format(UNITS),
|
||||||
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
TOKENIZER_SUFFIXES = _suffixes
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
STOP_WORDS = set("""
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
من
|
من
|
||||||
نحو
|
نحو
|
||||||
لعل
|
لعل
|
||||||
|
@ -388,4 +389,5 @@ STOP_WORDS = set("""
|
||||||
وإن
|
وإن
|
||||||
ولو
|
ولو
|
||||||
يا
|
يا
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -1,21 +1,23 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
import re
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
# time
|
|
||||||
|
# Time
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
{LEMMA: "قبل الميلاد", ORTH: "ق.م"},
|
{LEMMA: "قبل الميلاد", ORTH: "ق.م"},
|
||||||
{LEMMA: "بعد الميلاد", ORTH: "ب. م"},
|
{LEMMA: "بعد الميلاد", ORTH: "ب. م"},
|
||||||
{LEMMA: "ميلادي", ORTH: ".م"},
|
{LEMMA: "ميلادي", ORTH: ".م"},
|
||||||
{LEMMA: "هجري", ORTH: ".هـ"},
|
{LEMMA: "هجري", ORTH: ".هـ"},
|
||||||
{LEMMA: "توفي", ORTH: ".ت"}]:
|
{LEMMA: "توفي", ORTH: ".ت"},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
# scientific abv.
|
# Scientific abv.
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
{LEMMA: "صلى الله عليه وسلم", ORTH: "صلعم"},
|
{LEMMA: "صلى الله عليه وسلم", ORTH: "صلعم"},
|
||||||
{LEMMA: "الشارح", ORTH: "الشـ"},
|
{LEMMA: "الشارح", ORTH: "الشـ"},
|
||||||
|
@ -28,20 +30,20 @@ for exc_data in [
|
||||||
{LEMMA: "أنبأنا", ORTH: "أنا"},
|
{LEMMA: "أنبأنا", ORTH: "أنا"},
|
||||||
{LEMMA: "أخبرنا", ORTH: "نا"},
|
{LEMMA: "أخبرنا", ORTH: "نا"},
|
||||||
{LEMMA: "مصدر سابق", ORTH: "م. س"},
|
{LEMMA: "مصدر سابق", ORTH: "م. س"},
|
||||||
{LEMMA: "مصدر نفسه", ORTH: "م. ن"}]:
|
{LEMMA: "مصدر نفسه", ORTH: "م. ن"},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
# other abv.
|
# Other abv.
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
{LEMMA: "دكتور", ORTH: "د."},
|
{LEMMA: "دكتور", ORTH: "د."},
|
||||||
{LEMMA: "أستاذ دكتور", ORTH: "أ.د"},
|
{LEMMA: "أستاذ دكتور", ORTH: "أ.د"},
|
||||||
{LEMMA: "أستاذ", ORTH: "أ."},
|
{LEMMA: "أستاذ", ORTH: "أ."},
|
||||||
{LEMMA: "بروفيسور", ORTH: "ب."}]:
|
{LEMMA: "بروفيسور", ORTH: "ب."},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
for exc_data in [
|
for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
|
||||||
{LEMMA: "تلفون", ORTH: "ت."},
|
|
||||||
{LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
|
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -15,7 +15,7 @@ from ...util import update_exc
|
||||||
|
|
||||||
class BengaliDefaults(Language.Defaults):
|
class BengaliDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'bn'
|
lex_attr_getters[LANG] = lambda text: "bn"
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
@ -26,8 +26,8 @@ class BengaliDefaults(Language.Defaults):
|
||||||
|
|
||||||
|
|
||||||
class Bengali(Language):
|
class Bengali(Language):
|
||||||
lang = 'bn'
|
lang = "bn"
|
||||||
Defaults = BengaliDefaults
|
Defaults = BengaliDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Bengali']
|
__all__ = ["Bengali"]
|
||||||
|
|
|
@ -13,11 +13,9 @@ LEMMA_RULES = {
|
||||||
["গাছা", ""],
|
["গাছা", ""],
|
||||||
["গাছি", ""],
|
["গাছি", ""],
|
||||||
["ছড়া", ""],
|
["ছড়া", ""],
|
||||||
|
|
||||||
["কে", ""],
|
["কে", ""],
|
||||||
["ে", ""],
|
["ে", ""],
|
||||||
["তে", ""],
|
["তে", ""],
|
||||||
|
|
||||||
["র", ""],
|
["র", ""],
|
||||||
["রা", ""],
|
["রা", ""],
|
||||||
["রে", ""],
|
["রে", ""],
|
||||||
|
@ -28,7 +26,6 @@ LEMMA_RULES = {
|
||||||
["গুলা", ""],
|
["গুলা", ""],
|
||||||
["গুলো", ""],
|
["গুলো", ""],
|
||||||
["গুলি", ""],
|
["গুলি", ""],
|
||||||
|
|
||||||
["কুল", ""],
|
["কুল", ""],
|
||||||
["গণ", ""],
|
["গণ", ""],
|
||||||
["দল", ""],
|
["দল", ""],
|
||||||
|
@ -45,7 +42,6 @@ LEMMA_RULES = {
|
||||||
["সকল", ""],
|
["সকল", ""],
|
||||||
["মহল", ""],
|
["মহল", ""],
|
||||||
["াবলি", ""], # আবলি
|
["াবলি", ""], # আবলি
|
||||||
|
|
||||||
# Bengali digit representations
|
# Bengali digit representations
|
||||||
["০", "0"],
|
["০", "0"],
|
||||||
["১", "1"],
|
["১", "1"],
|
||||||
|
@ -58,11 +54,5 @@ LEMMA_RULES = {
|
||||||
["৮", "8"],
|
["৮", "8"],
|
||||||
["৯", "9"],
|
["৯", "9"],
|
||||||
],
|
],
|
||||||
|
"punct": [["“", '"'], ["”", '"'], ["\u2018", "'"], ["\u2019", "'"]],
|
||||||
"punct": [
|
|
||||||
["“", "\""],
|
|
||||||
["”", "\""],
|
|
||||||
["\u2018", "'"],
|
|
||||||
["\u2019", "'"]
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,64 +5,253 @@ from ...symbols import LEMMA, PRON_LEMMA
|
||||||
|
|
||||||
|
|
||||||
MORPH_RULES = {
|
MORPH_RULES = {
|
||||||
"PRP": {
|
"PRP": {
|
||||||
'ঐ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'},
|
"ঐ": {LEMMA: PRON_LEMMA, "PronType": "Dem"},
|
||||||
'আমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'},
|
"আমাকে": {
|
||||||
'কি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
LEMMA: PRON_LEMMA,
|
||||||
'সে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'},
|
"Number": "Sing",
|
||||||
'কিসে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
"Person": "One",
|
||||||
'তাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
|
"PronType": "Prs",
|
||||||
'স্বয়ং': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
"Case": "Acc",
|
||||||
'কোনগুলো': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Gender': 'Neut', 'PronType': 'Int', 'Case': 'Acc'},
|
},
|
||||||
'তুমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
"কি": {
|
||||||
'তুই': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
LEMMA: PRON_LEMMA,
|
||||||
'তাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Acc'},
|
"Number": "Sing",
|
||||||
'আমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One ', 'PronType': 'Prs', 'Case': 'Nom'},
|
"Gender": "Neut",
|
||||||
'যিনি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
|
"PronType": "Int",
|
||||||
'আমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Acc'},
|
"Case": "Acc",
|
||||||
'কোন': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
},
|
||||||
'কারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
|
"সে": {
|
||||||
'তোমাকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
LEMMA: PRON_LEMMA,
|
||||||
'তোকে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
"Number": "Sing",
|
||||||
'খোদ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
"Person": "Three",
|
||||||
'কে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
"PronType": "Prs",
|
||||||
'যারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Rel', 'Case': 'Nom'},
|
"Case": "Nom",
|
||||||
'যে': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Rel', 'Case': 'Nom'},
|
},
|
||||||
'তোমরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
"কিসে": {
|
||||||
'তোরা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Nom'},
|
LEMMA: PRON_LEMMA,
|
||||||
'তোমাদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
"Number": "Sing",
|
||||||
'তোদেরকে': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Case': 'Acc'},
|
"Gender": "Neut",
|
||||||
'আপন': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
"PronType": "Int",
|
||||||
'এ': {LEMMA: PRON_LEMMA, 'PronType': 'Dem'},
|
"Case": "Acc",
|
||||||
'নিজ': {LEMMA: PRON_LEMMA, 'Reflex': 'Yes', 'PronType': 'Ref'},
|
},
|
||||||
'কার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'PronType': 'Int', 'Case': 'Acc'},
|
"তাকে": {
|
||||||
'যা': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Gender': 'Neut', 'PronType': 'Rel', 'Case': 'Nom'},
|
LEMMA: PRON_LEMMA,
|
||||||
'তারা': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Case': 'Nom'},
|
"Number": "Sing",
|
||||||
'আমি': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Case': 'Nom'}
|
"Person": "Three",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"স্বয়ং": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
|
||||||
|
"কোনগুলো": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Gender": "Neut",
|
||||||
|
"PronType": "Int",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"তুমি": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তুই": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তাদেরকে": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Three",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"আমরা": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "One ",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"যিনি": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Rel", "Case": "Nom"},
|
||||||
|
"আমাদেরকে": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "One",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"কোন": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
|
||||||
|
"কারা": {LEMMA: PRON_LEMMA, "Number": "Plur", "PronType": "Int", "Case": "Acc"},
|
||||||
|
"তোমাকে": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"তোকে": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"খোদ": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
|
||||||
|
"কে": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
|
||||||
|
"যারা": {LEMMA: PRON_LEMMA, "Number": "Plur", "PronType": "Rel", "Case": "Nom"},
|
||||||
|
"যে": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Rel", "Case": "Nom"},
|
||||||
|
"তোমরা": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তোরা": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তোমাদেরকে": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"তোদেরকে": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"আপন": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
|
||||||
|
"এ": {LEMMA: PRON_LEMMA, "PronType": "Dem"},
|
||||||
|
"নিজ": {LEMMA: PRON_LEMMA, "Reflex": "Yes", "PronType": "Ref"},
|
||||||
|
"কার": {LEMMA: PRON_LEMMA, "Number": "Sing", "PronType": "Int", "Case": "Acc"},
|
||||||
|
"যা": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Gender": "Neut",
|
||||||
|
"PronType": "Rel",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তারা": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Three",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"আমি": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "One",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"PRP$": {
|
"PRP$": {
|
||||||
|
"আমার": {
|
||||||
'আমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
LEMMA: PRON_LEMMA,
|
||||||
'Case': 'Nom'},
|
"Number": "Sing",
|
||||||
'মোর': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
"Person": "One",
|
||||||
'Case': 'Nom'},
|
"PronType": "Prs",
|
||||||
'মোদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
"Poss": "Yes",
|
||||||
'Case': 'Nom'},
|
"Case": "Nom",
|
||||||
'তার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
|
},
|
||||||
'Case': 'Nom'},
|
"মোর": {
|
||||||
'তোমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
LEMMA: PRON_LEMMA,
|
||||||
'Case': 'Nom'},
|
"Number": "Sing",
|
||||||
'আমাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'One', 'PronType': 'Prs', 'Poss': 'Yes',
|
"Person": "One",
|
||||||
'Case': 'Nom'},
|
"PronType": "Prs",
|
||||||
'তোমার': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
"Poss": "Yes",
|
||||||
'Case': 'Nom'},
|
"Case": "Nom",
|
||||||
'তোর': {LEMMA: PRON_LEMMA, 'Number': 'Sing', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
},
|
||||||
'Case': 'Nom'},
|
"মোদের": {
|
||||||
'তাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Three', 'PronType': 'Prs', 'Poss': 'Yes',
|
LEMMA: PRON_LEMMA,
|
||||||
'Case': 'Nom'},
|
"Number": "Plur",
|
||||||
'কাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
|
"Person": "One",
|
||||||
'তোদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'Person': 'Two', 'PronType': 'Prs', 'Poss': 'Yes',
|
"PronType": "Prs",
|
||||||
'Case': 'Nom'},
|
"Poss": "Yes",
|
||||||
'যাদের': {LEMMA: PRON_LEMMA, 'Number': 'Plur', 'PronType': 'Int', 'Case': 'Acc'},
|
"Case": "Nom",
|
||||||
}
|
},
|
||||||
|
"তার": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Three",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তোমাদের": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"আমাদের": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "One",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তোমার": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তোর": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Sing",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"তাদের": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Three",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"কাদের": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"PronType": "Int",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"তোদের": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"Person": "Two",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"যাদের": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Number": "Plur",
|
||||||
|
"PronType": "Int",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,29 +2,45 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
|
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, QUOTES, UNITS
|
||||||
|
|
||||||
|
|
||||||
_currency = r"\$|¢|£|€|¥|฿|৳"
|
_currency = r"\$|¢|£|€|¥|฿|৳"
|
||||||
_quotes = QUOTES.replace("'", '')
|
_quotes = QUOTES.replace("'", "")
|
||||||
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
|
_list_punct = LIST_PUNCT + "। ॥".strip().split()
|
||||||
|
|
||||||
|
|
||||||
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
|
_prefixes = [r"\+"] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS
|
||||||
|
|
||||||
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
_suffixes = (
|
||||||
[r'(?<=[0-9])\+',
|
_list_punct
|
||||||
r'(?<=°[FfCcKk])\.',
|
+ LIST_ELLIPSES
|
||||||
r'(?<=[0-9])(?:{})'.format(_currency),
|
+ LIST_QUOTES
|
||||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
+ LIST_ICONS
|
||||||
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
|
+ [
|
||||||
|
r"(?<=[0-9])\+",
|
||||||
|
r"(?<=°[FfCcKk])\.",
|
||||||
|
r"(?<=[0-9])(?:{})".format(_currency),
|
||||||
|
r"(?<=[0-9])(?:{})".format(UNITS),
|
||||||
|
r"(?<=[{}(?:{})])\.".format(
|
||||||
|
"|".join([ALPHA_LOWER, r"%²\-\)\]\+", QUOTES]), _currency
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
_infixes = (
|
||||||
[r'(?<=[0-9{zero}-{nine}])[+\-\*^=](?=[0-9{zero}-{nine}-])'.format(zero=u'০', nine=u'৯'),
|
LIST_ELLIPSES
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
+ LIST_ICONS
|
||||||
r'(?<=[{a}])[{h}](?={ae})'.format(a=ALPHA, h=HYPHENS, ae=u'এ'),
|
+ [
|
||||||
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
r"(?<=[0-9{zero}-{nine}])[+\-\*^=](?=[0-9{zero}-{nine}-])".format(
|
||||||
r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA)])
|
zero="০", nine="৯"
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])[{h}](?={ae})".format(a=ALPHA, h=HYPHENS, ae="এ"),
|
||||||
|
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
||||||
|
r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = _prefixes
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set("""
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
|
অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
|
||||||
আগামী আগে আগেই আছে আজ আদ্যভাগে আপনার আপনি আবার আমরা আমাকে আমাদের আমার আমি আর আরও
|
আগামী আগে আগেই আছে আজ আদ্যভাগে আপনার আপনি আবার আমরা আমাকে আমাদের আমার আমি আর আরও
|
||||||
ইত্যাদি ইহা
|
ইত্যাদি ইহা
|
||||||
|
@ -41,4 +42,5 @@ STOP_WORDS = set("""
|
||||||
সাধারণ সামনে সঙ্গে সঙ্গেও সব সবার সমস্ত সম্প্রতি সময় সহ সহিত সাথে সুতরাং সে সেই সেখান সেখানে সেটা সেটাই সেটাও সেটি স্পষ্ট স্বয়ং
|
সাধারণ সামনে সঙ্গে সঙ্গেও সব সবার সমস্ত সম্প্রতি সময় সহ সহিত সাথে সুতরাং সে সেই সেখান সেখানে সেটা সেটাই সেটাও সেটি স্পষ্ট স্বয়ং
|
||||||
হইতে হইবে হইয়া হওয়া হওয়ায় হওয়ার হচ্ছে হত হতে হতেই হন হবে হবেন হয় হয়তো হয়নি হয়ে হয়েই হয়েছিল হয়েছে হাজার
|
হইতে হইবে হইয়া হওয়া হওয়ায় হওয়ার হচ্ছে হত হতে হতেই হন হবে হবেন হয় হয়তো হয়নি হয়ে হয়েই হয়েছিল হয়েছে হাজার
|
||||||
হয়েছেন হল হলে হলেই হলেও হলো হিসাবে হিসেবে হৈলে হোক হয় হয়ে হয়েছে হৈতে হইয়া হয়েছিল হয়েছেন হয়নি হয়েই হয়তো হওয়া হওয়ার হওয়ায়
|
হয়েছেন হল হলে হলেই হলেও হলো হিসাবে হিসেবে হৈলে হোক হয় হয়ে হয়েছে হৈতে হইয়া হয়েছিল হয়েছেন হয়নি হয়েই হয়তো হওয়া হওয়ার হওয়ায়
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -6,72 +6,77 @@ from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
".": {POS: PUNCT, "PunctType": "peri"},
|
".": {POS: PUNCT, "PunctType": "peri"},
|
||||||
",": {POS: PUNCT, "PunctType": "comm"},
|
",": {POS: PUNCT, "PunctType": "comm"},
|
||||||
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
||||||
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
||||||
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
||||||
"\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
'""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||||
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||||
":": {POS: PUNCT},
|
":": {POS: PUNCT},
|
||||||
"৳": {POS: SYM, "Other": {"SymType": "currency"}},
|
"৳": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||||
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||||
"AFX": {POS: ADJ, "Hyph": "yes"},
|
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||||
"CC": {POS: CONJ, "ConjType": "coor"},
|
"CC": {POS: CONJ, "ConjType": "coor"},
|
||||||
"CD": {POS: NUM, "NumType": "card"},
|
"CD": {POS: NUM, "NumType": "card"},
|
||||||
"DT": {POS: DET},
|
"DT": {POS: DET},
|
||||||
"EX": {POS: ADV, "AdvType": "ex"},
|
"EX": {POS: ADV, "AdvType": "ex"},
|
||||||
"FW": {POS: X, "Foreign": "yes"},
|
"FW": {POS: X, "Foreign": "yes"},
|
||||||
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
||||||
"IN": {POS: ADP},
|
"IN": {POS: ADP},
|
||||||
"JJ": {POS: ADJ, "Degree": "pos"},
|
"JJ": {POS: ADJ, "Degree": "pos"},
|
||||||
"JJR": {POS: ADJ, "Degree": "comp"},
|
"JJR": {POS: ADJ, "Degree": "comp"},
|
||||||
"JJS": {POS: ADJ, "Degree": "sup"},
|
"JJS": {POS: ADJ, "Degree": "sup"},
|
||||||
"LS": {POS: PUNCT, "NumType": "ord"},
|
"LS": {POS: PUNCT, "NumType": "ord"},
|
||||||
"MD": {POS: VERB, "VerbType": "mod"},
|
"MD": {POS: VERB, "VerbType": "mod"},
|
||||||
"NIL": {POS: ""},
|
"NIL": {POS: ""},
|
||||||
"NN": {POS: NOUN, "Number": "sing"},
|
"NN": {POS: NOUN, "Number": "sing"},
|
||||||
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
||||||
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
||||||
"NNS": {POS: NOUN, "Number": "plur"},
|
"NNS": {POS: NOUN, "Number": "plur"},
|
||||||
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
||||||
"POS": {POS: PART, "Poss": "yes"},
|
"POS": {POS: PART, "Poss": "yes"},
|
||||||
"PRP": {POS: PRON, "PronType": "prs"},
|
"PRP": {POS: PRON, "PronType": "prs"},
|
||||||
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
||||||
"RB": {POS: ADV, "Degree": "pos"},
|
"RB": {POS: ADV, "Degree": "pos"},
|
||||||
"RBR": {POS: ADV, "Degree": "comp"},
|
"RBR": {POS: ADV, "Degree": "comp"},
|
||||||
"RBS": {POS: ADV, "Degree": "sup"},
|
"RBS": {POS: ADV, "Degree": "sup"},
|
||||||
"RP": {POS: PART},
|
"RP": {POS: PART},
|
||||||
"SYM": {POS: SYM},
|
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
||||||
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
"UH": {POS: INTJ},
|
||||||
"UH": {POS: INTJ},
|
"VB": {POS: VERB, "VerbForm": "inf"},
|
||||||
"VB": {POS: VERB, "VerbForm": "inf"},
|
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
||||||
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
||||||
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
||||||
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
||||||
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
"VBZ": {
|
||||||
"VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
|
POS: VERB,
|
||||||
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
"VerbForm": "fin",
|
||||||
"WP": {POS: NOUN, "PronType": "int|rel"},
|
"Tense": "pres",
|
||||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
"Number": "sing",
|
||||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
"Person": 3,
|
||||||
"SP": {POS: SPACE},
|
},
|
||||||
"ADV": {POS: ADV},
|
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
||||||
"NOUN": {POS: NOUN},
|
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||||
"ADP": {POS: ADP},
|
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||||
"PRON": {POS: PRON},
|
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||||
"SCONJ": {POS: SCONJ},
|
"SP": {POS: SPACE},
|
||||||
"PROPN": {POS: PROPN},
|
"ADV": {POS: ADV},
|
||||||
"DET": {POS: DET},
|
"NOUN": {POS: NOUN},
|
||||||
"SYM": {POS: SYM},
|
"ADP": {POS: ADP},
|
||||||
"INTJ": {POS: INTJ},
|
"PRON": {POS: PRON},
|
||||||
"PUNCT": {POS: PUNCT},
|
"SCONJ": {POS: SCONJ},
|
||||||
"NUM": {POS: NUM},
|
"PROPN": {POS: PROPN},
|
||||||
"AUX": {POS: AUX},
|
"DET": {POS: DET},
|
||||||
"X": {POS: X},
|
"SYM": {POS: SYM},
|
||||||
"CONJ": {POS: CONJ},
|
"INTJ": {POS: INTJ},
|
||||||
"CCONJ": {POS: CCONJ},
|
"PUNCT": {POS: PUNCT},
|
||||||
"ADJ": {POS: ADJ},
|
"NUM": {POS: NUM},
|
||||||
"VERB": {POS: VERB},
|
"AUX": {POS: AUX},
|
||||||
"PART": {POS: PART},
|
"X": {POS: X},
|
||||||
|
"CONJ": {POS: CONJ},
|
||||||
|
"CCONJ": {POS: CCONJ},
|
||||||
|
"ADJ": {POS: ADJ},
|
||||||
|
"VERB": {POS: VERB},
|
||||||
|
"PART": {POS: PART},
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,7 +19,8 @@ for exc_data in [
|
||||||
{ORTH: "কি.মি", LEMMA: "কিলোমিটার"},
|
{ORTH: "কি.মি", LEMMA: "কিলোমিটার"},
|
||||||
{ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
|
{ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
|
||||||
{ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
|
{ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
|
||||||
{ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}]:
|
{ORTH: "মি.লি.", LEMMA: "মিলিলিটার"},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,13 +4,6 @@ from __future__ import unicode_literals
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
# uncomment if files are available
|
|
||||||
# from .norm_exceptions import NORM_EXCEPTIONS
|
|
||||||
# from .tag_map import TAG_MAP
|
|
||||||
# from .morph_rules import MORPH_RULES
|
|
||||||
|
|
||||||
# uncomment if lookup-based lemmatizer is available
|
|
||||||
from .lemmatizer import LOOKUP
|
from .lemmatizer import LOOKUP
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
@ -19,46 +12,22 @@ from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
# Create a Language subclass
|
|
||||||
# Documentation: https://spacy.io/docs/usage/adding-languages
|
|
||||||
|
|
||||||
# This file should be placed in spacy/lang/ca (ISO code of language).
|
|
||||||
# Before submitting a pull request, make sure the remove all comments from the
|
|
||||||
# language data files, and run at least the basic tokenizer tests. Simply add the
|
|
||||||
# language ID to the list of languages in spacy/tests/conftest.py to include it
|
|
||||||
# in the basic tokenizer sanity tests. You can optionally add a fixture for the
|
|
||||||
# language's tokenizer and add more specific tests. For more info, see the
|
|
||||||
# tests documentation: https://github.com/explosion/spaCy/tree/master/spacy/tests
|
|
||||||
|
|
||||||
|
|
||||||
class CatalanDefaults(Language.Defaults):
|
class CatalanDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'ca' # ISO code
|
lex_attr_getters[LANG] = lambda text: "ca"
|
||||||
# add more norm exception dictionaries here
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
|
)
|
||||||
# overwrite functions for lexical attributes
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
|
||||||
# add custom tokenizer exceptions to base exceptions
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
|
||||||
# add stop words
|
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
# if available: add tag map
|
|
||||||
# tag_map = dict(TAG_MAP)
|
|
||||||
|
|
||||||
# if available: add morph rules
|
|
||||||
# morph_rules = dict(MORPH_RULES)
|
|
||||||
|
|
||||||
lemma_lookup = LOOKUP
|
lemma_lookup = LOOKUP
|
||||||
|
|
||||||
|
|
||||||
class Catalan(Language):
|
class Catalan(Language):
|
||||||
lang = 'ca' # ISO code
|
lang = "ca"
|
||||||
Defaults = CatalanDefaults # set Defaults to custom language defaults
|
Defaults = CatalanDefaults
|
||||||
|
|
||||||
|
|
||||||
# set default export – this allows the language class to be lazy-loaded
|
__all__ = ["Catalan"]
|
||||||
__all__ = ['Catalan']
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ from __future__ import unicode_literals
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
>>> from spacy.lang.es.examples import sentences
|
>>> from spacy.lang.ca.examples import sentences
|
||||||
>>> docs = nlp.pipe(sentences)
|
>>> docs = nlp.pipe(sentences)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
|
@ -1,33 +1,57 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
# import the symbols for the attrs you want to overwrite
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
# Overwriting functions for lexical attributes
|
_num_words = [
|
||||||
# Documentation: https://localhost:1234/docs/usage/adding-languages#lex-attrs
|
"zero",
|
||||||
# Most of these functions, like is_lower or like_url should be language-
|
"un",
|
||||||
# independent. Others, like like_num (which includes both digits and number
|
"dos",
|
||||||
# words), requires customisation.
|
"tres",
|
||||||
|
"quatre",
|
||||||
|
"cinc",
|
||||||
# Example: check if token resembles a number
|
"sis",
|
||||||
|
"set",
|
||||||
_num_words = ['zero', 'un', 'dos', 'tres', 'quatre', 'cinc', 'sis', 'set',
|
"vuit",
|
||||||
'vuit', 'nou', 'deu', 'onze', 'dotze', 'tretze', 'catorze',
|
"nou",
|
||||||
'quinze', 'setze', 'disset', 'divuit', 'dinou', 'vint',
|
"deu",
|
||||||
'trenta', 'quaranta', 'cinquanta', 'seixanta', 'setanta', 'vuitanta', 'noranta',
|
"onze",
|
||||||
'cent', 'mil', 'milió', 'bilió', 'trilió', 'quatrilió',
|
"dotze",
|
||||||
'gazilió', 'bazilió']
|
"tretze",
|
||||||
|
"catorze",
|
||||||
|
"quinze",
|
||||||
|
"setze",
|
||||||
|
"disset",
|
||||||
|
"divuit",
|
||||||
|
"dinou",
|
||||||
|
"vint",
|
||||||
|
"trenta",
|
||||||
|
"quaranta",
|
||||||
|
"cinquanta",
|
||||||
|
"seixanta",
|
||||||
|
"setanta",
|
||||||
|
"vuitanta",
|
||||||
|
"noranta",
|
||||||
|
"cent",
|
||||||
|
"mil",
|
||||||
|
"milió",
|
||||||
|
"bilió",
|
||||||
|
"trilió",
|
||||||
|
"quatrilió",
|
||||||
|
"gazilió",
|
||||||
|
"bazilió",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
text = text.replace(',', '').replace('.', '')
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.count('/') == 1:
|
if text.count("/") == 1:
|
||||||
num, denom = text.split('/')
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text in _num_words:
|
if text in _num_words:
|
||||||
|
@ -35,9 +59,4 @@ def like_num(text):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
# Create dictionary of functions to overwrite. The default lex_attr_getters are
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||||
# updated with this one, so only the functions defined here are overwritten.
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
|
||||||
LIKE_NUM: like_num
|
|
||||||
}
|
|
||||||
|
|
|
@ -2,9 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
# Stop words
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
STOP_WORDS = set("""
|
|
||||||
a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò
|
a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò
|
||||||
als altra altre altres amb ambdues ambdós anar ans apa aquell aquella aquelles aquells
|
als altra altre altres amb ambdues ambdós anar ans apa aquell aquella aquelles aquells
|
||||||
aquest aquesta aquestes aquests aquí
|
aquest aquesta aquestes aquests aquí
|
||||||
|
@ -53,4 +52,5 @@ un una unes uns us últim ús
|
||||||
|
|
||||||
va vaig vam van vas veu vosaltres vostra vostre vostres
|
va vaig vam van vas veu vosaltres vostra vostre vostres
|
||||||
|
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -5,32 +5,24 @@ from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
||||||
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
|
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
|
||||||
|
|
||||||
|
|
||||||
# Add a tag map
|
|
||||||
# Documentation: https://spacy.io/docs/usage/adding-languages#tag-map
|
|
||||||
# Universal Dependencies: http://universaldependencies.org/u/pos/all.html
|
|
||||||
# The keys of the tag map should be strings in your tag set. The dictionary must
|
|
||||||
# have an entry POS whose value is one of the Universal Dependencies tags.
|
|
||||||
# Optionally, you can also include morphological features or other attributes.
|
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
"ADV": {POS: ADV},
|
"ADV": {POS: ADV},
|
||||||
"NOUN": {POS: NOUN},
|
"NOUN": {POS: NOUN},
|
||||||
"ADP": {POS: ADP},
|
"ADP": {POS: ADP},
|
||||||
"PRON": {POS: PRON},
|
"PRON": {POS: PRON},
|
||||||
"SCONJ": {POS: SCONJ},
|
"SCONJ": {POS: SCONJ},
|
||||||
"PROPN": {POS: PROPN},
|
"PROPN": {POS: PROPN},
|
||||||
"DET": {POS: DET},
|
"DET": {POS: DET},
|
||||||
"SYM": {POS: SYM},
|
"SYM": {POS: SYM},
|
||||||
"INTJ": {POS: INTJ},
|
"INTJ": {POS: INTJ},
|
||||||
"PUNCT": {POS: PUNCT},
|
"PUNCT": {POS: PUNCT},
|
||||||
"NUM": {POS: NUM},
|
"NUM": {POS: NUM},
|
||||||
"AUX": {POS: AUX},
|
"AUX": {POS: AUX},
|
||||||
"X": {POS: X},
|
"X": {POS: X},
|
||||||
"CONJ": {POS: CONJ},
|
"CONJ": {POS: CONJ},
|
||||||
"CCONJ": {POS: CCONJ},
|
"CCONJ": {POS: CCONJ},
|
||||||
"ADJ": {POS: ADJ},
|
"ADJ": {POS: ADJ},
|
||||||
"VERB": {POS: VERB},
|
"VERB": {POS: VERB},
|
||||||
"PART": {POS: PART},
|
"PART": {POS: PART},
|
||||||
"SP": {POS: SPACE}
|
"SP": {POS: SPACE},
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
# import symbols – if you need to use more, add them here
|
from ...symbols import ORTH, LEMMA
|
||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, ADP, DET
|
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -25,27 +24,18 @@ for exc_data in [
|
||||||
{ORTH: "Srta.", LEMMA: "senyoreta"},
|
{ORTH: "Srta.", LEMMA: "senyoreta"},
|
||||||
{ORTH: "núm", LEMMA: "número"},
|
{ORTH: "núm", LEMMA: "número"},
|
||||||
{ORTH: "St.", LEMMA: "sant"},
|
{ORTH: "St.", LEMMA: "sant"},
|
||||||
{ORTH: "Sta.", LEMMA: "santa"}]:
|
{ORTH: "Sta.", LEMMA: "santa"},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
# Times
|
# Times
|
||||||
|
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}]
|
||||||
_exc["12m."] = [
|
|
||||||
{ORTH: "12"},
|
|
||||||
{ORTH: "m.", LEMMA: "p.m."}]
|
|
||||||
|
|
||||||
|
|
||||||
for h in range(1, 12 + 1):
|
for h in range(1, 12 + 1):
|
||||||
for period in ["a.m.", "am"]:
|
for period in ["a.m.", "am"]:
|
||||||
_exc["%d%s" % (h, period)] = [
|
_exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "a.m."}]
|
||||||
{ORTH: "%d" % h},
|
|
||||||
{ORTH: period, LEMMA: "a.m."}]
|
|
||||||
for period in ["p.m.", "pm"]:
|
for period in ["p.m.", "pm"]:
|
||||||
_exc["%d%s" % (h, period)] = [
|
_exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "p.m."}]
|
||||||
{ORTH: "%d" % h},
|
|
||||||
{ORTH: period, LEMMA: "p.m."}]
|
|
||||||
|
|
||||||
# To keep things clean and readable, it's recommended to only declare the
|
|
||||||
# TOKENIZER_EXCEPTIONS at the bottom:
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -4,23 +4,23 @@ from __future__ import unicode_literals
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
re.DEFAULT_VERSION = re.VERSION1
|
re.DEFAULT_VERSION = re.VERSION1
|
||||||
merge_char_classes = lambda classes: '[{}]'.format('||'.join(classes))
|
merge_char_classes = lambda classes: "[{}]".format("||".join(classes))
|
||||||
split_chars = lambda char: list(char.strip().split(' '))
|
split_chars = lambda char: list(char.strip().split(" "))
|
||||||
merge_chars = lambda char: char.strip().replace(' ', '|')
|
merge_chars = lambda char: char.strip().replace(" ", "|")
|
||||||
|
|
||||||
_bengali = r'[\p{L}&&\p{Bengali}]'
|
_bengali = r"[\p{L}&&\p{Bengali}]"
|
||||||
_hebrew = r'[\p{L}&&\p{Hebrew}]'
|
_hebrew = r"[\p{L}&&\p{Hebrew}]"
|
||||||
_latin_lower = r'[\p{Ll}&&\p{Latin}]'
|
_latin_lower = r"[\p{Ll}&&\p{Latin}]"
|
||||||
_latin_upper = r'[\p{Lu}&&\p{Latin}]'
|
_latin_upper = r"[\p{Lu}&&\p{Latin}]"
|
||||||
_latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
|
_latin = r"[[\p{Ll}||\p{Lu}]&&\p{Latin}]"
|
||||||
_persian = r'[\p{L}&&\p{Arabic}]'
|
_persian = r"[\p{L}&&\p{Arabic}]"
|
||||||
_russian_lower = r'[ёа-я]'
|
_russian_lower = r"[ёа-я]"
|
||||||
_russian_upper = r'[ЁА-Я]'
|
_russian_upper = r"[ЁА-Я]"
|
||||||
_sinhala = r'[\p{L}&&\p{Sinhala}]'
|
_sinhala = r"[\p{L}&&\p{Sinhala}]"
|
||||||
_tatar_lower = r'[әөүҗңһ]'
|
_tatar_lower = r"[әөүҗңһ]"
|
||||||
_tatar_upper = r'[ӘӨҮҖҢҺ]'
|
_tatar_upper = r"[ӘӨҮҖҢҺ]"
|
||||||
_greek_lower = r'[α-ωάέίόώήύ]'
|
_greek_lower = r"[α-ωάέίόώήύ]"
|
||||||
_greek_upper = r'[Α-ΩΆΈΊΌΏΉΎ]'
|
_greek_upper = r"[Α-ΩΆΈΊΌΏΉΎ]"
|
||||||
|
|
||||||
_upper = [_latin_upper, _russian_upper, _tatar_upper, _greek_upper]
|
_upper = [_latin_upper, _russian_upper, _tatar_upper, _greek_upper]
|
||||||
_lower = [_latin_lower, _russian_lower, _tatar_lower, _greek_lower]
|
_lower = [_latin_lower, _russian_lower, _tatar_lower, _greek_lower]
|
||||||
|
@ -30,23 +30,27 @@ ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
||||||
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
|
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
|
||||||
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
||||||
|
|
||||||
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
|
_units = (
|
||||||
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
|
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
||||||
'TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм '
|
"kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "
|
||||||
'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб'
|
"TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм "
|
||||||
'كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب')
|
"кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб"
|
||||||
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼'
|
"كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب"
|
||||||
|
)
|
||||||
|
_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼"
|
||||||
|
|
||||||
# These expressions contain various unicode variations, including characters
|
# These expressions contain various unicode variations, including characters
|
||||||
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
||||||
# conflicts, spaCy's base tokenizer should handle all of those by default
|
# conflicts, spaCy's base tokenizer should handle all of those by default
|
||||||
_punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪'
|
_punct = (
|
||||||
|
r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪"
|
||||||
|
)
|
||||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉'
|
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉'
|
||||||
_hyphens = '- – — -- --- —— ~'
|
_hyphens = "- – — -- --- —— ~"
|
||||||
|
|
||||||
# Various symbols like dingbats, but also emoji
|
# Various symbols like dingbats, but also emoji
|
||||||
# Details: https://www.compart.com/en/unicode/category/So
|
# Details: https://www.compart.com/en/unicode/category/So
|
||||||
_other_symbols = r'[\p{So}]'
|
_other_symbols = r"[\p{So}]"
|
||||||
|
|
||||||
UNITS = merge_chars(_units)
|
UNITS = merge_chars(_units)
|
||||||
CURRENCY = merge_chars(_currency)
|
CURRENCY = merge_chars(_currency)
|
||||||
|
@ -60,5 +64,5 @@ LIST_CURRENCY = split_chars(_currency)
|
||||||
LIST_QUOTES = split_chars(_quotes)
|
LIST_QUOTES = split_chars(_quotes)
|
||||||
LIST_PUNCT = split_chars(_punct)
|
LIST_PUNCT = split_chars(_punct)
|
||||||
LIST_HYPHENS = split_chars(_hyphens)
|
LIST_HYPHENS = split_chars(_hyphens)
|
||||||
LIST_ELLIPSES = [r'\.\.+', '…']
|
LIST_ELLIPSES = [r"\.\.+", "…"]
|
||||||
LIST_ICONS = [_other_symbols]
|
LIST_ICONS = [_other_symbols]
|
||||||
|
|
|
@ -20,9 +20,10 @@ from ...util import update_exc, add_lookups
|
||||||
class DanishDefaults(Language.Defaults):
|
class DanishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: 'da'
|
lex_attr_getters[LANG] = lambda text: "da"
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
BASE_NORMS, NORM_EXCEPTIONS)
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
||||||
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
morph_rules = MORPH_RULES
|
morph_rules = MORPH_RULES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
@ -33,8 +34,8 @@ class DanishDefaults(Language.Defaults):
|
||||||
|
|
||||||
|
|
||||||
class Danish(Language):
|
class Danish(Language):
|
||||||
lang = 'da'
|
lang = "da"
|
||||||
Defaults = DanishDefaults
|
Defaults = DanishDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Danish']
|
__all__ = ["Danish"]
|
||||||
|
|
|
@ -14,5 +14,5 @@ sentences = [
|
||||||
"Apple overvejer at købe et britisk startup for 1 milliard dollar",
|
"Apple overvejer at købe et britisk startup for 1 milliard dollar",
|
||||||
"Selvkørende biler flytter forsikringsansvaret over på producenterne",
|
"Selvkørende biler flytter forsikringsansvaret over på producenterne",
|
||||||
"San Francisco overvejer at forbyde udbringningsrobotter på fortov",
|
"San Francisco overvejer at forbyde udbringningsrobotter på fortov",
|
||||||
"London er en stor by i Storbritannien"
|
"London er en stor by i Storbritannien",
|
||||||
]
|
]
|
||||||
|
|
|
@ -3,8 +3,8 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
# Source http://fjern-uv.dk/tal.php
|
|
||||||
|
|
||||||
|
# Source http://fjern-uv.dk/tal.php
|
||||||
_num_words = """nul
|
_num_words = """nul
|
||||||
en et to tre fire fem seks syv otte ni ti
|
en et to tre fire fem seks syv otte ni ti
|
||||||
elleve tolv tretten fjorten femten seksten sytten atten nitten tyve
|
elleve tolv tretten fjorten femten seksten sytten atten nitten tyve
|
||||||
|
@ -19,8 +19,8 @@ enoghalvfems tooghalvfems treoghalvfems fireoghalvfems femoghalvfems seksoghalvf
|
||||||
million milliard billion billiard trillion trilliard
|
million milliard billion billiard trillion trilliard
|
||||||
""".split()
|
""".split()
|
||||||
|
|
||||||
# source http://www.duda.dk/video/dansk/grammatik/talord/talord.html
|
|
||||||
|
|
||||||
|
# Source: http://www.duda.dk/video/dansk/grammatik/talord/talord.html
|
||||||
_ordinal_words = """nulte
|
_ordinal_words = """nulte
|
||||||
første anden tredje fjerde femte sjette syvende ottende niende tiende
|
første anden tredje fjerde femte sjette syvende ottende niende tiende
|
||||||
elfte tolvte trettende fjortende femtende sekstende syttende attende nittende tyvende
|
elfte tolvte trettende fjortende femtende sekstende syttende attende nittende tyvende
|
||||||
|
@ -33,14 +33,15 @@ enogfirsindstyvende toogfirsindstyvende treogfirsindstyvende fireogfirsindstyven
|
||||||
enoghalvfemsindstyvende tooghalvfemsindstyvende treoghalvfemsindstyvende fireoghalvfemsindstyvende femoghalvfemsindstyvende seksoghalvfemsindstyvende syvoghalvfemsindstyvende otteoghalvfemsindstyvende nioghalvfemsindstyvende
|
enoghalvfemsindstyvende tooghalvfemsindstyvende treoghalvfemsindstyvende fireoghalvfemsindstyvende femoghalvfemsindstyvende seksoghalvfemsindstyvende syvoghalvfemsindstyvende otteoghalvfemsindstyvende nioghalvfemsindstyvende
|
||||||
""".split()
|
""".split()
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
if text.startswith(('+', '-', '±', '~')):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(",", "").replace(".", "")
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.count('/') == 1:
|
if text.count("/") == 1:
|
||||||
num, denom = text.split('/')
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.lower() in _num_words:
|
if text.lower() in _num_words:
|
||||||
|
@ -49,6 +50,5 @@ def like_num(text):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
LEX_ATTRS = {
|
|
||||||
LIKE_NUM: like_num
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||||
}
|
|
||||||
|
|
|
@ -11,53 +11,299 @@ from ...symbols import LEMMA, PRON_LEMMA
|
||||||
|
|
||||||
MORPH_RULES = {
|
MORPH_RULES = {
|
||||||
"PRON": {
|
"PRON": {
|
||||||
"jeg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Nom", "Gender": "Com"}, # Case=Nom|Gender=Com|Number=Sing|Person=1|PronType=Prs
|
"jeg": {
|
||||||
"mig": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc", "Gender": "Com"}, # Case=Acc|Gender=Com|Number=Sing|Person=1|PronType=Prs
|
LEMMA: PRON_LEMMA,
|
||||||
"min": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender": "Com"}, # Gender=Com|Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs
|
"PronType": "Prs",
|
||||||
"mit": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender": "Neut"}, # Gender=Neut|Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs
|
"Person": "One",
|
||||||
"vor": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender": "Com"}, # Gender=Com|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs|Style=Form
|
"Number": "Sing",
|
||||||
"vort": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender": "Neut"}, # Gender=Neut|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs|Style=Form
|
"Case": "Nom",
|
||||||
"du": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Case": "Nom", "Gender": "Com"}, # Case=Nom|Gender=Com|Number=Sing|Person=2|PronType=Prs
|
"Gender": "Com",
|
||||||
"dig": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Case": "Acc", "Gender": "Com"}, # Case=Acc|Gender=Com|Number=Sing|Person=2|PronType=Prs
|
}, # Case=Nom|Gender=Com|Number=Sing|Person=1|PronType=Prs
|
||||||
"din": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Poss": "Yes", "Gender": "Com"}, # Gender=Com|Number=Sing|Number[psor]=Sing|Person=2|Poss=Yes|PronType=Prs
|
"mig": {
|
||||||
"dit": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Poss": "Yes", "Gender": "Neut"}, # Gender=Neut|Number=Sing|Number[psor]=Sing|Person=2|Poss=Yes|PronType=Prs
|
LEMMA: PRON_LEMMA,
|
||||||
"han": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Nom", "Gender": "Com"}, # Case=Nom|Gender=Com|Number=Sing|Person=3|PronType=Prs
|
"PronType": "Prs",
|
||||||
"hun": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Nom", "Gender": "Com"}, # Case=Nom|Gender=Com|Number=Sing|Person=3|PronType=Prs
|
"Person": "One",
|
||||||
"den": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Com"}, # Case=Acc|Gender=Com|Number=Sing|Person=3|PronType=Prs, See note above.
|
"Number": "Sing",
|
||||||
"det": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, # Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs See note above.
|
"Case": "Acc",
|
||||||
"ham": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Com"}, # Case=Acc|Gender=Com|Number=Sing|Person=3|PronType=Prs
|
"Gender": "Com",
|
||||||
"hende": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Com"}, # Case=Acc|Gender=Com|Number=Sing|Person=3|PronType=Prs
|
}, # Case=Acc|Gender=Com|Number=Sing|Person=1|PronType=Prs
|
||||||
"sin": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Poss": "Yes", "Gender": "Com", "Reflex": "Yes"}, # Gender=Com|Number=Sing|Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs|Reflex=Yes
|
"min": {
|
||||||
"sit": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Poss": "Yes", "Gender": "Neut", "Reflex": "Yes"}, # Gender=Neut|Number=Sing|Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs|Reflex=Yes
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
"vi": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Nom", "Gender": "Com"}, # Case=Nom|Gender=Com|Number=Plur|Person=1|PronType=Prs
|
"Person": "One",
|
||||||
"os": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc", "Gender": "Com"}, # Case=Acc|Gender=Com|Number=Plur|Person=1|PronType=Prs
|
"Number": "Sing",
|
||||||
"mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes"}, # Number=Plur|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs
|
"Poss": "Yes",
|
||||||
"vore": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes"}, # Number=Plur|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs|Style=Form
|
"Gender": "Com",
|
||||||
"I": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur", "Case": "Nom", "Gender": "Com"}, # Case=Nom|Gender=Com|Number=Plur|Person=2|PronType=Prs
|
}, # Gender=Com|Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs
|
||||||
"jer": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur", "Case": "Acc", "Gender": "Com"}, # Case=Acc|Gender=Com|Number=Plur|Person=2|PronType=Prs
|
"mit": {
|
||||||
"dine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur", "Poss": "Yes"}, # Number=Plur|Number[psor]=Sing|Person=2|Poss=Yes|PronType=Prs
|
LEMMA: PRON_LEMMA,
|
||||||
"de": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Nom"}, # Case=Nom|Number=Plur|Person=3|PronType=Prs
|
"PronType": "Prs",
|
||||||
"dem": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, # Case=Acc|Number=Plur|Person=3|PronType=Prs
|
"Person": "One",
|
||||||
"sine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"}, # Number=Plur|Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs|Reflex=Yes
|
"Number": "Sing",
|
||||||
|
"Poss": "Yes",
|
||||||
"vores": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Poss": "Yes"}, # Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs
|
"Gender": "Neut",
|
||||||
"De": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Case": "Nom", "Gender": "Com"}, # Case=Nom|Gender=Com|Person=2|Polite=Form|PronType=Prs
|
}, # Gender=Neut|Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs
|
||||||
"Dem": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Case": "Acc", "Gender": "Com"}, # Case=Acc|Gender=Com|Person=2|Polite=Form|PronType=Prs
|
"vor": {
|
||||||
"Deres": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes"}, # Person=2|Polite=Form|Poss=Yes|PronType=Prs
|
LEMMA: PRON_LEMMA,
|
||||||
"jeres": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes"}, # Number[psor]=Plur|Person=2|Poss=Yes|PronType=Prs
|
"PronType": "Prs",
|
||||||
"sig": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Case": "Acc", "Reflex": "Yes"}, # Case=Acc|Person=3|PronType=Prs|Reflex=Yes
|
"Person": "One",
|
||||||
"hans": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Poss": "Yes"}, # Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs
|
"Number": "Sing",
|
||||||
"hendes": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Poss": "Yes"}, # Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs
|
"Poss": "Yes",
|
||||||
"dens": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Poss": "Yes"}, # Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs
|
"Gender": "Com",
|
||||||
"dets": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Poss": "Yes"}, # Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs
|
}, # Gender=Com|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs|Style=Form
|
||||||
"deres": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Poss": "Yes"}, # Number[psor]=Plur|Person=3|Poss=Yes|PronType=Prs
|
"vort": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "One",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Gender": "Neut",
|
||||||
|
}, # Gender=Neut|Number=Sing|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs|Style=Form
|
||||||
|
"du": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Case": "Nom",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Case=Nom|Gender=Com|Number=Sing|Person=2|PronType=Prs
|
||||||
|
"dig": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Case=Acc|Gender=Com|Number=Sing|Person=2|PronType=Prs
|
||||||
|
"din": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Gender=Com|Number=Sing|Number[psor]=Sing|Person=2|Poss=Yes|PronType=Prs
|
||||||
|
"dit": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Gender": "Neut",
|
||||||
|
}, # Gender=Neut|Number=Sing|Number[psor]=Sing|Person=2|Poss=Yes|PronType=Prs
|
||||||
|
"han": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Case": "Nom",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Case=Nom|Gender=Com|Number=Sing|Person=3|PronType=Prs
|
||||||
|
"hun": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Case": "Nom",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Case=Nom|Gender=Com|Number=Sing|Person=3|PronType=Prs
|
||||||
|
"den": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Case=Acc|Gender=Com|Number=Sing|Person=3|PronType=Prs, See note above.
|
||||||
|
"det": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Gender": "Neut",
|
||||||
|
}, # Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs See note above.
|
||||||
|
"ham": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Case=Acc|Gender=Com|Number=Sing|Person=3|PronType=Prs
|
||||||
|
"hende": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Case=Acc|Gender=Com|Number=Sing|Person=3|PronType=Prs
|
||||||
|
"sin": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Gender": "Com",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
}, # Gender=Com|Number=Sing|Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs|Reflex=Yes
|
||||||
|
"sit": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Gender": "Neut",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
}, # Gender=Neut|Number=Sing|Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs|Reflex=Yes
|
||||||
|
"vi": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "One",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Case": "Nom",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Case=Nom|Gender=Com|Number=Plur|Person=1|PronType=Prs
|
||||||
|
"os": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "One",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Case=Acc|Gender=Com|Number=Plur|Person=1|PronType=Prs
|
||||||
|
"mine": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "One",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Poss": "Yes",
|
||||||
|
}, # Number=Plur|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs
|
||||||
|
"vore": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "One",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Poss": "Yes",
|
||||||
|
}, # Number=Plur|Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs|Style=Form
|
||||||
|
"I": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Case": "Nom",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Case=Nom|Gender=Com|Number=Plur|Person=2|PronType=Prs
|
||||||
|
"jer": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Case=Acc|Gender=Com|Number=Plur|Person=2|PronType=Prs
|
||||||
|
"dine": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Poss": "Yes",
|
||||||
|
}, # Number=Plur|Number[psor]=Sing|Person=2|Poss=Yes|PronType=Prs
|
||||||
|
"de": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Case": "Nom",
|
||||||
|
}, # Case=Nom|Number=Plur|Person=3|PronType=Prs
|
||||||
|
"dem": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Case": "Acc",
|
||||||
|
}, # Case=Acc|Number=Plur|Person=3|PronType=Prs
|
||||||
|
"sine": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
}, # Number=Plur|Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs|Reflex=Yes
|
||||||
|
"vores": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "One",
|
||||||
|
"Poss": "Yes",
|
||||||
|
}, # Number[psor]=Plur|Person=1|Poss=Yes|PronType=Prs
|
||||||
|
"De": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Case": "Nom",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Case=Nom|Gender=Com|Person=2|Polite=Form|PronType=Prs
|
||||||
|
"Dem": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Gender": "Com",
|
||||||
|
}, # Case=Acc|Gender=Com|Person=2|Polite=Form|PronType=Prs
|
||||||
|
"Deres": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Poss": "Yes",
|
||||||
|
}, # Person=2|Polite=Form|Poss=Yes|PronType=Prs
|
||||||
|
"jeres": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Poss": "Yes",
|
||||||
|
}, # Number[psor]=Plur|Person=2|Poss=Yes|PronType=Prs
|
||||||
|
"sig": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
}, # Case=Acc|Person=3|PronType=Prs|Reflex=Yes
|
||||||
|
"hans": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Poss": "Yes",
|
||||||
|
}, # Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs
|
||||||
|
"hendes": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Poss": "Yes",
|
||||||
|
}, # Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs
|
||||||
|
"dens": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Poss": "Yes",
|
||||||
|
}, # Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs
|
||||||
|
"dets": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Poss": "Yes",
|
||||||
|
}, # Number[psor]=Sing|Person=3|Poss=Yes|PronType=Prs
|
||||||
|
"deres": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Poss": "Yes",
|
||||||
|
}, # Number[psor]=Plur|Person=3|Poss=Yes|PronType=Prs
|
||||||
},
|
},
|
||||||
|
|
||||||
"VERB": {
|
"VERB": {
|
||||||
"er": {LEMMA: "være", "VerbForm": "Fin", "Tense": "Pres"},
|
"er": {LEMMA: "være", "VerbForm": "Fin", "Tense": "Pres"},
|
||||||
"var": {LEMMA: "være", "VerbForm": "Fin", "Tense": "Past"}
|
"var": {LEMMA: "være", "VerbForm": "Fin", "Tense": "Past"},
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for tag, rules in MORPH_RULES.items():
|
for tag, rules in MORPH_RULES.items():
|
||||||
|
|
|
@ -516,7 +516,7 @@ _exc = {
|
||||||
"øjeåbner": "øjenåbner", # 1
|
"øjeåbner": "øjenåbner", # 1
|
||||||
"økonomiministerium": "økonomiministerie", # 1
|
"økonomiministerium": "økonomiministerie", # 1
|
||||||
"ørenring": "ørering", # 2
|
"ørenring": "ørering", # 2
|
||||||
"øvehefte": "øvehæfte" # 1
|
"øvehefte": "øvehæfte", # 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,17 +6,26 @@ from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
from ..punctuation import TOKENIZER_SUFFIXES
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
_quotes = QUOTES.replace("'", '')
|
_quotes = QUOTES.replace("'", "")
|
||||||
|
|
||||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
_infixes = (
|
||||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
LIST_ELLIPSES
|
||||||
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
+ LIST_ICONS
|
||||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
+ [
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
r"(?<=[{}])\.(?=[{}])".format(ALPHA_LOWER, ALPHA_UPPER),
|
||||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)])
|
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])".format(a=ALPHA, q=_quotes),
|
||||||
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
_suffixes = [suffix for suffix in TOKENIZER_SUFFIXES if suffix not in ["'s", "'S", "’s", "’S", r"\'"]]
|
_suffixes = [
|
||||||
|
suffix
|
||||||
|
for suffix in TOKENIZER_SUFFIXES
|
||||||
|
if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
|
||||||
|
]
|
||||||
_suffixes += [r"(?<=[^sSxXzZ])\'"]
|
_suffixes += [r"(?<=[^sSxXzZ])\'"]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,8 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
# Source: Handpicked by Jens Dahl Møllerhøj.
|
# Source: Handpicked by Jens Dahl Møllerhøj.
|
||||||
|
|
||||||
STOP_WORDS = set("""
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
af aldrig alene alle allerede alligevel alt altid anden andet andre at
|
af aldrig alene alle allerede alligevel alt altid anden andet andre at
|
||||||
|
|
||||||
bag begge blandt blev blive bliver burde bør
|
bag begge blandt blev blive bliver burde bør
|
||||||
|
@ -43,4 +44,5 @@ ud uden udover under undtagen
|
||||||
var ved vi via vil ville vore vores vær være været
|
var ved vi via vil ville vore vores vær være været
|
||||||
|
|
||||||
øvrigt
|
øvrigt
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -15,130 +15,540 @@ _exc = {}
|
||||||
# (for "torsdag") are left out because they are ambiguous. The same is the case
|
# (for "torsdag") are left out because they are ambiguous. The same is the case
|
||||||
# for abbreviations "jul." and "Jul." ("juli").
|
# for abbreviations "jul." and "Jul." ("juli").
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
{ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
|
{ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
|
||||||
{ORTH: "jan.", LEMMA: "januar"},
|
{ORTH: "jan.", LEMMA: "januar"},
|
||||||
{ORTH: "febr.", LEMMA: "februar"},
|
{ORTH: "febr.", LEMMA: "februar"},
|
||||||
{ORTH: "feb.", LEMMA: "februar"},
|
{ORTH: "feb.", LEMMA: "februar"},
|
||||||
{ORTH: "mar.", LEMMA: "marts"},
|
{ORTH: "mar.", LEMMA: "marts"},
|
||||||
{ORTH: "apr.", LEMMA: "april"},
|
{ORTH: "apr.", LEMMA: "april"},
|
||||||
{ORTH: "jun.", LEMMA: "juni"},
|
{ORTH: "jun.", LEMMA: "juni"},
|
||||||
{ORTH: "aug.", LEMMA: "august"},
|
{ORTH: "aug.", LEMMA: "august"},
|
||||||
{ORTH: "sept.", LEMMA: "september"},
|
{ORTH: "sept.", LEMMA: "september"},
|
||||||
{ORTH: "sep.", LEMMA: "september"},
|
{ORTH: "sep.", LEMMA: "september"},
|
||||||
{ORTH: "okt.", LEMMA: "oktober"},
|
{ORTH: "okt.", LEMMA: "oktober"},
|
||||||
{ORTH: "nov.", LEMMA: "november"},
|
{ORTH: "nov.", LEMMA: "november"},
|
||||||
{ORTH: "dec.", LEMMA: "december"},
|
{ORTH: "dec.", LEMMA: "december"},
|
||||||
{ORTH: "man.", LEMMA: "mandag"},
|
{ORTH: "man.", LEMMA: "mandag"},
|
||||||
{ORTH: "tirs.", LEMMA: "tirsdag"},
|
{ORTH: "tirs.", LEMMA: "tirsdag"},
|
||||||
{ORTH: "ons.", LEMMA: "onsdag"},
|
{ORTH: "ons.", LEMMA: "onsdag"},
|
||||||
{ORTH: "tor.", LEMMA: "torsdag"},
|
{ORTH: "tor.", LEMMA: "torsdag"},
|
||||||
{ORTH: "tors.", LEMMA: "torsdag"},
|
{ORTH: "tors.", LEMMA: "torsdag"},
|
||||||
{ORTH: "fre.", LEMMA: "fredag"},
|
{ORTH: "fre.", LEMMA: "fredag"},
|
||||||
{ORTH: "lør.", LEMMA: "lørdag"},
|
{ORTH: "lør.", LEMMA: "lørdag"},
|
||||||
{ORTH: "Jan.", LEMMA: "januar"},
|
{ORTH: "Jan.", LEMMA: "januar"},
|
||||||
{ORTH: "Febr.", LEMMA: "februar"},
|
{ORTH: "Febr.", LEMMA: "februar"},
|
||||||
{ORTH: "Feb.", LEMMA: "februar"},
|
{ORTH: "Feb.", LEMMA: "februar"},
|
||||||
{ORTH: "Mar.", LEMMA: "marts"},
|
{ORTH: "Mar.", LEMMA: "marts"},
|
||||||
{ORTH: "Apr.", LEMMA: "april"},
|
{ORTH: "Apr.", LEMMA: "april"},
|
||||||
{ORTH: "Jun.", LEMMA: "juni"},
|
{ORTH: "Jun.", LEMMA: "juni"},
|
||||||
{ORTH: "Aug.", LEMMA: "august"},
|
{ORTH: "Aug.", LEMMA: "august"},
|
||||||
{ORTH: "Sept.", LEMMA: "september"},
|
{ORTH: "Sept.", LEMMA: "september"},
|
||||||
{ORTH: "Sep.", LEMMA: "september"},
|
{ORTH: "Sep.", LEMMA: "september"},
|
||||||
{ORTH: "Okt.", LEMMA: "oktober"},
|
{ORTH: "Okt.", LEMMA: "oktober"},
|
||||||
{ORTH: "Nov.", LEMMA: "november"},
|
{ORTH: "Nov.", LEMMA: "november"},
|
||||||
{ORTH: "Dec.", LEMMA: "december"},
|
{ORTH: "Dec.", LEMMA: "december"},
|
||||||
{ORTH: "Man.", LEMMA: "mandag"},
|
{ORTH: "Man.", LEMMA: "mandag"},
|
||||||
{ORTH: "Tirs.", LEMMA: "tirsdag"},
|
{ORTH: "Tirs.", LEMMA: "tirsdag"},
|
||||||
{ORTH: "Ons.", LEMMA: "onsdag"},
|
{ORTH: "Ons.", LEMMA: "onsdag"},
|
||||||
{ORTH: "Fre.", LEMMA: "fredag"},
|
{ORTH: "Fre.", LEMMA: "fredag"},
|
||||||
{ORTH: "Lør.", LEMMA: "lørdag"}]:
|
{ORTH: "Lør.", LEMMA: "lørdag"},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
# Specified case only
|
# Specified case only
|
||||||
for orth in [
|
for orth in [
|
||||||
"diam.", "ib.", "mia.", "mik.", "pers.", "A.D.", "A/S", "B.C.", "BK.",
|
"diam.",
|
||||||
"Dr.", "Boul.", "Chr.", "Dronn.", "H.K.H.", "H.M.", "Hf.", "i/s", "I/S",
|
"ib.",
|
||||||
"Kprs.", "L.A.", "Ll.", "m/s", "M/S", "Mag.", "Mr.", "Ndr.", "Ph.d.",
|
"mia.",
|
||||||
"Prs.", "Rcp.", "Sdr.", "Skt.", "Spl.", "Vg."]:
|
"mik.",
|
||||||
|
"pers.",
|
||||||
|
"A.D.",
|
||||||
|
"A/S",
|
||||||
|
"B.C.",
|
||||||
|
"BK.",
|
||||||
|
"Dr.",
|
||||||
|
"Boul.",
|
||||||
|
"Chr.",
|
||||||
|
"Dronn.",
|
||||||
|
"H.K.H.",
|
||||||
|
"H.M.",
|
||||||
|
"Hf.",
|
||||||
|
"i/s",
|
||||||
|
"I/S",
|
||||||
|
"Kprs.",
|
||||||
|
"L.A.",
|
||||||
|
"Ll.",
|
||||||
|
"m/s",
|
||||||
|
"M/S",
|
||||||
|
"Mag.",
|
||||||
|
"Mr.",
|
||||||
|
"Ndr.",
|
||||||
|
"Ph.d.",
|
||||||
|
"Prs.",
|
||||||
|
"Rcp.",
|
||||||
|
"Sdr.",
|
||||||
|
"Skt.",
|
||||||
|
"Spl.",
|
||||||
|
"Vg.",
|
||||||
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
for orth in [
|
for orth in [
|
||||||
"aarh.", "ac.", "adj.", "adr.", "adsk.", "adv.", "afb.", "afd.", "afg.",
|
"aarh.",
|
||||||
"afk.", "afs.", "aht.", "alg.", "alk.", "alm.", "amer.", "ang.", "ank.",
|
"ac.",
|
||||||
"anl.", "anv.", "arb.", "arr.", "att.", "bd.", "bdt.", "beg.", "begr.",
|
"adj.",
|
||||||
"beh.", "bet.", "bev.", "bhk.", "bib.", "bibl.", "bidr.", "bildl.",
|
"adr.",
|
||||||
"bill.", "biol.", "bk.", "bl.", "bl.a.", "borgm.", "br.", "brolægn.",
|
"adsk.",
|
||||||
"bto.", "bygn.", "ca.", "cand.", "d.d.", "d.m.", "d.s.", "d.s.s.",
|
"adv.",
|
||||||
"d.y.", "d.å.", "d.æ.", "dagl.", "dat.", "dav.", "def.", "dek.", "dep.",
|
"afb.",
|
||||||
"desl.", "dir.", "disp.", "distr.", "div.", "dkr.", "dl.", "do.",
|
"afd.",
|
||||||
"dobb.", "dr.h.c", "dr.phil.", "ds.", "dvs.", "e.b.", "e.l.", "e.o.",
|
"afg.",
|
||||||
"e.v.t.", "eftf.", "eftm.", "egl.", "eks.", "eksam.", "ekskl.", "eksp.",
|
"afk.",
|
||||||
"ekspl.", "el.lign.", "emer.", "endv.", "eng.", "enk.", "etc.", "etym.",
|
"afs.",
|
||||||
"eur.", "evt.", "exam.", "f.eks.", "f.m.", "f.n.", "f.o.", "f.o.m.",
|
"aht.",
|
||||||
"f.s.v.", "f.t.", "f.v.t.", "f.å.", "fa.", "fakt.", "fam.", "ff.",
|
"alg.",
|
||||||
"fg.", "fhv.", "fig.", "filol.", "filos.", "fl.", "flg.", "fm.", "fmd.",
|
"alk.",
|
||||||
"fol.", "forb.", "foreg.", "foren.", "forf.", "fork.", "forr.", "fors.",
|
"alm.",
|
||||||
"forsk.", "forts.", "fr.", "fr.u.", "frk.", "fsva.", "fuldm.", "fung.",
|
"amer.",
|
||||||
"fx.", "fys.", "fær.", "g.d.", "g.m.", "gd.", "gdr.", "genuds.", "gl.",
|
"ang.",
|
||||||
"gn.", "gns.", "gr.", "grdl.", "gross.", "h.a.", "h.c.", "hdl.",
|
"ank.",
|
||||||
"henv.", "hhv.", "hj.hj.", "hj.spl.", "hort.", "hosp.", "hpl.", "hr.",
|
"anl.",
|
||||||
"hrs.", "hum.", "hvp.", "i.e.", "id.", "if.", "iflg.", "ifm.", "ift.",
|
"anv.",
|
||||||
"iht.", "ill.", "indb.", "indreg.", "inf.", "ing.", "inh.", "inj.",
|
"arb.",
|
||||||
"inkl.", "insp.", "instr.", "isl.", "istf.", "it.", "ital.", "iv.",
|
"arr.",
|
||||||
"jap.", "jf.", "jfr.", "jnr.", "j.nr.", "jr.", "jur.", "jvf.", "kap.",
|
"att.",
|
||||||
"kbh.", "kem.", "kgl.", "kl.", "kld.", "knsp.", "komm.", "kons.",
|
"bd.",
|
||||||
"korr.", "kp.", "kr.", "kst.", "kt.", "ktr.", "kv.", "kvt.", "l.c.",
|
"bdt.",
|
||||||
"lab.", "lat.", "lb.m.", "lb.nr.", "lejl.", "lgd.", "lic.", "lign.",
|
"beg.",
|
||||||
"lin.", "ling.merc.", "litt.", "loc.cit.", "lok.", "lrs.", "ltr.",
|
"begr.",
|
||||||
"m.a.o.", "m.fl.", "m.m.", "m.v.", "m.v.h.", "maks.", "md.", "mdr.",
|
"beh.",
|
||||||
"mdtl.", "mezz.", "mfl.", "m.h.p.", "m.h.t.", "mht.", "mill.", "mio.",
|
"bet.",
|
||||||
"modt.", "mrk.", "mul.", "mv.", "n.br.", "n.f.", "nb.", "nedenst.",
|
"bev.",
|
||||||
"nl.", "nr.", "nto.", "nuv.", "o/m", "o.a.", "o.fl.", "o.h.", "o.l.",
|
"bhk.",
|
||||||
"o.lign.", "o.m.a.", "o.s.fr.", "obl.", "obs.", "odont.", "oecon.",
|
"bib.",
|
||||||
"off.", "ofl.", "omg.", "omkr.", "omr.", "omtr.", "opg.", "opl.",
|
"bibl.",
|
||||||
"opr.", "org.", "orig.", "osv.", "ovenst.", "overs.", "ovf.", "p.a.",
|
"bidr.",
|
||||||
"p.b.a", "p.b.v", "p.c.", "p.m.", "p.m.v.", "p.n.", "p.p.", "p.p.s.",
|
"bildl.",
|
||||||
"p.s.", "p.t.", "p.v.a.", "p.v.c.", "pag.", "pass.", "pcs.", "pct.",
|
"bill.",
|
||||||
"pd.", "pens.", "pft.", "pg.", "pga.", "pgl.", "pinx.", "pk.", "pkt.",
|
"biol.",
|
||||||
"polit.", "polyt.", "pos.", "pp.", "ppm.", "pr.", "prc.", "priv.",
|
"bk.",
|
||||||
"prod.", "prof.", "pron.", "præd.", "præf.", "præt.", "psych.", "pt.",
|
"bl.",
|
||||||
"pæd.", "q.e.d.", "rad.", "red.", "ref.", "reg.", "regn.", "rel.",
|
"bl.a.",
|
||||||
"rep.", "repr.", "resp.", "rest.", "rm.", "rtg.", "russ.", "s.br.",
|
"borgm.",
|
||||||
"s.d.", "s.f.", "s.m.b.a.", "s.u.", "s.å.", "sa.", "sb.", "sc.",
|
"br.",
|
||||||
"scient.", "scil.", "sek.", "sekr.", "self.", "sem.", "shj.", "sign.",
|
"brolægn.",
|
||||||
"sing.", "sj.", "skr.", "slutn.", "sml.", "smp.", "snr.", "soc.",
|
"bto.",
|
||||||
"soc.dem.", "sp.", "spec.", "spm.", "spr.", "spsk.", "statsaut.", "st.",
|
"bygn.",
|
||||||
"stk.", "str.", "stud.", "subj.", "subst.", "suff.", "sup.", "suppl.",
|
"ca.",
|
||||||
"sv.", "såk.", "sædv.", "t/r", "t.h.", "t.o.", "t.o.m.", "t.v.", "tbl.",
|
"cand.",
|
||||||
"tcp/ip", "td.", "tdl.", "tdr.", "techn.", "tekn.", "temp.", "th.",
|
"d.d.",
|
||||||
"theol.", "tidl.", "tilf.", "tilh.", "till.", "tilsv.", "tjg.", "tkr.",
|
"d.m.",
|
||||||
"tlf.", "tlgr.", "tr.", "trp.", "tsk.", "tv.", "ty.", "u/b", "udb.",
|
"d.s.",
|
||||||
"udbet.", "ugtl.", "undt.", "v.f.", "vb.", "vedk.", "vedl.", "vedr.",
|
"d.s.s.",
|
||||||
"vejl.", "vh.", "vha.", "vs.", "vsa.", "vær.", "zool.", "ø.lgd.",
|
"d.y.",
|
||||||
"øvr.", "årg.", "årh."]:
|
"d.å.",
|
||||||
|
"d.æ.",
|
||||||
|
"dagl.",
|
||||||
|
"dat.",
|
||||||
|
"dav.",
|
||||||
|
"def.",
|
||||||
|
"dek.",
|
||||||
|
"dep.",
|
||||||
|
"desl.",
|
||||||
|
"dir.",
|
||||||
|
"disp.",
|
||||||
|
"distr.",
|
||||||
|
"div.",
|
||||||
|
"dkr.",
|
||||||
|
"dl.",
|
||||||
|
"do.",
|
||||||
|
"dobb.",
|
||||||
|
"dr.h.c",
|
||||||
|
"dr.phil.",
|
||||||
|
"ds.",
|
||||||
|
"dvs.",
|
||||||
|
"e.b.",
|
||||||
|
"e.l.",
|
||||||
|
"e.o.",
|
||||||
|
"e.v.t.",
|
||||||
|
"eftf.",
|
||||||
|
"eftm.",
|
||||||
|
"egl.",
|
||||||
|
"eks.",
|
||||||
|
"eksam.",
|
||||||
|
"ekskl.",
|
||||||
|
"eksp.",
|
||||||
|
"ekspl.",
|
||||||
|
"el.lign.",
|
||||||
|
"emer.",
|
||||||
|
"endv.",
|
||||||
|
"eng.",
|
||||||
|
"enk.",
|
||||||
|
"etc.",
|
||||||
|
"etym.",
|
||||||
|
"eur.",
|
||||||
|
"evt.",
|
||||||
|
"exam.",
|
||||||
|
"f.eks.",
|
||||||
|
"f.m.",
|
||||||
|
"f.n.",
|
||||||
|
"f.o.",
|
||||||
|
"f.o.m.",
|
||||||
|
"f.s.v.",
|
||||||
|
"f.t.",
|
||||||
|
"f.v.t.",
|
||||||
|
"f.å.",
|
||||||
|
"fa.",
|
||||||
|
"fakt.",
|
||||||
|
"fam.",
|
||||||
|
"ff.",
|
||||||
|
"fg.",
|
||||||
|
"fhv.",
|
||||||
|
"fig.",
|
||||||
|
"filol.",
|
||||||
|
"filos.",
|
||||||
|
"fl.",
|
||||||
|
"flg.",
|
||||||
|
"fm.",
|
||||||
|
"fmd.",
|
||||||
|
"fol.",
|
||||||
|
"forb.",
|
||||||
|
"foreg.",
|
||||||
|
"foren.",
|
||||||
|
"forf.",
|
||||||
|
"fork.",
|
||||||
|
"forr.",
|
||||||
|
"fors.",
|
||||||
|
"forsk.",
|
||||||
|
"forts.",
|
||||||
|
"fr.",
|
||||||
|
"fr.u.",
|
||||||
|
"frk.",
|
||||||
|
"fsva.",
|
||||||
|
"fuldm.",
|
||||||
|
"fung.",
|
||||||
|
"fx.",
|
||||||
|
"fys.",
|
||||||
|
"fær.",
|
||||||
|
"g.d.",
|
||||||
|
"g.m.",
|
||||||
|
"gd.",
|
||||||
|
"gdr.",
|
||||||
|
"genuds.",
|
||||||
|
"gl.",
|
||||||
|
"gn.",
|
||||||
|
"gns.",
|
||||||
|
"gr.",
|
||||||
|
"grdl.",
|
||||||
|
"gross.",
|
||||||
|
"h.a.",
|
||||||
|
"h.c.",
|
||||||
|
"hdl.",
|
||||||
|
"henv.",
|
||||||
|
"hhv.",
|
||||||
|
"hj.hj.",
|
||||||
|
"hj.spl.",
|
||||||
|
"hort.",
|
||||||
|
"hosp.",
|
||||||
|
"hpl.",
|
||||||
|
"hr.",
|
||||||
|
"hrs.",
|
||||||
|
"hum.",
|
||||||
|
"hvp.",
|
||||||
|
"i.e.",
|
||||||
|
"id.",
|
||||||
|
"if.",
|
||||||
|
"iflg.",
|
||||||
|
"ifm.",
|
||||||
|
"ift.",
|
||||||
|
"iht.",
|
||||||
|
"ill.",
|
||||||
|
"indb.",
|
||||||
|
"indreg.",
|
||||||
|
"inf.",
|
||||||
|
"ing.",
|
||||||
|
"inh.",
|
||||||
|
"inj.",
|
||||||
|
"inkl.",
|
||||||
|
"insp.",
|
||||||
|
"instr.",
|
||||||
|
"isl.",
|
||||||
|
"istf.",
|
||||||
|
"it.",
|
||||||
|
"ital.",
|
||||||
|
"iv.",
|
||||||
|
"jap.",
|
||||||
|
"jf.",
|
||||||
|
"jfr.",
|
||||||
|
"jnr.",
|
||||||
|
"j.nr.",
|
||||||
|
"jr.",
|
||||||
|
"jur.",
|
||||||
|
"jvf.",
|
||||||
|
"kap.",
|
||||||
|
"kbh.",
|
||||||
|
"kem.",
|
||||||
|
"kgl.",
|
||||||
|
"kl.",
|
||||||
|
"kld.",
|
||||||
|
"knsp.",
|
||||||
|
"komm.",
|
||||||
|
"kons.",
|
||||||
|
"korr.",
|
||||||
|
"kp.",
|
||||||
|
"kr.",
|
||||||
|
"kst.",
|
||||||
|
"kt.",
|
||||||
|
"ktr.",
|
||||||
|
"kv.",
|
||||||
|
"kvt.",
|
||||||
|
"l.c.",
|
||||||
|
"lab.",
|
||||||
|
"lat.",
|
||||||
|
"lb.m.",
|
||||||
|
"lb.nr.",
|
||||||
|
"lejl.",
|
||||||
|
"lgd.",
|
||||||
|
"lic.",
|
||||||
|
"lign.",
|
||||||
|
"lin.",
|
||||||
|
"ling.merc.",
|
||||||
|
"litt.",
|
||||||
|
"loc.cit.",
|
||||||
|
"lok.",
|
||||||
|
"lrs.",
|
||||||
|
"ltr.",
|
||||||
|
"m.a.o.",
|
||||||
|
"m.fl.",
|
||||||
|
"m.m.",
|
||||||
|
"m.v.",
|
||||||
|
"m.v.h.",
|
||||||
|
"maks.",
|
||||||
|
"md.",
|
||||||
|
"mdr.",
|
||||||
|
"mdtl.",
|
||||||
|
"mezz.",
|
||||||
|
"mfl.",
|
||||||
|
"m.h.p.",
|
||||||
|
"m.h.t.",
|
||||||
|
"mht.",
|
||||||
|
"mill.",
|
||||||
|
"mio.",
|
||||||
|
"modt.",
|
||||||
|
"mrk.",
|
||||||
|
"mul.",
|
||||||
|
"mv.",
|
||||||
|
"n.br.",
|
||||||
|
"n.f.",
|
||||||
|
"nb.",
|
||||||
|
"nedenst.",
|
||||||
|
"nl.",
|
||||||
|
"nr.",
|
||||||
|
"nto.",
|
||||||
|
"nuv.",
|
||||||
|
"o/m",
|
||||||
|
"o.a.",
|
||||||
|
"o.fl.",
|
||||||
|
"o.h.",
|
||||||
|
"o.l.",
|
||||||
|
"o.lign.",
|
||||||
|
"o.m.a.",
|
||||||
|
"o.s.fr.",
|
||||||
|
"obl.",
|
||||||
|
"obs.",
|
||||||
|
"odont.",
|
||||||
|
"oecon.",
|
||||||
|
"off.",
|
||||||
|
"ofl.",
|
||||||
|
"omg.",
|
||||||
|
"omkr.",
|
||||||
|
"omr.",
|
||||||
|
"omtr.",
|
||||||
|
"opg.",
|
||||||
|
"opl.",
|
||||||
|
"opr.",
|
||||||
|
"org.",
|
||||||
|
"orig.",
|
||||||
|
"osv.",
|
||||||
|
"ovenst.",
|
||||||
|
"overs.",
|
||||||
|
"ovf.",
|
||||||
|
"p.a.",
|
||||||
|
"p.b.a",
|
||||||
|
"p.b.v",
|
||||||
|
"p.c.",
|
||||||
|
"p.m.",
|
||||||
|
"p.m.v.",
|
||||||
|
"p.n.",
|
||||||
|
"p.p.",
|
||||||
|
"p.p.s.",
|
||||||
|
"p.s.",
|
||||||
|
"p.t.",
|
||||||
|
"p.v.a.",
|
||||||
|
"p.v.c.",
|
||||||
|
"pag.",
|
||||||
|
"pass.",
|
||||||
|
"pcs.",
|
||||||
|
"pct.",
|
||||||
|
"pd.",
|
||||||
|
"pens.",
|
||||||
|
"pft.",
|
||||||
|
"pg.",
|
||||||
|
"pga.",
|
||||||
|
"pgl.",
|
||||||
|
"pinx.",
|
||||||
|
"pk.",
|
||||||
|
"pkt.",
|
||||||
|
"polit.",
|
||||||
|
"polyt.",
|
||||||
|
"pos.",
|
||||||
|
"pp.",
|
||||||
|
"ppm.",
|
||||||
|
"pr.",
|
||||||
|
"prc.",
|
||||||
|
"priv.",
|
||||||
|
"prod.",
|
||||||
|
"prof.",
|
||||||
|
"pron.",
|
||||||
|
"præd.",
|
||||||
|
"præf.",
|
||||||
|
"præt.",
|
||||||
|
"psych.",
|
||||||
|
"pt.",
|
||||||
|
"pæd.",
|
||||||
|
"q.e.d.",
|
||||||
|
"rad.",
|
||||||
|
"red.",
|
||||||
|
"ref.",
|
||||||
|
"reg.",
|
||||||
|
"regn.",
|
||||||
|
"rel.",
|
||||||
|
"rep.",
|
||||||
|
"repr.",
|
||||||
|
"resp.",
|
||||||
|
"rest.",
|
||||||
|
"rm.",
|
||||||
|
"rtg.",
|
||||||
|
"russ.",
|
||||||
|
"s.br.",
|
||||||
|
"s.d.",
|
||||||
|
"s.f.",
|
||||||
|
"s.m.b.a.",
|
||||||
|
"s.u.",
|
||||||
|
"s.å.",
|
||||||
|
"sa.",
|
||||||
|
"sb.",
|
||||||
|
"sc.",
|
||||||
|
"scient.",
|
||||||
|
"scil.",
|
||||||
|
"sek.",
|
||||||
|
"sekr.",
|
||||||
|
"self.",
|
||||||
|
"sem.",
|
||||||
|
"shj.",
|
||||||
|
"sign.",
|
||||||
|
"sing.",
|
||||||
|
"sj.",
|
||||||
|
"skr.",
|
||||||
|
"slutn.",
|
||||||
|
"sml.",
|
||||||
|
"smp.",
|
||||||
|
"snr.",
|
||||||
|
"soc.",
|
||||||
|
"soc.dem.",
|
||||||
|
"sp.",
|
||||||
|
"spec.",
|
||||||
|
"spm.",
|
||||||
|
"spr.",
|
||||||
|
"spsk.",
|
||||||
|
"statsaut.",
|
||||||
|
"st.",
|
||||||
|
"stk.",
|
||||||
|
"str.",
|
||||||
|
"stud.",
|
||||||
|
"subj.",
|
||||||
|
"subst.",
|
||||||
|
"suff.",
|
||||||
|
"sup.",
|
||||||
|
"suppl.",
|
||||||
|
"sv.",
|
||||||
|
"såk.",
|
||||||
|
"sædv.",
|
||||||
|
"t/r",
|
||||||
|
"t.h.",
|
||||||
|
"t.o.",
|
||||||
|
"t.o.m.",
|
||||||
|
"t.v.",
|
||||||
|
"tbl.",
|
||||||
|
"tcp/ip",
|
||||||
|
"td.",
|
||||||
|
"tdl.",
|
||||||
|
"tdr.",
|
||||||
|
"techn.",
|
||||||
|
"tekn.",
|
||||||
|
"temp.",
|
||||||
|
"th.",
|
||||||
|
"theol.",
|
||||||
|
"tidl.",
|
||||||
|
"tilf.",
|
||||||
|
"tilh.",
|
||||||
|
"till.",
|
||||||
|
"tilsv.",
|
||||||
|
"tjg.",
|
||||||
|
"tkr.",
|
||||||
|
"tlf.",
|
||||||
|
"tlgr.",
|
||||||
|
"tr.",
|
||||||
|
"trp.",
|
||||||
|
"tsk.",
|
||||||
|
"tv.",
|
||||||
|
"ty.",
|
||||||
|
"u/b",
|
||||||
|
"udb.",
|
||||||
|
"udbet.",
|
||||||
|
"ugtl.",
|
||||||
|
"undt.",
|
||||||
|
"v.f.",
|
||||||
|
"vb.",
|
||||||
|
"vedk.",
|
||||||
|
"vedl.",
|
||||||
|
"vedr.",
|
||||||
|
"vejl.",
|
||||||
|
"vh.",
|
||||||
|
"vha.",
|
||||||
|
"vs.",
|
||||||
|
"vsa.",
|
||||||
|
"vær.",
|
||||||
|
"zool.",
|
||||||
|
"ø.lgd.",
|
||||||
|
"øvr.",
|
||||||
|
"årg.",
|
||||||
|
"årh.",
|
||||||
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
capitalized = orth.capitalize()
|
capitalized = orth.capitalize()
|
||||||
_exc[capitalized] = [{ORTH: capitalized}]
|
_exc[capitalized] = [{ORTH: capitalized}]
|
||||||
|
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
{ORTH: "s'gu", LEMMA: "s'gu", NORM: "s'gu"},
|
{ORTH: "s'gu", LEMMA: "s'gu", NORM: "s'gu"},
|
||||||
{ORTH: "S'gu", LEMMA: "s'gu", NORM: "s'gu"},
|
{ORTH: "S'gu", LEMMA: "s'gu", NORM: "s'gu"},
|
||||||
{ORTH: "sgu'", LEMMA: "s'gu", NORM: "s'gu"},
|
{ORTH: "sgu'", LEMMA: "s'gu", NORM: "s'gu"},
|
||||||
{ORTH: "Sgu'", LEMMA: "s'gu", NORM: "s'gu"},
|
{ORTH: "Sgu'", LEMMA: "s'gu", NORM: "s'gu"},
|
||||||
{ORTH: "sku'", LEMMA: "skal", NORM: "skulle"},
|
{ORTH: "sku'", LEMMA: "skal", NORM: "skulle"},
|
||||||
{ORTH: "ku'", LEMMA: "kan", NORM: "kunne"},
|
{ORTH: "ku'", LEMMA: "kan", NORM: "kunne"},
|
||||||
{ORTH: "Ku'", LEMMA: "kan", NORM: "kunne"},
|
{ORTH: "Ku'", LEMMA: "kan", NORM: "kunne"},
|
||||||
{ORTH: "ka'", LEMMA: "kan", NORM: "kan"},
|
{ORTH: "ka'", LEMMA: "kan", NORM: "kan"},
|
||||||
{ORTH: "Ka'", LEMMA: "kan", NORM: "kan"},
|
{ORTH: "Ka'", LEMMA: "kan", NORM: "kan"},
|
||||||
{ORTH: "gi'", LEMMA: "give", NORM: "giv"},
|
{ORTH: "gi'", LEMMA: "give", NORM: "giv"},
|
||||||
{ORTH: "Gi'", LEMMA: "give", NORM: "giv"},
|
{ORTH: "Gi'", LEMMA: "give", NORM: "giv"},
|
||||||
{ORTH: "li'", LEMMA: "lide", NORM: "lide"},
|
{ORTH: "li'", LEMMA: "lide", NORM: "lide"},
|
||||||
{ORTH: "ha'", LEMMA: "have", NORM: "have"},
|
{ORTH: "ha'", LEMMA: "have", NORM: "have"},
|
||||||
{ORTH: "Ha'", LEMMA: "have", NORM: "have"},
|
{ORTH: "Ha'", LEMMA: "have", NORM: "have"},
|
||||||
{ORTH: "ik'", LEMMA: "ikke", NORM: "ikke"},
|
{ORTH: "ik'", LEMMA: "ikke", NORM: "ikke"},
|
||||||
{ORTH: "Ik'", LEMMA: "ikke", NORM: "ikke"}]:
|
{ORTH: "Ik'", LEMMA: "ikke", NORM: "ikke"},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
|
@ -147,11 +557,7 @@ for h in range(1, 31 + 1):
|
||||||
for period in ["."]:
|
for period in ["."]:
|
||||||
_exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}]
|
_exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}]
|
||||||
|
|
||||||
_custom_base_exc = {
|
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]}
|
||||||
"i.": [
|
|
||||||
{ORTH: "i", LEMMA: "i", NORM: "i"},
|
|
||||||
{ORTH: ".", TAG: PUNCT}]
|
|
||||||
}
|
|
||||||
_exc.update(_custom_base_exc)
|
_exc.update(_custom_base_exc)
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -18,9 +18,10 @@ from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
class GermanDefaults(Language.Defaults):
|
class GermanDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'de'
|
lex_attr_getters[LANG] = lambda text: "de"
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
NORM_EXCEPTIONS, BASE_NORMS)
|
Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
|
||||||
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
@ -30,8 +31,8 @@ class GermanDefaults(Language.Defaults):
|
||||||
|
|
||||||
|
|
||||||
class German(Language):
|
class German(Language):
|
||||||
lang = 'de'
|
lang = "de"
|
||||||
Defaults = GermanDefaults
|
Defaults = GermanDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['German']
|
__all__ = ["German"]
|
||||||
|
|
|
@ -18,5 +18,5 @@ sentences = [
|
||||||
"San Francisco erwägt Verbot von Lieferrobotern",
|
"San Francisco erwägt Verbot von Lieferrobotern",
|
||||||
"Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
|
"Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
|
||||||
"Wo bist du?",
|
"Wo bist du?",
|
||||||
"Was ist die Hauptstadt von Deutschland?"
|
"Was ist die Hauptstadt von Deutschland?",
|
||||||
]
|
]
|
||||||
|
|
|
@ -6,9 +6,7 @@ from __future__ import unicode_literals
|
||||||
# old vs. new spelling rules, and all possible cases.
|
# old vs. new spelling rules, and all possible cases.
|
||||||
|
|
||||||
|
|
||||||
_exc = {
|
_exc = {"daß": "dass"}
|
||||||
"daß": "dass"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
NORM_EXCEPTIONS = {}
|
NORM_EXCEPTIONS = {}
|
||||||
|
|
|
@ -5,16 +5,21 @@ from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||||
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
|
||||||
|
|
||||||
_quotes = QUOTES.replace("'", '')
|
_quotes = QUOTES.replace("'", "")
|
||||||
|
|
||||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
_infixes = (
|
||||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
LIST_ELLIPSES
|
||||||
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
+ LIST_ICONS
|
||||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
+ [
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
r"(?<=[{}])\.(?=[{}])".format(ALPHA_LOWER, ALPHA_UPPER),
|
||||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||||
r'(?<=[0-9])-(?=[0-9])'])
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])".format(a=ALPHA, q=_quotes),
|
||||||
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[0-9])-(?=[0-9])",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_INFIXES = _infixes
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set("""
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
á a ab aber ach acht achte achten achter achtes ag alle allein allem allen
|
á a ab aber ach acht achte achten achter achtes ag alle allein allem allen
|
||||||
aller allerdings alles allgemeinen als also am an andere anderen andern anders
|
aller allerdings alles allgemeinen als also am an andere anderen andern anders
|
||||||
auch auf aus ausser außer ausserdem außerdem
|
auch auf aus ausser außer ausserdem außerdem
|
||||||
|
@ -78,4 +79,5 @@ wollt wollte wollten worden wurde würde wurden würden
|
||||||
|
|
||||||
zehn zehnte zehnten zehnter zehntes zeit zu zuerst zugleich zum zunächst zur
|
zehn zehnte zehnten zehnter zehntes zeit zu zuerst zugleich zum zunächst zur
|
||||||
zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen
|
zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -13,26 +13,37 @@ def noun_chunks(obj):
|
||||||
# measurement construction, the span is sometimes extended to the right of
|
# measurement construction, the span is sometimes extended to the right of
|
||||||
# the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee"
|
# the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee"
|
||||||
# and not just "eine Tasse", same for "das Thema Familie".
|
# and not just "eine Tasse", same for "das Thema Familie".
|
||||||
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
|
labels = [
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
"sb",
|
||||||
np_label = doc.vocab.strings.add('NP')
|
"oa",
|
||||||
|
"da",
|
||||||
|
"nk",
|
||||||
|
"mo",
|
||||||
|
"ag",
|
||||||
|
"ROOT",
|
||||||
|
"root",
|
||||||
|
"cj",
|
||||||
|
"pd",
|
||||||
|
"og",
|
||||||
|
"app",
|
||||||
|
]
|
||||||
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
|
np_label = doc.vocab.strings.add("NP")
|
||||||
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
||||||
close_app = doc.vocab.strings.add('nk')
|
close_app = doc.vocab.strings.add("nk")
|
||||||
|
|
||||||
rbracket = 0
|
rbracket = 0
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(obj):
|
||||||
if i < rbracket:
|
if i < rbracket:
|
||||||
continue
|
continue
|
||||||
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
||||||
rbracket = word.i+1
|
rbracket = word.i + 1
|
||||||
# try to extend the span to the right
|
# try to extend the span to the right
|
||||||
# to capture close apposition/measurement constructions
|
# to capture close apposition/measurement constructions
|
||||||
for rdep in doc[word.i].rights:
|
for rdep in doc[word.i].rights:
|
||||||
if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
|
if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
|
||||||
rbracket = rdep.i+1
|
rbracket = rdep.i + 1
|
||||||
yield word.left_edge.i, rbracket, np_label
|
yield word.left_edge.i, rbracket, np_label
|
||||||
|
|
||||||
|
|
||||||
SYNTAX_ITERATORS = {
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||||
'noun_chunks': noun_chunks
|
|
||||||
}
|
|
||||||
|
|
|
@ -6,61 +6,61 @@ from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
"$(": {POS: PUNCT, "PunctType": "brck"},
|
"$(": {POS: PUNCT, "PunctType": "brck"},
|
||||||
"$,": {POS: PUNCT, "PunctType": "comm"},
|
"$,": {POS: PUNCT, "PunctType": "comm"},
|
||||||
"$.": {POS: PUNCT, "PunctType": "peri"},
|
"$.": {POS: PUNCT, "PunctType": "peri"},
|
||||||
"ADJA": {POS: ADJ},
|
"ADJA": {POS: ADJ},
|
||||||
"ADJD": {POS: ADJ, "Variant": "short"},
|
"ADJD": {POS: ADJ, "Variant": "short"},
|
||||||
"ADV": {POS: ADV},
|
"ADV": {POS: ADV},
|
||||||
"APPO": {POS: ADP, "AdpType": "post"},
|
"APPO": {POS: ADP, "AdpType": "post"},
|
||||||
"APPR": {POS: ADP, "AdpType": "prep"},
|
"APPR": {POS: ADP, "AdpType": "prep"},
|
||||||
"APPRART": {POS: ADP, "AdpType": "prep", "PronType": "art"},
|
"APPRART": {POS: ADP, "AdpType": "prep", "PronType": "art"},
|
||||||
"APZR": {POS: ADP, "AdpType": "circ"},
|
"APZR": {POS: ADP, "AdpType": "circ"},
|
||||||
"ART": {POS: DET, "PronType": "art"},
|
"ART": {POS: DET, "PronType": "art"},
|
||||||
"CARD": {POS: NUM, "NumType": "card"},
|
"CARD": {POS: NUM, "NumType": "card"},
|
||||||
"FM": {POS: X, "Foreign": "yes"},
|
"FM": {POS: X, "Foreign": "yes"},
|
||||||
"ITJ": {POS: INTJ},
|
"ITJ": {POS: INTJ},
|
||||||
"KOKOM": {POS: CONJ, "ConjType": "comp"},
|
"KOKOM": {POS: CONJ, "ConjType": "comp"},
|
||||||
"KON": {POS: CONJ},
|
"KON": {POS: CONJ},
|
||||||
"KOUI": {POS: SCONJ},
|
"KOUI": {POS: SCONJ},
|
||||||
"KOUS": {POS: SCONJ},
|
"KOUS": {POS: SCONJ},
|
||||||
"NE": {POS: PROPN},
|
"NE": {POS: PROPN},
|
||||||
"NNE": {POS: PROPN},
|
"NNE": {POS: PROPN},
|
||||||
"NN": {POS: NOUN},
|
"NN": {POS: NOUN},
|
||||||
"PAV": {POS: ADV, "PronType": "dem"},
|
"PAV": {POS: ADV, "PronType": "dem"},
|
||||||
"PROAV": {POS: ADV, "PronType": "dem"},
|
"PROAV": {POS: ADV, "PronType": "dem"},
|
||||||
"PDAT": {POS: DET, "PronType": "dem"},
|
"PDAT": {POS: DET, "PronType": "dem"},
|
||||||
"PDS": {POS: PRON, "PronType": "dem"},
|
"PDS": {POS: PRON, "PronType": "dem"},
|
||||||
"PIAT": {POS: DET, "PronType": "ind|neg|tot"},
|
"PIAT": {POS: DET, "PronType": "ind|neg|tot"},
|
||||||
"PIDAT": {POS: DET, "AdjType": "pdt", "PronType": "ind|neg|tot"},
|
"PIDAT": {POS: DET, "AdjType": "pdt", "PronType": "ind|neg|tot"},
|
||||||
"PIS": {POS: PRON, "PronType": "ind|neg|tot"},
|
"PIS": {POS: PRON, "PronType": "ind|neg|tot"},
|
||||||
"PPER": {POS: PRON, "PronType": "prs"},
|
"PPER": {POS: PRON, "PronType": "prs"},
|
||||||
"PPOSAT": {POS: DET, "Poss": "yes", "PronType": "prs"},
|
"PPOSAT": {POS: DET, "Poss": "yes", "PronType": "prs"},
|
||||||
"PPOSS": {POS: PRON, "Poss": "yes", "PronType": "prs"},
|
"PPOSS": {POS: PRON, "Poss": "yes", "PronType": "prs"},
|
||||||
"PRELAT": {POS: DET, "PronType": "rel"},
|
"PRELAT": {POS: DET, "PronType": "rel"},
|
||||||
"PRELS": {POS: PRON, "PronType": "rel"},
|
"PRELS": {POS: PRON, "PronType": "rel"},
|
||||||
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
|
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
|
||||||
"PTKA": {POS: PART},
|
"PTKA": {POS: PART},
|
||||||
"PTKANT": {POS: PART, "PartType": "res"},
|
"PTKANT": {POS: PART, "PartType": "res"},
|
||||||
"PTKNEG": {POS: PART, "Polarity": "Neg"},
|
"PTKNEG": {POS: PART, "Polarity": "Neg"},
|
||||||
"PTKVZ": {POS: PART, "PartType": "vbp"},
|
"PTKVZ": {POS: PART, "PartType": "vbp"},
|
||||||
"PTKZU": {POS: PART, "PartType": "inf"},
|
"PTKZU": {POS: PART, "PartType": "inf"},
|
||||||
"PWAT": {POS: DET, "PronType": "int"},
|
"PWAT": {POS: DET, "PronType": "int"},
|
||||||
"PWAV": {POS: ADV, "PronType": "int"},
|
"PWAV": {POS: ADV, "PronType": "int"},
|
||||||
"PWS": {POS: PRON, "PronType": "int"},
|
"PWS": {POS: PRON, "PronType": "int"},
|
||||||
"TRUNC": {POS: X, "Hyph": "yes"},
|
"TRUNC": {POS: X, "Hyph": "yes"},
|
||||||
"VAFIN": {POS: AUX, "Mood": "ind", "VerbForm": "fin"},
|
"VAFIN": {POS: AUX, "Mood": "ind", "VerbForm": "fin"},
|
||||||
"VAIMP": {POS: AUX, "Mood": "imp", "VerbForm": "fin"},
|
"VAIMP": {POS: AUX, "Mood": "imp", "VerbForm": "fin"},
|
||||||
"VAINF": {POS: AUX, "VerbForm": "inf"},
|
"VAINF": {POS: AUX, "VerbForm": "inf"},
|
||||||
"VAPP": {POS: AUX, "Aspect": "perf", "VerbForm": "part"},
|
"VAPP": {POS: AUX, "Aspect": "perf", "VerbForm": "part"},
|
||||||
"VMFIN": {POS: VERB, "Mood": "ind", "VerbForm": "fin", "VerbType": "mod"},
|
"VMFIN": {POS: VERB, "Mood": "ind", "VerbForm": "fin", "VerbType": "mod"},
|
||||||
"VMINF": {POS: VERB, "VerbForm": "inf", "VerbType": "mod"},
|
"VMINF": {POS: VERB, "VerbForm": "inf", "VerbType": "mod"},
|
||||||
"VMPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part", "VerbType": "mod"},
|
"VMPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part", "VerbType": "mod"},
|
||||||
"VVFIN": {POS: VERB, "Mood": "ind", "VerbForm": "fin"},
|
"VVFIN": {POS: VERB, "Mood": "ind", "VerbForm": "fin"},
|
||||||
"VVIMP": {POS: VERB, "Mood": "imp", "VerbForm": "fin"},
|
"VVIMP": {POS: VERB, "Mood": "imp", "VerbForm": "fin"},
|
||||||
"VVINF": {POS: VERB, "VerbForm": "inf"},
|
"VVINF": {POS: VERB, "VerbForm": "inf"},
|
||||||
"VVIZU": {POS: VERB, "VerbForm": "inf"},
|
"VVIZU": {POS: VERB, "VerbForm": "inf"},
|
||||||
"VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
|
"VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
|
||||||
"XY": {POS: X},
|
"XY": {POS: X},
|
||||||
"_SP": {POS: SPACE}
|
"_SP": {POS: SPACE},
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,49 +5,41 @@ from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||||||
|
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
"auf'm": [
|
"auf'm": [{ORTH: "auf", LEMMA: "auf"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
|
||||||
{ORTH: "auf", LEMMA: "auf"},
|
|
||||||
{ORTH: "'m", LEMMA: "der", NORM: "dem"}],
|
|
||||||
|
|
||||||
"du's": [
|
"du's": [
|
||||||
{ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
{ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||||
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}],
|
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
|
||||||
|
],
|
||||||
"er's": [
|
"er's": [
|
||||||
{ORTH: "er", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
{ORTH: "er", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||||
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}],
|
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
|
||||||
|
],
|
||||||
"hinter'm": [
|
"hinter'm": [
|
||||||
{ORTH: "hinter", LEMMA: "hinter"},
|
{ORTH: "hinter", LEMMA: "hinter"},
|
||||||
{ORTH: "'m", LEMMA: "der", NORM: "dem"}],
|
{ORTH: "'m", LEMMA: "der", NORM: "dem"},
|
||||||
|
],
|
||||||
"ich's": [
|
"ich's": [
|
||||||
{ORTH: "ich", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
{ORTH: "ich", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||||
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}],
|
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
|
||||||
|
],
|
||||||
"ihr's": [
|
"ihr's": [
|
||||||
{ORTH: "ihr", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
{ORTH: "ihr", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||||
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}],
|
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
|
||||||
|
],
|
||||||
"sie's": [
|
"sie's": [
|
||||||
{ORTH: "sie", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
{ORTH: "sie", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||||
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}],
|
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
|
||||||
|
],
|
||||||
"unter'm": [
|
"unter'm": [
|
||||||
{ORTH: "unter", LEMMA: "unter"},
|
{ORTH: "unter", LEMMA: "unter"},
|
||||||
{ORTH: "'m", LEMMA: "der", NORM: "dem"}],
|
{ORTH: "'m", LEMMA: "der", NORM: "dem"},
|
||||||
|
],
|
||||||
"vor'm": [
|
"vor'm": [{ORTH: "vor", LEMMA: "vor"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
|
||||||
{ORTH: "vor", LEMMA: "vor"},
|
|
||||||
{ORTH: "'m", LEMMA: "der", NORM: "dem"}],
|
|
||||||
|
|
||||||
"wir's": [
|
"wir's": [
|
||||||
{ORTH: "wir", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
{ORTH: "wir", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||||
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}],
|
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
|
||||||
|
],
|
||||||
"über'm": [
|
"über'm": [{ORTH: "über", LEMMA: "über"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
|
||||||
{ORTH: "über", LEMMA: "über"},
|
|
||||||
{ORTH: "'m", LEMMA: "der", NORM: "dem"}]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -162,21 +154,95 @@ for exc_data in [
|
||||||
{ORTH: "z.Zt.", LEMMA: "zur Zeit"},
|
{ORTH: "z.Zt.", LEMMA: "zur Zeit"},
|
||||||
{ORTH: "z.b.", LEMMA: "zum Beispiel"},
|
{ORTH: "z.b.", LEMMA: "zum Beispiel"},
|
||||||
{ORTH: "zzgl.", LEMMA: "zuzüglich"},
|
{ORTH: "zzgl.", LEMMA: "zuzüglich"},
|
||||||
{ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
|
{ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
for orth in [
|
for orth in [
|
||||||
"A.C.", "a.D.", "A.D.", "A.G.", "a.M.", "a.Z.", "Abs.", "adv.", "al.",
|
"A.C.",
|
||||||
"B.A.", "B.Sc.", "betr.", "biol.", "Biol.", "ca.", "Chr.", "Cie.", "co.",
|
"a.D.",
|
||||||
"Co.", "D.C.", "Dipl.-Ing.", "Dipl.", "Dr.", "e.g.", "e.V.", "ehem.",
|
"A.D.",
|
||||||
"entspr.", "erm.", "etc.", "ev.", "G.m.b.H.", "geb.", "Gebr.", "gem.",
|
"A.G.",
|
||||||
"h.c.", "Hg.", "hrsg.", "Hrsg.", "i.A.", "i.e.", "i.G.", "i.Tr.", "i.V.",
|
"a.M.",
|
||||||
"Ing.", "jr.", "Jr.", "jun.", "jur.", "K.O.", "L.A.", "lat.", "M.A.",
|
"a.Z.",
|
||||||
"m.E.", "m.M.", "M.Sc.", "Mr.", "N.Y.", "N.Y.C.", "nat.", "o.a.",
|
"Abs.",
|
||||||
"o.ä.", "o.g.", "o.k.", "O.K.", "p.a.", "p.s.", "P.S.", "pers.", "phil.",
|
"adv.",
|
||||||
"q.e.d.", "R.I.P.", "rer.", "sen.", "St.", "std.", "u.a.", "U.S.", "U.S.A.",
|
"al.",
|
||||||
"U.S.S.", "Vol.", "vs.", "wiss."]:
|
"B.A.",
|
||||||
|
"B.Sc.",
|
||||||
|
"betr.",
|
||||||
|
"biol.",
|
||||||
|
"Biol.",
|
||||||
|
"ca.",
|
||||||
|
"Chr.",
|
||||||
|
"Cie.",
|
||||||
|
"co.",
|
||||||
|
"Co.",
|
||||||
|
"D.C.",
|
||||||
|
"Dipl.-Ing.",
|
||||||
|
"Dipl.",
|
||||||
|
"Dr.",
|
||||||
|
"e.g.",
|
||||||
|
"e.V.",
|
||||||
|
"ehem.",
|
||||||
|
"entspr.",
|
||||||
|
"erm.",
|
||||||
|
"etc.",
|
||||||
|
"ev.",
|
||||||
|
"G.m.b.H.",
|
||||||
|
"geb.",
|
||||||
|
"Gebr.",
|
||||||
|
"gem.",
|
||||||
|
"h.c.",
|
||||||
|
"Hg.",
|
||||||
|
"hrsg.",
|
||||||
|
"Hrsg.",
|
||||||
|
"i.A.",
|
||||||
|
"i.e.",
|
||||||
|
"i.G.",
|
||||||
|
"i.Tr.",
|
||||||
|
"i.V.",
|
||||||
|
"Ing.",
|
||||||
|
"jr.",
|
||||||
|
"Jr.",
|
||||||
|
"jun.",
|
||||||
|
"jur.",
|
||||||
|
"K.O.",
|
||||||
|
"L.A.",
|
||||||
|
"lat.",
|
||||||
|
"M.A.",
|
||||||
|
"m.E.",
|
||||||
|
"m.M.",
|
||||||
|
"M.Sc.",
|
||||||
|
"Mr.",
|
||||||
|
"N.Y.",
|
||||||
|
"N.Y.C.",
|
||||||
|
"nat.",
|
||||||
|
"o.a.",
|
||||||
|
"o.ä.",
|
||||||
|
"o.g.",
|
||||||
|
"o.k.",
|
||||||
|
"O.K.",
|
||||||
|
"p.a.",
|
||||||
|
"p.s.",
|
||||||
|
"P.S.",
|
||||||
|
"pers.",
|
||||||
|
"phil.",
|
||||||
|
"q.e.d.",
|
||||||
|
"R.I.P.",
|
||||||
|
"rer.",
|
||||||
|
"sen.",
|
||||||
|
"St.",
|
||||||
|
"std.",
|
||||||
|
"u.a.",
|
||||||
|
"U.S.",
|
||||||
|
"U.S.A.",
|
||||||
|
"U.S.S.",
|
||||||
|
"Vol.",
|
||||||
|
"vs.",
|
||||||
|
"wiss.",
|
||||||
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -21,9 +21,10 @@ from ...util import update_exc, add_lookups
|
||||||
class GreekDefaults(Language.Defaults):
|
class GreekDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: 'el' # ISO code
|
lex_attr_getters[LANG] = lambda text: "el" # ISO code
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS)
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
||||||
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
@ -37,15 +38,16 @@ class GreekDefaults(Language.Defaults):
|
||||||
lemma_rules = LEMMA_RULES
|
lemma_rules = LEMMA_RULES
|
||||||
lemma_index = LEMMA_INDEX
|
lemma_index = LEMMA_INDEX
|
||||||
lemma_exc = LEMMA_EXC
|
lemma_exc = LEMMA_EXC
|
||||||
return GreekLemmatizer(index=lemma_index, exceptions=lemma_exc,
|
return GreekLemmatizer(
|
||||||
rules=lemma_rules)
|
index=lemma_index, exceptions=lemma_exc, rules=lemma_rules
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class Greek(Language):
|
class Greek(Language):
|
||||||
|
|
||||||
lang = 'el' # ISO code
|
lang = "el" # ISO code
|
||||||
Defaults = GreekDefaults # set Defaults to custom language defaults
|
Defaults = GreekDefaults # set Defaults to custom language defaults
|
||||||
|
|
||||||
|
|
||||||
# set default export – this allows the language class to be lazy-loaded
|
# set default export – this allows the language class to be lazy-loaded
|
||||||
__all__ = ['Greek']
|
__all__ = ["Greek"]
|
||||||
|
|
|
@ -9,20 +9,20 @@ Example sentences to test spaCy and its language models.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
'''Η άνιση κατανομή του πλούτου και του εισοδήματος, η οποία έχει λάβει
|
"""Η άνιση κατανομή του πλούτου και του εισοδήματος, η οποία έχει λάβει
|
||||||
τρομερές διαστάσεις, δεν δείχνει τάσεις βελτίωσης.''',
|
τρομερές διαστάσεις, δεν δείχνει τάσεις βελτίωσης.""",
|
||||||
'''Ο στόχος της σύντομης αυτής έκθεσης είναι να συνοψίσει τα κυριότερα
|
"""Ο στόχος της σύντομης αυτής έκθεσης είναι να συνοψίσει τα κυριότερα
|
||||||
συμπεράσματα των επισκοπήσεων κάθε μιας χώρας.''',
|
συμπεράσματα των επισκοπήσεων κάθε μιας χώρας.""",
|
||||||
'''Μέχρι αργά χθες το βράδυ ο πλοιοκτήτης παρέμενε έξω από το γραφείο του
|
"""Μέχρι αργά χθες το βράδυ ο πλοιοκτήτης παρέμενε έξω από το γραφείο του
|
||||||
γενικού γραμματέα του υπουργείου, ενώ είχε μόνον τηλεφωνική επικοινωνία με
|
γενικού γραμματέα του υπουργείου, ενώ είχε μόνον τηλεφωνική επικοινωνία με
|
||||||
τον υπουργό.''',
|
τον υπουργό.""",
|
||||||
'''Σύμφωνα με καλά ενημερωμένη πηγή, από την επεξεργασία του προέκυψε ότι
|
"""Σύμφωνα με καλά ενημερωμένη πηγή, από την επεξεργασία του προέκυψε ότι
|
||||||
οι δράστες της επίθεσης ήταν δύο, καθώς και ότι προσέγγισαν και αποχώρησαν
|
οι δράστες της επίθεσης ήταν δύο, καθώς και ότι προσέγγισαν και αποχώρησαν
|
||||||
από το σημείο με μοτοσικλέτα.''',
|
από το σημείο με μοτοσικλέτα.""",
|
||||||
"Η υποδομή καταλυμάτων στην Ελλάδα είναι πλήρης και ανανεώνεται συνεχώς.",
|
"Η υποδομή καταλυμάτων στην Ελλάδα είναι πλήρης και ανανεώνεται συνεχώς.",
|
||||||
'''Το επείγον ταχυδρομείο (ήτοι το παραδοτέο εντός 48 ωρών το πολύ) μπορεί
|
"""Το επείγον ταχυδρομείο (ήτοι το παραδοτέο εντός 48 ωρών το πολύ) μπορεί
|
||||||
να μεταφέρεται αεροπορικώς μόνον εφόσον εφαρμόζονται οι κανόνες
|
να μεταφέρεται αεροπορικώς μόνον εφόσον εφαρμόζονται οι κανόνες
|
||||||
ασφαλείας''',
|
ασφαλείας""",
|
||||||
''''Στις ορεινές περιοχές του νησιού οι χιονοπτώσεις και οι παγετοί είναι
|
"""'Στις ορεινές περιοχές του νησιού οι χιονοπτώσεις και οι παγετοί είναι
|
||||||
περιορισμένοι ενώ στις παραθαλάσσιες περιοχές σημειώνονται σπανίως.'''
|
περιορισμένοι ενώ στις παραθαλάσσιες περιοχές σημειώνονται σπανίως.""",
|
||||||
]
|
]
|
||||||
|
|
|
@ -12,10 +12,19 @@ from ._verbs import VERBS
|
||||||
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
|
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
|
||||||
|
|
||||||
|
|
||||||
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
|
LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
|
||||||
|
|
||||||
|
|
||||||
LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES,
|
LEMMA_RULES = {
|
||||||
'punct': PUNCT_RULES}
|
"adj": ADJECTIVE_RULES,
|
||||||
|
"noun": NOUN_RULES,
|
||||||
|
"verb": VERB_RULES,
|
||||||
|
"punct": PUNCT_RULES,
|
||||||
|
}
|
||||||
|
|
||||||
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'noun': NOUNS_IRREG, 'det': DETS_IRREG, 'verb': VERBS_IRREG}
|
LEMMA_EXC = {
|
||||||
|
"adj": ADJECTIVES_IRREG,
|
||||||
|
"noun": NOUNS_IRREG,
|
||||||
|
"det": DETS_IRREG,
|
||||||
|
"verb": VERBS_IRREG,
|
||||||
|
}
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
ADJECTIVES = set("""
|
|
||||||
|
ADJECTIVES = set(
|
||||||
|
"""
|
||||||
n-διάστατος µεταφυτρωτικός άβαθος άβαλτος άβαρος άβατος άβαφος άβγαλτος άβιος
|
n-διάστατος µεταφυτρωτικός άβαθος άβαλτος άβαρος άβατος άβαφος άβγαλτος άβιος
|
||||||
άβλαπτος άβλεπτος άβολος άβουλος άβραστος άβρεχτος άβροχος άβυθος άγαμος
|
άβλαπτος άβλεπτος άβολος άβουλος άβραστος άβρεχτος άβροχος άβυθος άγαμος
|
||||||
άγγιχτος άγδαρτος άγδυτος άγευστος άγιος άγλυκος άγλωσσος άγναθος άγναντος
|
άγγιχτος άγδαρτος άγδυτος άγευστος άγιος άγλυκος άγλωσσος άγναθος άγναντος
|
||||||
|
@ -2438,4 +2440,5 @@ ADJECTIVES = set("""
|
||||||
όμορφος όνειος όξινος όρθιος όσιος όφκαιρος όψια όψιμος ύπανδρος ύπατος
|
όμορφος όνειος όξινος όρθιος όσιος όφκαιρος όψια όψιμος ύπανδρος ύπατος
|
||||||
ύπουλος ύπτιος ύστατος ύστερος ύψιστος ώριμος ώριος ἀγκυλωτός ἀκαταμέτρητος
|
ύπουλος ύπτιος ύστατος ύστερος ύψιστος ώριμος ώριος ἀγκυλωτός ἀκαταμέτρητος
|
||||||
ἄπειρος ἄτροπος ἐλαφρός ἐνεστώς ἐνυπόστατος ἔναυλος ἥττων ἰσχυρός ἵστωρ
|
ἄπειρος ἄτροπος ἐλαφρός ἐνεστώς ἐνυπόστατος ἔναυλος ἥττων ἰσχυρός ἵστωρ
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -32,5 +32,4 @@ ADJECTIVES_IRREG = {
|
||||||
"πολύς": ("πολύ",),
|
"πολύς": ("πολύ",),
|
||||||
"πολλύ": ("πολύ",),
|
"πολλύ": ("πολύ",),
|
||||||
"πολλύς": ("πολύ",),
|
"πολλύς": ("πολύ",),
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
ADVERBS = set("""
|
|
||||||
|
ADVERBS = set(
|
||||||
|
"""
|
||||||
άβλαβα άβολα άβουλα άγαν άγαρμπα άγγιχτα άγνωμα άγρια άγρυπνα άδηλα άδικα
|
άβλαβα άβολα άβουλα άγαν άγαρμπα άγγιχτα άγνωμα άγρια άγρυπνα άδηλα άδικα
|
||||||
άδοξα άθελα άθλια άκαιρα άκακα άκαμπτα άκαρδα άκαρπα άκεφα άκομψα άκοπα άκοσμα
|
άδοξα άθελα άθλια άκαιρα άκακα άκαμπτα άκαρδα άκαρπα άκεφα άκομψα άκοπα άκοσμα
|
||||||
άκρως άκυρα άλαλα άλιωτα άλλοθεν άλλοτε άλλως άλλωστε άλογα άλυπα άμεμπτα
|
άκρως άκυρα άλαλα άλιωτα άλλοθεν άλλοτε άλλως άλλωστε άλογα άλυπα άμεμπτα
|
||||||
|
@ -861,4 +863,5 @@ ADVERBS = set("""
|
||||||
ψυχραντικά ψωροπερήφανα ψόφια ψύχραιμα ωδικώς ωμά ωρίμως ωραία ωραιότατα
|
ψυχραντικά ψωροπερήφανα ψόφια ψύχραιμα ωδικώς ωμά ωρίμως ωραία ωραιότατα
|
||||||
ωριαία ωριαίως ως ωσαύτως ωσεί ωφέλιμα ωφελίμως ωφελιμιστικά ωχρά όθε όθεν όλο
|
ωριαία ωριαίως ως ωσαύτως ωσεί ωφέλιμα ωφελίμως ωφελιμιστικά ωχρά όθε όθεν όλο
|
||||||
όμορφα όντως όξω όπισθεν όπου όπως όρθια όρτσα όσια όσο όχι όψιμα ύπερθεν
|
όμορφα όντως όξω όπισθεν όπου όπως όρθια όρτσα όσια όσο όχι όψιμα ύπερθεν
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
DETS = set("""
|
|
||||||
|
DETS = set(
|
||||||
|
"""
|
||||||
ένας η ο το τη
|
ένας η ο το τη
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -8,5 +8,5 @@ DETS_IRREG = {
|
||||||
"τους": ("το",),
|
"τους": ("το",),
|
||||||
"τις": ("τη",),
|
"τις": ("τη",),
|
||||||
"τα": ("το",),
|
"τα": ("το",),
|
||||||
"οι": ("ο","η"),
|
"οι": ("ο", "η"),
|
||||||
}
|
}
|
||||||
|
|
|
@ -140,17 +140,7 @@ VERB_RULES = [
|
||||||
["ξουμε", "ζω"],
|
["ξουμε", "ζω"],
|
||||||
["ξετε", "ζω"],
|
["ξετε", "ζω"],
|
||||||
["ξουν", "ζω"],
|
["ξουν", "ζω"],
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
PUNCT_RULES = [
|
PUNCT_RULES = [["“", '"'], ["”", '"'], ["\u2018", "'"], ["\u2019", "'"]]
|
||||||
["“", "\""],
|
|
||||||
["”", "\""],
|
|
||||||
["\u2018", "'"],
|
|
||||||
["\u2019", "'"]
|
|
||||||
]
|
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
NOUNS = set("""
|
|
||||||
|
NOUNS = set(
|
||||||
|
"""
|
||||||
-αλγία -βατώ -βατῶ -ούλα -πληξία -ώνυμο sofa table άβακας άβατο άβατον άβυσσος
|
-αλγία -βατώ -βατῶ -ούλα -πληξία -ώνυμο sofa table άβακας άβατο άβατον άβυσσος
|
||||||
άγανο άγαρ άγγελμα άγγελος άγγιγμα άγγισμα άγγλος άγημα άγιασμα άγιο φως
|
άγανο άγαρ άγγελμα άγγελος άγγιγμα άγγισμα άγγλος άγημα άγιασμα άγιο φως
|
||||||
άγκλισμα άγκυρα άγμα άγνοια άγνωστος άγονο άγος άγουρος άγουσα άγρα άγρευμα
|
άγκλισμα άγκυρα άγμα άγνοια άγνωστος άγονο άγος άγουρος άγουσα άγρα άγρευμα
|
||||||
|
@ -6066,4 +6068,5 @@ NOUNS = set("""
|
||||||
ἐντευκτήριον ἐντόσθια ἐξοικείωσις ἐξοχή ἐξωκκλήσιον ἐπίσκεψις ἐπίσχεστρον
|
ἐντευκτήριον ἐντόσθια ἐξοικείωσις ἐξοχή ἐξωκκλήσιον ἐπίσκεψις ἐπίσχεστρον
|
||||||
ἐρωτίς ἑρμηνεία ἔκθλιψις ἔκτισις ἔκτρωμα ἔπαλξις ἱππάρχας ἱππάρχης ἴς ἵππαρχος
|
ἐρωτίς ἑρμηνεία ἔκθλιψις ἔκτισις ἔκτρωμα ἔπαλξις ἱππάρχας ἱππάρχης ἴς ἵππαρχος
|
||||||
ὑστερικός ὕστερον ὠάριον ὠοθήκη ὠοθηκῖτις ὠοθυλάκιον ὠορρηξία ὠοσκόπιον
|
ὑστερικός ὕστερον ὠάριον ὠοθήκη ὠοθηκῖτις ὠοθυλάκιον ὠορρηξία ὠοσκόπιον
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
PARTICIPLES = set("""
|
|
||||||
|
PARTICIPLES = set(
|
||||||
|
"""
|
||||||
έρποντας έχοντας αβανιάζοντας αβγατισμένος αγαπημένος αγαπώντας αγγίζοντας
|
έρποντας έχοντας αβανιάζοντας αβγατισμένος αγαπημένος αγαπώντας αγγίζοντας
|
||||||
αγγιγμένος αγιασμένος αγιογραφώντας αγιοποιημένος αγιοποιώντας αγκαζαρισμένος
|
αγγιγμένος αγιασμένος αγιογραφώντας αγιοποιημένος αγιοποιώντας αγκαζαρισμένος
|
||||||
αγκιστρωμένος αγκυλωμένος αγκυροβολημένος αγλακώντας αγνοημένος αγνοούμενος
|
αγκιστρωμένος αγκυλωμένος αγκυροβολημένος αγλακώντας αγνοημένος αγνοούμενος
|
||||||
|
@ -941,4 +943,5 @@ PARTICIPLES = set("""
|
||||||
ψιλούμενος ψοφολογώντας ψυχογραφώντας ψυχολογημένος ψυχομαχώντας ψυχομαχώντας
|
ψιλούμενος ψοφολογώντας ψυχογραφώντας ψυχολογημένος ψυχομαχώντας ψυχομαχώντας
|
||||||
ψυχορραγώντας ψυχρηλατώντας ψυχωμένος ψωμοζητώντας ψωμοζώντας ψωμωμένος
|
ψυχορραγώντας ψυχρηλατώντας ψυχωμένος ψωμοζητώντας ψωμοζώντας ψωμωμένος
|
||||||
ωθηθείς ωθώντας ωραιοποιημένος ωραιοποιώντας ωρυόμενος ωτοσκοπώντας όντας
|
ωθηθείς ωθώντας ωραιοποιημένος ωραιοποιώντας ωρυόμενος ωτοσκοπώντας όντας
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
PROPER_NAMES = set("""
|
|
||||||
|
PROPER_NAMES = set(
|
||||||
|
"""
|
||||||
άαχεν άβαρος άβδηρα άβελ άβιλα άβολα άγγελοι άγγελος άγιο πνεύμα
|
άαχεν άβαρος άβδηρα άβελ άβιλα άβολα άγγελοι άγγελος άγιο πνεύμα
|
||||||
άγιοι τόποι άγιον όρος άγιος αθανάσιος άγιος αναστάσιος άγιος αντώνιος
|
άγιοι τόποι άγιον όρος άγιος αθανάσιος άγιος αναστάσιος άγιος αντώνιος
|
||||||
άγιος αριστείδης άγιος βαρθολομαίος άγιος βασίλειος άγιος βασίλης
|
άγιος αριστείδης άγιος βαρθολομαίος άγιος βασίλειος άγιος βασίλης
|
||||||
|
@ -641,4 +644,5 @@ PROPER_NAMES = set("""
|
||||||
ωρολόγιον ωρωπός ωσηέ όγκα όγκατα όγκι όθρυς όθων όιτα όλγα όλιβερ όλυμπος
|
ωρολόγιον ωρωπός ωσηέ όγκα όγκατα όγκι όθρυς όθων όιτα όλγα όλιβερ όλυμπος
|
||||||
όμουρα όμπιδος όνειρος όνο όρεγκον όσακι όσατο όσκαρ όσλο όταμα ότσου όφενμπαχ
|
όμουρα όμπιδος όνειρος όνο όρεγκον όσακι όσατο όσκαρ όσλο όταμα ότσου όφενμπαχ
|
||||||
όχιρα ύδρα ύδρος ύψιστος ώλενος ώρες ώρχους ώστιν ἀλεξανδρούπολις ἀμαλιούπολις
|
όχιρα ύδρα ύδρος ύψιστος ώλενος ώρες ώρχους ώστιν ἀλεξανδρούπολις ἀμαλιούπολις
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
VERBS = set("""
|
|
||||||
|
VERBS = set(
|
||||||
|
"""
|
||||||
'γγίζω άγομαι άγχομαι άγω άδω άπτομαι άπωσον άρχομαι άρχω άφτω έγκειται έκιοσε
|
'γγίζω άγομαι άγχομαι άγω άδω άπτομαι άπωσον άρχομαι άρχω άφτω έγκειται έκιοσε
|
||||||
έπομαι έρπω έρχομαι έστω έχω ήγγικεν ήθελε ίπταμαι ίσταμαι αίρομαι αίρω
|
έπομαι έρπω έρχομαι έστω έχω ήγγικεν ήθελε ίπταμαι ίσταμαι αίρομαι αίρω
|
||||||
αβαντάρω αβαντζάρω αβαντσάρω αβαράρω αβασκαίνω αβγατίζω αβγαταίνω αβγοκόβω
|
αβαντάρω αβαντζάρω αβαντσάρω αβαράρω αβασκαίνω αβγατίζω αβγαταίνω αβγοκόβω
|
||||||
|
@ -1186,4 +1188,5 @@ VERBS = set("""
|
||||||
ωρύομαι ωτακουστώ ωτοσκοπώ ωφελούμαι ωφελώ ωχραίνω ωχριώ όζω όψομαι ἀδικῶ
|
ωρύομαι ωτακουστώ ωτοσκοπώ ωφελούμαι ωφελώ ωχραίνω ωχριώ όζω όψομαι ἀδικῶ
|
||||||
ἀκροῶμαι ἀλέθω ἀμελῶ ἀναπτερυγιάζω ἀναπτερώνω ἀναπτερώνω ἀνασαίνω ἀναταράσσω
|
ἀκροῶμαι ἀλέθω ἀμελῶ ἀναπτερυγιάζω ἀναπτερώνω ἀναπτερώνω ἀνασαίνω ἀναταράσσω
|
||||||
ἀναφτερουγίζω ἀναφτερουγιάζω ἀναφτερώνω ἀναχωρίζω ἀντιμετρῶ ἀράζω ἀφοδεύω
|
ἀναφτερουγίζω ἀναφτερουγιάζω ἀναφτερώνω ἀναχωρίζω ἀντιμετρῶ ἀράζω ἀφοδεύω
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -1,200 +1,198 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
VERBS_IRREG = {
|
VERBS_IRREG = {
|
||||||
"είσαι": ("είμαι",),
|
"είσαι": ("είμαι",),
|
||||||
"είναι": ("είμαι",),
|
"είναι": ("είμαι",),
|
||||||
"είμαστε": ("είμαι",),
|
"είμαστε": ("είμαι",),
|
||||||
"είστε": ("είμαι",),
|
"είστε": ("είμαι",),
|
||||||
"είσαστε": ("είμαι",),
|
"είσαστε": ("είμαι",),
|
||||||
"ήμουν": ("είμαι",),
|
"ήμουν": ("είμαι",),
|
||||||
"ήσουν": ("είμαι",),
|
"ήσουν": ("είμαι",),
|
||||||
"ήταν": ("είμαι",),
|
"ήταν": ("είμαι",),
|
||||||
"ήμαστε": ("είμαι",),
|
"ήμαστε": ("είμαι",),
|
||||||
"ήμασταν": ("είμαι",),
|
"ήμασταν": ("είμαι",),
|
||||||
"ήταν": ("είμαι",),
|
"ήταν": ("είμαι",),
|
||||||
"είπα": ("λέω",),
|
"είπα": ("λέω",),
|
||||||
"είπες": ("λέω",),
|
"είπες": ("λέω",),
|
||||||
"είπε": ("λέω",),
|
"είπε": ("λέω",),
|
||||||
"είπαμε": ("λέω",),
|
"είπαμε": ("λέω",),
|
||||||
"είπατε": ("λέω",),
|
"είπατε": ("λέω",),
|
||||||
"είπαν": ("λέω",),
|
"είπαν": ("λέω",),
|
||||||
"είπανε": ("λέω",),
|
"είπανε": ("λέω",),
|
||||||
"πει": ("λέω"),
|
"πει": ("λέω"),
|
||||||
"πω": ("λέω"),
|
"πω": ("λέω"),
|
||||||
"πάω": ("πηγαίνω",),
|
"πάω": ("πηγαίνω",),
|
||||||
"πάς": ("πηγαίνω",),
|
"πάς": ("πηγαίνω",),
|
||||||
"πας": ("πηγαίνω",),
|
"πας": ("πηγαίνω",),
|
||||||
"πάει": ("πηγαίνω",),
|
"πάει": ("πηγαίνω",),
|
||||||
"πάμε": ("πηγαίνω",),
|
"πάμε": ("πηγαίνω",),
|
||||||
"πάτε": ("πηγαίνω",),
|
"πάτε": ("πηγαίνω",),
|
||||||
"πάνε": ("πηγαίνω",),
|
"πάνε": ("πηγαίνω",),
|
||||||
"πήγα": ("πηγαίνω",),
|
"πήγα": ("πηγαίνω",),
|
||||||
"πήγες": ("πηγαίνω",),
|
"πήγες": ("πηγαίνω",),
|
||||||
"πήγε": ("πηγαίνω",),
|
"πήγε": ("πηγαίνω",),
|
||||||
"πήγαμε": ("πηγαίνω",),
|
"πήγαμε": ("πηγαίνω",),
|
||||||
"πήγατε": ("πηγαίνω",),
|
"πήγατε": ("πηγαίνω",),
|
||||||
"πήγαν": ("πηγαίνω",),
|
"πήγαν": ("πηγαίνω",),
|
||||||
"πήγανε": ("πηγαίνω",),
|
"πήγανε": ("πηγαίνω",),
|
||||||
"έπαιζα": ("παίζω",),
|
"έπαιζα": ("παίζω",),
|
||||||
"έπαιζες": ("παίζω",),
|
"έπαιζες": ("παίζω",),
|
||||||
"έπαιζε": ("παίζω",),
|
"έπαιζε": ("παίζω",),
|
||||||
"έπαιζαν": ("παίζω,",),
|
"έπαιζαν": ("παίζω,",),
|
||||||
"έπαιξα": ("παίζω",),
|
"έπαιξα": ("παίζω",),
|
||||||
"έπαιξες": ("παίζω",),
|
"έπαιξες": ("παίζω",),
|
||||||
"έπαιξε": ("παίζω",),
|
"έπαιξε": ("παίζω",),
|
||||||
"έτρωγα": ("τρώω",),
|
"έτρωγα": ("τρώω",),
|
||||||
"έτρωγες": ("τρώω",),
|
"έτρωγες": ("τρώω",),
|
||||||
"έτρωγε": ("τρώω",),
|
"έτρωγε": ("τρώω",),
|
||||||
"έτρωγαν": ("τρώω",),
|
"έτρωγαν": ("τρώω",),
|
||||||
"είχα": ("έχω",),
|
"είχα": ("έχω",),
|
||||||
"είχες": ("έχω",),
|
"είχες": ("έχω",),
|
||||||
"είχε": ("έχω",),
|
"είχε": ("έχω",),
|
||||||
"είχαμε": ("έχω",),
|
"είχαμε": ("έχω",),
|
||||||
"είχατε": ("έχω",),
|
"είχατε": ("έχω",),
|
||||||
"είχαν": ("έχω",),
|
"είχαν": ("έχω",),
|
||||||
"είχανε": ("έχω",),
|
"είχανε": ("έχω",),
|
||||||
"έπαιρνα": ("παίρνω",),
|
"έπαιρνα": ("παίρνω",),
|
||||||
"έπαιρνες": ("παίρνω",),
|
"έπαιρνες": ("παίρνω",),
|
||||||
"έπαιρνε": ("παίρνω",),
|
"έπαιρνε": ("παίρνω",),
|
||||||
"έπαιρναν": ("παίρνω",),
|
"έπαιρναν": ("παίρνω",),
|
||||||
"εδίνα": ("δίνω",),
|
"εδίνα": ("δίνω",),
|
||||||
"εδίνες": ("δίνω",),
|
"εδίνες": ("δίνω",),
|
||||||
"εδίνε": ("δίνω",),
|
"εδίνε": ("δίνω",),
|
||||||
"εδίναν": ("δίνω",),
|
"εδίναν": ("δίνω",),
|
||||||
"έκανα": ("κάνω",),
|
"έκανα": ("κάνω",),
|
||||||
"έκανες": ("κάνω",),
|
"έκανες": ("κάνω",),
|
||||||
"έκανε": ("κάνω",),
|
"έκανε": ("κάνω",),
|
||||||
"έκαναν": ("κάνω",),
|
"έκαναν": ("κάνω",),
|
||||||
"ήθελα": ("θέλω",),
|
"ήθελα": ("θέλω",),
|
||||||
"ήθελες": ("θέλω",),
|
"ήθελες": ("θέλω",),
|
||||||
"ήθελε": ("θέλω",),
|
"ήθελε": ("θέλω",),
|
||||||
"ήθελαν": ("θέλω",),
|
"ήθελαν": ("θέλω",),
|
||||||
"έβλεπα": ("βλέπω",),
|
"έβλεπα": ("βλέπω",),
|
||||||
"έβλεπες": ("βλέπω",),
|
"έβλεπες": ("βλέπω",),
|
||||||
"έβλεπε": ("βλέπω",),
|
"έβλεπε": ("βλέπω",),
|
||||||
"έβλεπαν": ("βλέπω",),
|
"έβλεπαν": ("βλέπω",),
|
||||||
"είδα": ("βλέπω",),
|
"είδα": ("βλέπω",),
|
||||||
"είδες": ("βλέπω",),
|
"είδες": ("βλέπω",),
|
||||||
"είδε": ("βλέπω",),
|
"είδε": ("βλέπω",),
|
||||||
"είδαμε": ("βλέπω",),
|
"είδαμε": ("βλέπω",),
|
||||||
"είδατε": ("βλέπω",),
|
"είδατε": ("βλέπω",),
|
||||||
"είδαν": ("βλέπω",),
|
"είδαν": ("βλέπω",),
|
||||||
"έφερνα": ("φέρνω",),
|
"έφερνα": ("φέρνω",),
|
||||||
"έφερνες": ("φέρνω",),
|
"έφερνες": ("φέρνω",),
|
||||||
"έφερνε": ("φέρνω",),
|
"έφερνε": ("φέρνω",),
|
||||||
"έφερναν": ("φέρνω",),
|
"έφερναν": ("φέρνω",),
|
||||||
"έφερα": ("φέρω",),
|
"έφερα": ("φέρω",),
|
||||||
"έφερες": ("φέρω",),
|
"έφερες": ("φέρω",),
|
||||||
"έφερε": ("φέρω",),
|
"έφερε": ("φέρω",),
|
||||||
"έφεραν": ("φέρω",),
|
"έφεραν": ("φέρω",),
|
||||||
"έλαβα": ("λαμβάνω",),
|
"έλαβα": ("λαμβάνω",),
|
||||||
"έλαβες": ("λαμβάνω",),
|
"έλαβες": ("λαμβάνω",),
|
||||||
"έλαβε": ("λαμβάνω",),
|
"έλαβε": ("λαμβάνω",),
|
||||||
"έλαβαν": ("λαμβάνω",),
|
"έλαβαν": ("λαμβάνω",),
|
||||||
"έβρισκα": ("βρίσκω",),
|
"έβρισκα": ("βρίσκω",),
|
||||||
"έβρισκες": ("βρίσκω",),
|
"έβρισκες": ("βρίσκω",),
|
||||||
"έβρισκε": ("βρίσκω",),
|
"έβρισκε": ("βρίσκω",),
|
||||||
"έβρισκαν": ("βρίσκω",),
|
"έβρισκαν": ("βρίσκω",),
|
||||||
"ήξερα": ("ξέρω",),
|
"ήξερα": ("ξέρω",),
|
||||||
"ήξερες": ("ξέρω",),
|
"ήξερες": ("ξέρω",),
|
||||||
"ήξερε": ("ξέρω",),
|
"ήξερε": ("ξέρω",),
|
||||||
"ήξεραν": ("ξέρω",),
|
"ήξεραν": ("ξέρω",),
|
||||||
"ανέφερα": ("αναφέρω",),
|
"ανέφερα": ("αναφέρω",),
|
||||||
"ανέφερες": ("αναφέρω",),
|
"ανέφερες": ("αναφέρω",),
|
||||||
"ανέφερε": ("αναφέρω",),
|
"ανέφερε": ("αναφέρω",),
|
||||||
"ανέφεραν": ("αναφέρω",),
|
"ανέφεραν": ("αναφέρω",),
|
||||||
"έβαζα": ("βάζω",),
|
"έβαζα": ("βάζω",),
|
||||||
"έβαζες": ("βάζω",),
|
"έβαζες": ("βάζω",),
|
||||||
"έβαζε": ("βάζω",),
|
"έβαζε": ("βάζω",),
|
||||||
"έβαζαν": ("βάζω",),
|
"έβαζαν": ("βάζω",),
|
||||||
"έμεινα": ("μένω",),
|
"έμεινα": ("μένω",),
|
||||||
"έμεινες": ("μένω",),
|
"έμεινες": ("μένω",),
|
||||||
"έμεινε": ("μένω",),
|
"έμεινε": ("μένω",),
|
||||||
"έμειναν": ("μένω",),
|
"έμειναν": ("μένω",),
|
||||||
"έβγαζα": ("βγάζω",),
|
"έβγαζα": ("βγάζω",),
|
||||||
"έβγαζες": ("βγάζω",),
|
"έβγαζες": ("βγάζω",),
|
||||||
"έβγαζε": ("βγάζω",),
|
"έβγαζε": ("βγάζω",),
|
||||||
"έβγαζαν": ("βγάζω",),
|
"έβγαζαν": ("βγάζω",),
|
||||||
"έμπαινα": ("μπαίνω",),
|
"έμπαινα": ("μπαίνω",),
|
||||||
"έμπαινες": ("μπαίνω",),
|
"έμπαινες": ("μπαίνω",),
|
||||||
"έμπαινε": ("μπαίνω",),
|
"έμπαινε": ("μπαίνω",),
|
||||||
"έμπαιναν": ("μπαίνω",),
|
"έμπαιναν": ("μπαίνω",),
|
||||||
"βγήκα": ("βγαίνω",),
|
"βγήκα": ("βγαίνω",),
|
||||||
"βγήκες": ("βγαίνω",),
|
"βγήκες": ("βγαίνω",),
|
||||||
"βγήκε": ("βγαίνω",),
|
"βγήκε": ("βγαίνω",),
|
||||||
"βγήκαμε": ("βγαίνω",),
|
"βγήκαμε": ("βγαίνω",),
|
||||||
"βγήκατε": ("βγαίνω",),
|
"βγήκατε": ("βγαίνω",),
|
||||||
"βγήκαν": ("βγαίνω",),
|
"βγήκαν": ("βγαίνω",),
|
||||||
"έπεφτα": ("πέφτω",),
|
"έπεφτα": ("πέφτω",),
|
||||||
"έπεφτες": ("πέφτω",),
|
"έπεφτες": ("πέφτω",),
|
||||||
"έπεφτε": ("πέφτω",),
|
"έπεφτε": ("πέφτω",),
|
||||||
"έπεφταν": ("πέφτω",),
|
"έπεφταν": ("πέφτω",),
|
||||||
"έπεσα": ("πέφτω",),
|
"έπεσα": ("πέφτω",),
|
||||||
"έπεσες": ("πέφτω",),
|
"έπεσες": ("πέφτω",),
|
||||||
"έπεσε": ("πέφτω",),
|
"έπεσε": ("πέφτω",),
|
||||||
"έπεσαν": ("πέφτω",),
|
"έπεσαν": ("πέφτω",),
|
||||||
"έστειλα": ("στέλνω",),
|
"έστειλα": ("στέλνω",),
|
||||||
"έστειλες": ("στέλνω",),
|
"έστειλες": ("στέλνω",),
|
||||||
"έστειλε": ("στέλνω",),
|
"έστειλε": ("στέλνω",),
|
||||||
"έστειλαν": ("στέλνω",),
|
"έστειλαν": ("στέλνω",),
|
||||||
"έφυγα": ("φεύγω",),
|
"έφυγα": ("φεύγω",),
|
||||||
"έφυγες": ("φεύγω",),
|
"έφυγες": ("φεύγω",),
|
||||||
"έφυγες": ("φεύγω",),
|
"έφυγες": ("φεύγω",),
|
||||||
"έφυγαν": ("φεύγω",),
|
"έφυγαν": ("φεύγω",),
|
||||||
"έμαθα": ("μαθαίνω",),
|
"έμαθα": ("μαθαίνω",),
|
||||||
"έμαθες": ("μαθαίνω",),
|
"έμαθες": ("μαθαίνω",),
|
||||||
"έμαθε": ("μαθαίνω",),
|
"έμαθε": ("μαθαίνω",),
|
||||||
"έμαθαν": ("μαθαίνω",),
|
"έμαθαν": ("μαθαίνω",),
|
||||||
"υπέβαλλα": ("υποβάλλω",),
|
"υπέβαλλα": ("υποβάλλω",),
|
||||||
"υπέβαλλες": ("υποβάλλω",),
|
"υπέβαλλες": ("υποβάλλω",),
|
||||||
"υπέβαλλε": ("υποβάλλω",),
|
"υπέβαλλε": ("υποβάλλω",),
|
||||||
"υπέβαλλαν": ("υποβάλλω",),
|
"υπέβαλλαν": ("υποβάλλω",),
|
||||||
"έπινα": ("πίνω",),
|
"έπινα": ("πίνω",),
|
||||||
"έπινες": ("πίνω",),
|
"έπινες": ("πίνω",),
|
||||||
"έπινε": ("πίνω",),
|
"έπινε": ("πίνω",),
|
||||||
"έπιναν": ("πίνω",),
|
"έπιναν": ("πίνω",),
|
||||||
"ήπια": ("πίνω",),
|
"ήπια": ("πίνω",),
|
||||||
"ήπιες": ("πίνω",),
|
"ήπιες": ("πίνω",),
|
||||||
"ήπιε": ("πίνω",),
|
"ήπιε": ("πίνω",),
|
||||||
"ήπιαμε": ("πίνω",),
|
"ήπιαμε": ("πίνω",),
|
||||||
"ήπιατε": ("πίνω",),
|
"ήπιατε": ("πίνω",),
|
||||||
"ήπιαν": ("πίνω",),
|
"ήπιαν": ("πίνω",),
|
||||||
"ετύχα": ("τυχαίνω",),
|
"ετύχα": ("τυχαίνω",),
|
||||||
"ετύχες": ("τυχαίνω",),
|
"ετύχες": ("τυχαίνω",),
|
||||||
"ετύχε": ("τυχαίνω",),
|
"ετύχε": ("τυχαίνω",),
|
||||||
"ετύχαν": ("τυχαίνω",),
|
"ετύχαν": ("τυχαίνω",),
|
||||||
"φάω": ("τρώω",),
|
"φάω": ("τρώω",),
|
||||||
"φάς": ("τρώω",),
|
"φάς": ("τρώω",),
|
||||||
"φάει": ("τρώω",),
|
"φάει": ("τρώω",),
|
||||||
"φάμε": ("τρώω",),
|
"φάμε": ("τρώω",),
|
||||||
"φάτε": ("τρώω",),
|
"φάτε": ("τρώω",),
|
||||||
"φάνε": ("τρώω",),
|
"φάνε": ("τρώω",),
|
||||||
"φάν": ("τρώω",),
|
"φάν": ("τρώω",),
|
||||||
"έτρωγα": ("τρώω",),
|
"έτρωγα": ("τρώω",),
|
||||||
"έτρωγες": ("τρώω",),
|
"έτρωγες": ("τρώω",),
|
||||||
"τρώγαμε": ("τρώω",),
|
"τρώγαμε": ("τρώω",),
|
||||||
"τρώγατε": ("τρώω",),
|
"τρώγατε": ("τρώω",),
|
||||||
"τρώγανε": ("τρώω",),
|
"τρώγανε": ("τρώω",),
|
||||||
"τρώγαν": ("τρώω",),
|
"τρώγαν": ("τρώω",),
|
||||||
"πέρασα": ("περνώ",),
|
"πέρασα": ("περνώ",),
|
||||||
"πέρασες": ("περνώ",),
|
"πέρασες": ("περνώ",),
|
||||||
"πέρασε": ("περνώ",),
|
"πέρασε": ("περνώ",),
|
||||||
"πέρασαμε": ("περνώ",),
|
"πέρασαμε": ("περνώ",),
|
||||||
"πέρασατε": ("περνώ",),
|
"πέρασατε": ("περνώ",),
|
||||||
"πέρασαν": ("περνώ",),
|
"πέρασαν": ("περνώ",),
|
||||||
"έγδαρα": ("γδάρω",),
|
"έγδαρα": ("γδάρω",),
|
||||||
"έγδαρες": ("γδάρω",),
|
"έγδαρες": ("γδάρω",),
|
||||||
"έγδαρε": ("γδάρω",),
|
"έγδαρε": ("γδάρω",),
|
||||||
"έγδαραν": ("γδάρω",),
|
"έγδαραν": ("γδάρω",),
|
||||||
"έβγαλα": ("βγάλω",),
|
"έβγαλα": ("βγάλω",),
|
||||||
"έβγαλες": ("βγάλω",),
|
"έβγαλες": ("βγάλω",),
|
||||||
"έβγαλε": ("βγάλω",),
|
"έβγαλε": ("βγάλω",),
|
||||||
"έβγαλαν": ("βγάλω",),
|
"έβγαλαν": ("βγάλω",),
|
||||||
"έφθασα": ("φτάνω",),
|
"έφθασα": ("φτάνω",),
|
||||||
"έφθασες": ("φτάνω",),
|
"έφθασες": ("φτάνω",),
|
||||||
"έφθασε": ("φτάνω",),
|
"έφθασε": ("φτάνω",),
|
||||||
"έφθασαν": ("φτάνω",),
|
"έφθασαν": ("φτάνω",),
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,34 +1,45 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import pickle
|
|
||||||
|
|
||||||
from gensim.corpora.wikicorpus import extract_pages
|
from gensim.corpora.wikicorpus import extract_pages
|
||||||
|
|
||||||
regex = re.compile(r'==={{(\w+)\|el}}===')
|
|
||||||
regex2 = re.compile(r'==={{(\w+ \w+)\|el}}===')
|
regex = re.compile(r"==={{(\w+)\|el}}===")
|
||||||
|
regex2 = re.compile(r"==={{(\w+ \w+)\|el}}===")
|
||||||
|
|
||||||
# get words based on the Wiktionary dump
|
# get words based on the Wiktionary dump
|
||||||
# check only for specific parts
|
# check only for specific parts
|
||||||
|
|
||||||
# ==={{κύριο όνομα|el}}===
|
# ==={{κύριο όνομα|el}}===
|
||||||
expected_parts = ['μετοχή', 'ρήμα', 'επίθετο',
|
expected_parts = [
|
||||||
'επίρρημα', 'ουσιαστικό', 'κύριο όνομα', 'άρθρο']
|
"μετοχή",
|
||||||
|
"ρήμα",
|
||||||
|
"επίθετο",
|
||||||
|
"επίρρημα",
|
||||||
|
"ουσιαστικό",
|
||||||
|
"κύριο όνομα",
|
||||||
|
"άρθρο",
|
||||||
|
]
|
||||||
|
|
||||||
unwanted_parts = '''
|
unwanted_parts = """
|
||||||
{'αναγραμματισμοί': 2, 'σύνδεσμος': 94, 'απαρέμφατο': 1, 'μορφή άρθρου': 1, 'ένθημα': 1, 'μερική συνωνυμία': 57, 'ορισμός': 1, 'σημείωση': 3, 'πρόσφυμα': 3, 'ταυτόσημα': 8, 'χαρακτήρας': 51, 'μορφή επιρρήματος': 1, 'εκφράσεις': 22, 'ρηματικό σχήμα': 3, 'πολυλεκτικό επίρρημα': 2, 'μόριο': 35, 'προφορά': 412, 'ρηματική έκφραση': 15, 'λογοπαίγνια': 2, 'πρόθεση': 46, 'ρηματικό επίθετο': 1, 'κατάληξη επιρρημάτων': 10, 'συναφείς όροι': 1, 'εξωτερικοί σύνδεσμοι': 1, 'αρσενικό γένος': 1, 'πρόθημα': 169, 'κατάληξη': 3, 'υπώνυμα': 7, 'επιφώνημα': 197, 'ρηματικός τύπος': 1, 'συντομομορφή': 560, 'μορφή ρήματος': 68282, 'μορφή επιθέτου': 61779, 'μορφές': 71, 'ιδιωματισμός': 2, 'πολυλεκτικός όρος': 719, 'πολυλεκτικό ουσιαστικό': 180, 'παράγωγα': 25, 'μορφή μετοχής': 806, 'μορφή αριθμητικού': 3, 'άκλιτο': 1, 'επίθημα': 181, 'αριθμητικό': 129, 'συγγενικά': 94, 'σημειώσεις': 45, 'Ιδιωματισμός': 1, 'ρητά': 12, 'φράση': 9, 'συνώνυμα': 556, 'μεταφράσεις': 1, 'κατάληξη ρημάτων': 15, 'σύνθετα': 27, 'υπερώνυμα': 1, 'εναλλακτικός τύπος': 22, 'μορφή ουσιαστικού': 35122, 'επιρρηματική έκφραση': 12, 'αντώνυμα': 76, 'βλέπε': 7, 'μορφή αντωνυμίας': 51, 'αντωνυμία': 100, 'κλίση': 11, 'σύνθετοι τύποι': 1, 'παροιμία': 5, 'μορφή_επιθέτου': 2, 'έκφραση': 738, 'σύμβολο': 8, 'πολυλεκτικό επίθετο': 1, 'ετυμολογία': 867}
|
{'αναγραμματισμοί': 2, 'σύνδεσμος': 94, 'απαρέμφατο': 1, 'μορφή άρθρου': 1, 'ένθημα': 1, 'μερική συνωνυμία': 57, 'ορισμός': 1, 'σημείωση': 3, 'πρόσφυμα': 3, 'ταυτόσημα': 8, 'χαρακτήρας': 51, 'μορφή επιρρήματος': 1, 'εκφράσεις': 22, 'ρηματικό σχήμα': 3, 'πολυλεκτικό επίρρημα': 2, 'μόριο': 35, 'προφορά': 412, 'ρηματική έκφραση': 15, 'λογοπαίγνια': 2, 'πρόθεση': 46, 'ρηματικό επίθετο': 1, 'κατάληξη επιρρημάτων': 10, 'συναφείς όροι': 1, 'εξωτερικοί σύνδεσμοι': 1, 'αρσενικό γένος': 1, 'πρόθημα': 169, 'κατάληξη': 3, 'υπώνυμα': 7, 'επιφώνημα': 197, 'ρηματικός τύπος': 1, 'συντομομορφή': 560, 'μορφή ρήματος': 68282, 'μορφή επιθέτου': 61779, 'μορφές': 71, 'ιδιωματισμός': 2, 'πολυλεκτικός όρος': 719, 'πολυλεκτικό ουσιαστικό': 180, 'παράγωγα': 25, 'μορφή μετοχής': 806, 'μορφή αριθμητικού': 3, 'άκλιτο': 1, 'επίθημα': 181, 'αριθμητικό': 129, 'συγγενικά': 94, 'σημειώσεις': 45, 'Ιδιωματισμός': 1, 'ρητά': 12, 'φράση': 9, 'συνώνυμα': 556, 'μεταφράσεις': 1, 'κατάληξη ρημάτων': 15, 'σύνθετα': 27, 'υπερώνυμα': 1, 'εναλλακτικός τύπος': 22, 'μορφή ουσιαστικού': 35122, 'επιρρηματική έκφραση': 12, 'αντώνυμα': 76, 'βλέπε': 7, 'μορφή αντωνυμίας': 51, 'αντωνυμία': 100, 'κλίση': 11, 'σύνθετοι τύποι': 1, 'παροιμία': 5, 'μορφή_επιθέτου': 2, 'έκφραση': 738, 'σύμβολο': 8, 'πολυλεκτικό επίθετο': 1, 'ετυμολογία': 867}
|
||||||
'''
|
"""
|
||||||
|
|
||||||
|
|
||||||
wiktionary_file_path = '/data/gsoc2018-spacy/spacy/lang/el/res/elwiktionary-latest-pages-articles.xml'
|
wiktionary_file_path = (
|
||||||
|
"/data/gsoc2018-spacy/spacy/lang/el/res/elwiktionary-latest-pages-articles.xml"
|
||||||
|
)
|
||||||
|
|
||||||
proper_names_dict={
|
proper_names_dict = {
|
||||||
'ουσιαστικό':'nouns',
|
"ουσιαστικό": "nouns",
|
||||||
'επίθετο':'adjectives',
|
"επίθετο": "adjectives",
|
||||||
'άρθρο':'dets',
|
"άρθρο": "dets",
|
||||||
'επίρρημα':'adverbs',
|
"επίρρημα": "adverbs",
|
||||||
'κύριο όνομα': 'proper_names',
|
"κύριο όνομα": "proper_names",
|
||||||
'μετοχή': 'participles',
|
"μετοχή": "participles",
|
||||||
'ρήμα': 'verbs'
|
"ρήμα": "verbs",
|
||||||
}
|
}
|
||||||
expected_parts_dict = {}
|
expected_parts_dict = {}
|
||||||
for expected_part in expected_parts:
|
for expected_part in expected_parts:
|
||||||
|
@ -36,7 +47,7 @@ for expected_part in expected_parts:
|
||||||
|
|
||||||
other_parts = {}
|
other_parts = {}
|
||||||
for title, text, pageid in extract_pages(wiktionary_file_path):
|
for title, text, pageid in extract_pages(wiktionary_file_path):
|
||||||
if text.startswith('#REDIRECT'):
|
if text.startswith("#REDIRECT"):
|
||||||
continue
|
continue
|
||||||
title = title.lower()
|
title = title.lower()
|
||||||
all_regex = regex.findall(text)
|
all_regex = regex.findall(text)
|
||||||
|
@ -47,20 +58,17 @@ for title, text, pageid in extract_pages(wiktionary_file_path):
|
||||||
|
|
||||||
|
|
||||||
for i in expected_parts_dict:
|
for i in expected_parts_dict:
|
||||||
with open('_{0}.py'.format(proper_names_dict[i]), 'w') as f:
|
with open("_{0}.py".format(proper_names_dict[i]), "w") as f:
|
||||||
f.write('from __future__ import unicode_literals\n')
|
f.write("from __future__ import unicode_literals\n")
|
||||||
f.write('{} = set(\"\"\"\n'.format(proper_names_dict[i].upper()))
|
f.write('{} = set("""\n'.format(proper_names_dict[i].upper()))
|
||||||
words = sorted(expected_parts_dict[i])
|
words = sorted(expected_parts_dict[i])
|
||||||
line = ''
|
line = ""
|
||||||
to_write = []
|
to_write = []
|
||||||
for word in words:
|
for word in words:
|
||||||
if len(line + ' ' + word) > 79:
|
if len(line + " " + word) > 79:
|
||||||
to_write.append(line)
|
to_write.append(line)
|
||||||
line = ''
|
line = ""
|
||||||
else:
|
else:
|
||||||
line = line + ' ' + word
|
line = line + " " + word
|
||||||
f.write('\n'.join(to_write))
|
f.write("\n".join(to_write))
|
||||||
f.write('\n\"\"\".split())')
|
f.write('\n""".split())')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,18 +3,18 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from ....symbols import NOUN, VERB, ADJ, PUNCT
|
from ....symbols import NOUN, VERB, ADJ, PUNCT
|
||||||
|
|
||||||
'''
|
|
||||||
Greek language lemmatizer applies the default rule based lemmatization
|
|
||||||
procedure with some modifications for better Greek language support.
|
|
||||||
|
|
||||||
The first modification is that it checks if the word for lemmatization is
|
|
||||||
already a lemma and if yes, it just returns it.
|
|
||||||
The second modification is about removing the base forms function which is
|
|
||||||
not applicable for Greek language.
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
class GreekLemmatizer(object):
|
class GreekLemmatizer(object):
|
||||||
|
"""
|
||||||
|
Greek language lemmatizer applies the default rule based lemmatization
|
||||||
|
procedure with some modifications for better Greek language support.
|
||||||
|
|
||||||
|
The first modification is that it checks if the word for lemmatization is
|
||||||
|
already a lemma and if yes, it just returns it.
|
||||||
|
The second modification is about removing the base forms function which is
|
||||||
|
not applicable for Greek language.
|
||||||
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
||||||
return cls(index, exc, rules, lookup)
|
return cls(index, exc, rules, lookup)
|
||||||
|
@ -28,26 +28,29 @@ class GreekLemmatizer(object):
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(self, string, univ_pos, morphology=None):
|
||||||
if not self.rules:
|
if not self.rules:
|
||||||
return [self.lookup_table.get(string, string)]
|
return [self.lookup_table.get(string, string)]
|
||||||
if univ_pos in (NOUN, 'NOUN', 'noun'):
|
if univ_pos in (NOUN, "NOUN", "noun"):
|
||||||
univ_pos = 'noun'
|
univ_pos = "noun"
|
||||||
elif univ_pos in (VERB, 'VERB', 'verb'):
|
elif univ_pos in (VERB, "VERB", "verb"):
|
||||||
univ_pos = 'verb'
|
univ_pos = "verb"
|
||||||
elif univ_pos in (ADJ, 'ADJ', 'adj'):
|
elif univ_pos in (ADJ, "ADJ", "adj"):
|
||||||
univ_pos = 'adj'
|
univ_pos = "adj"
|
||||||
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
|
elif univ_pos in (PUNCT, "PUNCT", "punct"):
|
||||||
univ_pos = 'punct'
|
univ_pos = "punct"
|
||||||
else:
|
else:
|
||||||
return list(set([string.lower()]))
|
return list(set([string.lower()]))
|
||||||
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
|
lemmas = lemmatize(
|
||||||
self.exc.get(univ_pos, {}),
|
string,
|
||||||
self.rules.get(univ_pos, []))
|
self.index.get(univ_pos, {}),
|
||||||
|
self.exc.get(univ_pos, {}),
|
||||||
|
self.rules.get(univ_pos, []),
|
||||||
|
)
|
||||||
return lemmas
|
return lemmas
|
||||||
|
|
||||||
|
|
||||||
def lemmatize(string, index, exceptions, rules):
|
def lemmatize(string, index, exceptions, rules):
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
forms = []
|
forms = []
|
||||||
if (string in index):
|
if string in index:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
return forms
|
return forms
|
||||||
forms.extend(exceptions.get(string, []))
|
forms.extend(exceptions.get(string, []))
|
||||||
|
@ -55,7 +58,7 @@ def lemmatize(string, index, exceptions, rules):
|
||||||
if not forms:
|
if not forms:
|
||||||
for old, new in rules:
|
for old, new in rules:
|
||||||
if string.endswith(old):
|
if string.endswith(old):
|
||||||
form = string[:len(string) - len(old)] + new
|
form = string[: len(string) - len(old)] + new
|
||||||
if not form:
|
if not form:
|
||||||
pass
|
pass
|
||||||
elif form in index or not form.isalpha():
|
elif form in index or not form.isalpha():
|
||||||
|
|
|
@ -4,43 +4,100 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
_num_words = ['μηδέν', 'ένας', 'δυο', 'δυό', 'τρεις', 'τέσσερις', 'πέντε',
|
_num_words = [
|
||||||
'έξι', 'εφτά', 'επτά', 'οκτώ', 'οχτώ',
|
"μηδέν",
|
||||||
'εννιά', 'εννέα', 'δέκα', 'έντεκα', 'ένδεκα', 'δώδεκα',
|
"ένας",
|
||||||
'δεκατρείς', 'δεκατέσσερις', 'δεκαπέντε', 'δεκαέξι', 'δεκαεπτά',
|
"δυο",
|
||||||
'δεκαοχτώ', 'δεκαεννέα', 'δεκαεννεα', 'είκοσι', 'τριάντα',
|
"δυό",
|
||||||
'σαράντα', 'πενήντα', 'εξήντα', 'εβδομήντα', 'ογδόντα',
|
"τρεις",
|
||||||
'ενενήντα', 'εκατό', 'διακόσιοι', 'διακόσοι', 'τριακόσιοι',
|
"τέσσερις",
|
||||||
'τριακόσοι', 'τετρακόσιοι', 'τετρακόσοι', 'πεντακόσιοι',
|
"πέντε",
|
||||||
'πεντακόσοι', 'εξακόσιοι', 'εξακόσοι', 'εφτακόσιοι', 'εφτακόσοι',
|
"έξι",
|
||||||
'επτακόσιοι', 'επτακόσοι', 'οχτακόσιοι', 'οχτακόσοι',
|
"εφτά",
|
||||||
'οκτακόσιοι', 'οκτακόσοι', 'εννιακόσιοι', 'χίλιοι', 'χιλιάδα',
|
"επτά",
|
||||||
'εκατομμύριο', 'δισεκατομμύριο', 'τρισεκατομμύριο', 'τετράκις',
|
"οκτώ",
|
||||||
'πεντάκις', 'εξάκις', 'επτάκις', 'οκτάκις', 'εννεάκις', 'ένα',
|
"οχτώ",
|
||||||
'δύο', 'τρία', 'τέσσερα', 'δις', 'χιλιάδες']
|
"εννιά",
|
||||||
|
"εννέα",
|
||||||
|
"δέκα",
|
||||||
|
"έντεκα",
|
||||||
|
"ένδεκα",
|
||||||
|
"δώδεκα",
|
||||||
|
"δεκατρείς",
|
||||||
|
"δεκατέσσερις",
|
||||||
|
"δεκαπέντε",
|
||||||
|
"δεκαέξι",
|
||||||
|
"δεκαεπτά",
|
||||||
|
"δεκαοχτώ",
|
||||||
|
"δεκαεννέα",
|
||||||
|
"δεκαεννεα",
|
||||||
|
"είκοσι",
|
||||||
|
"τριάντα",
|
||||||
|
"σαράντα",
|
||||||
|
"πενήντα",
|
||||||
|
"εξήντα",
|
||||||
|
"εβδομήντα",
|
||||||
|
"ογδόντα",
|
||||||
|
"ενενήντα",
|
||||||
|
"εκατό",
|
||||||
|
"διακόσιοι",
|
||||||
|
"διακόσοι",
|
||||||
|
"τριακόσιοι",
|
||||||
|
"τριακόσοι",
|
||||||
|
"τετρακόσιοι",
|
||||||
|
"τετρακόσοι",
|
||||||
|
"πεντακόσιοι",
|
||||||
|
"πεντακόσοι",
|
||||||
|
"εξακόσιοι",
|
||||||
|
"εξακόσοι",
|
||||||
|
"εφτακόσιοι",
|
||||||
|
"εφτακόσοι",
|
||||||
|
"επτακόσιοι",
|
||||||
|
"επτακόσοι",
|
||||||
|
"οχτακόσιοι",
|
||||||
|
"οχτακόσοι",
|
||||||
|
"οκτακόσιοι",
|
||||||
|
"οκτακόσοι",
|
||||||
|
"εννιακόσιοι",
|
||||||
|
"χίλιοι",
|
||||||
|
"χιλιάδα",
|
||||||
|
"εκατομμύριο",
|
||||||
|
"δισεκατομμύριο",
|
||||||
|
"τρισεκατομμύριο",
|
||||||
|
"τετράκις",
|
||||||
|
"πεντάκις",
|
||||||
|
"εξάκις",
|
||||||
|
"επτάκις",
|
||||||
|
"οκτάκις",
|
||||||
|
"εννεάκις",
|
||||||
|
"ένα",
|
||||||
|
"δύο",
|
||||||
|
"τρία",
|
||||||
|
"τέσσερα",
|
||||||
|
"δις",
|
||||||
|
"χιλιάδες",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
if text.startswith(('+', '-', '±', '~')):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(",", "").replace(".", "")
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.count('/') == 1:
|
if text.count("/") == 1:
|
||||||
num, denom = text.split('/')
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.count('^') == 1:
|
if text.count("^") == 1:
|
||||||
num, denom = text.split('^')
|
num, denom = text.split("^")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.lower() in _num_words or text.lower().split(' ')[0] in _num_words:
|
if text.lower() in _num_words or text.lower().split(" ")[0] in _num_words:
|
||||||
return True
|
return True
|
||||||
if text in _num_words:
|
if text in _num_words:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||||
LIKE_NUM: like_num
|
|
||||||
}
|
|
||||||
|
|
|
@ -3,8 +3,6 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
# These exceptions are used to add NORM values based on a token's ORTH value.
|
# These exceptions are used to add NORM values based on a token's ORTH value.
|
||||||
|
|
||||||
|
|
||||||
# Norms are only set if no alternative is provided in the tokenizer exceptions.
|
# Norms are only set if no alternative is provided in the tokenizer exceptions.
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
|
|
|
@ -6,66 +6,91 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||||
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||||
from ..char_classes import QUOTES, CURRENCY
|
from ..char_classes import QUOTES, CURRENCY
|
||||||
|
|
||||||
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
|
_units = (
|
||||||
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
|
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
||||||
'TB T G M K км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм '
|
"kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "
|
||||||
'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб')
|
"TB T G M K км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм "
|
||||||
|
"кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def merge_chars(char): return char.strip().replace(' ', '|')
|
def merge_chars(char):
|
||||||
|
return char.strip().replace(" ", "|")
|
||||||
|
|
||||||
|
|
||||||
UNITS = merge_chars(_units)
|
UNITS = merge_chars(_units)
|
||||||
|
|
||||||
_prefixes = (['\'\'', '§', '%', '=', r'\+[0-9]+%', # 90%
|
_prefixes = (
|
||||||
r'\'([0-9]){2}([\-]\'([0-9]){2})*', # '12'-13
|
[
|
||||||
r'\-([0-9]){1,9}\.([0-9]){1,9}', # -12.13
|
"''",
|
||||||
r'\'([Α-Ωα-ωίϊΐόάέύϋΰήώ]+)\'', # 'αβγ'
|
"§",
|
||||||
r'([Α-Ωα-ωίϊΐόάέύϋΰήώ]){1,3}\'', # αβγ'
|
"%",
|
||||||
r'http://www.[A-Za-z]+\-[A-Za-z]+(\.[A-Za-z]+)+(\/[A-Za-z]+)*(\.[A-Za-z]+)*',
|
"=",
|
||||||
r'[ΈΆΊΑ-Ωα-ωίϊΐόάέύϋΰήώ]+\*', # όνομα*
|
r"\+[0-9]+%", # 90%
|
||||||
r'\$([0-9])+([\,\.]([0-9])+){0,1}',
|
r"\'([0-9]){2}([\-]\'([0-9]){2})*", # '12'-13
|
||||||
] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
r"\-([0-9]){1,9}\.([0-9]){1,9}", # -12.13
|
||||||
LIST_CURRENCY + LIST_ICONS)
|
r"\'([Α-Ωα-ωίϊΐόάέύϋΰήώ]+)\'", # 'αβγ'
|
||||||
|
r"([Α-Ωα-ωίϊΐόάέύϋΰήώ]){1,3}\'", # αβγ'
|
||||||
|
r"http://www.[A-Za-z]+\-[A-Za-z]+(\.[A-Za-z]+)+(\/[A-Za-z]+)*(\.[A-Za-z]+)*",
|
||||||
|
r"[ΈΆΊΑ-Ωα-ωίϊΐόάέύϋΰήώ]+\*", # όνομα*
|
||||||
|
r"\$([0-9])+([\,\.]([0-9])+){0,1}",
|
||||||
|
]
|
||||||
|
+ LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_CURRENCY
|
||||||
|
+ LIST_ICONS
|
||||||
|
)
|
||||||
|
|
||||||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
_suffixes = (
|
||||||
[r'(?<=[0-9])\+', # 12+
|
LIST_PUNCT
|
||||||
r'([0-9])+\'', # 12'
|
+ LIST_ELLIPSES
|
||||||
r'([A-Za-z])?\'', # a'
|
+ LIST_QUOTES
|
||||||
r'^([0-9]){1,2}\.', # 12.
|
+ LIST_ICONS
|
||||||
r' ([0-9]){1,2}\.', # 12.
|
+ [
|
||||||
r'([0-9]){1}\) ', # 12)
|
r"(?<=[0-9])\+", # 12+
|
||||||
r'^([0-9]){1}\)$', # 12)
|
r"([0-9])+\'", # 12'
|
||||||
r'(?<=°[FfCcKk])\.',
|
r"([A-Za-z])?\'", # a'
|
||||||
r'([0-9])+\&', # 12&
|
r"^([0-9]){1,2}\.", # 12.
|
||||||
r'(?<=[0-9])(?:{})'.format(CURRENCY),
|
r" ([0-9]){1,2}\.", # 12.
|
||||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
r"([0-9]){1}\) ", # 12)
|
||||||
r'(?<=[0-9{}{}(?:{})])\.'.format(ALPHA_LOWER, r'²\-\)\]\+', QUOTES),
|
r"^([0-9]){1}\)$", # 12)
|
||||||
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER),
|
r"(?<=°[FfCcKk])\.",
|
||||||
r'(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\-', # όνομα-
|
r"([0-9])+\&", # 12&
|
||||||
r'(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\.',
|
r"(?<=[0-9])(?:{})".format(CURRENCY),
|
||||||
r'^[Α-Ω]{1}\.',
|
r"(?<=[0-9])(?:{})".format(UNITS),
|
||||||
r'\ [Α-Ω]{1}\.',
|
r"(?<=[0-9{}{}(?:{})])\.".format(ALPHA_LOWER, r"²\-\)\]\+", QUOTES),
|
||||||
# πρώτος-δεύτερος , πρώτος-δεύτερος-τρίτος
|
r"(?<=[{a}][{a}])\.".format(a=ALPHA_UPPER),
|
||||||
r'[ΈΆΊΑΌ-Ωα-ωίϊΐόάέύϋΰήώ]+([\-]([ΈΆΊΑΌ-Ωα-ωίϊΐόάέύϋΰήώ]+))+',
|
r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\-", # όνομα-
|
||||||
r'([0-9]+)mg', # 13mg
|
r"(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\.",
|
||||||
r'([0-9]+)\.([0-9]+)m' # 1.2m
|
r"^[Α-Ω]{1}\.",
|
||||||
])
|
r"\ [Α-Ω]{1}\.",
|
||||||
|
# πρώτος-δεύτερος , πρώτος-δεύτερος-τρίτος
|
||||||
|
r"[ΈΆΊΑΌ-Ωα-ωίϊΐόάέύϋΰήώ]+([\-]([ΈΆΊΑΌ-Ωα-ωίϊΐόάέύϋΰήώ]+))+",
|
||||||
|
r"([0-9]+)mg", # 13mg
|
||||||
|
r"([0-9]+)\.([0-9]+)m", # 1.2m
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
_infixes = (
|
||||||
[r'(?<=[0-9])[+\/\-\*^](?=[0-9])', # 1/2 , 1-2 , 1*2
|
LIST_ELLIPSES
|
||||||
r'([a-zA-Z]+)\/([a-zA-Z]+)\/([a-zA-Z]+)', # name1/name2/name3
|
+ LIST_ICONS
|
||||||
r'([0-9])+(\.([0-9]+))*([\-]([0-9])+)+', # 10.9 , 10.9.9 , 10.9-6
|
+ [
|
||||||
r'([0-9])+[,]([0-9])+[\-]([0-9])+[,]([0-9])+', # 10,11,12
|
r"(?<=[0-9])[+\/\-\*^](?=[0-9])", # 1/2 , 1-2 , 1*2
|
||||||
r'([0-9])+[ης]+([\-]([0-9])+)+', # 1ης-2
|
r"([a-zA-Z]+)\/([a-zA-Z]+)\/([a-zA-Z]+)", # name1/name2/name3
|
||||||
# 15/2 , 15/2/17 , 2017/2/15
|
r"([0-9])+(\.([0-9]+))*([\-]([0-9])+)+", # 10.9 , 10.9.9 , 10.9-6
|
||||||
r'([0-9]){1,4}[\/]([0-9]){1,2}([\/]([0-9]){0,4}){0,1}',
|
r"([0-9])+[,]([0-9])+[\-]([0-9])+[,]([0-9])+", # 10,11,12
|
||||||
r'[A-Za-z]+\@[A-Za-z]+(\-[A-Za-z]+)*\.[A-Za-z]+', # abc@cde-fgh.a
|
r"([0-9])+[ης]+([\-]([0-9])+)+", # 1ης-2
|
||||||
r'([a-zA-Z]+)(\-([a-zA-Z]+))+', # abc-abc
|
# 15/2 , 15/2/17 , 2017/2/15
|
||||||
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
r"([0-9]){1,4}[\/]([0-9]){1,2}([\/]([0-9]){0,4}){0,1}",
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
r"[A-Za-z]+\@[A-Za-z]+(\-[A-Za-z]+)*\.[A-Za-z]+", # abc@cde-fgh.a
|
||||||
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
r"([a-zA-Z]+)(\-([a-zA-Z]+))+", # abc-abc
|
||||||
r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA)])
|
r"(?<=[{}])\.(?=[{}])".format(ALPHA_LOWER, ALPHA_UPPER),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
||||||
|
r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = _prefixes
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
TOKENIZER_SUFFIXES = _suffixes
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
|
|
@ -1,13 +1,11 @@
|
||||||
# -*- coding: utf-8 -*-
|
# coding: utf8
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
# Stop words
|
# Stop words
|
||||||
|
|
||||||
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
|
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
STOP_WORDS = set("""
|
|
||||||
αδιάκοπα αι ακόμα ακόμη ακριβώς άλλα αλλά αλλαχού άλλες άλλη άλλην
|
αδιάκοπα αι ακόμα ακόμη ακριβώς άλλα αλλά αλλαχού άλλες άλλη άλλην
|
||||||
άλλης αλλιώς αλλιώτικα άλλο άλλοι αλλοιώς αλλοιώτικα άλλον άλλος άλλοτε αλλού
|
άλλης αλλιώς αλλιώτικα άλλο άλλοι αλλοιώς αλλοιώτικα άλλον άλλος άλλοτε αλλού
|
||||||
άλλους άλλων άμα άμεσα αμέσως αν ανά ανάμεσα αναμεταξύ άνευ αντί αντίπερα αντίς
|
άλλους άλλων άμα άμεσα αμέσως αν ανά ανάμεσα αναμεταξύ άνευ αντί αντίπερα αντίς
|
||||||
|
@ -89,4 +87,5 @@ STOP_WORDS = set("""
|
||||||
χωρίς χωριστά
|
χωρίς χωριστά
|
||||||
|
|
||||||
ω ως ωσάν ωσότου ώσπου ώστε ωστόσο ωχ
|
ω ως ωσάν ωσότου ώσπου ώστε ωστόσο ωχ
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -8,18 +8,16 @@ def noun_chunks(obj):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases. Works on both Doc and Span.
|
Detect base noun phrases. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
# It follows the logic of the noun chunks finder of English language,
|
||||||
# it follows the logic of the noun chunks finder of English language,
|
|
||||||
# adjusted to some Greek language special characteristics.
|
# adjusted to some Greek language special characteristics.
|
||||||
|
|
||||||
# obj tag corrects some DEP tagger mistakes.
|
# obj tag corrects some DEP tagger mistakes.
|
||||||
# Further improvement of the models will eliminate the need for this tag.
|
# Further improvement of the models will eliminate the need for this tag.
|
||||||
labels = ['nsubj', 'obj', 'iobj', 'appos', 'ROOT', 'obl']
|
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add('conj')
|
conj = doc.vocab.strings.add("conj")
|
||||||
nmod = doc.vocab.strings.add('nmod')
|
nmod = doc.vocab.strings.add("nmod")
|
||||||
np_label = doc.vocab.strings.add('NP')
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
seen = set()
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(obj):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
@ -31,16 +29,17 @@ def noun_chunks(obj):
|
||||||
if any(w.i in seen for w in word.subtree):
|
if any(w.i in seen for w in word.subtree):
|
||||||
continue
|
continue
|
||||||
flag = False
|
flag = False
|
||||||
if (word.pos == NOUN):
|
if word.pos == NOUN:
|
||||||
# check for patterns such as γραμμή παραγωγής
|
# check for patterns such as γραμμή παραγωγής
|
||||||
for potential_nmod in word.rights:
|
for potential_nmod in word.rights:
|
||||||
if (potential_nmod.dep == nmod):
|
if potential_nmod.dep == nmod:
|
||||||
seen.update(j for j in range(
|
seen.update(
|
||||||
word.left_edge.i, potential_nmod.i + 1))
|
j for j in range(word.left_edge.i, potential_nmod.i + 1)
|
||||||
|
)
|
||||||
yield word.left_edge.i, potential_nmod.i + 1, np_label
|
yield word.left_edge.i, potential_nmod.i + 1, np_label
|
||||||
flag = True
|
flag = True
|
||||||
break
|
break
|
||||||
if (flag is False):
|
if flag is False:
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
|
@ -56,6 +55,4 @@ def noun_chunks(obj):
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
SYNTAX_ITERATORS = {
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||||
'noun_chunks': noun_chunks
|
|
||||||
}
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,3 +1,4 @@
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
||||||
|
@ -22,5 +23,5 @@ TAG_MAP = {
|
||||||
"AUX": {POS: AUX},
|
"AUX": {POS: AUX},
|
||||||
"SPACE": {POS: SPACE},
|
"SPACE": {POS: SPACE},
|
||||||
"DET": {POS: DET},
|
"DET": {POS: DET},
|
||||||
"X": {POS: X}
|
"X": {POS: X},
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,303 +1,132 @@
|
||||||
# -*- coding: utf-8 -*-
|
# coding: utf8
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, NORM
|
from ...symbols import ORTH, LEMMA, NORM
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
for token in ["Απ'", "ΑΠ'", "αφ'", "Αφ'"]:
|
for token in ["Απ'", "ΑΠ'", "αφ'", "Αφ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "από", NORM: "από"}]
|
||||||
{ORTH: token, LEMMA: "από", NORM: "από"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Αλλ'", "αλλ'"]:
|
for token in ["Αλλ'", "αλλ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "αλλά", NORM: "αλλά"}]
|
||||||
{ORTH: token, LEMMA: "αλλά", NORM: "αλλά"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["παρ'", "Παρ'", "ΠΑΡ'"]:
|
for token in ["παρ'", "Παρ'", "ΠΑΡ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "παρά", NORM: "παρά"}]
|
||||||
{ORTH: token, LEMMA: "παρά", NORM: "παρά"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["καθ'", "Καθ'"]:
|
for token in ["καθ'", "Καθ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "κάθε", NORM: "κάθε"}]
|
||||||
{ORTH: token, LEMMA: "κάθε", NORM: "κάθε"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["κατ'", "Κατ'"]:
|
for token in ["κατ'", "Κατ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "κατά", NORM: "κατά"}]
|
||||||
{ORTH: token, LEMMA: "κατά", NORM: "κατά"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["'ΣΟΥΝ", "'ναι", "'ταν", "'τανε", "'μαστε", "'μουνα", "'μουν"]:
|
for token in ["'ΣΟΥΝ", "'ναι", "'ταν", "'τανε", "'μαστε", "'μουνα", "'μουν"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "είμαι", NORM: "είμαι"}]
|
||||||
{ORTH: token, LEMMA: "είμαι", NORM: "είμαι"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Επ'", "επ'", "εφ'", "Εφ'"]:
|
for token in ["Επ'", "επ'", "εφ'", "Εφ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "επί", NORM: "επί"}]
|
||||||
{ORTH: token, LEMMA: "επί", NORM: "επί"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Δι'", "δι'"]:
|
for token in ["Δι'", "δι'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "δια", NORM: "δια"}]
|
||||||
{ORTH: token, LEMMA: "δια", NORM: "δια"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["'χουν", "'χουμε", "'χαμε", "'χα", "'χε", "'χεις", "'χει"]:
|
for token in ["'χουν", "'χουμε", "'χαμε", "'χα", "'χε", "'χεις", "'χει"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "έχω", NORM: "έχω"}]
|
||||||
{ORTH: token, LEMMA: "έχω", NORM: "έχω"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["υπ'", "Υπ'"]:
|
for token in ["υπ'", "Υπ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "υπό", NORM: "υπό"}]
|
||||||
{ORTH: token, LEMMA: "υπό", NORM: "υπό"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Μετ'", "ΜΕΤ'", "'μετ"]:
|
for token in ["Μετ'", "ΜΕΤ'", "'μετ"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "μετά", NORM: "μετά"}]
|
||||||
{ORTH: token, LEMMA: "μετά", NORM: "μετά"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Μ'", "μ'"]:
|
for token in ["Μ'", "μ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "με", NORM: "με"}]
|
||||||
{ORTH: token, LEMMA: "με", NORM: "με"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Γι'", "ΓΙ'", "γι'"]:
|
for token in ["Γι'", "ΓΙ'", "γι'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "για", NORM: "για"}]
|
||||||
{ORTH: token, LEMMA: "για", NORM: "για"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Σ'", "σ'"]:
|
for token in ["Σ'", "σ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "σε", NORM: "σε"}]
|
||||||
{ORTH: token, LEMMA: "σε", NORM: "σε"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Θ'", "θ'"]:
|
for token in ["Θ'", "θ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "θα", NORM: "θα"}]
|
||||||
{ORTH: token, LEMMA: "θα", NORM: "θα"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Ν'", "ν'"]:
|
for token in ["Ν'", "ν'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "να", NORM: "να"}]
|
||||||
{ORTH: token, LEMMA: "να", NORM: "να"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Τ'", "τ'"]:
|
for token in ["Τ'", "τ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "να", NORM: "να"}]
|
||||||
{ORTH: token, LEMMA: "να", NORM: "να"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["'γω", "'σένα", "'μεις"]:
|
for token in ["'γω", "'σένα", "'μεις"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "εγώ", NORM: "εγώ"}]
|
||||||
{ORTH: token, LEMMA: "εγώ", NORM: "εγώ"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Τ'", "τ'"]:
|
for token in ["Τ'", "τ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "το", NORM: "το"}]
|
||||||
{ORTH: token, LEMMA: "το", NORM: "το"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Φέρ'", "Φερ'", "φέρ'", "φερ'"]:
|
for token in ["Φέρ'", "Φερ'", "φέρ'", "φερ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "φέρνω", NORM: "φέρνω"}]
|
||||||
{ORTH: token, LEMMA: "φέρνω", NORM: "φέρνω"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["'ρθούνε", "'ρθουν", "'ρθει", "'ρθεί", "'ρθε", "'ρχεται"]:
|
for token in ["'ρθούνε", "'ρθουν", "'ρθει", "'ρθεί", "'ρθε", "'ρχεται"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "έρχομαι", NORM: "έρχομαι"}]
|
||||||
{ORTH: token, LEMMA: "έρχομαι", NORM: "έρχομαι"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["'πανε", "'λεγε", "'λεγαν", "'πε", "'λεγα"]:
|
for token in ["'πανε", "'λεγε", "'λεγαν", "'πε", "'λεγα"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "λέγω", NORM: "λέγω"}]
|
||||||
{ORTH: token, LEMMA: "λέγω", NORM: "λέγω"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Πάρ'", "πάρ'"]:
|
for token in ["Πάρ'", "πάρ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "παίρνω", NORM: "παίρνω"}]
|
||||||
{ORTH: token, LEMMA: "παίρνω", NORM: "παίρνω"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["μέσ'", "Μέσ'", "μεσ'"]:
|
for token in ["μέσ'", "Μέσ'", "μεσ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "μέσα", NORM: "μέσα"}]
|
||||||
{ORTH: token, LEMMA: "μέσα", NORM: "μέσα"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["Δέσ'", "Δεσ'", "δεσ'"]:
|
for token in ["Δέσ'", "Δεσ'", "δεσ'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "δένω", NORM: "δένω"}]
|
||||||
{ORTH: token, LEMMA: "δένω", NORM: "δένω"}
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in ["'κανε", "Κάν'"]:
|
for token in ["'κανε", "Κάν'"]:
|
||||||
_exc[token] = [
|
_exc[token] = [{ORTH: token, LEMMA: "κάνω", NORM: "κάνω"}]
|
||||||
{ORTH: token, LEMMA: "κάνω", NORM: "κάνω"}
|
|
||||||
]
|
|
||||||
|
|
||||||
_other_exc = {
|
_other_exc = {
|
||||||
|
"κι": [{ORTH: "κι", LEMMA: "και", NORM: "και"}],
|
||||||
"κι": [
|
"Παίξ'": [{ORTH: "Παίξ'", LEMMA: "παίζω", NORM: "παίζω"}],
|
||||||
{ORTH: "κι", LEMMA: "και", NORM: "και"},
|
"Αντ'": [{ORTH: "Αντ'", LEMMA: "αντί", NORM: "αντί"}],
|
||||||
],
|
"ολ'": [{ORTH: "ολ'", LEMMA: "όλος", NORM: "όλος"}],
|
||||||
|
"ύστερ'": [{ORTH: "ύστερ'", LEMMA: "ύστερα", NORM: "ύστερα"}],
|
||||||
"Παίξ'": [
|
"'πρεπε": [{ORTH: "'πρεπε", LEMMA: "πρέπει", NORM: "πρέπει"}],
|
||||||
{ORTH: "Παίξ'", LEMMA: "παίζω", NORM: "παίζω"},
|
"Δύσκολ'": [{ORTH: "Δύσκολ'", LEMMA: "δύσκολος", NORM: "δύσκολος"}],
|
||||||
],
|
"'θελα": [{ORTH: "'θελα", LEMMA: "θέλω", NORM: "θέλω"}],
|
||||||
|
"'γραφα": [{ORTH: "'γραφα", LEMMA: "γράφω", NORM: "γράφω"}],
|
||||||
"Αντ'": [
|
"'παιρνα": [{ORTH: "'παιρνα", LEMMA: "παίρνω", NORM: "παίρνω"}],
|
||||||
{ORTH: "Αντ'", LEMMA: "αντί", NORM: "αντί"},
|
"'δειξε": [{ORTH: "'δειξε", LEMMA: "δείχνω", NORM: "δείχνω"}],
|
||||||
],
|
"όμουρφ'": [{ORTH: "όμουρφ'", LEMMA: "όμορφος", NORM: "όμορφος"}],
|
||||||
|
"κ'τσή": [{ORTH: "κ'τσή", LEMMA: "κουτσός", NORM: "κουτσός"}],
|
||||||
"ολ'": [
|
"μηδ'": [{ORTH: "μηδ'", LEMMA: "μήδε", NORM: "μήδε"}],
|
||||||
{ORTH: "ολ'", LEMMA: "όλος", NORM: "όλος"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"ύστερ'": [
|
|
||||||
{ORTH: "ύστερ'", LEMMA: "ύστερα", NORM: "ύστερα"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'πρεπε": [
|
|
||||||
{ORTH: "'πρεπε", LEMMA: "πρέπει", NORM: "πρέπει"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"Δύσκολ'": [
|
|
||||||
{ORTH: "Δύσκολ'", LEMMA: "δύσκολος", NORM: "δύσκολος"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'θελα": [
|
|
||||||
{ORTH: "'θελα", LEMMA: "θέλω", NORM: "θέλω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'γραφα": [
|
|
||||||
{ORTH: "'γραφα", LEMMA: "γράφω", NORM: "γράφω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'παιρνα": [
|
|
||||||
{ORTH: "'παιρνα", LEMMA: "παίρνω", NORM: "παίρνω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'δειξε": [
|
|
||||||
{ORTH: "'δειξε", LEMMA: "δείχνω", NORM: "δείχνω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"όμουρφ'": [
|
|
||||||
{ORTH: "όμουρφ'", LEMMA: "όμορφος", NORM: "όμορφος"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"κ'τσή": [
|
|
||||||
{ORTH: "κ'τσή", LEMMA: "κουτσός", NORM: "κουτσός"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"μηδ'": [
|
|
||||||
{ORTH: "μηδ'", LEMMA: "μήδε", NORM: "μήδε"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'ξομολογήθηκε": [
|
"'ξομολογήθηκε": [
|
||||||
{ORTH: "'ξομολογήθηκε", LEMMA: "εξομολογούμαι", NORM: "εξομολογούμαι"},
|
{ORTH: "'ξομολογήθηκε", LEMMA: "εξομολογούμαι", NORM: "εξομολογούμαι"}
|
||||||
],
|
],
|
||||||
|
"'μας": [{ORTH: "'μας", LEMMA: "εμάς", NORM: "εμάς"}],
|
||||||
"'μας": [
|
"'ξερες": [{ORTH: "'ξερες", LEMMA: "ξέρω", NORM: "ξέρω"}],
|
||||||
{ORTH: "'μας", LEMMA: "εμάς", NORM: "εμάς"},
|
"έφθασ'": [{ORTH: "έφθασ'", LEMMA: "φθάνω", NORM: "φθάνω"}],
|
||||||
],
|
"εξ'": [{ORTH: "εξ'", LEMMA: "εκ", NORM: "εκ"}],
|
||||||
|
"δώσ'": [{ORTH: "δώσ'", LEMMA: "δίνω", NORM: "δίνω"}],
|
||||||
"'ξερες": [
|
"τίποτ'": [{ORTH: "τίποτ'", LEMMA: "τίποτα", NORM: "τίποτα"}],
|
||||||
{ORTH: "'ξερες", LEMMA: "ξέρω", NORM: "ξέρω"},
|
"Λήξ'": [{ORTH: "Λήξ'", LEMMA: "λήγω", NORM: "λήγω"}],
|
||||||
],
|
"άσ'": [{ORTH: "άσ'", LEMMA: "αφήνω", NORM: "αφήνω"}],
|
||||||
|
"Στ'": [{ORTH: "Στ'", LEMMA: "στο", NORM: "στο"}],
|
||||||
"έφθασ'": [
|
"Δωσ'": [{ORTH: "Δωσ'", LEMMA: "δίνω", NORM: "δίνω"}],
|
||||||
{ORTH: "έφθασ'", LEMMA: "φθάνω", NORM: "φθάνω"},
|
"Βάψ'": [{ORTH: "Βάψ'", LEMMA: "βάφω", NORM: "βάφω"}],
|
||||||
],
|
"Αλλ'": [{ORTH: "Αλλ'", LEMMA: "αλλά", NORM: "αλλά"}],
|
||||||
|
"Αμ'": [{ORTH: "Αμ'", LEMMA: "άμα", NORM: "άμα"}],
|
||||||
"εξ'": [
|
"Αγόρασ'": [{ORTH: "Αγόρασ'", LEMMA: "αγοράζω", NORM: "αγοράζω"}],
|
||||||
{ORTH: "εξ'", LEMMA: "εκ", NORM: "εκ"},
|
"'φύγε": [{ORTH: "'φύγε", LEMMA: "φεύγω", NORM: "φεύγω"}],
|
||||||
],
|
"'φερε": [{ORTH: "'φερε", LEMMA: "φέρνω", NORM: "φέρνω"}],
|
||||||
|
"'φαγε": [{ORTH: "'φαγε", LEMMA: "τρώω", NORM: "τρώω"}],
|
||||||
"δώσ'": [
|
"'σπαγαν": [{ORTH: "'σπαγαν", LEMMA: "σπάω", NORM: "σπάω"}],
|
||||||
{ORTH: "δώσ'", LEMMA: "δίνω", NORM: "δίνω"},
|
"'σκασε": [{ORTH: "'σκασε", LEMMA: "σκάω", NORM: "σκάω"}],
|
||||||
],
|
"'σβηνε": [{ORTH: "'σβηνε", LEMMA: "σβήνω", NORM: "σβήνω"}],
|
||||||
|
"'ριξε": [{ORTH: "'ριξε", LEMMA: "ρίχνω", NORM: "ρίχνω"}],
|
||||||
"τίποτ'": [
|
"'κλεβε": [{ORTH: "'κλεβε", LEMMA: "κλέβω", NORM: "κλέβω"}],
|
||||||
{ORTH: "τίποτ'", LEMMA: "τίποτα", NORM: "τίποτα"},
|
"'κει": [{ORTH: "'κει", LEMMA: "εκεί", NORM: "εκεί"}],
|
||||||
],
|
"'βλεπε": [{ORTH: "'βλεπε", LEMMA: "βλέπω", NORM: "βλέπω"}],
|
||||||
|
"'βγαινε": [{ORTH: "'βγαινε", LEMMA: "βγαίνω", NORM: "βγαίνω"}],
|
||||||
"Λήξ'": [
|
|
||||||
{ORTH: "Λήξ'", LEMMA: "λήγω", NORM: "λήγω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"άσ'": [
|
|
||||||
{ORTH: "άσ'", LEMMA: "αφήνω", NORM: "αφήνω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"Στ'": [
|
|
||||||
{ORTH: "Στ'", LEMMA: "στο", NORM: "στο"},
|
|
||||||
|
|
||||||
],
|
|
||||||
|
|
||||||
"Δωσ'": [
|
|
||||||
{ORTH: "Δωσ'", LEMMA: "δίνω", NORM: "δίνω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"Βάψ'": [
|
|
||||||
{ORTH: "Βάψ'", LEMMA: "βάφω", NORM: "βάφω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"Αλλ'": [
|
|
||||||
{ORTH: "Αλλ'", LEMMA: "αλλά", NORM: "αλλά"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"Αμ'": [
|
|
||||||
{ORTH: "Αμ'", LEMMA: "άμα", NORM: "άμα"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"Αγόρασ'": [
|
|
||||||
{ORTH: "Αγόρασ'", LEMMA: "αγοράζω", NORM: "αγοράζω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'φύγε": [
|
|
||||||
{ORTH: "'φύγε", LEMMA: "φεύγω", NORM: "φεύγω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'φερε": [
|
|
||||||
{ORTH: "'φερε", LEMMA: "φέρνω", NORM: "φέρνω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'φαγε": [
|
|
||||||
{ORTH: "'φαγε", LEMMA: "τρώω", NORM: "τρώω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'σπαγαν": [
|
|
||||||
{ORTH: "'σπαγαν", LEMMA: "σπάω", NORM: "σπάω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'σκασε": [
|
|
||||||
{ORTH: "'σκασε", LEMMA: "σκάω", NORM: "σκάω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'σβηνε": [
|
|
||||||
{ORTH: "'σβηνε", LEMMA: "σβήνω", NORM: "σβήνω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'ριξε": [
|
|
||||||
{ORTH: "'ριξε", LEMMA: "ρίχνω", NORM: "ρίχνω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'κλεβε": [
|
|
||||||
{ORTH: "'κλεβε", LEMMA: "κλέβω", NORM: "κλέβω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'κει": [
|
|
||||||
{ORTH: "'κει", LEMMA: "εκεί", NORM: "εκεί"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'βλεπε": [
|
|
||||||
{ORTH: "'βλεπε", LEMMA: "βλέπω", NORM: "βλέπω"},
|
|
||||||
],
|
|
||||||
|
|
||||||
"'βγαινε": [
|
|
||||||
{ORTH: "'βγαινε", LEMMA: "βγαίνω", NORM: "βγαίνω"},
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_exc.update(_other_exc)
|
_exc.update(_other_exc)
|
||||||
|
@ -307,12 +136,14 @@ for h in range(1, 12 + 1):
|
||||||
for period in ["π.μ.", "πμ"]:
|
for period in ["π.μ.", "πμ"]:
|
||||||
_exc["%d%s" % (h, period)] = [
|
_exc["%d%s" % (h, period)] = [
|
||||||
{ORTH: "%d" % h},
|
{ORTH: "%d" % h},
|
||||||
{ORTH: period, LEMMA: "π.μ.", NORM: "π.μ."}]
|
{ORTH: period, LEMMA: "π.μ.", NORM: "π.μ."},
|
||||||
|
]
|
||||||
|
|
||||||
for period in ["μ.μ.", "μμ"]:
|
for period in ["μ.μ.", "μμ"]:
|
||||||
_exc["%d%s" % (h, period)] = [
|
_exc["%d%s" % (h, period)] = [
|
||||||
{ORTH: "%d" % h},
|
{ORTH: "%d" % h},
|
||||||
{ORTH: period, LEMMA: "μ.μ.", NORM: "μ.μ."}]
|
{ORTH: period, LEMMA: "μ.μ.", NORM: "μ.μ."},
|
||||||
|
]
|
||||||
|
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
{ORTH: "ΑΓΡ.", LEMMA: "Αγροτικός", NORM: "Αγροτικός"},
|
{ORTH: "ΑΓΡ.", LEMMA: "Αγροτικός", NORM: "Αγροτικός"},
|
||||||
|
@ -339,43 +170,228 @@ for exc_data in [
|
||||||
|
|
||||||
for orth in [
|
for orth in [
|
||||||
"$ΗΠΑ",
|
"$ΗΠΑ",
|
||||||
"Α'", "Α.Ε.", "Α.Ε.Β.Ε.", "Α.Ε.Ι.", "Α.Ε.Π.", "Α.Μ.Α.", "Α.Π.Θ.", "Α.Τ.", "Α.Χ.", "ΑΝ.", "Αγ.", "Αλ.", "Αν.",
|
"Α'",
|
||||||
"Αντ.", "Απ.",
|
"Α.Ε.",
|
||||||
"Β'", "Β)", "Β.Ζ.", "Β.Ι.Ο.", "Β.Κ.", "Β.Μ.Α.", "Βασ.",
|
"Α.Ε.Β.Ε.",
|
||||||
"Γ'", "Γ)", "Γ.Γ.", "Γ.Δ.", "Γκ.",
|
"Α.Ε.Ι.",
|
||||||
"Δ.Ε.Η.", "Δ.Ε.Σ.Ε.", "Δ.Ν.", "Δ.Ο.Υ.", "Δ.Σ.", "Δ.Υ.", "ΔΙ.ΚΑ.Τ.Σ.Α.", "Δηλ.", "Διον.",
|
"Α.Ε.Π.",
|
||||||
"Ε.Α.", "Ε.Α.Κ.", "Ε.Α.Π.", "Ε.Ε.", "Ε.Κ.", "Ε.ΚΕ.ΠΙΣ.", "Ε.Λ.Α.", "Ε.Λ.Ι.Α.", "Ε.Π.Σ.", "Ε.Π.Τ.Α.", "Ε.Σ.Ε.Ε.Κ.",
|
"Α.Μ.Α.",
|
||||||
"Ε.Υ.Κ.", "ΕΕ.", "ΕΚ.", "ΕΛ.", "ΕΛ.ΑΣ.", "Εθν.", "Ελ.", "Εμ.", "Επ.", "Ευ.",
|
"Α.Π.Θ.",
|
||||||
"Η'", "Η.Π.Α.",
|
"Α.Τ.",
|
||||||
"ΘΕ.", "Θεμ.", "Θεοδ.", "Θρ.",
|
"Α.Χ.",
|
||||||
"Ι.Ε.Κ.", "Ι.Κ.Α.", "Ι.Κ.Υ.", "Ι.Σ.Θ.", "Ι.Χ.", "ΙΖ'", "ΙΧ.",
|
"ΑΝ.",
|
||||||
"Κ.Α.Α.", "Κ.Α.Ε.", "Κ.Β.Σ.", "Κ.Δ.", "Κ.Ε.", "Κ.Ε.Κ.", "Κ.Ι.", "Κ.Κ.", "Κ.Ι.Θ.", "Κ.Ι.Θ.", "Κ.ΚΕΚ.", "Κ.Ο.",
|
"Αγ.",
|
||||||
"Κ.Π.Ρ.", "ΚΑΤ.", "ΚΚ.", "Καν.", "Καρ.", "Κατ.", "Κυρ.", "Κων.",
|
"Αλ.",
|
||||||
"Λ.Α.", "Λ.χ.", "Λ.Χ.", "Λεωφ.", "Λι.",
|
"Αν.",
|
||||||
"Μ.Δ.Ε.", "Μ.Ε.Ο.", "Μ.Ζ.", "Μ.Μ.Ε.", "Μ.Ο.", "Μεγ.", "Μιλτ.", "Μιχ.",
|
"Αντ.",
|
||||||
"Ν.Δ.", "Ν.Ε.Α.", "Ν.Κ.", "Ν.Ο.", "Ν.Ο.Θ.", "Ν.Π.Δ.Δ.", "Ν.Υ.", "ΝΔ.", "Νικ.", "Ντ'", "Ντ.",
|
"Απ.",
|
||||||
"Ο'", "Ο.Α.", "Ο.Α.Ε.Δ.", "Ο.Δ.", "Ο.Ε.Ε.", "Ο.Ε.Ε.Κ.", "Ο.Η.Ε.", "Ο.Κ.",
|
"Β'",
|
||||||
"Π.Δ.", "Π.Ε.Κ.Δ.Υ.", "Π.Ε.Π.", "Π.Μ.Σ.", "ΠΟΛ.", "Π.Χ.", "Παρ.", "Πλ.", "Πρ.",
|
"Β)",
|
||||||
"Σ.Δ.Ο.Ε.", "Σ.Ε.", "Σ.Ε.Κ.", "Σ.Π.Δ.Ω.Β.", "Σ.Τ.", "Σαβ.", "Στ.", "ΣτΕ.", "Στρ.",
|
"Β.Ζ.",
|
||||||
"Τ.Α.", "Τ.Ε.Ε.", "Τ.Ε.Ι.", "ΤΡ.", "Τζ.", "Τηλ.",
|
"Β.Ι.Ο.",
|
||||||
"Υ.Γ.", "ΥΓ.", "ΥΠ.Ε.Π.Θ.",
|
"Β.Κ.",
|
||||||
"Φ.Α.Β.Ε.", "Φ.Κ.", "Φ.Σ.", "Φ.Χ.", "Φ.Π.Α.", "Φιλ.",
|
"Β.Μ.Α.",
|
||||||
"Χ.Α.Α.", "ΧΡ.", "Χ.Χ.", "Χαρ.", "Χιλ.", "Χρ.",
|
"Βασ.",
|
||||||
"άγ.", "άρθρ.", "αι.", "αν.", "απ.", "αρ.", "αριθ.", "αριθμ.",
|
"Γ'",
|
||||||
"β'", "βλ.",
|
"Γ)",
|
||||||
"γ.γ.", "γεν.", "γραμμ.",
|
"Γ.Γ.",
|
||||||
"δ.δ.", "δ.σ.", "δηλ.", "δισ.", "δολ.", "δρχ.",
|
"Γ.Δ.",
|
||||||
"εκ.", "εκατ.", "ελ.",
|
"Γκ.",
|
||||||
|
"Δ.Ε.Η.",
|
||||||
|
"Δ.Ε.Σ.Ε.",
|
||||||
|
"Δ.Ν.",
|
||||||
|
"Δ.Ο.Υ.",
|
||||||
|
"Δ.Σ.",
|
||||||
|
"Δ.Υ.",
|
||||||
|
"ΔΙ.ΚΑ.Τ.Σ.Α.",
|
||||||
|
"Δηλ.",
|
||||||
|
"Διον.",
|
||||||
|
"Ε.Α.",
|
||||||
|
"Ε.Α.Κ.",
|
||||||
|
"Ε.Α.Π.",
|
||||||
|
"Ε.Ε.",
|
||||||
|
"Ε.Κ.",
|
||||||
|
"Ε.ΚΕ.ΠΙΣ.",
|
||||||
|
"Ε.Λ.Α.",
|
||||||
|
"Ε.Λ.Ι.Α.",
|
||||||
|
"Ε.Π.Σ.",
|
||||||
|
"Ε.Π.Τ.Α.",
|
||||||
|
"Ε.Σ.Ε.Ε.Κ.",
|
||||||
|
"Ε.Υ.Κ.",
|
||||||
|
"ΕΕ.",
|
||||||
|
"ΕΚ.",
|
||||||
|
"ΕΛ.",
|
||||||
|
"ΕΛ.ΑΣ.",
|
||||||
|
"Εθν.",
|
||||||
|
"Ελ.",
|
||||||
|
"Εμ.",
|
||||||
|
"Επ.",
|
||||||
|
"Ευ.",
|
||||||
|
"Η'",
|
||||||
|
"Η.Π.Α.",
|
||||||
|
"ΘΕ.",
|
||||||
|
"Θεμ.",
|
||||||
|
"Θεοδ.",
|
||||||
|
"Θρ.",
|
||||||
|
"Ι.Ε.Κ.",
|
||||||
|
"Ι.Κ.Α.",
|
||||||
|
"Ι.Κ.Υ.",
|
||||||
|
"Ι.Σ.Θ.",
|
||||||
|
"Ι.Χ.",
|
||||||
|
"ΙΖ'",
|
||||||
|
"ΙΧ.",
|
||||||
|
"Κ.Α.Α.",
|
||||||
|
"Κ.Α.Ε.",
|
||||||
|
"Κ.Β.Σ.",
|
||||||
|
"Κ.Δ.",
|
||||||
|
"Κ.Ε.",
|
||||||
|
"Κ.Ε.Κ.",
|
||||||
|
"Κ.Ι.",
|
||||||
|
"Κ.Κ.",
|
||||||
|
"Κ.Ι.Θ.",
|
||||||
|
"Κ.Ι.Θ.",
|
||||||
|
"Κ.ΚΕΚ.",
|
||||||
|
"Κ.Ο.",
|
||||||
|
"Κ.Π.Ρ.",
|
||||||
|
"ΚΑΤ.",
|
||||||
|
"ΚΚ.",
|
||||||
|
"Καν.",
|
||||||
|
"Καρ.",
|
||||||
|
"Κατ.",
|
||||||
|
"Κυρ.",
|
||||||
|
"Κων.",
|
||||||
|
"Λ.Α.",
|
||||||
|
"Λ.χ.",
|
||||||
|
"Λ.Χ.",
|
||||||
|
"Λεωφ.",
|
||||||
|
"Λι.",
|
||||||
|
"Μ.Δ.Ε.",
|
||||||
|
"Μ.Ε.Ο.",
|
||||||
|
"Μ.Ζ.",
|
||||||
|
"Μ.Μ.Ε.",
|
||||||
|
"Μ.Ο.",
|
||||||
|
"Μεγ.",
|
||||||
|
"Μιλτ.",
|
||||||
|
"Μιχ.",
|
||||||
|
"Ν.Δ.",
|
||||||
|
"Ν.Ε.Α.",
|
||||||
|
"Ν.Κ.",
|
||||||
|
"Ν.Ο.",
|
||||||
|
"Ν.Ο.Θ.",
|
||||||
|
"Ν.Π.Δ.Δ.",
|
||||||
|
"Ν.Υ.",
|
||||||
|
"ΝΔ.",
|
||||||
|
"Νικ.",
|
||||||
|
"Ντ'",
|
||||||
|
"Ντ.",
|
||||||
|
"Ο'",
|
||||||
|
"Ο.Α.",
|
||||||
|
"Ο.Α.Ε.Δ.",
|
||||||
|
"Ο.Δ.",
|
||||||
|
"Ο.Ε.Ε.",
|
||||||
|
"Ο.Ε.Ε.Κ.",
|
||||||
|
"Ο.Η.Ε.",
|
||||||
|
"Ο.Κ.",
|
||||||
|
"Π.Δ.",
|
||||||
|
"Π.Ε.Κ.Δ.Υ.",
|
||||||
|
"Π.Ε.Π.",
|
||||||
|
"Π.Μ.Σ.",
|
||||||
|
"ΠΟΛ.",
|
||||||
|
"Π.Χ.",
|
||||||
|
"Παρ.",
|
||||||
|
"Πλ.",
|
||||||
|
"Πρ.",
|
||||||
|
"Σ.Δ.Ο.Ε.",
|
||||||
|
"Σ.Ε.",
|
||||||
|
"Σ.Ε.Κ.",
|
||||||
|
"Σ.Π.Δ.Ω.Β.",
|
||||||
|
"Σ.Τ.",
|
||||||
|
"Σαβ.",
|
||||||
|
"Στ.",
|
||||||
|
"ΣτΕ.",
|
||||||
|
"Στρ.",
|
||||||
|
"Τ.Α.",
|
||||||
|
"Τ.Ε.Ε.",
|
||||||
|
"Τ.Ε.Ι.",
|
||||||
|
"ΤΡ.",
|
||||||
|
"Τζ.",
|
||||||
|
"Τηλ.",
|
||||||
|
"Υ.Γ.",
|
||||||
|
"ΥΓ.",
|
||||||
|
"ΥΠ.Ε.Π.Θ.",
|
||||||
|
"Φ.Α.Β.Ε.",
|
||||||
|
"Φ.Κ.",
|
||||||
|
"Φ.Σ.",
|
||||||
|
"Φ.Χ.",
|
||||||
|
"Φ.Π.Α.",
|
||||||
|
"Φιλ.",
|
||||||
|
"Χ.Α.Α.",
|
||||||
|
"ΧΡ.",
|
||||||
|
"Χ.Χ.",
|
||||||
|
"Χαρ.",
|
||||||
|
"Χιλ.",
|
||||||
|
"Χρ.",
|
||||||
|
"άγ.",
|
||||||
|
"άρθρ.",
|
||||||
|
"αι.",
|
||||||
|
"αν.",
|
||||||
|
"απ.",
|
||||||
|
"αρ.",
|
||||||
|
"αριθ.",
|
||||||
|
"αριθμ.",
|
||||||
|
"β'",
|
||||||
|
"βλ.",
|
||||||
|
"γ.γ.",
|
||||||
|
"γεν.",
|
||||||
|
"γραμμ.",
|
||||||
|
"δ.δ.",
|
||||||
|
"δ.σ.",
|
||||||
|
"δηλ.",
|
||||||
|
"δισ.",
|
||||||
|
"δολ.",
|
||||||
|
"δρχ.",
|
||||||
|
"εκ.",
|
||||||
|
"εκατ.",
|
||||||
|
"ελ.",
|
||||||
"θιν'",
|
"θιν'",
|
||||||
"κ.", "κ.ά.", "κ.α.", "κ.κ.", "κ.λπ.", "κ.ο.κ.", "κ.τ.λ.", "κλπ.", "κτλ.", "κυβ.",
|
"κ.",
|
||||||
|
"κ.ά.",
|
||||||
|
"κ.α.",
|
||||||
|
"κ.κ.",
|
||||||
|
"κ.λπ.",
|
||||||
|
"κ.ο.κ.",
|
||||||
|
"κ.τ.λ.",
|
||||||
|
"κλπ.",
|
||||||
|
"κτλ.",
|
||||||
|
"κυβ.",
|
||||||
"λ.χ.",
|
"λ.χ.",
|
||||||
"μ.", "μ.Χ.", "μ.μ.", "μιλ.",
|
"μ.",
|
||||||
|
"μ.Χ.",
|
||||||
|
"μ.μ.",
|
||||||
|
"μιλ.",
|
||||||
"ντ'",
|
"ντ'",
|
||||||
"π.Χ.", "π.β.", "π.δ.", "π.μ.", "π.χ.",
|
"π.Χ.",
|
||||||
"σ.", "σ.α.λ.", "σ.σ.", "σελ.", "στρ.",
|
"π.β.",
|
||||||
"τ'ς", "τ.μ.", "τετ.", "τετρ.", "τηλ.", "τρισ.", "τόν.",
|
"π.δ.",
|
||||||
|
"π.μ.",
|
||||||
|
"π.χ.",
|
||||||
|
"σ.",
|
||||||
|
"σ.α.λ.",
|
||||||
|
"σ.σ.",
|
||||||
|
"σελ.",
|
||||||
|
"στρ.",
|
||||||
|
"τ'ς",
|
||||||
|
"τ.μ.",
|
||||||
|
"τετ.",
|
||||||
|
"τετρ.",
|
||||||
|
"τηλ.",
|
||||||
|
"τρισ.",
|
||||||
|
"τόν.",
|
||||||
"υπ.",
|
"υπ.",
|
||||||
"χ.μ.", "χγρ.", "χιλ.", "χλμ."
|
"χ.μ.",
|
||||||
|
"χγρ.",
|
||||||
|
"χιλ.",
|
||||||
|
"χλμ.",
|
||||||
]:
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
|
@ -16,15 +16,18 @@ from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
|
||||||
def _return_en(_):
|
def _return_en(_):
|
||||||
return 'en'
|
return "en"
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(Language.Defaults):
|
class EnglishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = _return_en
|
lex_attr_getters[LANG] = _return_en
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
BASE_NORMS, NORM_EXCEPTIONS)
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
||||||
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
@ -37,8 +40,8 @@ class EnglishDefaults(Language.Defaults):
|
||||||
|
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
lang = 'en'
|
lang = "en"
|
||||||
Defaults = EnglishDefaults
|
Defaults = EnglishDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['English']
|
__all__ = ["English"]
|
||||||
|
|
|
@ -18,5 +18,5 @@ sentences = [
|
||||||
"Where are you?",
|
"Where are you?",
|
||||||
"Who is the president of France?",
|
"Who is the president of France?",
|
||||||
"What is the capital of the United States?",
|
"What is the capital of the United States?",
|
||||||
"When was Barack Obama born?"
|
"When was Barack Obama born?",
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .lookup import LOOKUP
|
from .lookup import LOOKUP # noqa: F401
|
||||||
from ._adjectives import ADJECTIVES
|
from ._adjectives import ADJECTIVES
|
||||||
from ._adjectives_irreg import ADJECTIVES_IRREG
|
from ._adjectives_irreg import ADJECTIVES_IRREG
|
||||||
from ._adverbs import ADVERBS
|
from ._adverbs import ADVERBS
|
||||||
|
@ -13,10 +13,18 @@ from ._verbs_irreg import VERBS_IRREG
|
||||||
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
|
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
|
||||||
|
|
||||||
|
|
||||||
LEMMA_INDEX = {'adj': ADJECTIVES, 'adv': ADVERBS, 'noun': NOUNS, 'verb': VERBS}
|
LEMMA_INDEX = {"adj": ADJECTIVES, "adv": ADVERBS, "noun": NOUNS, "verb": VERBS}
|
||||||
|
|
||||||
LEMMA_EXC = {'adj': ADJECTIVES_IRREG, 'adv': ADVERBS_IRREG, 'noun': NOUNS_IRREG,
|
LEMMA_EXC = {
|
||||||
'verb': VERBS_IRREG}
|
"adj": ADJECTIVES_IRREG,
|
||||||
|
"adv": ADVERBS_IRREG,
|
||||||
|
"noun": NOUNS_IRREG,
|
||||||
|
"verb": VERBS_IRREG,
|
||||||
|
}
|
||||||
|
|
||||||
LEMMA_RULES = {'adj': ADJECTIVE_RULES, 'noun': NOUN_RULES, 'verb': VERB_RULES,
|
LEMMA_RULES = {
|
||||||
'punct': PUNCT_RULES}
|
"adj": ADJECTIVE_RULES,
|
||||||
|
"noun": NOUN_RULES,
|
||||||
|
"verb": VERB_RULES,
|
||||||
|
"punct": PUNCT_RULES,
|
||||||
|
}
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
ADJECTIVES = set("""
|
ADJECTIVES = set(
|
||||||
|
"""
|
||||||
.22-caliber .22-calibre .38-caliber .38-calibre .45-caliber .45-calibre 0 1 10
|
.22-caliber .22-calibre .38-caliber .38-calibre .45-caliber .45-calibre 0 1 10
|
||||||
10-membered 100 1000 1000th 100th 101 101st 105 105th 10th 11 110 110th 115
|
10-membered 100 1000 1000th 100th 101 101st 105 105th 10th 11 110 110th 115
|
||||||
115th 11th 12 120 120th 125 125th 12th 13 130 130th 135 135th 13th 14 140 140th
|
115th 11th 12 120 120th 125 125th 12th 13 130 130th 135 135th 13th 14 140 140th
|
||||||
|
@ -2824,4 +2825,5 @@ zealous zenithal zero zeroth zestful zesty zig-zag zigzag zillion zimbabwean
|
||||||
zionist zippy zodiacal zoftig zoic zolaesque zonal zonary zoological zoonotic
|
zionist zippy zodiacal zoftig zoic zolaesque zonal zonary zoological zoonotic
|
||||||
zoophagous zoroastrian zygodactyl zygomatic zygomorphic zygomorphous zygotic
|
zoophagous zoroastrian zygodactyl zygomatic zygomorphic zygomorphous zygotic
|
||||||
zymoid zymolytic zymotic
|
zymoid zymolytic zymotic
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -48,8 +48,7 @@ ADJECTIVES_IRREG = {
|
||||||
"bendier": ("bendy",),
|
"bendier": ("bendy",),
|
||||||
"bendiest": ("bendy",),
|
"bendiest": ("bendy",),
|
||||||
"best": ("good",),
|
"best": ("good",),
|
||||||
"better": ("good",
|
"better": ("good", "well"),
|
||||||
"well",),
|
|
||||||
"bigger": ("big",),
|
"bigger": ("big",),
|
||||||
"biggest": ("big",),
|
"biggest": ("big",),
|
||||||
"bitchier": ("bitchy",),
|
"bitchier": ("bitchy",),
|
||||||
|
@ -289,10 +288,8 @@ ADJECTIVES_IRREG = {
|
||||||
"doughtiest": ("doughty",),
|
"doughtiest": ("doughty",),
|
||||||
"dowdier": ("dowdy",),
|
"dowdier": ("dowdy",),
|
||||||
"dowdiest": ("dowdy",),
|
"dowdiest": ("dowdy",),
|
||||||
"dowier": ("dowie",
|
"dowier": ("dowie", "dowy"),
|
||||||
"dowy",),
|
"dowiest": ("dowie", "dowy"),
|
||||||
"dowiest": ("dowie",
|
|
||||||
"dowy",),
|
|
||||||
"downer": ("downer",),
|
"downer": ("downer",),
|
||||||
"downier": ("downy",),
|
"downier": ("downy",),
|
||||||
"downiest": ("downy",),
|
"downiest": ("downy",),
|
||||||
|
@ -1494,5 +1491,5 @@ ADJECTIVES_IRREG = {
|
||||||
"zanier": ("zany",),
|
"zanier": ("zany",),
|
||||||
"zaniest": ("zany",),
|
"zaniest": ("zany",),
|
||||||
"zippier": ("zippy",),
|
"zippier": ("zippy",),
|
||||||
"zippiest": ("zippy",)
|
"zippiest": ("zippy",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
ADVERBS = set("""
|
ADVERBS = set(
|
||||||
|
"""
|
||||||
'tween a.d. a.k.a. a.m. aback abaft abaxially abeam abed abjectly ably
|
'tween a.d. a.k.a. a.m. aback abaft abaxially abeam abed abjectly ably
|
||||||
abnormally aboard abominably aborad abortively about above aboveboard abreast
|
abnormally aboard abominably aborad abortively about above aboveboard abreast
|
||||||
abroad abruptly absently absentmindedly absolutely abstemiously abstractedly
|
abroad abruptly absently absentmindedly absolutely abstemiously abstractedly
|
||||||
|
@ -540,4 +541,5 @@ wordlessly worriedly worryingly worse worst worthily worthlessly wrathfully
|
||||||
wretchedly wrong wrongfully wrongheadedly wrongly wryly yea yeah yearly
|
wretchedly wrong wrongfully wrongheadedly wrongly wryly yea yeah yearly
|
||||||
yearningly yesterday yet yieldingly yon yonder youthfully zealously zestfully
|
yearningly yesterday yet yieldingly yon yonder youthfully zealously zestfully
|
||||||
zestily zigzag
|
zestily zigzag
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -9,5 +9,5 @@ ADVERBS_IRREG = {
|
||||||
"farther": ("far",),
|
"farther": ("far",),
|
||||||
"further": ("far",),
|
"further": ("far",),
|
||||||
"harder": ("hard",),
|
"harder": ("hard",),
|
||||||
"hardest": ("hard",)
|
"hardest": ("hard",),
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,12 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
ADJECTIVE_RULES = [
|
ADJECTIVE_RULES = [["er", ""], ["est", ""], ["er", "e"], ["est", "e"]]
|
||||||
["er", ""],
|
|
||||||
["est", ""],
|
|
||||||
["er", "e"],
|
|
||||||
["est", "e"]
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
NOUN_RULES = [
|
NOUN_RULES = [
|
||||||
|
@ -19,7 +14,7 @@ NOUN_RULES = [
|
||||||
["ches", "ch"],
|
["ches", "ch"],
|
||||||
["shes", "sh"],
|
["shes", "sh"],
|
||||||
["men", "man"],
|
["men", "man"],
|
||||||
["ies", "y"]
|
["ies", "y"],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,13 +26,8 @@ VERB_RULES = [
|
||||||
["ed", "e"],
|
["ed", "e"],
|
||||||
["ed", ""],
|
["ed", ""],
|
||||||
["ing", "e"],
|
["ing", "e"],
|
||||||
["ing", ""]
|
["ing", ""],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
PUNCT_RULES = [
|
PUNCT_RULES = [["“", '"'], ["”", '"'], ["\u2018", "'"], ["\u2019", "'"]]
|
||||||
["“", "\""],
|
|
||||||
["”", "\""],
|
|
||||||
["\u2018", "'"],
|
|
||||||
["\u2019", "'"]
|
|
||||||
]
|
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
NOUNS = set("""
|
NOUNS = set(
|
||||||
|
"""
|
||||||
'hood .22 0 1 1-dodecanol 1-hitter 10 100 1000 10000 100000 1000000 1000000000
|
'hood .22 0 1 1-dodecanol 1-hitter 10 100 1000 10000 100000 1000000 1000000000
|
||||||
1000000000000 11 11-plus 12 120 13 14 144 15 1530s 16 17 1728 1750s 1760s 1770s
|
1000000000000 11 11-plus 12 120 13 14 144 15 1530s 16 17 1728 1750s 1760s 1770s
|
||||||
1780s 1790s 18 1820s 1830s 1840s 1850s 1860s 1870s 1880s 1890s 19 1900s 1920s
|
1780s 1790s 18 1820s 1830s 1840s 1850s 1860s 1870s 1880s 1890s 19 1900s 1920s
|
||||||
|
@ -7110,4 +7111,5 @@ zurvanism zweig zwieback zwingli zworykin zydeco zygnema zygnemales
|
||||||
zygnemataceae zygnematales zygocactus zygoma zygomatic zygomycetes zygomycota
|
zygnemataceae zygnematales zygocactus zygoma zygomatic zygomycetes zygomycota
|
||||||
zygomycotina zygophyllaceae zygophyllum zygoptera zygospore zygote zygotene
|
zygomycotina zygophyllaceae zygophyllum zygoptera zygospore zygote zygotene
|
||||||
zyloprim zymase zymogen zymology zymolysis zymosis zymurgy zyrian
|
zyloprim zymase zymogen zymology zymolysis zymosis zymurgy zyrian
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
VERBS = set("""
|
VERBS = set(
|
||||||
|
"""
|
||||||
aah abacinate abandon abase abash abate abbreviate abdicate abduce abduct
|
aah abacinate abandon abase abash abate abbreviate abdicate abduce abduct
|
||||||
aberrate abet abhor abide abjure ablactate ablate abnegate abolish abominate
|
aberrate abet abhor abide abjure ablactate ablate abnegate abolish abominate
|
||||||
abort abound about-face abrade abrase abreact abridge abrogate abscise abscond
|
abort abound about-face abrade abrase abreact abridge abrogate abscise abscond
|
||||||
|
@ -912,4 +913,5 @@ wreck wrench wrest wrestle wrick wriggle wring wrinkle write writhe wrong x-ray
|
||||||
xerox yacht yack yak yammer yank yap yarn yarn-dye yaup yaw yawl yawn yawp yearn
|
xerox yacht yack yak yammer yank yap yarn yarn-dye yaup yaw yawl yawn yawp yearn
|
||||||
yell yellow yelp yen yield yip yodel yoke yowl zap zero zest zigzag zinc zip
|
yell yellow yelp yen yield yip yodel yoke yowl zap zero zest zigzag zinc zip
|
||||||
zipper zone zoom
|
zipper zone zoom
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -4,22 +4,54 @@ from __future__ import unicode_literals
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
|
_num_words = [
|
||||||
'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
|
"zero",
|
||||||
'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
|
"one",
|
||||||
'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
|
"two",
|
||||||
'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
|
"three",
|
||||||
'gajillion', 'bazillion']
|
"four",
|
||||||
|
"five",
|
||||||
|
"six",
|
||||||
|
"seven",
|
||||||
|
"eight",
|
||||||
|
"nine",
|
||||||
|
"ten",
|
||||||
|
"eleven",
|
||||||
|
"twelve",
|
||||||
|
"thirteen",
|
||||||
|
"fourteen",
|
||||||
|
"fifteen",
|
||||||
|
"sixteen",
|
||||||
|
"seventeen",
|
||||||
|
"eighteen",
|
||||||
|
"nineteen",
|
||||||
|
"twenty",
|
||||||
|
"thirty",
|
||||||
|
"forty",
|
||||||
|
"fifty",
|
||||||
|
"sixty",
|
||||||
|
"seventy",
|
||||||
|
"eighty",
|
||||||
|
"ninety",
|
||||||
|
"hundred",
|
||||||
|
"thousand",
|
||||||
|
"million",
|
||||||
|
"billion",
|
||||||
|
"trillion",
|
||||||
|
"quadrillion",
|
||||||
|
"gajillion",
|
||||||
|
"bazillion",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
if text.startswith(('+', '-', '±', '~')):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(",", "").replace(".", "")
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.count('/') == 1:
|
if text.count("/") == 1:
|
||||||
num, denom = text.split('/')
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.lower() in _num_words:
|
if text.lower() in _num_words:
|
||||||
|
@ -27,6 +59,4 @@ def like_num(text):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||||
LIKE_NUM: like_num
|
|
||||||
}
|
|
||||||
|
|
|
@ -6,66 +6,321 @@ from ...symbols import LEMMA, PRON_LEMMA
|
||||||
|
|
||||||
MORPH_RULES = {
|
MORPH_RULES = {
|
||||||
"PRP": {
|
"PRP": {
|
||||||
"I": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Nom"},
|
"I": {
|
||||||
"me": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc"},
|
LEMMA: PRON_LEMMA,
|
||||||
"you": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"},
|
"PronType": "Prs",
|
||||||
"he": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Nom"},
|
"Person": "One",
|
||||||
"him": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Acc"},
|
"Number": "Sing",
|
||||||
"she": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Nom"},
|
"Case": "Nom",
|
||||||
"her": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Acc"},
|
},
|
||||||
"it": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"},
|
"me": {
|
||||||
"we": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Nom"},
|
LEMMA: PRON_LEMMA,
|
||||||
"us": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc"},
|
"PronType": "Prs",
|
||||||
"they": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Nom"},
|
"Person": "One",
|
||||||
"them": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"},
|
"Number": "Sing",
|
||||||
|
"Case": "Acc",
|
||||||
"mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"},
|
},
|
||||||
"his": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"},
|
"you": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"},
|
||||||
"hers": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"},
|
"he": {
|
||||||
"its": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"},
|
LEMMA: PRON_LEMMA,
|
||||||
"ours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"},
|
"PronType": "Prs",
|
||||||
"yours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"},
|
"Person": "Three",
|
||||||
"theirs": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"},
|
"Number": "Sing",
|
||||||
|
"Gender": "Masc",
|
||||||
"myself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc", "Reflex": "Yes"},
|
"Case": "Nom",
|
||||||
"yourself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Case": "Acc", "Reflex": "Yes"},
|
},
|
||||||
"himself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Masc", "Reflex": "Yes"},
|
"him": {
|
||||||
"herself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Fem", "Reflex": "Yes"},
|
LEMMA: PRON_LEMMA,
|
||||||
"itself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Neut", "Reflex": "Yes"},
|
"PronType": "Prs",
|
||||||
"themself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Reflex": "Yes"},
|
"Person": "Three",
|
||||||
"ourselves": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc", "Reflex": "Yes"},
|
"Number": "Sing",
|
||||||
"yourselves": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Case": "Acc", "Reflex": "Yes"},
|
"Gender": "Masc",
|
||||||
"themselves": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc", "Reflex": "Yes"}
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"she": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Gender": "Fem",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"her": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Gender": "Fem",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"it": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Gender": "Neut",
|
||||||
|
},
|
||||||
|
"we": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "One",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"us": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "One",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"they": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Case": "Nom",
|
||||||
|
},
|
||||||
|
"them": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Case": "Acc",
|
||||||
|
},
|
||||||
|
"mine": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "One",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"his": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Gender": "Masc",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"hers": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Gender": "Fem",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"its": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Gender": "Neut",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"ours": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "One",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"yours": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"theirs": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Poss": "Yes",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"myself": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "One",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"yourself": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"himself": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Gender": "Masc",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"herself": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Gender": "Fem",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"itself": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Gender": "Neut",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"themself": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"ourselves": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "One",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"yourselves": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Two",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
|
"themselves": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Plur",
|
||||||
|
"Case": "Acc",
|
||||||
|
"Reflex": "Yes",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
"PRP$": {
|
"PRP$": {
|
||||||
"my": {LEMMA: PRON_LEMMA, "Person": "One", "Number": "Sing", "PronType": "Prs", "Poss": "Yes"},
|
"my": {
|
||||||
"your": {LEMMA: PRON_LEMMA, "Person": "Two", "PronType": "Prs", "Poss": "Yes"},
|
LEMMA: PRON_LEMMA,
|
||||||
"his": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Masc", "PronType": "Prs", "Poss": "Yes"},
|
"Person": "One",
|
||||||
"her": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Fem", "PronType": "Prs", "Poss": "Yes"},
|
"Number": "Sing",
|
||||||
"its": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Neut", "PronType": "Prs", "Poss": "Yes"},
|
"PronType": "Prs",
|
||||||
"our": {LEMMA: PRON_LEMMA, "Person": "One", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"},
|
"Poss": "Yes",
|
||||||
"their": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"}
|
},
|
||||||
|
"your": {LEMMA: PRON_LEMMA, "Person": "Two", "PronType": "Prs", "Poss": "Yes"},
|
||||||
|
"his": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Gender": "Masc",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
},
|
||||||
|
"her": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Gender": "Fem",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
},
|
||||||
|
"its": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Sing",
|
||||||
|
"Gender": "Neut",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
},
|
||||||
|
"our": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Person": "One",
|
||||||
|
"Number": "Plur",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
},
|
||||||
|
"their": {
|
||||||
|
LEMMA: PRON_LEMMA,
|
||||||
|
"Person": "Three",
|
||||||
|
"Number": "Plur",
|
||||||
|
"PronType": "Prs",
|
||||||
|
"Poss": "Yes",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
"VBZ": {
|
"VBZ": {
|
||||||
"am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
|
"am": {
|
||||||
"are": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"},
|
LEMMA: "be",
|
||||||
"is": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"},
|
"VerbForm": "Fin",
|
||||||
"'re": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"},
|
"Person": "One",
|
||||||
"'s": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"},
|
"Tense": "Pres",
|
||||||
|
"Mood": "Ind",
|
||||||
|
},
|
||||||
|
"are": {
|
||||||
|
LEMMA: "be",
|
||||||
|
"VerbForm": "Fin",
|
||||||
|
"Person": "Two",
|
||||||
|
"Tense": "Pres",
|
||||||
|
"Mood": "Ind",
|
||||||
|
},
|
||||||
|
"is": {
|
||||||
|
LEMMA: "be",
|
||||||
|
"VerbForm": "Fin",
|
||||||
|
"Person": "Three",
|
||||||
|
"Tense": "Pres",
|
||||||
|
"Mood": "Ind",
|
||||||
|
},
|
||||||
|
"'re": {
|
||||||
|
LEMMA: "be",
|
||||||
|
"VerbForm": "Fin",
|
||||||
|
"Person": "Two",
|
||||||
|
"Tense": "Pres",
|
||||||
|
"Mood": "Ind",
|
||||||
|
},
|
||||||
|
"'s": {
|
||||||
|
LEMMA: "be",
|
||||||
|
"VerbForm": "Fin",
|
||||||
|
"Person": "Three",
|
||||||
|
"Tense": "Pres",
|
||||||
|
"Mood": "Ind",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
"VBP": {
|
"VBP": {
|
||||||
"are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
"are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
||||||
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
||||||
"am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
|
"am": {
|
||||||
|
LEMMA: "be",
|
||||||
|
"VerbForm": "Fin",
|
||||||
|
"Person": "One",
|
||||||
|
"Tense": "Pres",
|
||||||
|
"Mood": "Ind",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
"VBD": {
|
"VBD": {
|
||||||
"was": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
|
"was": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
|
||||||
"were": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}
|
"were": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"},
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,6 @@ _exc = {
|
||||||
"plz": "please",
|
"plz": "please",
|
||||||
"pls": "please",
|
"pls": "please",
|
||||||
"thx": "thanks",
|
"thx": "thanks",
|
||||||
|
|
||||||
# US vs. UK spelling
|
# US vs. UK spelling
|
||||||
"accessorise": "accessorize",
|
"accessorise": "accessorize",
|
||||||
"accessorised": "accessorized",
|
"accessorised": "accessorized",
|
||||||
|
@ -690,7 +689,7 @@ _exc = {
|
||||||
"globalising": "globalizing",
|
"globalising": "globalizing",
|
||||||
"glueing ": "gluing ",
|
"glueing ": "gluing ",
|
||||||
"goin": "going",
|
"goin": "going",
|
||||||
"goin'":"going",
|
"goin'": "going",
|
||||||
"goitre": "goiter",
|
"goitre": "goiter",
|
||||||
"goitres": "goiters",
|
"goitres": "goiters",
|
||||||
"gonorrhoea": "gonorrhea",
|
"gonorrhoea": "gonorrhea",
|
||||||
|
@ -1758,7 +1757,7 @@ _exc = {
|
||||||
"yoghourt": "yogurt",
|
"yoghourt": "yogurt",
|
||||||
"yoghourts": "yogurts",
|
"yoghourts": "yogurts",
|
||||||
"yoghurt": "yogurt",
|
"yoghurt": "yogurt",
|
||||||
"yoghurts": "yogurts"
|
"yoghurts": "yogurts",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,8 +3,8 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
# Stop words
|
# Stop words
|
||||||
|
STOP_WORDS = set(
|
||||||
STOP_WORDS = set("""
|
"""
|
||||||
a about above across after afterwards again against all almost alone along
|
a about above across after afterwards again against all almost alone along
|
||||||
already also although always am among amongst amount an and another any anyhow
|
already also although always am among amongst amount an and another any anyhow
|
||||||
anyone anything anyway anywhere are around as at
|
anyone anything anyway anywhere are around as at
|
||||||
|
@ -68,4 +68,5 @@ whither who whoever whole whom whose why will with within without would
|
||||||
yet you your yours yourself yourselves
|
yet you your yours yourself yourselves
|
||||||
|
|
||||||
'd 'll 'm 're 's 've
|
'd 'll 'm 're 's 've
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -8,12 +8,21 @@ def noun_chunks(obj):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'dative', 'appos',
|
labels = [
|
||||||
'attr', 'ROOT']
|
"nsubj",
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
"dobj",
|
||||||
|
"nsubjpass",
|
||||||
|
"pcomp",
|
||||||
|
"pobj",
|
||||||
|
"dative",
|
||||||
|
"appos",
|
||||||
|
"attr",
|
||||||
|
"ROOT",
|
||||||
|
]
|
||||||
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add('conj')
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add('NP')
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
seen = set()
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(obj):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
@ -24,8 +33,8 @@ def noun_chunks(obj):
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
if any(w.i in seen for w in word.subtree):
|
||||||
continue
|
continue
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i+1))
|
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
||||||
yield word.left_edge.i, word.i+1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
head = word.head
|
head = word.head
|
||||||
while head.dep == conj and head.head.i < head.i:
|
while head.dep == conj and head.head.i < head.i:
|
||||||
|
@ -34,10 +43,8 @@ def noun_chunks(obj):
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
if any(w.i in seen for w in word.subtree):
|
||||||
continue
|
continue
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i+1))
|
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
||||||
yield word.left_edge.i, word.i+1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
SYNTAX_ITERATORS = {
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||||
'noun_chunks': noun_chunks
|
|
||||||
}
|
|
||||||
|
|
|
@ -6,61 +6,67 @@ from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
".": {POS: PUNCT, "PunctType": "peri"},
|
".": {POS: PUNCT, "PunctType": "peri"},
|
||||||
",": {POS: PUNCT, "PunctType": "comm"},
|
",": {POS: PUNCT, "PunctType": "comm"},
|
||||||
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
||||||
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
||||||
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
||||||
"\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
'""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||||
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||||
":": {POS: PUNCT},
|
":": {POS: PUNCT},
|
||||||
"$": {POS: SYM, "Other": {"SymType": "currency"}},
|
"$": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||||
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||||
"AFX": {POS: ADJ, "Hyph": "yes"},
|
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||||
"CC": {POS: CCONJ, "ConjType": "coor"},
|
"CC": {POS: CCONJ, "ConjType": "coor"},
|
||||||
"CD": {POS: NUM, "NumType": "card"},
|
"CD": {POS: NUM, "NumType": "card"},
|
||||||
"DT": {POS: DET},
|
"DT": {POS: DET},
|
||||||
"EX": {POS: ADV, "AdvType": "ex"},
|
"EX": {POS: ADV, "AdvType": "ex"},
|
||||||
"FW": {POS: X, "Foreign": "yes"},
|
"FW": {POS: X, "Foreign": "yes"},
|
||||||
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
||||||
"IN": {POS: ADP},
|
"IN": {POS: ADP},
|
||||||
"JJ": {POS: ADJ, "Degree": "pos"},
|
"JJ": {POS: ADJ, "Degree": "pos"},
|
||||||
"JJR": {POS: ADJ, "Degree": "comp"},
|
"JJR": {POS: ADJ, "Degree": "comp"},
|
||||||
"JJS": {POS: ADJ, "Degree": "sup"},
|
"JJS": {POS: ADJ, "Degree": "sup"},
|
||||||
"LS": {POS: PUNCT, "NumType": "ord"},
|
"LS": {POS: PUNCT, "NumType": "ord"},
|
||||||
"MD": {POS: VERB, "VerbType": "mod"},
|
"MD": {POS: VERB, "VerbType": "mod"},
|
||||||
"NIL": {POS: ""},
|
"NIL": {POS: ""},
|
||||||
"NN": {POS: NOUN, "Number": "sing"},
|
"NN": {POS: NOUN, "Number": "sing"},
|
||||||
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
||||||
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
||||||
"NNS": {POS: NOUN, "Number": "plur"},
|
"NNS": {POS: NOUN, "Number": "plur"},
|
||||||
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
||||||
"POS": {POS: PART, "Poss": "yes"},
|
"POS": {POS: PART, "Poss": "yes"},
|
||||||
"PRP": {POS: PRON, "PronType": "prs"},
|
"PRP": {POS: PRON, "PronType": "prs"},
|
||||||
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
||||||
"RB": {POS: ADV, "Degree": "pos"},
|
"RB": {POS: ADV, "Degree": "pos"},
|
||||||
"RBR": {POS: ADV, "Degree": "comp"},
|
"RBR": {POS: ADV, "Degree": "comp"},
|
||||||
"RBS": {POS: ADV, "Degree": "sup"},
|
"RBS": {POS: ADV, "Degree": "sup"},
|
||||||
"RP": {POS: PART},
|
"RP": {POS: PART},
|
||||||
"SP": {POS: SPACE},
|
"SP": {POS: SPACE},
|
||||||
"SYM": {POS: SYM},
|
"SYM": {POS: SYM},
|
||||||
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
||||||
"UH": {POS: INTJ},
|
"UH": {POS: INTJ},
|
||||||
"VB": {POS: VERB, "VerbForm": "inf"},
|
"VB": {POS: VERB, "VerbForm": "inf"},
|
||||||
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
||||||
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
||||||
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
||||||
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
||||||
"VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
|
"VBZ": {
|
||||||
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
POS: VERB,
|
||||||
"WP": {POS: NOUN, "PronType": "int|rel"},
|
"VerbForm": "fin",
|
||||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
"Tense": "pres",
|
||||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
"Number": "sing",
|
||||||
"ADD": {POS: X},
|
"Person": 3,
|
||||||
"NFP": {POS: PUNCT},
|
},
|
||||||
"GW": {POS: X},
|
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
||||||
"XX": {POS: X},
|
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||||
"BES": {POS: VERB},
|
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||||
"HVS": {POS: VERB},
|
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||||
"_SP": {POS: SPACE},
|
"ADD": {POS: X},
|
||||||
|
"NFP": {POS: PUNCT},
|
||||||
|
"GW": {POS: X},
|
||||||
|
"XX": {POS: X},
|
||||||
|
"BES": {POS: VERB},
|
||||||
|
"HVS": {POS: VERB},
|
||||||
|
"_SP": {POS: SPACE},
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,103 +5,143 @@ from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
_exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
|
_exclude = [
|
||||||
"Shed", "shed", "were", "Were", "Well", "well", "Whore", "whore"]
|
"Ill",
|
||||||
|
"ill",
|
||||||
|
"Its",
|
||||||
|
"its",
|
||||||
|
"Hell",
|
||||||
|
"hell",
|
||||||
|
"Shell",
|
||||||
|
"shell",
|
||||||
|
"Shed",
|
||||||
|
"shed",
|
||||||
|
"were",
|
||||||
|
"Were",
|
||||||
|
"Well",
|
||||||
|
"well",
|
||||||
|
"Whore",
|
||||||
|
"whore",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# Pronouns
|
# Pronouns
|
||||||
|
|
||||||
for pron in ["i"]:
|
for pron in ["i"]:
|
||||||
for orth in [pron, pron.title()]:
|
for orth in [pron, pron.title()]:
|
||||||
_exc[orth + "'m"] = [
|
_exc[orth + "'m"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}]
|
{
|
||||||
|
ORTH: "'m",
|
||||||
|
LEMMA: "be",
|
||||||
|
NORM: "am",
|
||||||
|
TAG: "VBP",
|
||||||
|
"tenspect": 1,
|
||||||
|
"number": 1,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "m"] = [
|
_exc[orth + "m"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
|
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "'ma"] = [
|
_exc[orth + "'ma"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "'m", LEMMA: "be", NORM: "am"},
|
{ORTH: "'m", LEMMA: "be", NORM: "am"},
|
||||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
|
{ORTH: "a", LEMMA: "going to", NORM: "gonna"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "ma"] = [
|
_exc[orth + "ma"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "m", LEMMA: "be", NORM: "am"},
|
{ORTH: "m", LEMMA: "be", NORM: "am"},
|
||||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
|
{ORTH: "a", LEMMA: "going to", NORM: "gonna"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
for pron in ["i", "you", "he", "she", "it", "we", "they"]:
|
for pron in ["i", "you", "he", "she", "it", "we", "they"]:
|
||||||
for orth in [pron, pron.title()]:
|
for orth in [pron, pron.title()]:
|
||||||
_exc[orth + "'ll"] = [
|
_exc[orth + "'ll"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "ll"] = [
|
_exc[orth + "ll"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "'ll've"] = [
|
_exc[orth + "'ll've"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "llve"] = [
|
_exc[orth + "llve"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "'d"] = [
|
_exc[orth + "'d"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}]
|
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "d"] = [
|
_exc[orth + "d"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}]
|
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "'d've"] = [
|
_exc[orth + "'d've"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "dve"] = [
|
_exc[orth + "dve"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
for pron in ["i", "you", "we", "they"]:
|
for pron in ["i", "you", "we", "they"]:
|
||||||
for orth in [pron, pron.title()]:
|
for orth in [pron, pron.title()]:
|
||||||
_exc[orth + "'ve"] = [
|
_exc[orth + "'ve"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "ve"] = [
|
_exc[orth + "ve"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
for pron in ["you", "we", "they"]:
|
for pron in ["you", "we", "they"]:
|
||||||
for orth in [pron, pron.title()]:
|
for orth in [pron, pron.title()]:
|
||||||
_exc[orth + "'re"] = [
|
_exc[orth + "'re"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
|
{ORTH: "'re", LEMMA: "be", NORM: "are"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "re"] = [
|
_exc[orth + "re"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
|
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
for pron in ["he", "she", "it"]:
|
for pron in ["he", "she", "it"]:
|
||||||
for orth in [pron, pron.title()]:
|
for orth in [pron, pron.title()]:
|
||||||
_exc[orth + "'s"] = [
|
_exc[orth + "'s"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "'s", NORM: "'s"}]
|
{ORTH: "'s", NORM: "'s"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "s"] = [
|
_exc[orth + "s"] = [
|
||||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||||
{ORTH: "s"}]
|
{ORTH: "s"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# W-words, relative pronouns, prepositions etc.
|
# W-words, relative pronouns, prepositions etc.
|
||||||
|
@ -110,63 +150,71 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
|
||||||
for orth in [word, word.title()]:
|
for orth in [word, word.title()]:
|
||||||
_exc[orth + "'s"] = [
|
_exc[orth + "'s"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
{ORTH: "'s", NORM: "'s"}]
|
{ORTH: "'s", NORM: "'s"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "s"] = [
|
_exc[orth + "s"] = [{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "s"}]
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
|
||||||
{ORTH: "s"}]
|
|
||||||
|
|
||||||
_exc[orth + "'ll"] = [
|
_exc[orth + "'ll"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "ll"] = [
|
_exc[orth + "ll"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "'ll've"] = [
|
_exc[orth + "'ll've"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "llve"] = [
|
_exc[orth + "llve"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "'re"] = [
|
_exc[orth + "'re"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
|
{ORTH: "'re", LEMMA: "be", NORM: "are"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "re"] = [
|
_exc[orth + "re"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
{ORTH: "re", LEMMA: "be", NORM: "are"}]
|
{ORTH: "re", LEMMA: "be", NORM: "are"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "'ve"] = [
|
_exc[orth + "'ve"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
{ORTH: "'ve", LEMMA: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "ve"] = [
|
_exc[orth + "ve"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "'d"] = [
|
_exc[orth + "'d"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
{ORTH: "'d", NORM: "'d"}]
|
{ORTH: "'d", NORM: "'d"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "d"] = [
|
_exc[orth + "d"] = [{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "d"}]
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
|
||||||
{ORTH: "d"}]
|
|
||||||
|
|
||||||
_exc[orth + "'d've"] = [
|
_exc[orth + "'d've"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[orth + "dve"] = [
|
_exc[orth + "dve"] = [
|
||||||
{ORTH: orth, LEMMA: word, NORM: word},
|
{ORTH: orth, LEMMA: word, NORM: word},
|
||||||
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# Verbs
|
# Verbs
|
||||||
|
@ -186,27 +234,32 @@ for verb_data in [
|
||||||
{ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
|
{ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
|
||||||
{ORTH: "should", NORM: "should", TAG: "MD"},
|
{ORTH: "should", NORM: "should", TAG: "MD"},
|
||||||
{ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
|
{ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||||
{ORTH: "would", NORM: "would", TAG: "MD"}]:
|
{ORTH: "would", NORM: "would", TAG: "MD"},
|
||||||
|
]:
|
||||||
verb_data_tc = dict(verb_data)
|
verb_data_tc = dict(verb_data)
|
||||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||||
for data in [verb_data, verb_data_tc]:
|
for data in [verb_data, verb_data_tc]:
|
||||||
_exc[data[ORTH] + "n't"] = [
|
_exc[data[ORTH] + "n't"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[data[ORTH] + "nt"] = [
|
_exc[data[ORTH] + "nt"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[data[ORTH] + "n't've"] = [
|
_exc[data[ORTH] + "n't've"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
|
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[data[ORTH] + "ntve"] = [
|
_exc[data[ORTH] + "ntve"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
|
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
for verb_data in [
|
for verb_data in [
|
||||||
|
@ -214,17 +267,14 @@ for verb_data in [
|
||||||
{ORTH: "might", NORM: "might", TAG: "MD"},
|
{ORTH: "might", NORM: "might", TAG: "MD"},
|
||||||
{ORTH: "must", NORM: "must", TAG: "MD"},
|
{ORTH: "must", NORM: "must", TAG: "MD"},
|
||||||
{ORTH: "should", NORM: "should", TAG: "MD"},
|
{ORTH: "should", NORM: "should", TAG: "MD"},
|
||||||
{ORTH: "would", NORM: "would", TAG: "MD"}]:
|
{ORTH: "would", NORM: "would", TAG: "MD"},
|
||||||
|
]:
|
||||||
verb_data_tc = dict(verb_data)
|
verb_data_tc = dict(verb_data)
|
||||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||||
for data in [verb_data, verb_data_tc]:
|
for data in [verb_data, verb_data_tc]:
|
||||||
_exc[data[ORTH] + "'ve"] = [
|
_exc[data[ORTH] + "'ve"] = [dict(data), {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||||
dict(data),
|
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
|
||||||
|
|
||||||
_exc[data[ORTH] + "ve"] = [
|
_exc[data[ORTH] + "ve"] = [dict(data), {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||||
dict(data),
|
|
||||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
|
||||||
|
|
||||||
|
|
||||||
for verb_data in [
|
for verb_data in [
|
||||||
|
@ -235,17 +285,20 @@ for verb_data in [
|
||||||
{ORTH: "were", LEMMA: "be", NORM: "were"},
|
{ORTH: "were", LEMMA: "be", NORM: "were"},
|
||||||
{ORTH: "have", NORM: "have"},
|
{ORTH: "have", NORM: "have"},
|
||||||
{ORTH: "has", LEMMA: "have", NORM: "has"},
|
{ORTH: "has", LEMMA: "have", NORM: "has"},
|
||||||
{ORTH: "dare", NORM: "dare"}]:
|
{ORTH: "dare", NORM: "dare"},
|
||||||
|
]:
|
||||||
verb_data_tc = dict(verb_data)
|
verb_data_tc = dict(verb_data)
|
||||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||||
for data in [verb_data, verb_data_tc]:
|
for data in [verb_data, verb_data_tc]:
|
||||||
_exc[data[ORTH] + "n't"] = [
|
_exc[data[ORTH] + "n't"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||||
|
]
|
||||||
|
|
||||||
_exc[data[ORTH] + "nt"] = [
|
_exc[data[ORTH] + "nt"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# Other contractions with trailing apostrophe
|
# Other contractions with trailing apostrophe
|
||||||
|
@ -256,7 +309,8 @@ for exc_data in [
|
||||||
{ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
|
{ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
|
||||||
{ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
|
{ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
|
||||||
{ORTH: "ol", LEMMA: "old", NORM: "old"},
|
{ORTH: "ol", LEMMA: "old", NORM: "old"},
|
||||||
{ORTH: "somethin", LEMMA: "something", NORM: "something"}]:
|
{ORTH: "somethin", LEMMA: "something", NORM: "something"},
|
||||||
|
]:
|
||||||
exc_data_tc = dict(exc_data)
|
exc_data_tc = dict(exc_data)
|
||||||
exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
|
exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
|
||||||
for data in [exc_data, exc_data_tc]:
|
for data in [exc_data, exc_data_tc]:
|
||||||
|
@ -272,7 +326,8 @@ for exc_data in [
|
||||||
{ORTH: "cause", LEMMA: "because", NORM: "because"},
|
{ORTH: "cause", LEMMA: "because", NORM: "because"},
|
||||||
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
|
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
|
||||||
{ORTH: "ll", LEMMA: "will", NORM: "will"},
|
{ORTH: "ll", LEMMA: "will", NORM: "will"},
|
||||||
{ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]:
|
{ORTH: "nuff", LEMMA: "enough", NORM: "enough"},
|
||||||
|
]:
|
||||||
exc_data_apos = dict(exc_data)
|
exc_data_apos = dict(exc_data)
|
||||||
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
|
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
|
||||||
for data in [exc_data, exc_data_apos]:
|
for data in [exc_data, exc_data_apos]:
|
||||||
|
@ -285,81 +340,69 @@ for h in range(1, 12 + 1):
|
||||||
for period in ["a.m.", "am"]:
|
for period in ["a.m.", "am"]:
|
||||||
_exc["%d%s" % (h, period)] = [
|
_exc["%d%s" % (h, period)] = [
|
||||||
{ORTH: "%d" % h},
|
{ORTH: "%d" % h},
|
||||||
{ORTH: period, LEMMA: "a.m.", NORM: "a.m."}]
|
{ORTH: period, LEMMA: "a.m.", NORM: "a.m."},
|
||||||
|
]
|
||||||
for period in ["p.m.", "pm"]:
|
for period in ["p.m.", "pm"]:
|
||||||
_exc["%d%s" % (h, period)] = [
|
_exc["%d%s" % (h, period)] = [
|
||||||
{ORTH: "%d" % h},
|
{ORTH: "%d" % h},
|
||||||
{ORTH: period, LEMMA: "p.m.", NORM: "p.m."}]
|
{ORTH: period, LEMMA: "p.m.", NORM: "p.m."},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# Rest
|
# Rest
|
||||||
|
|
||||||
_other_exc = {
|
_other_exc = {
|
||||||
"y'all": [
|
"y'all": [{ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, {ORTH: "all"}],
|
||||||
{ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"},
|
"yall": [{ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, {ORTH: "all"}],
|
||||||
{ORTH: "all"}],
|
|
||||||
|
|
||||||
"yall": [
|
|
||||||
{ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"},
|
|
||||||
{ORTH: "all"}],
|
|
||||||
|
|
||||||
"how'd'y": [
|
"how'd'y": [
|
||||||
{ORTH: "how", LEMMA: "how"},
|
{ORTH: "how", LEMMA: "how"},
|
||||||
{ORTH: "'d", LEMMA: "do"},
|
{ORTH: "'d", LEMMA: "do"},
|
||||||
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
|
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"},
|
||||||
|
],
|
||||||
"How'd'y": [
|
"How'd'y": [
|
||||||
{ORTH: "How", LEMMA: "how", NORM: "how"},
|
{ORTH: "How", LEMMA: "how", NORM: "how"},
|
||||||
{ORTH: "'d", LEMMA: "do"},
|
{ORTH: "'d", LEMMA: "do"},
|
||||||
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
|
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"},
|
||||||
|
],
|
||||||
"not've": [
|
"not've": [
|
||||||
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
||||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
],
|
||||||
"notve": [
|
"notve": [
|
||||||
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
||||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
],
|
||||||
"Not've": [
|
"Not've": [
|
||||||
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
|
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
],
|
||||||
"Notve": [
|
"Notve": [
|
||||||
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
|
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
|
||||||
|
],
|
||||||
"cannot": [
|
"cannot": [
|
||||||
{ORTH: "can", LEMMA: "can", TAG: "MD"},
|
{ORTH: "can", LEMMA: "can", TAG: "MD"},
|
||||||
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
|
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
||||||
|
],
|
||||||
"Cannot": [
|
"Cannot": [
|
||||||
{ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
|
{ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
|
||||||
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
|
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
||||||
|
],
|
||||||
"gonna": [
|
"gonna": [
|
||||||
{ORTH: "gon", LEMMA: "go", NORM: "going"},
|
{ORTH: "gon", LEMMA: "go", NORM: "going"},
|
||||||
{ORTH: "na", LEMMA: "to", NORM: "to"}],
|
{ORTH: "na", LEMMA: "to", NORM: "to"},
|
||||||
|
],
|
||||||
"Gonna": [
|
"Gonna": [
|
||||||
{ORTH: "Gon", LEMMA: "go", NORM: "going"},
|
{ORTH: "Gon", LEMMA: "go", NORM: "going"},
|
||||||
{ORTH: "na", LEMMA: "to", NORM: "to"}],
|
{ORTH: "na", LEMMA: "to", NORM: "to"},
|
||||||
|
],
|
||||||
"gotta": [
|
"gotta": [{ORTH: "got"}, {ORTH: "ta", LEMMA: "to", NORM: "to"}],
|
||||||
{ORTH: "got"},
|
"Gotta": [{ORTH: "Got", NORM: "got"}, {ORTH: "ta", LEMMA: "to", NORM: "to"}],
|
||||||
{ORTH: "ta", LEMMA: "to", NORM: "to"}],
|
"let's": [{ORTH: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
|
||||||
|
|
||||||
"Gotta": [
|
|
||||||
{ORTH: "Got", NORM: "got"},
|
|
||||||
{ORTH: "ta", LEMMA: "to", NORM: "to"}],
|
|
||||||
|
|
||||||
"let's": [
|
|
||||||
{ORTH: "let"},
|
|
||||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
|
|
||||||
|
|
||||||
"Let's": [
|
"Let's": [
|
||||||
{ORTH: "Let", LEMMA: "let", NORM: "let"},
|
{ORTH: "Let", LEMMA: "let", NORM: "let"},
|
||||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
|
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"},
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
_exc.update(_other_exc)
|
_exc.update(_other_exc)
|
||||||
|
@ -402,8 +445,6 @@ for exc_data in [
|
||||||
{ORTH: "Goin'", LEMMA: "go", NORM: "going"},
|
{ORTH: "Goin'", LEMMA: "go", NORM: "going"},
|
||||||
{ORTH: "goin", LEMMA: "go", NORM: "going"},
|
{ORTH: "goin", LEMMA: "go", NORM: "going"},
|
||||||
{ORTH: "Goin", LEMMA: "go", NORM: "going"},
|
{ORTH: "Goin", LEMMA: "go", NORM: "going"},
|
||||||
|
|
||||||
|
|
||||||
{ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
|
{ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
|
||||||
{ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
|
{ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
|
||||||
{ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
|
{ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
|
||||||
|
@ -456,15 +497,47 @@ for exc_data in [
|
||||||
{ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
|
{ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
|
||||||
{ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
|
{ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
|
||||||
{ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
|
{ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
|
||||||
{ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
|
{ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
for orth in [
|
for orth in [
|
||||||
"'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
|
"'d",
|
||||||
"E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
|
"a.m.",
|
||||||
"Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
|
"Adm.",
|
||||||
"Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs."]:
|
"Bros.",
|
||||||
|
"co.",
|
||||||
|
"Co.",
|
||||||
|
"Corp.",
|
||||||
|
"D.C.",
|
||||||
|
"Dr.",
|
||||||
|
"e.g.",
|
||||||
|
"E.g.",
|
||||||
|
"E.G.",
|
||||||
|
"Gen.",
|
||||||
|
"Gov.",
|
||||||
|
"i.e.",
|
||||||
|
"I.e.",
|
||||||
|
"I.E.",
|
||||||
|
"Inc.",
|
||||||
|
"Jr.",
|
||||||
|
"Ltd.",
|
||||||
|
"Md.",
|
||||||
|
"Messrs.",
|
||||||
|
"Mo.",
|
||||||
|
"Mont.",
|
||||||
|
"Mr.",
|
||||||
|
"Mrs.",
|
||||||
|
"Ms.",
|
||||||
|
"p.m.",
|
||||||
|
"Ph.D.",
|
||||||
|
"Rep.",
|
||||||
|
"Rev.",
|
||||||
|
"Sen.",
|
||||||
|
"St.",
|
||||||
|
"vs.",
|
||||||
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -30,8 +30,9 @@ for name, tag, patterns in [
|
||||||
("Facebook", "ORG", [[{LOWER: "facebook"}]]),
|
("Facebook", "ORG", [[{LOWER: "facebook"}]]),
|
||||||
("Blizzard", "ORG", [[{LOWER: "blizzard"}]]),
|
("Blizzard", "ORG", [[{LOWER: "blizzard"}]]),
|
||||||
("Ubuntu", "ORG", [[{LOWER: "ubuntu"}]]),
|
("Ubuntu", "ORG", [[{LOWER: "ubuntu"}]]),
|
||||||
("YouTube", "PRODUCT", [[{LOWER: "youtube"}]]),]:
|
("YouTube", "PRODUCT", [[{LOWER: "youtube"}]]),
|
||||||
ENTITY_RULES.append({ENT_ID: name, 'attrs': {ENT_TYPE: tag}, 'patterns': patterns})
|
]:
|
||||||
|
ENTITY_RULES.append({ENT_ID: name, "attrs": {ENT_TYPE: tag}, "patterns": patterns})
|
||||||
|
|
||||||
|
|
||||||
FALSE_POSITIVES = [
|
FALSE_POSITIVES = [
|
||||||
|
@ -46,5 +47,5 @@ FALSE_POSITIVES = [
|
||||||
[{ORTH: "Yay"}],
|
[{ORTH: "Yay"}],
|
||||||
[{ORTH: "Ahh"}],
|
[{ORTH: "Ahh"}],
|
||||||
[{ORTH: "Yea"}],
|
[{ORTH: "Yea"}],
|
||||||
[{ORTH: "Bah"}]
|
[{ORTH: "Bah"}],
|
||||||
]
|
]
|
||||||
|
|
|
@ -16,8 +16,10 @@ from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
class SpanishDefaults(Language.Defaults):
|
class SpanishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'es'
|
lex_attr_getters[LANG] = lambda text: "es"
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
@ -26,8 +28,8 @@ class SpanishDefaults(Language.Defaults):
|
||||||
|
|
||||||
|
|
||||||
class Spanish(Language):
|
class Spanish(Language):
|
||||||
lang = 'es'
|
lang = "es"
|
||||||
Defaults = SpanishDefaults
|
Defaults = SpanishDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Spanish']
|
__all__ = ["Spanish"]
|
||||||
|
|
|
@ -18,5 +18,5 @@ sentences = [
|
||||||
"El gato come pescado",
|
"El gato come pescado",
|
||||||
"Veo al hombre con el telescopio",
|
"Veo al hombre con el telescopio",
|
||||||
"La araña come moscas",
|
"La araña come moscas",
|
||||||
"El pingüino incuba en su nido"
|
"El pingüino incuba en su nido",
|
||||||
]
|
]
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set("""
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
|
actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
|
||||||
al algo alguna algunas alguno algunos algún alli allí alrededor ambos ampleamos
|
al algo alguna algunas alguno algunos algún alli allí alrededor ambos ampleamos
|
||||||
antano antaño ante anterior antes apenas aproximadamente aquel aquella aquellas
|
antano antaño ante anterior antes apenas aproximadamente aquel aquella aquellas
|
||||||
|
@ -81,4 +82,5 @@ va vais valor vamos van varias varios vaya veces ver verdad verdadera verdadero
|
||||||
vez vosotras vosotros voy vuestra vuestras vuestro vuestros
|
vez vosotras vosotros voy vuestra vuestras vuestro vuestros
|
||||||
|
|
||||||
ya yo
|
ya yo
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -8,18 +8,20 @@ def noun_chunks(obj):
|
||||||
doc = obj.doc
|
doc = obj.doc
|
||||||
if not len(doc):
|
if not len(doc):
|
||||||
return
|
return
|
||||||
np_label = doc.vocab.strings.add('NP')
|
np_label = doc.vocab.strings.add("NP")
|
||||||
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
|
left_labels = ["det", "fixed", "neg"] # ['nunmod', 'det', 'appos', 'fixed']
|
||||||
right_labels = ['flat', 'fixed', 'compound', 'neg']
|
right_labels = ["flat", "fixed", "compound", "neg"]
|
||||||
stop_labels = ['punct']
|
stop_labels = ["punct"]
|
||||||
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
|
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
|
||||||
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
||||||
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
||||||
token = doc[0]
|
token = doc[0]
|
||||||
while token and token.i < len(doc):
|
while token and token.i < len(doc):
|
||||||
if token.pos in [PROPN, NOUN, PRON]:
|
if token.pos in [PROPN, NOUN, PRON]:
|
||||||
left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
|
left, right = noun_bounds(
|
||||||
yield left.i, right.i+1, np_label
|
doc, token, np_left_deps, np_right_deps, stop_deps
|
||||||
|
)
|
||||||
|
yield left.i, right.i + 1, np_label
|
||||||
token = right
|
token = right
|
||||||
token = next_token(token)
|
token = next_token(token)
|
||||||
|
|
||||||
|
@ -31,7 +33,7 @@ def is_verb_token(token):
|
||||||
def next_token(token):
|
def next_token(token):
|
||||||
try:
|
try:
|
||||||
return token.nbor()
|
return token.nbor()
|
||||||
except:
|
except IndexError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,16 +44,20 @@ def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
|
||||||
left_bound = token
|
left_bound = token
|
||||||
right_bound = root
|
right_bound = root
|
||||||
for token in root.rights:
|
for token in root.rights:
|
||||||
if (token.dep in np_right_deps):
|
if token.dep in np_right_deps:
|
||||||
left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
|
left, right = noun_bounds(
|
||||||
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
|
doc, token, np_left_deps, np_right_deps, stop_deps
|
||||||
doc[left_bound.i: right.i])):
|
)
|
||||||
|
if list(
|
||||||
|
filter(
|
||||||
|
lambda t: is_verb_token(t) or t.dep in stop_deps,
|
||||||
|
doc[left_bound.i : right.i],
|
||||||
|
)
|
||||||
|
):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
right_bound = right
|
right_bound = right
|
||||||
return left_bound, right_bound
|
return left_bound, right_bound
|
||||||
|
|
||||||
|
|
||||||
SYNTAX_ITERATORS = {
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||||
'noun_chunks': noun_chunks
|
|
||||||
}
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB
|
from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB
|
||||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX, CONJ
|
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX, CONJ
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
"ADJ___": {"morph": "_", POS: ADJ},
|
"ADJ___": {"morph": "_", POS: ADJ},
|
||||||
"ADJ__AdpType=Prep": {"morph": "AdpType=Prep", POS: ADJ},
|
"ADJ__AdpType=Prep": {"morph": "AdpType=Prep", POS: ADJ},
|
||||||
|
@ -29,7 +29,7 @@ TAG_MAP = {
|
||||||
"ADP__AdpType=Preppron|Gender=Fem|Number=Sing": {"morph": "AdpType=Preppron|Gender=Fem|Number=Sing", POS: ADP},
|
"ADP__AdpType=Preppron|Gender=Fem|Number=Sing": {"morph": "AdpType=Preppron|Gender=Fem|Number=Sing", POS: ADP},
|
||||||
"ADP__AdpType=Preppron|Gender=Masc|Number=Plur": {"morph": "AdpType=Preppron|Gender=Masc|Number=Plur", POS: ADP},
|
"ADP__AdpType=Preppron|Gender=Masc|Number=Plur": {"morph": "AdpType=Preppron|Gender=Masc|Number=Plur", POS: ADP},
|
||||||
"ADP__AdpType=Preppron|Gender=Masc|Number=Sing": {"morph": "AdpType=Preppron|Gender=Masc|Number=Sing", POS: ADP},
|
"ADP__AdpType=Preppron|Gender=Masc|Number=Sing": {"morph": "AdpType=Preppron|Gender=Masc|Number=Sing", POS: ADP},
|
||||||
"ADP": { POS: ADP},
|
"ADP": {POS: ADP},
|
||||||
"ADV___": {"morph": "_", POS: ADV},
|
"ADV___": {"morph": "_", POS: ADV},
|
||||||
"ADV__AdpType=Prep": {"morph": "AdpType=Prep", POS: ADV},
|
"ADV__AdpType=Prep": {"morph": "AdpType=Prep", POS: ADV},
|
||||||
"ADV__AdpType=Preppron|Gender=Masc|Number=Sing": {"morph": "AdpType=Preppron|Gender=Masc|Number=Sing", POS: ADV},
|
"ADV__AdpType=Preppron|Gender=Masc|Number=Sing": {"morph": "AdpType=Preppron|Gender=Masc|Number=Sing", POS: ADV},
|
||||||
|
@ -135,7 +135,7 @@ TAG_MAP = {
|
||||||
"DET__Number=Sing|PronType=Ind": {"morph": "Number=Sing|PronType=Ind", POS: DET},
|
"DET__Number=Sing|PronType=Ind": {"morph": "Number=Sing|PronType=Ind", POS: DET},
|
||||||
"DET__PronType=Int": {"morph": "PronType=Int", POS: DET},
|
"DET__PronType=Int": {"morph": "PronType=Int", POS: DET},
|
||||||
"DET__PronType=Rel": {"morph": "PronType=Rel", POS: DET},
|
"DET__PronType=Rel": {"morph": "PronType=Rel", POS: DET},
|
||||||
"DET": { POS: DET},
|
"DET": {POS: DET},
|
||||||
"INTJ___": {"morph": "_", POS: INTJ},
|
"INTJ___": {"morph": "_", POS: INTJ},
|
||||||
"NOUN___": {"morph": "_", POS: NOUN},
|
"NOUN___": {"morph": "_", POS: NOUN},
|
||||||
"NOUN__AdvType=Tim": {"morph": "AdvType=Tim", POS: NOUN},
|
"NOUN__AdvType=Tim": {"morph": "AdvType=Tim", POS: NOUN},
|
||||||
|
@ -307,3 +307,4 @@ TAG_MAP = {
|
||||||
"X___": {"morph": "_", POS: X},
|
"X___": {"morph": "_", POS: X},
|
||||||
"_SP": {"morph": "_", POS: SPACE},
|
"_SP": {"morph": "_", POS: SPACE},
|
||||||
}
|
}
|
||||||
|
# fmt: on
|
||||||
|
|
|
@ -1,17 +1,12 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, ADP, DET, PRON_LEMMA
|
from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA
|
||||||
|
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
"pal": [
|
"pal": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "l", LEMMA: "el", NORM: "el"}],
|
||||||
{ORTH: "pa", LEMMA: "para"},
|
"pala": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "la", LEMMA: "la", NORM: "la"}],
|
||||||
{ORTH: "l", LEMMA: "el", NORM: "el"}],
|
|
||||||
|
|
||||||
"pala": [
|
|
||||||
{ORTH: "pa", LEMMA: "para"},
|
|
||||||
{ORTH: "la", LEMMA: "la", NORM: "la"}]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,32 +19,50 @@ for exc_data in [
|
||||||
{ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"},
|
{ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"},
|
||||||
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
|
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
|
||||||
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
|
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
|
||||||
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]:
|
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
# Times
|
# Times
|
||||||
|
|
||||||
_exc["12m."] = [
|
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}]
|
||||||
{ORTH: "12"},
|
|
||||||
{ORTH: "m.", LEMMA: "p.m."}]
|
|
||||||
|
|
||||||
|
|
||||||
for h in range(1, 12 + 1):
|
for h in range(1, 12 + 1):
|
||||||
for period in ["a.m.", "am"]:
|
for period in ["a.m.", "am"]:
|
||||||
_exc["%d%s" % (h, period)] = [
|
_exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "a.m."}]
|
||||||
{ORTH: "%d" % h},
|
|
||||||
{ORTH: period, LEMMA: "a.m."}]
|
|
||||||
for period in ["p.m.", "pm"]:
|
for period in ["p.m.", "pm"]:
|
||||||
_exc["%d%s" % (h, period)] = [
|
_exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "p.m."}]
|
||||||
{ORTH: "%d" % h},
|
|
||||||
{ORTH: period, LEMMA: "p.m."}]
|
|
||||||
|
|
||||||
|
|
||||||
for orth in [
|
for orth in [
|
||||||
"a.C.", "a.J.C.", "apdo.", "Av.", "Avda.", "Cía.", "etc.", "Gob.", "Gral.",
|
"a.C.",
|
||||||
"Ing.", "J.C.", "Lic.", "m.n.", "no.", "núm.", "P.D.", "Prof.", "Profa.",
|
"a.J.C.",
|
||||||
"q.e.p.d.", "S.A.", "S.L.", "s.s.s.", "Sr.", "Sra.", "Srta."]:
|
"apdo.",
|
||||||
|
"Av.",
|
||||||
|
"Avda.",
|
||||||
|
"Cía.",
|
||||||
|
"etc.",
|
||||||
|
"Gob.",
|
||||||
|
"Gral.",
|
||||||
|
"Ing.",
|
||||||
|
"J.C.",
|
||||||
|
"Lic.",
|
||||||
|
"m.n.",
|
||||||
|
"no.",
|
||||||
|
"núm.",
|
||||||
|
"P.D.",
|
||||||
|
"Prof.",
|
||||||
|
"Profa.",
|
||||||
|
"q.e.p.d.",
|
||||||
|
"S.A.",
|
||||||
|
"S.L.",
|
||||||
|
"s.s.s.",
|
||||||
|
"Sr.",
|
||||||
|
"Sra.",
|
||||||
|
"Srta.",
|
||||||
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,11 +12,14 @@ from .tag_map import TAG_MAP
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
|
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
|
||||||
|
|
||||||
|
|
||||||
class PersianDefaults(Language.Defaults):
|
class PersianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
lex_attr_getters[LANG] = lambda text: 'fa'
|
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||||
|
)
|
||||||
|
lex_attr_getters[LANG] = lambda text: "fa"
|
||||||
tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
|
||||||
lemma_rules = LEMMA_RULES
|
lemma_rules = LEMMA_RULES
|
||||||
lemma_index = LEMMA_INDEX
|
lemma_index = LEMMA_INDEX
|
||||||
|
@ -27,8 +30,8 @@ class PersianDefaults(Language.Defaults):
|
||||||
|
|
||||||
|
|
||||||
class Persian(Language):
|
class Persian(Language):
|
||||||
lang = 'fa'
|
lang = "fa"
|
||||||
Defaults = PersianDefaults
|
Defaults = PersianDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Persian']
|
__all__ = ["Persian"]
|
||||||
|
|
|
@ -12,8 +12,8 @@ Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
"این یک جمله نمونه می باشد.",
|
"این یک جمله نمونه می باشد.",
|
||||||
"قرار ما، امروز ساعت ۲:۳۰ بعدازظهر هست!"
|
"قرار ما، امروز ساعت ۲:۳۰ بعدازظهر هست!",
|
||||||
"دیروز علی به من ۲۰۰۰.۱﷼ پول نقد داد.",
|
"دیروز علی به من ۲۰۰۰.۱﷼ پول نقد داد.",
|
||||||
"چطور میتوان از تهران به کاشان رفت؟"
|
"چطور میتوان از تهران به کاشان رفت؟",
|
||||||
"حدود ۸۰٪ هوا از نیتروژن تشکیل شده است."
|
"حدود ۸۰٪ هوا از نیتروژن تشکیل شده است.",
|
||||||
]
|
]
|
||||||
|
|
|
@ -10,23 +10,13 @@ from ._verbs_exc import VERBS_EXC
|
||||||
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
|
from ._lemma_rules import ADJECTIVE_RULES, NOUN_RULES, VERB_RULES, PUNCT_RULES
|
||||||
|
|
||||||
|
|
||||||
LEMMA_INDEX = {
|
LEMMA_INDEX = {"adj": ADJECTIVES, "noun": NOUNS, "verb": VERBS}
|
||||||
'adj': ADJECTIVES,
|
|
||||||
'noun': NOUNS,
|
|
||||||
'verb': VERBS
|
|
||||||
}
|
|
||||||
|
|
||||||
LEMMA_RULES = {
|
LEMMA_RULES = {
|
||||||
'adj': ADJECTIVE_RULES,
|
"adj": ADJECTIVE_RULES,
|
||||||
'noun': NOUN_RULES,
|
"noun": NOUN_RULES,
|
||||||
'verb': VERB_RULES,
|
"verb": VERB_RULES,
|
||||||
'punct': PUNCT_RULES
|
"punct": PUNCT_RULES,
|
||||||
}
|
}
|
||||||
|
|
||||||
LEMMA_EXC = {
|
LEMMA_EXC = {"adj": ADJECTIVES_EXC, "noun": NOUNS_EXC, "verb": VERBS_EXC}
|
||||||
'adj': ADJECTIVES_EXC,
|
|
||||||
'noun': NOUNS_EXC,
|
|
||||||
'verb': VERBS_EXC
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user