mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 18:24:08 +03:00
Fix merge conflit in printer
This commit is contained in:
commit
94267ec50f
1
.appveyor.yml
Normal file
1
.appveyor.yml
Normal file
|
@ -0,0 +1 @@
|
||||||
|
build: off
|
96
.gitignore
vendored
96
.gitignore
vendored
|
@ -1,50 +1,45 @@
|
||||||
# Vim
|
# spaCy
|
||||||
*.swp
|
spacy/data/
|
||||||
*.sw*
|
|
||||||
Profile.prof
|
|
||||||
tmp/
|
|
||||||
.dev
|
|
||||||
.denv
|
|
||||||
.pypyenv
|
|
||||||
.eggs
|
|
||||||
*.tgz
|
|
||||||
.sass-cache
|
|
||||||
.python-version
|
|
||||||
|
|
||||||
MANIFEST
|
|
||||||
|
|
||||||
corpora/
|
corpora/
|
||||||
models/
|
models/
|
||||||
keys/
|
keys/
|
||||||
|
|
||||||
spacy/syntax/*.cpp
|
# Website
|
||||||
spacy/syntax/*.html
|
website/www/
|
||||||
spacy/en/*.cpp
|
website/_deploy.sh
|
||||||
spacy/tokens/*.cpp
|
website/package.json
|
||||||
spacy/serialize/*.cpp
|
website/announcement.jade
|
||||||
spacy/en/data/*
|
website/.gitignore
|
||||||
spacy/*.cpp
|
|
||||||
spacy/ner/*.cpp
|
|
||||||
spacy/orthography/*.cpp
|
|
||||||
ext/murmurhash.cpp
|
|
||||||
ext/sparsehash.cpp
|
|
||||||
|
|
||||||
/spacy/data/
|
# Cython / C extensions
|
||||||
|
|
||||||
_build/
|
|
||||||
.env/
|
|
||||||
tmp/
|
|
||||||
cythonize.json
|
cythonize.json
|
||||||
|
spacy/*.html
|
||||||
# Byte-compiled / optimized / DLL files
|
*.cpp
|
||||||
__pycache__/
|
|
||||||
*.py[cod]
|
|
||||||
|
|
||||||
# C extensions
|
|
||||||
*.so
|
*.so
|
||||||
|
|
||||||
# Distribution / packaging
|
# Vim / VSCode / editors
|
||||||
|
*.swp
|
||||||
|
*.sw*
|
||||||
|
Profile.prof
|
||||||
|
.vscode
|
||||||
|
.sass-cache
|
||||||
|
|
||||||
|
# Python
|
||||||
.Python
|
.Python
|
||||||
|
.python-version
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
.env/
|
||||||
|
.env2/
|
||||||
|
.env3/
|
||||||
|
.~env/
|
||||||
|
.venv
|
||||||
|
venv/
|
||||||
|
.dev
|
||||||
|
.denv
|
||||||
|
.pypyenv
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
env/
|
env/
|
||||||
bin/
|
bin/
|
||||||
build/
|
build/
|
||||||
|
@ -59,6 +54,12 @@ var/
|
||||||
*.egg-info/
|
*.egg-info/
|
||||||
.installed.cfg
|
.installed.cfg
|
||||||
*.egg
|
*.egg
|
||||||
|
.eggs
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# Temporary files
|
||||||
|
*.~*
|
||||||
|
tmp/
|
||||||
|
|
||||||
# Installer logs
|
# Installer logs
|
||||||
pip-log.txt
|
pip-log.txt
|
||||||
|
@ -87,25 +88,16 @@ coverage.xml
|
||||||
*.log
|
*.log
|
||||||
*.pot
|
*.pot
|
||||||
|
|
||||||
# Windows local helper files
|
# Windows
|
||||||
*.bat
|
*.bat
|
||||||
|
Thumbs.db
|
||||||
|
Desktop.ini
|
||||||
|
|
||||||
# Mac OS X
|
# Mac OS X
|
||||||
*.DS_Store
|
*.DS_Store
|
||||||
|
|
||||||
# Temporary files / Dropbox hack
|
|
||||||
*.~*
|
|
||||||
|
|
||||||
# Komodo project files
|
# Komodo project files
|
||||||
*.komodoproject
|
*.komodoproject
|
||||||
|
|
||||||
# Website
|
# Other
|
||||||
website/_deploy.sh
|
*.tgz
|
||||||
website/package.json
|
|
||||||
website/announcement.jade
|
|
||||||
website/www/
|
|
||||||
website/.gitignore
|
|
||||||
|
|
||||||
# Python virtualenv
|
|
||||||
venv
|
|
||||||
venv/*
|
|
||||||
|
|
|
@ -16,6 +16,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
|
||||||
* Daniel Vila Suero, [@dvsrepo](https://github.com/dvsrepo)
|
* Daniel Vila Suero, [@dvsrepo](https://github.com/dvsrepo)
|
||||||
* Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi)
|
* Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi)
|
||||||
* Eric Zhao, [@ericzhao28](https://github.com/ericzhao28)
|
* Eric Zhao, [@ericzhao28](https://github.com/ericzhao28)
|
||||||
|
* Francisco Aranda, [@frascuchon](https://github.com/frascuchon)
|
||||||
* Greg Baker, [@solresol](https://github.com/solresol)
|
* Greg Baker, [@solresol](https://github.com/solresol)
|
||||||
* Grégory Howard, [@Gregory-Howard](https://github.com/Gregory-Howard)
|
* Grégory Howard, [@Gregory-Howard](https://github.com/Gregory-Howard)
|
||||||
* György Orosz, [@oroszgy](https://github.com/oroszgy)
|
* György Orosz, [@oroszgy](https://github.com/oroszgy)
|
||||||
|
@ -24,6 +25,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
|
||||||
* Ines Montani, [@ines](https://github.com/ines)
|
* Ines Montani, [@ines](https://github.com/ines)
|
||||||
* J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading)
|
* J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading)
|
||||||
* Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan)
|
* Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan)
|
||||||
|
* Jim Regan, [@jimregan](https://github.com/jimregan)
|
||||||
* Jordan Suchow, [@suchow](https://github.com/suchow)
|
* Jordan Suchow, [@suchow](https://github.com/suchow)
|
||||||
* Josh Reeter, [@jreeter](https://github.com/jreeter)
|
* Josh Reeter, [@jreeter](https://github.com/jreeter)
|
||||||
* Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks)
|
* Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks)
|
||||||
|
|
12
README.rst
12
README.rst
|
@ -4,18 +4,22 @@ spaCy: Industrial-strength NLP
|
||||||
spaCy is a library for advanced natural language processing in Python and
|
spaCy is a library for advanced natural language processing in Python and
|
||||||
Cython. spaCy is built on the very latest research, but it isn't researchware.
|
Cython. spaCy is built on the very latest research, but it isn't researchware.
|
||||||
It was designed from day one to be used in real products. spaCy currently supports
|
It was designed from day one to be used in real products. spaCy currently supports
|
||||||
English, German and French, as well as tokenization for Spanish, Italian,
|
English, German, French and Spanish, as well as tokenization for Italian,
|
||||||
Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew,
|
Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew,
|
||||||
Chinese and Japanese. It's commercial open-source software, released under the
|
Chinese and Japanese. It's commercial open-source software, released under the
|
||||||
MIT license.
|
MIT license.
|
||||||
|
|
||||||
📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.
|
⭐️ **Test spaCy v2.0.0 alpha and the new models!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/tag/v2.0.0-alpha>`_
|
||||||
|
|
||||||
💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
||||||
|
|
||||||
.. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square
|
.. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square
|
||||||
:target: https://travis-ci.org/explosion/spaCy
|
:target: https://travis-ci.org/explosion/spaCy
|
||||||
:alt: Build Status
|
:alt: Travis Build Status
|
||||||
|
|
||||||
|
.. image:: https://img.shields.io/appveyor/ci/explosion/spacy/master.svg?style=flat-square
|
||||||
|
:target: https://ci.appveyor.com/project/explosion/spacy
|
||||||
|
:alt: Appveyor Build Status
|
||||||
|
|
||||||
.. image:: https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square
|
.. image:: https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square
|
||||||
:target: https://github.com/explosion/spaCy/releases
|
:target: https://github.com/explosion/spaCy/releases
|
||||||
|
@ -85,7 +89,7 @@ Features
|
||||||
* GIL-free **multi-threading**
|
* GIL-free **multi-threading**
|
||||||
* Efficient binary serialization
|
* Efficient binary serialization
|
||||||
* Easy **deep learning** integration
|
* Easy **deep learning** integration
|
||||||
* Statistical models for **English** and **German**
|
* Statistical models for **English**, **German**, **French** and **Spanish**
|
||||||
* State-of-the-art speed
|
* State-of-the-art speed
|
||||||
* Robust, rigorously evaluated accuracy
|
* Robust, rigorously evaluated accuracy
|
||||||
|
|
||||||
|
|
|
@ -52,6 +52,7 @@ def train_ner(nlp, train_data, output_dir):
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
loss = 0.
|
loss = 0.
|
||||||
for raw_text, entity_offsets in train_data:
|
for raw_text, entity_offsets in train_data:
|
||||||
|
doc = nlp.make_doc(raw_text)
|
||||||
gold = GoldParse(doc, entities=entity_offsets)
|
gold = GoldParse(doc, entities=entity_offsets)
|
||||||
# By default, the GoldParse class assumes that the entities
|
# By default, the GoldParse class assumes that the entities
|
||||||
# described by offset are complete, and all other words should
|
# described by offset are complete, and all other words should
|
||||||
|
@ -63,7 +64,6 @@ def train_ner(nlp, train_data, output_dir):
|
||||||
#for i in range(len(gold.ner)):
|
#for i in range(len(gold.ner)):
|
||||||
#if not gold.ner[i].endswith('ANIMAL'):
|
#if not gold.ner[i].endswith('ANIMAL'):
|
||||||
# gold.ner[i] = '-'
|
# gold.ner[i] = '-'
|
||||||
doc = nlp.make_doc(raw_text)
|
|
||||||
nlp.tagger(doc)
|
nlp.tagger(doc)
|
||||||
# As of 1.9, spaCy's parser now lets you supply a dropout probability
|
# As of 1.9, spaCy's parser now lets you supply a dropout probability
|
||||||
# This might help the model generalize better from only a few
|
# This might help the model generalize better from only a few
|
||||||
|
|
|
@ -7,9 +7,11 @@ thinc>=6.5.0,<6.6.0
|
||||||
murmurhash>=0.26,<0.27
|
murmurhash>=0.26,<0.27
|
||||||
plac<1.0.0,>=0.9.6
|
plac<1.0.0,>=0.9.6
|
||||||
six
|
six
|
||||||
|
html5lib==1.0b8
|
||||||
ujson>=1.35
|
ujson>=1.35
|
||||||
dill>=0.2,<0.3
|
dill>=0.2,<0.3
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
regex==2017.4.5
|
regex==2017.4.5
|
||||||
ftfy>=4.4.2,<5.0.0
|
ftfy>=4.4.2,<5.0.0
|
||||||
pytest>=3.0.6,<4.0.0
|
pytest>=3.0.6,<4.0.0
|
||||||
|
pip>=9.0.0,<10.0.0
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -197,6 +197,7 @@ def setup_package():
|
||||||
'preshed>=1.0.0,<2.0.0',
|
'preshed>=1.0.0,<2.0.0',
|
||||||
'thinc>=6.5.0,<6.6.0',
|
'thinc>=6.5.0,<6.6.0',
|
||||||
'plac<1.0.0,>=0.9.6',
|
'plac<1.0.0,>=0.9.6',
|
||||||
|
'pip>=9.0.0,<10.0.0',
|
||||||
'six',
|
'six',
|
||||||
'pathlib',
|
'pathlib',
|
||||||
'ujson>=1.35',
|
'ujson>=1.35',
|
||||||
|
|
|
@ -10,7 +10,7 @@ __author__ = 'Matthew Honnibal'
|
||||||
__email__ = 'matt@explosion.ai'
|
__email__ = 'matt@explosion.ai'
|
||||||
__license__ = 'MIT'
|
__license__ = 'MIT'
|
||||||
|
|
||||||
__docs__ = 'https://spacy.io/docs/usage'
|
__docs_models__ = 'https://spacy.io/docs/usage'
|
||||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||||
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
||||||
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
|
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
|
||||||
|
|
|
@ -79,5 +79,5 @@ def check_error_depr(model):
|
||||||
"As of v1.7.0, the download all command is deprecated. Please "
|
"As of v1.7.0, the download all command is deprecated. Please "
|
||||||
"download the models individually via spacy.download [model name] "
|
"download the models individually via spacy.download [model name] "
|
||||||
"or pip install. For more info on this, see the documentation: "
|
"or pip install. For more info on this, see the documentation: "
|
||||||
"{d}".format(d=about.__docs__),
|
"{d}".format(d=about.__docs_models__),
|
||||||
title="Deprecated command")
|
title="Deprecated command")
|
||||||
|
|
|
@ -47,7 +47,7 @@ def package(input_dir, output_dir, meta_path, force):
|
||||||
|
|
||||||
def check_dirs(input_path, output_path, meta_path):
|
def check_dirs(input_path, output_path, meta_path):
|
||||||
if not input_path.exists():
|
if not input_path.exists():
|
||||||
util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found")
|
util.sys_exit(unicode_(input_path.as_posix()), title="Model directory not found")
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
util.sys_exit(unicode_(output_path), title="Output directory not found")
|
util.sys_exit(unicode_(output_path), title="Output directory not found")
|
||||||
if meta_path and not meta_path.exists():
|
if meta_path and not meta_path.exists():
|
||||||
|
|
|
@ -146,7 +146,7 @@ class ModelDownload():
|
||||||
"The spacy.{l}.download command is now deprecated. Please use "
|
"The spacy.{l}.download command is now deprecated. Please use "
|
||||||
"python -m spacy download [model name or shortcut] instead. For more "
|
"python -m spacy download [model name or shortcut] instead. For more "
|
||||||
"info and available models, see the documentation: {d}. "
|
"info and available models, see the documentation: {d}. "
|
||||||
"Downloading default '{l}' model now...".format(d=about.__docs__, l=lang),
|
"Downloading default '{l}' model now...".format(d=about.__docs_models__, l=lang),
|
||||||
title="Warning: deprecated command")
|
title="Warning: deprecated command")
|
||||||
download(lang)
|
download(lang)
|
||||||
|
|
||||||
|
|
|
@ -178,7 +178,7 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
|
||||||
|
|
||||||
EXC[orth + "ve"] = [
|
EXC[orth + "ve"] = [
|
||||||
{ORTH: orth, LEMMA: word},
|
{ORTH: orth, LEMMA: word},
|
||||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
||||||
]
|
]
|
||||||
|
|
||||||
EXC[orth + "'d"] = [
|
EXC[orth + "'d"] = [
|
||||||
|
|
|
@ -6,36 +6,6 @@ from ..language_data import PRON_LEMMA, DET_LEMMA
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = {
|
TOKENIZER_EXCEPTIONS = {
|
||||||
"al": [
|
|
||||||
{ORTH: "a", LEMMA: "a", TAG: ADP},
|
|
||||||
{ORTH: "el", LEMMA: "el", TAG: DET}
|
|
||||||
],
|
|
||||||
|
|
||||||
"consigo": [
|
|
||||||
{ORTH: "con", LEMMA: "con"},
|
|
||||||
{ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"conmigo": [
|
|
||||||
{ORTH: "con", LEMMA: "con"},
|
|
||||||
{ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"contigo": [
|
|
||||||
{ORTH: "con", LEMMA: "con"},
|
|
||||||
{ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"del": [
|
|
||||||
{ORTH: "de", LEMMA: "de", TAG: ADP},
|
|
||||||
{ORTH: "l", LEMMA: "el", TAG: DET}
|
|
||||||
],
|
|
||||||
|
|
||||||
"pel": [
|
|
||||||
{ORTH: "pe", LEMMA: "per", TAG: ADP},
|
|
||||||
{ORTH: "l", LEMMA: "el", TAG: DET}
|
|
||||||
],
|
|
||||||
|
|
||||||
"pal": [
|
"pal": [
|
||||||
{ORTH: "pa", LEMMA: "para"},
|
{ORTH: "pa", LEMMA: "para"},
|
||||||
{ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}
|
{ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}
|
||||||
|
@ -43,7 +13,7 @@ TOKENIZER_EXCEPTIONS = {
|
||||||
|
|
||||||
"pala": [
|
"pala": [
|
||||||
{ORTH: "pa", LEMMA: "para"},
|
{ORTH: "pa", LEMMA: "para"},
|
||||||
{ORTH: "la", LEMMA: DET_LEMMA}
|
{ORTH: "la", LEMMA: DET_LEMMA, NORM: "la"}
|
||||||
],
|
],
|
||||||
|
|
||||||
"aprox.": [
|
"aprox.": [
|
||||||
|
|
|
@ -3,21 +3,39 @@ from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language, BaseDefaults
|
||||||
|
from ..tokenizer import Tokenizer
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
|
||||||
from .language_data import *
|
from .language_data import *
|
||||||
|
|
||||||
|
class JapaneseTokenizer(object):
|
||||||
class Japanese(Language):
|
def __init__(self, cls, nlp=None):
|
||||||
lang = 'ja'
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||||
|
|
||||||
def make_doc(self, text):
|
|
||||||
try:
|
try:
|
||||||
from janome.tokenizer import Tokenizer
|
from janome.tokenizer import Tokenizer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("The Japanese tokenizer requires the Janome library: "
|
raise ImportError("The Japanese tokenizer requires the Janome library: "
|
||||||
"https://github.com/mocobeta/janome")
|
"https://github.com/mocobeta/janome")
|
||||||
words = [x.surface for x in Tokenizer().tokenize(text)]
|
self.tokenizer = Tokenizer()
|
||||||
|
|
||||||
|
def __call__(self, text):
|
||||||
|
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
class JapaneseDefaults(BaseDefaults):
|
||||||
|
@classmethod
|
||||||
|
def create_tokenizer(cls, nlp=None):
|
||||||
|
return JapaneseTokenizer(cls, nlp)
|
||||||
|
|
||||||
|
class Japanese(Language):
|
||||||
|
lang = 'ja'
|
||||||
|
|
||||||
|
Defaults = JapaneseDefaults
|
||||||
|
|
||||||
|
def make_doc(self, text):
|
||||||
|
words = self.tokenizer(text)
|
||||||
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ from .attrs cimport IS_QUOTE
|
||||||
from .attrs cimport IS_LEFT_PUNCT
|
from .attrs cimport IS_LEFT_PUNCT
|
||||||
from .attrs cimport IS_RIGHT_PUNCT
|
from .attrs cimport IS_RIGHT_PUNCT
|
||||||
from .attrs cimport IS_OOV
|
from .attrs cimport IS_OOV
|
||||||
|
from . import about
|
||||||
|
|
||||||
|
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
|
@ -137,11 +138,10 @@ cdef class Lexeme:
|
||||||
cdef int length = self.vocab.vectors_length
|
cdef int length = self.vocab.vectors_length
|
||||||
if length == 0:
|
if length == 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Word vectors set to length 0. This may be because the "
|
"Word vectors set to length 0. This may be because you "
|
||||||
"data is not installed. If you haven't already, run"
|
"don't have a model installed or loaded, or because your "
|
||||||
"\npython -m spacy download %s\n"
|
"model doesn't include word vectors. For more info, see "
|
||||||
"to install the data." % self.vocab.lang
|
"the documentation: \n%s\n" % about.__docs_models__)
|
||||||
)
|
|
||||||
|
|
||||||
vector_view = <float[:length,]>self.c.vector
|
vector_view = <float[:length,]>self.c.vector
|
||||||
return numpy.asarray(vector_view)
|
return numpy.asarray(vector_view)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..parts_of_speech cimport NOUN, PROPN, PRON
|
from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX
|
||||||
|
|
||||||
|
|
||||||
def english_noun_chunks(obj):
|
def english_noun_chunks(obj):
|
||||||
|
@ -66,4 +66,55 @@ def german_noun_chunks(obj):
|
||||||
yield word.left_edge.i, rbracket, np_label
|
yield word.left_edge.i, rbracket, np_label
|
||||||
|
|
||||||
|
|
||||||
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
|
def es_noun_chunks(obj):
|
||||||
|
|
||||||
|
doc = obj.doc
|
||||||
|
np_label = doc.vocab.strings['NP']
|
||||||
|
|
||||||
|
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
|
||||||
|
right_labels = ['flat', 'fixed', 'compound', 'neg']
|
||||||
|
stop_labels = ['punct']
|
||||||
|
|
||||||
|
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
|
||||||
|
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
|
||||||
|
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
|
||||||
|
|
||||||
|
def next_token(token):
|
||||||
|
try:
|
||||||
|
return token.nbor()
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def noun_bounds(root):
|
||||||
|
|
||||||
|
def is_verb_token(token):
|
||||||
|
return token.pos in [VERB, AUX]
|
||||||
|
|
||||||
|
left_bound = root
|
||||||
|
for token in reversed(list(root.lefts)):
|
||||||
|
if token.dep in np_left_deps:
|
||||||
|
left_bound = token
|
||||||
|
|
||||||
|
right_bound = root
|
||||||
|
for token in root.rights:
|
||||||
|
if (token.dep in np_right_deps):
|
||||||
|
left, right = noun_bounds(token)
|
||||||
|
|
||||||
|
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps, doc[left_bound.i: right.i])):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
right_bound = right
|
||||||
|
|
||||||
|
return left_bound, right_bound
|
||||||
|
|
||||||
|
|
||||||
|
token = doc[0]
|
||||||
|
while token and token.i < len(doc):
|
||||||
|
if token.pos in [PROPN, NOUN, PRON]:
|
||||||
|
left, right = noun_bounds(token)
|
||||||
|
yield left.i, right.i+1, np_label
|
||||||
|
token = right
|
||||||
|
token = next_token(token)
|
||||||
|
|
||||||
|
|
||||||
|
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, 'es': es_noun_chunks}
|
||||||
|
|
|
@ -5,6 +5,7 @@ from ..en import English
|
||||||
from ..de import German
|
from ..de import German
|
||||||
from ..es import Spanish
|
from ..es import Spanish
|
||||||
from ..it import Italian
|
from ..it import Italian
|
||||||
|
from ..ja import Japanese
|
||||||
from ..fr import French
|
from ..fr import French
|
||||||
from ..pt import Portuguese
|
from ..pt import Portuguese
|
||||||
from ..nl import Dutch
|
from ..nl import Dutch
|
||||||
|
@ -26,7 +27,7 @@ from pathlib import Path
|
||||||
import os
|
import os
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
# These languages get run through generic tokenizer tests
|
||||||
LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
|
LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
|
||||||
Swedish, Hungarian, Finnish, Bengali, Norwegian]
|
Swedish, Hungarian, Finnish, Bengali, Norwegian]
|
||||||
|
|
||||||
|
@ -76,6 +77,12 @@ def fi_tokenizer():
|
||||||
return Finnish.Defaults.create_tokenizer()
|
return Finnish.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def ja_tokenizer():
|
||||||
|
janome = pytest.importorskip("janome")
|
||||||
|
return Japanese.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def sv_tokenizer():
|
def sv_tokenizer():
|
||||||
return Swedish.Defaults.create_tokenizer()
|
return Swedish.Defaults.create_tokenizer()
|
||||||
|
|
|
@ -217,10 +217,13 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors):
|
||||||
assert doc.has_vector
|
assert doc.has_vector
|
||||||
|
|
||||||
|
|
||||||
def test_parse_tree(EN):
|
def test_parse_tree(en_tokenizer):
|
||||||
|
"""Tests doc.print_tree() method."""
|
||||||
text = 'I like New York in Autumn.'
|
text = 'I like New York in Autumn.'
|
||||||
doc = EN(text, tag=True)
|
heads = [1, 0, 1, -2, -3, -1, -5]
|
||||||
doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
|
tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, tags=tags)
|
||||||
# full method parse_tree(text) is a trivial composition
|
# full method parse_tree(text) is a trivial composition
|
||||||
trees = doc.print_tree()
|
trees = doc.print_tree()
|
||||||
assert len(trees) > 0
|
assert len(trees) > 0
|
||||||
|
|
0
spacy/tests/ja/__init__.py
Normal file
0
spacy/tests/ja/__init__.py
Normal file
17
spacy/tests/ja/test_tokenizer.py
Normal file
17
spacy/tests/ja/test_tokenizer.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
TOKENIZER_TESTS = [
|
||||||
|
("日本語だよ", ['日本語', 'だ', 'よ']),
|
||||||
|
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
||||||
|
("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
|
||||||
|
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']),
|
||||||
|
("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
|
||||||
|
]
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
|
||||||
|
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
|
||||||
|
tokens = [token.text for token in ja_tokenizer(text)]
|
||||||
|
assert tokens == expected_tokens
|
|
@ -29,6 +29,7 @@ from ..serialize.bits cimport BitArray
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from ..syntax.iterators import CHUNKERS
|
from ..syntax.iterators import CHUNKERS
|
||||||
from ..compat import is_config
|
from ..compat import is_config
|
||||||
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
@ -403,9 +404,8 @@ cdef class Doc:
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"noun_chunks requires the dependency parse, which "
|
"noun_chunks requires the dependency parse, which "
|
||||||
"requires data to be installed. If you haven't done so, run: "
|
"requires data to be installed. For more info, see the "
|
||||||
"\npython -m spacy download %s\n"
|
"documentation: \n%s\n" % about.__docs_models__)
|
||||||
"to install the data" % self.vocab.lang)
|
|
||||||
# Accumulate the result before beginning to iterate over it. This prevents
|
# Accumulate the result before beginning to iterate over it. This prevents
|
||||||
# the tokenisation from being changed out from under us during the iteration.
|
# the tokenisation from being changed out from under us during the iteration.
|
||||||
# The tricky thing here is that Span accepts its tokenisation changing,
|
# The tricky thing here is that Span accepts its tokenisation changing,
|
||||||
|
@ -431,14 +431,14 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'sents' in self.user_hooks:
|
if 'sents' in self.user_hooks:
|
||||||
return self.user_hooks['sents'](self)
|
yield from self.user_hooks['sents'](self)
|
||||||
|
return
|
||||||
|
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"sentence boundary detection requires the dependency parse, which "
|
"Sentence boundary detection requires the dependency parse, which "
|
||||||
"requires data to be installed. If you haven't done so, run: "
|
"requires data to be installed. For more info, see the "
|
||||||
"\npython -m spacy download %s\n"
|
"documentation: \n%s\n" % about.__docs_models__)
|
||||||
"to install the data" % self.vocab.lang)
|
|
||||||
cdef int i
|
cdef int i
|
||||||
start = 0
|
start = 0
|
||||||
for i in range(1, self.length):
|
for i in range(1, self.length):
|
||||||
|
|
|
@ -1,13 +1,23 @@
|
||||||
from copy import deepcopy
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .doc import Doc
|
||||||
|
from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
|
||||||
|
|
||||||
|
|
||||||
def merge_ents(doc):
|
def merge_ents(doc):
|
||||||
'''Helper: merge adjacent entities into single tokens; modifies the doc.'''
|
"""
|
||||||
|
Helper: merge adjacent entities into single tokens; modifies the doc.
|
||||||
|
"""
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
ent.merge(ent.root.tag_, ent.text, ent.label_)
|
ent.merge(ent.root.tag_, ent.text, ent.label_)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
def format_POS(token, light, flat):
|
def format_POS(token, light, flat):
|
||||||
'''helper: form the POS output for a token'''
|
"""
|
||||||
|
Helper: form the POS output for a token.
|
||||||
|
"""
|
||||||
subtree = dict([
|
subtree = dict([
|
||||||
("word", token.text),
|
("word", token.text),
|
||||||
("lemma", token.lemma_), # trigger
|
("lemma", token.lemma_), # trigger
|
||||||
|
@ -25,17 +35,22 @@ def format_POS(token, light, flat):
|
||||||
subtree.pop("modifiers")
|
subtree.pop("modifiers")
|
||||||
return subtree
|
return subtree
|
||||||
|
|
||||||
def POS_tree(root, light, flat):
|
|
||||||
'''Helper: generate a POS tree for a root token.
|
def POS_tree(root, light=False, flat=False):
|
||||||
The doc must have merge_ents(doc) ran on it.
|
"""
|
||||||
'''
|
Helper: generate a POS tree for a root token. The doc must have
|
||||||
|
merge_ents(doc) ran on it.
|
||||||
|
"""
|
||||||
subtree = format_POS(root, light=light, flat=flat)
|
subtree = format_POS(root, light=light, flat=flat)
|
||||||
for c in root.children:
|
for c in root.children:
|
||||||
subtree["modifiers"].append(POS_tree(c))
|
subtree["modifiers"].append(POS_tree(c))
|
||||||
return subtree
|
return subtree
|
||||||
|
|
||||||
|
|
||||||
def parse_tree(doc, light=False, flat=False):
|
def parse_tree(doc, light=False, flat=False):
|
||||||
"""Makes a copy of the doc, then construct a syntactic parse tree, similar to the one used in displaCy. Generates the POS tree for all sentences in a doc
|
"""
|
||||||
|
Makes a copy of the doc, then construct a syntactic parse tree, similar to
|
||||||
|
the one used in displaCy. Generates the POS tree for all sentences in a doc.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
doc: The doc for parsing.
|
doc: The doc for parsing.
|
||||||
|
@ -50,6 +65,8 @@ def parse_tree(doc, light=False, flat=False):
|
||||||
[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
|
[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
|
||||||
"""
|
"""
|
||||||
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
||||||
doc_clone.from_array(doc.to_array([HEAD, DEP, TAG, ENT_IOB, ENT_TYPE])
|
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
||||||
|
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
|
||||||
|
doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
|
||||||
merge_ents(doc_clone) # merge the entities into single tokens first
|
merge_ents(doc_clone) # merge the entities into single tokens first
|
||||||
return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]
|
return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]
|
||||||
|
|
|
@ -16,6 +16,7 @@ from ..util import normalize_slice
|
||||||
from ..attrs cimport IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_PUNCT, IS_SPACE
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..compat import is_config
|
from ..compat import is_config
|
||||||
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
|
@ -221,9 +222,8 @@ cdef class Span:
|
||||||
if not self.doc.is_parsed:
|
if not self.doc.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"noun_chunks requires the dependency parse, which "
|
"noun_chunks requires the dependency parse, which "
|
||||||
"requires data to be installed. If you haven't done so, run: "
|
"requires data to be installed. For more info, see the "
|
||||||
"\npython -m spacy download %s\n"
|
"documentation: \n%s\n" % about.__docs_models__)
|
||||||
"to install the data" % self.vocab.lang)
|
|
||||||
# Accumulate the result before beginning to iterate over it. This prevents
|
# Accumulate the result before beginning to iterate over it. This prevents
|
||||||
# the tokenisation from being changed out from under us during the iteration.
|
# the tokenisation from being changed out from under us during the iteration.
|
||||||
# The tricky thing here is that Span accepts its tokenisation changing,
|
# The tricky thing here is that Span accepts its tokenisation changing,
|
||||||
|
|
|
@ -26,6 +26,7 @@ from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
from ..attrs cimport IS_OOV
|
from ..attrs cimport IS_OOV
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..compat import is_config
|
from ..compat import is_config
|
||||||
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
|
@ -237,11 +238,10 @@ cdef class Token:
|
||||||
cdef int length = self.vocab.vectors_length
|
cdef int length = self.vocab.vectors_length
|
||||||
if length == 0:
|
if length == 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Word vectors set to length 0. This may be because the "
|
"Word vectors set to length 0. This may be because you "
|
||||||
"data is not installed. If you haven't already, run"
|
"don't have a model installed or loaded, or because your "
|
||||||
"\npython -m spacy download %s\n"
|
"model doesn't include word vectors. For more info, see "
|
||||||
"to install the data." % self.vocab.lang
|
"the documentation: \n%s\n" % about.__docs_models__)
|
||||||
)
|
|
||||||
vector_view = <float[:length,]>self.c.lex.vector
|
vector_view = <float[:length,]>self.c.lex.vector
|
||||||
return numpy.asarray(vector_view)
|
return numpy.asarray(vector_view)
|
||||||
|
|
||||||
|
|
|
@ -8,4 +8,5 @@ class Chinese(Language):
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
import jieba
|
import jieba
|
||||||
words = list(jieba.cut(text, cut_all=True))
|
words = list(jieba.cut(text, cut_all=True))
|
||||||
|
words=[x for x in words if x]
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
|
@ -14,8 +14,8 @@
|
||||||
|
|
||||||
"SPACY_VERSION": "1.8",
|
"SPACY_VERSION": "1.8",
|
||||||
"LATEST_NEWS": {
|
"LATEST_NEWS": {
|
||||||
"url": "https://survey.spacy.io/",
|
"url": "/docs/usage/models",
|
||||||
"title": "Take the spaCy user survey and help us improve the library!"
|
"title": "The first official Spanish model is here!"
|
||||||
},
|
},
|
||||||
|
|
||||||
"SOCIAL": {
|
"SOCIAL": {
|
||||||
|
@ -55,7 +55,33 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
"V_CSS": "1.6",
|
"QUICKSTART": [
|
||||||
|
{ "id": "os", "title": "Operating system", "options": [
|
||||||
|
{ "id": "mac", "title": "macOS / OSX", "checked": true },
|
||||||
|
{ "id": "windows", "title": "Windows" },
|
||||||
|
{ "id": "linux", "title": "Linux" }]
|
||||||
|
},
|
||||||
|
{ "id": "package", "title": "Package manager", "options": [
|
||||||
|
{ "id": "pip", "title": "pip", "checked": true },
|
||||||
|
{ "id": "conda", "title": "conda" },
|
||||||
|
{ "id": "source", "title": "from source" }]
|
||||||
|
},
|
||||||
|
{ "id": "python", "title": "Python version", "options": [
|
||||||
|
{ "id": 2, "title": "2.x" },
|
||||||
|
{ "id": 3, "title": "3.x", "checked": true }]
|
||||||
|
},
|
||||||
|
{ "id": "config", "title": "Configuration", "multiple": true, "options": [
|
||||||
|
{"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }]
|
||||||
|
},
|
||||||
|
{ "id": "model", "title": "Models", "multiple": true, "options": [
|
||||||
|
{ "id": "en", "title": "English", "meta": "50MB" },
|
||||||
|
{ "id": "de", "title": "German", "meta": "645MB" },
|
||||||
|
{ "id": "fr", "title": "French", "meta": "1.33GB" },
|
||||||
|
{ "id": "es", "title": "Spanish", "meta": "377MB"}]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
|
||||||
|
"V_CSS": "1.7",
|
||||||
"V_JS": "1.2",
|
"V_JS": "1.2",
|
||||||
"DEFAULT_SYNTAX": "python",
|
"DEFAULT_SYNTAX": "python",
|
||||||
"ANALYTICS": "UA-58931649-1",
|
"ANALYTICS": "UA-58931649-1",
|
||||||
|
|
|
@ -121,6 +121,47 @@ mixin badge(name)
|
||||||
img(src=site.badge alt="{name} version" height="20")
|
img(src=site.badge alt="{name} version" height="20")
|
||||||
|
|
||||||
|
|
||||||
|
//- Quickstart widget
|
||||||
|
quickstart.js with manual markup, inspired by PyTorch's "Getting started"
|
||||||
|
groups - [object] option groups, uses global variable QUICKSTART
|
||||||
|
headline - [string] optional text to be rendered as widget headline
|
||||||
|
|
||||||
|
mixin quickstart(groups, headline)
|
||||||
|
.c-quickstart.o-block-small#qs
|
||||||
|
.c-quickstart__content
|
||||||
|
if headline
|
||||||
|
+h(2)=headline
|
||||||
|
for group in groups
|
||||||
|
.c-quickstart__group.u-text-small(data-qs-group=group.id)
|
||||||
|
.c-quickstart__legend=group.title
|
||||||
|
.c-quickstart__fields
|
||||||
|
for option in group.options
|
||||||
|
input.c-quickstart__input(class="c-quickstart__input--" + (group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
|
||||||
|
label.c-quickstart__label(for="qs-#{option.id}")=option.title
|
||||||
|
if option.meta
|
||||||
|
| #[span.c-quickstart__label__meta (#{option.meta})]
|
||||||
|
if option.help
|
||||||
|
| #[+help(option.help).c-quickstart__label__meta]
|
||||||
|
|
||||||
|
pre.c-code-block
|
||||||
|
code.c-code-block__content.c-quickstart__code(data-qs-results="")
|
||||||
|
block
|
||||||
|
|
||||||
|
.c-quickstart__info.u-text-tiny.o-block.u-text-right
|
||||||
|
| Like this widget? Check out #[+a("https://github.com/ines/quickstart").u-link quickstart.js]!
|
||||||
|
|
||||||
|
|
||||||
|
//- Quickstart code item
|
||||||
|
data [object] - Rendering conditions (keyed by option group ID, value: option)
|
||||||
|
|
||||||
|
mixin qs(data)
|
||||||
|
- args = {}
|
||||||
|
for value, setting in data
|
||||||
|
- args['data-qs-' + setting] = value
|
||||||
|
span.c-quickstart__line&attributes(args)
|
||||||
|
block
|
||||||
|
|
||||||
|
|
||||||
//- Logo
|
//- Logo
|
||||||
|
|
||||||
mixin logo()
|
mixin logo()
|
||||||
|
|
|
@ -47,6 +47,14 @@ mixin api(path)
|
||||||
| #[+icon("book", 18).o-icon--inline.u-color-subtle]
|
| #[+icon("book", 18).o-icon--inline.u-color-subtle]
|
||||||
|
|
||||||
|
|
||||||
|
//- Help icon with tooltip
|
||||||
|
tooltip - [string] Tooltip text
|
||||||
|
|
||||||
|
mixin help(tooltip)
|
||||||
|
span(data-tooltip=tooltip)&attributes(attributes)
|
||||||
|
+icon("help", 16).i-icon--inline
|
||||||
|
|
||||||
|
|
||||||
//- Aside for text
|
//- Aside for text
|
||||||
label - [string] aside title (optional)
|
label - [string] aside title (optional)
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,13 @@
|
||||||
//- 💫 INCLUDES > SCRIPTS
|
//- 💫 INCLUDES > SCRIPTS
|
||||||
|
|
||||||
script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
|
script(src="/assets/js/main.js?v#{V_JS}")
|
||||||
script(src="/assets/js/prism.js", type="text/javascript")
|
script(src="/assets/js/prism.js")
|
||||||
|
|
||||||
if SECTION == "docs"
|
if SECTION == "docs"
|
||||||
|
if quickstart
|
||||||
|
script(src="/assets/js/quickstart.js")
|
||||||
|
script var qs = new Quickstart("#qs");
|
||||||
|
|
||||||
script.
|
script.
|
||||||
((window.gitter = {}).chat = {}).options = {
|
((window.gitter = {}).chat = {}).options = {
|
||||||
useStyles: false,
|
useStyles: false,
|
||||||
|
|
|
@ -18,7 +18,7 @@
|
||||||
|
|
||||||
.c-code-block__content
|
.c-code-block__content
|
||||||
display: block
|
display: block
|
||||||
font: normal normal 1.1rem/#{2} $font-code
|
font: normal 600 1.1rem/#{2} $font-code
|
||||||
padding: 1em 2em
|
padding: 1em 2em
|
||||||
|
|
||||||
|
|
||||||
|
|
90
website/assets/css/_components/_quickstart.sass
Normal file
90
website/assets/css/_components/_quickstart.sass
Normal file
|
@ -0,0 +1,90 @@
|
||||||
|
//- 💫 CSS > COMPONENTS > QUICKSTART
|
||||||
|
|
||||||
|
.c-quickstart
|
||||||
|
border: 1px solid $color-subtle
|
||||||
|
border-radius: 2px
|
||||||
|
display: none
|
||||||
|
background: $color-subtle-light
|
||||||
|
|
||||||
|
&:not([style]) + .c-quickstart__info
|
||||||
|
display: none
|
||||||
|
|
||||||
|
.c-quickstart__content
|
||||||
|
padding: 2rem 3rem
|
||||||
|
|
||||||
|
.c-quickstart__input
|
||||||
|
@include size(0)
|
||||||
|
opacity: 0
|
||||||
|
position: absolute
|
||||||
|
left: -9999px
|
||||||
|
|
||||||
|
.c-quickstart__label
|
||||||
|
cursor: pointer
|
||||||
|
background: $color-back
|
||||||
|
border: 1px solid $color-subtle
|
||||||
|
border-radius: 2px
|
||||||
|
display: inline-block
|
||||||
|
padding: 0.75rem 1.25rem
|
||||||
|
margin: 0 0.5rem 0.5rem 0
|
||||||
|
font-weight: bold
|
||||||
|
|
||||||
|
&:hover
|
||||||
|
background: lighten($color-theme-light, 5)
|
||||||
|
|
||||||
|
.c-quickstart__input:focus + &
|
||||||
|
border: 1px solid $color-theme
|
||||||
|
|
||||||
|
.c-quickstart__input--radio:checked + &
|
||||||
|
color: $color-back
|
||||||
|
border-color: $color-theme
|
||||||
|
background: $color-theme
|
||||||
|
|
||||||
|
.c-quickstart__input--check + &:before
|
||||||
|
content: ""
|
||||||
|
background: $color-back
|
||||||
|
display: inline-block
|
||||||
|
width: 20px
|
||||||
|
height: 20px
|
||||||
|
border: 1px solid $color-subtle
|
||||||
|
vertical-align: middle
|
||||||
|
margin-right: 1rem
|
||||||
|
cursor: pointer
|
||||||
|
border-radius: 50%
|
||||||
|
|
||||||
|
.c-quickstart__input--check:checked + &:before
|
||||||
|
background: $color-theme url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0Ij4gICAgPHBhdGggZmlsbD0iI2ZmZiIgZD0iTTkgMTYuMTcybDEwLjU5NC0xMC41OTQgMS40MDYgMS40MDYtMTIgMTItNS41NzgtNS41NzggMS40MDYtMS40MDZ6Ii8+PC9zdmc+)
|
||||||
|
background-size: contain
|
||||||
|
border-color: $color-theme
|
||||||
|
|
||||||
|
.c-quickstart__label__meta
|
||||||
|
font-weight: normal
|
||||||
|
color: $color-subtle-dark
|
||||||
|
|
||||||
|
.c-quickstart__group
|
||||||
|
@include breakpoint(min, md)
|
||||||
|
display: flex
|
||||||
|
flex-flow: row nowrap
|
||||||
|
|
||||||
|
&:not(:last-child)
|
||||||
|
margin-bottom: 1rem
|
||||||
|
|
||||||
|
.c-quickstart__fields
|
||||||
|
flex: 100%
|
||||||
|
|
||||||
|
.c-quickstart__legend
|
||||||
|
color: $color-subtle-dark
|
||||||
|
margin-right: 2rem
|
||||||
|
padding-top: 0.75rem
|
||||||
|
flex: 1 1 35%
|
||||||
|
font-weight: bold
|
||||||
|
|
||||||
|
.c-quickstart__line
|
||||||
|
display: block
|
||||||
|
|
||||||
|
&:before
|
||||||
|
color: $color-theme
|
||||||
|
margin-right: 1em
|
||||||
|
content: "$"
|
||||||
|
|
||||||
|
.c-quickstart__code
|
||||||
|
font-size: 1.6rem
|
29
website/assets/css/_components/_tooltips.sass
Normal file
29
website/assets/css/_components/_tooltips.sass
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
//- 💫 CSS > COMPONENTS > TOOLTIPS
|
||||||
|
|
||||||
|
[data-tooltip]
|
||||||
|
position: relative
|
||||||
|
|
||||||
|
@include breakpoint(min, sm)
|
||||||
|
&:before
|
||||||
|
@include position(absolute, top, left, 125%, 50%)
|
||||||
|
display: inline-block
|
||||||
|
content: attr(data-tooltip)
|
||||||
|
background: $color-front
|
||||||
|
border-radius: 2px
|
||||||
|
color: $color-back
|
||||||
|
font-family: inherit
|
||||||
|
font-size: 1.3rem
|
||||||
|
line-height: 1.25
|
||||||
|
opacity: 0
|
||||||
|
padding: 0.5em 0.75em
|
||||||
|
transform: translateX(-50%) translateY(-2px)
|
||||||
|
transition: opacity 0.1s ease-out, transform 0.1s ease-out
|
||||||
|
visibility: hidden
|
||||||
|
min-width: 200px
|
||||||
|
max-width: 300px
|
||||||
|
z-index: 200
|
||||||
|
|
||||||
|
&:hover:before
|
||||||
|
opacity: 1
|
||||||
|
transform: translateX(-50%) translateY(0)
|
||||||
|
visibility: visible
|
|
@ -27,6 +27,7 @@ $font-code: 'Source Code Pro', Consolas, 'Andale Mono', Menlo, Monaco, Courier,
|
||||||
// Colors
|
// Colors
|
||||||
|
|
||||||
$colors: ( blue: #09a3d5, red: #d9515d )
|
$colors: ( blue: #09a3d5, red: #d9515d )
|
||||||
|
$colors-light: (blue: #cceaf4, red: #f9d7da)
|
||||||
|
|
||||||
$color-back: #fff !default
|
$color-back: #fff !default
|
||||||
$color-front: #1a1e23 !default
|
$color-front: #1a1e23 !default
|
||||||
|
@ -34,7 +35,7 @@ $color-dark: lighten($color-front, 20) !default
|
||||||
|
|
||||||
$color-theme: map-get($colors, $theme)
|
$color-theme: map-get($colors, $theme)
|
||||||
$color-theme-dark: darken(map-get($colors, $theme), 5)
|
$color-theme-dark: darken(map-get($colors, $theme), 5)
|
||||||
$color-theme-light: saturate(lighten(map-get($colors, $theme), 35), 5)
|
$color-theme-light: map-get($colors-light, $theme)
|
||||||
|
|
||||||
$color-subtle: #ddd !default
|
$color-subtle: #ddd !default
|
||||||
$color-subtle-light: #f6f6f6 !default
|
$color-subtle-light: #f6f6f6 !default
|
||||||
|
|
|
@ -32,3 +32,5 @@ $theme: blue !default
|
||||||
@import _components/navigation
|
@import _components/navigation
|
||||||
@import _components/sidebar
|
@import _components/sidebar
|
||||||
@import _components/tables
|
@import _components/tables
|
||||||
|
@import _components/tooltips
|
||||||
|
@import _components/quickstart
|
||||||
|
|
|
@ -1,5 +1,16 @@
|
||||||
<svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
<svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||||
<defs>
|
<defs>
|
||||||
|
<symbol id="v2alpha" viewBox="0 0 200 111">
|
||||||
|
<title>spaCy v2.0.0 alpha</title>
|
||||||
|
<path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/>
|
||||||
|
<path fill="#a3cad3" d="M45.4 105.4L19.6 94.6l25.4-1"/>
|
||||||
|
<path fill="#ddd" d="M196.6 2L155 3.4l1 29.2 41.6-1.4L187.2 17"/>
|
||||||
|
<path fill="#a3cad3" d="M155 3.4l25.8 10.8-25.4 1"/>
|
||||||
|
<path fill="#fff" d="M17.6 19.4l163-5.6 1 29.2-163 5.6zM19.2 65.6l163-5.6 1 29.2-163 5.6z"/>
|
||||||
|
<path fill="#008EBC" d="M45.8 29h-3.6v-2.4l10-.4.2 2.5h-3.6l.4 10.8h-3L45.8 29zM62 39L59 34.5h-1.6l.2 5h-3l-.5-13.2L59 26c3 0 5.2.8 5.3 4 0 1.8-.8 3-2.2 3.8l3.3 5.2H62zm-4.5-6.8H59c1.6-.2 2.4-.8 2.3-2 0-1.4-1-1.8-2.5-1.8h-1.5l.2 3.8zM69 34.2l-4.3-8.4H68l1.2 3 1.2 2.8c.4-1 .8-2 1-3l1.2-3 3-.2L72 34l.2 4.7h-3l-.2-4.5zM79.5 25.3h3.2l1.8 6 1.2 4.2c.5-1.5.7-2.8 1-4.3L88 25h3L87.7 38H84l-4.5-13zM92.4 25l8.3-.4V27l-5.2.3V30l4.6-.3.2 2.5-4.5.2v3l5.6-.2v2.5L93 38l-.6-13zM111 37.4l-2.6-4.7h-1.6l.2 5h-3l-.5-13.2 4.8-.2c2.8 0 5 .8 5.2 4 0 1.8-.8 3-2.2 3.8l3.2 5.3H111zm-4.3-7h1.5c1.6 0 2.4-.7 2.3-2 0-1.3-1-1.7-2.5-1.7h-1.5l.2 3.8zM116.8 33.5c1 .8 2.2 1.3 3.3 1.3 1.3 0 2-.5 2-1.3s-1-1-2-1.5l-1.8-.7c-1.4-.5-2.7-1.6-2.8-3.5 0-2.2 1.8-4 4.6-4 1.5-.2 3 .4 4.3 1.5l-1.4 2c-1-.7-1.8-1-3-1-1 0-1.6.4-1.5 1.2 0 .8 1 1 2 1.5l1.8.6c1.6.6 2.7 1.6 2.7 3.5 0 2.3-1.7 4.2-4.8 4.4-1.7 0-3.6-.5-5-1.7l1.6-2.2zM126.8 23.7h3l.5 13-3 .2-.5-13.3zM132.5 30c0-4.3 2.2-7 5.8-7 3.6 0 6 2.3 6.2 6.6 0 4.3-2.2 7-5.8 7-3.5.3-6-2.3-6.2-6.6zm9-.3c-.2-2.6-1.4-4.2-3.2-4-1.8 0-3 1.6-2.8 4.2 0 2.5 1.3 4.2 3 4 2 0 3-1.6 3-4.3zM146.7 23h3l3.8 6.3 1.4 3c-.2-1.5-.5-3.3-.5-5l-.2-4.6h2.8l.6 13-3 .2-3.8-6.6-1.4-2.8c0 1.5.4 3.2.4 4.8l.2 4.7-3 .2-.3-13.2z"/>
|
||||||
|
<path fill="#1A1E23" d="M50.2 84.7c3.2-3.2 5.4-5.5 5.3-7.3 0-1.3-.8-2-2-2-.8 0-1.5.8-2 1.5l-1.8-1.6c1.2-1.4 2.4-2 4.2-2.2 2.4 0 4.2 1.5 4.3 4 0 2-2 4.4-4 6.7.7-.2 1.6-.3 2.2-.3H59l.2 2.4-9 .4v-1.7zM63 82.4c1 0 2 .7 2 1.8 0 1-.7 2-1.7 2s-1.8-.8-2-2c0-1 .7-1.8 1.8-1.8zM66.7 79.3c-.2-4.4 1.6-6.7 4.4-6.8 3 0 4.8 2 5 6.5s-1.7 6.8-4.5 7c-2.7 0-4.6-2.3-4.8-6.7zM73 79c0-3.4-.8-4.2-1.8-4-1 0-1.8.7-1.6 4.3 0 3.5 1 4.4 2 4.3 1 0 1.6-1 1.5-4.5zM79.8 81.8c1 0 1.8.7 2 1.8 0 1-.8 2-1.8 2s-1.8-.8-2-2c0-1 .8-1.7 1.8-1.8zM83.5 78.7C83.3 74.3 85 72 88 72c2.7-.2 4.6 2 4.7 6.4s-1.6 6.8-4.4 7c-2.8 0-4.7-2.3-4.8-6.7zm6.3-.2c0-3.5-1-4.3-2-4.2-1 0-1.7.8-1.5 4.4 0 3.5 1 4.4 2 4.3 1 0 1.7-1 1.5-4.5zM105.5 81.3h-4l-.7 3.3h-3l3.7-13.2h3.6l4.7 13h-3.2l-1-3zm-.7-2.3l-.4-1.2-1.2-4.2-1 4.3-.3 1h2.8zM110.5 71h3l.4 10.7 5-.2.2 2.5-8.2.3-.5-13.2zM121 70.7l4.7-.2c3 0 5.2 1 5.3 4 0 3.2-2.2 4.7-5 4.7h-1.8l.2 4.6h-3l-.5-13zm4.7 6.2c1.6-.2 2.4-1 2.4-2.3 0-1.4-.8-2-2.4-1.8H124v4h1.7zM133 70.3h3l.3 5 4.5-.2-.2-5h3l.5 13-3 .2v-5.5l-4.6.2.2 5.4h-3l-.5-13zM153.3 79.7h-4l-.7 3.3h-3l3.7-13.2h3.6l4.5 13h-3.2l-1-3zm-.7-2.3l-.4-1.2L151 72l-1 4.3-.3 1.2h3z"/>
|
||||||
|
</symbol>
|
||||||
|
|
||||||
<symbol id="usersurvey" viewBox="0 0 200 111">
|
<symbol id="usersurvey" viewBox="0 0 200 111">
|
||||||
<title>spaCy user survey 2017</title>
|
<title>spaCy user survey 2017</title>
|
||||||
<path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/>
|
<path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/>
|
||||||
|
|
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 21 KiB |
|
@ -27,5 +27,8 @@
|
||||||
<symbol id="star" viewBox="0 0 24 24">
|
<symbol id="star" viewBox="0 0 24 24">
|
||||||
<path d="M12 17.25l-6.188 3.75 1.641-7.031-5.438-4.734 7.172-0.609 2.813-6.609 2.813 6.609 7.172 0.609-5.438 4.734 1.641 7.031z"></path>
|
<path d="M12 17.25l-6.188 3.75 1.641-7.031-5.438-4.734 7.172-0.609 2.813-6.609 2.813 6.609 7.172 0.609-5.438 4.734 1.641 7.031z"></path>
|
||||||
</symbol>
|
</symbol>
|
||||||
|
<symbol id="help" viewBox="0 0 24 24">
|
||||||
|
<path d="M12 6c2.203 0 3.984 1.781 3.984 3.984 0 2.484-3 2.766-3 5.016h-1.969c0-3.234 3-3 3-5.016 0-1.078-0.938-1.969-2.016-1.969s-2.016 0.891-2.016 1.969h-1.969c0-2.203 1.781-3.984 3.984-3.984zM12 20.016c4.406 0 8.016-3.609 8.016-8.016s-3.609-8.016-8.016-8.016-8.016 3.609-8.016 8.016 3.609 8.016 8.016 8.016zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984zM11.016 18v-2.016h1.969v2.016h-1.969z"/>
|
||||||
|
</symbol>
|
||||||
</defs>
|
</defs>
|
||||||
</svg>
|
</svg>
|
||||||
|
|
Before Width: | Height: | Size: 4.9 KiB After Width: | Height: | Size: 5.4 KiB |
8
website/assets/js/quickstart.js
Normal file
8
website/assets/js/quickstart.js
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
/**
|
||||||
|
* quickstart.js
|
||||||
|
* A micro-form for user-specific installation instructions
|
||||||
|
*
|
||||||
|
* @author Ines Montani <ines@ines.io>
|
||||||
|
* @version 0.0.1
|
||||||
|
* @license MIT
|
||||||
|
*/'use strict';var _createClass=function(){function a(b,c){for(var e,d=0;d<c.length;d++)e=c[d],e.enumerable=e.enumerable||!1,e.configurable=!0,'value'in e&&(e.writable=!0),Object.defineProperty(b,e.key,e)}return function(b,c,d){return c&&a(b.prototype,c),d&&a(b,d),b}}();function _toConsumableArray(a){if(Array.isArray(a)){for(var b=0,c=Array(a.length);b<a.length;b++)c[b]=a[b];return c}return Array.from(a)}function _classCallCheck(a,b){if(!(a instanceof b))throw new TypeError('Cannot call a class as a function')}var Quickstart=function(){function a(){var b=0<arguments.length&&void 0!==arguments[0]?arguments[0]:'#quickstart',d=arguments[1],c=2<arguments.length&&void 0!==arguments[2]?arguments[2]:{};_classCallCheck(this,a),this.container='string'==typeof b?this._$(b):b,this.groups=d,this.pfx=c.prefix||'qs',this.dpfx='data-'+this.pfx,this.init=this.init.bind(this),c.noInit||document.addEventListener('DOMContentLoaded',this.init)}return _createClass(a,[{key:'init',value:function init(){this.updateContainer(),this.container.style.display='block',this.container.classList.add(''+this.pfx);var b=this.groups;b instanceof Array?b.reverse().forEach(this.createGroup.bind(this)):this._$$('['+this.dpfx+'-group]').forEach(this.updateGroup.bind(this))}},{key:'initGroup',value:function initGroup(b,c){b.addEventListener('change',this.update.bind(this)),b.dispatchEvent(new CustomEvent('change',{detail:c}))}},{key:'updateGroup',value:function updateGroup(b){var c=b.getAttribute(this.dpfx+'-group'),d=this.createStyles(c);b.insertBefore(d,b.firstChild),this.initGroup(b,c)}},{key:'update',value:function update(b){var f=this,c=b.detail||b.target.name,d=this._$$('[name='+c+']:checked').map(function(h){return h.value}),e=d.map(function(h){return':not(['+f.dpfx+'-'+c+'="'+h+'"])'}).join(''),g='['+this.dpfx+'-results]>['+this.dpfx+'-'+c+']'+e+' {display: none}';this._$('['+this.dpfx+'-style="'+c+'"]').textContent=g}},{key:'updateContainer',value:function updateContainer(){if(!this._$('['+this.dpfx+'-results]')){var b=this.childNodes(this.container,'pre'),c=b?b[0]:this._c('pre',this.pfx+'-code'),d=this.childNodes(c,'code')||this.childNodes(this.container,'code'),e=d?d[0]:this._c('code',this.pfx+'-results');e.setAttribute(this.dpfx+'-results','');var f=this.childNodes(e,'span')||this.childNodes(c,'span')||this.childNodes(this.container,'span');f&&f.forEach(function(g){return e.appendChild(g)}),c.appendChild(e),this.container.appendChild(c)}}},{key:'createGroup',value:function createGroup(b){var d=this,c=this._c('fieldset',this.pfx+'-group');c.setAttribute(this.dpfx+'-group',b.id),c.innerHTML=this.createStyles(b.id).outerHTML,c.innerHTML+='<legend class="'+this.pfx+'-legend">'+b.title+'</legend>',c.innerHTML+=b.options.map(function(e){var f=b.multiple?'checkbox':'radio';return'<input class="'+d.pfx+'-input '+d.pfx+'-input--'+f+'" type="'+f+'" name="'+b.id+'" id="'+e.id+'" value="'+e.id+'" '+(e.checked?'checked':'')+' /><label class="'+d.pfx+'-label" for="'+e.id+'">'+e.title+'</label>'}).join(''),this.container.insertBefore(c,this.container.firstChild),this.initGroup(c,b.id)}},{key:'createStyles',value:function createStyles(b){var c=this._c('style');return c.setAttribute(this.dpfx+'-style',b),c.textContent='['+this.dpfx+'-results]>['+this.dpfx+'-'+b+'] {display: none}',c}},{key:'childNodes',value:function childNodes(b,c){var d=c.toUpperCase();if(!b.hasChildNodes)return!1;var e=[].concat(_toConsumableArray(b.childNodes)).filter(function(f){return f.nodeName===d});return!!e.length&&e}},{key:'_$',value:function _$(b){return document.querySelector(b)}},{key:'_$$',value:function _$$(b){return[].concat(_toConsumableArray(document.querySelectorAll(b)))}},{key:'_c',value:function _c(b,c){var d=document.createElement(b);return c&&(d.className=c),d}}]),a}();
|
|
@ -1,10 +1,5 @@
|
||||||
//- 💫 DOCS > API > ANNOTATION > DEPENDENCY LABELS
|
//- 💫 DOCS > API > ANNOTATION > DEPENDENCY LABELS
|
||||||
|
|
||||||
+infobox("Tip")
|
|
||||||
| In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
|
|
||||||
| description for the string representation of a label. For example,
|
|
||||||
| #[code spacy.explain("prt")] will return "particle".
|
|
||||||
|
|
||||||
+h(3, "dependency-parsing-english") English dependency labels
|
+h(3, "dependency-parsing-english") English dependency labels
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
|
@ -1,10 +1,5 @@
|
||||||
//- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES
|
//- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES
|
||||||
|
|
||||||
+infobox("Tip")
|
|
||||||
| In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
|
|
||||||
| description for the string representation of an entity label. For example,
|
|
||||||
| #[code spacy.explain("LANGUAGE")] will return "any named language".
|
|
||||||
|
|
||||||
+table([ "Type", "Description" ])
|
+table([ "Type", "Description" ])
|
||||||
+row
|
+row
|
||||||
+cell #[code PERSON]
|
+cell #[code PERSON]
|
||||||
|
|
|
@ -1,10 +1,5 @@
|
||||||
//- 💫 DOCS > API > ANNOTATION > POS TAGS
|
//- 💫 DOCS > API > ANNOTATION > POS TAGS
|
||||||
|
|
||||||
+infobox("Tip")
|
|
||||||
| In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
|
|
||||||
| description for the string representation of a tag. For example,
|
|
||||||
| #[code spacy.explain("RB")] will return "adverb".
|
|
||||||
|
|
||||||
+h(3, "pos-tagging-english") English part-of-speech tag scheme
|
+h(3, "pos-tagging-english") English part-of-speech tag scheme
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
|
@ -103,7 +103,7 @@ p Get a #[code Token] object.
|
||||||
doc = nlp(u'Give it back! He pleaded.')
|
doc = nlp(u'Give it back! He pleaded.')
|
||||||
assert doc[0].text == 'Give'
|
assert doc[0].text == 'Give'
|
||||||
assert doc[-1].text == '.'
|
assert doc[-1].text == '.'
|
||||||
span = doc[1:1]
|
span = doc[1:3]
|
||||||
assert span.text == 'it back'
|
assert span.text == 'it back'
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
|
@ -272,7 +272,7 @@ p Import the document contents from a binary string.
|
||||||
p
|
p
|
||||||
| Retokenize the document, such that the span at
|
| Retokenize the document, such that the span at
|
||||||
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
|
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
|
||||||
| #[code start_idx] and #[end_idx] do not mark start and end token
|
| #[code start_idx] and #[code end_idx] do not mark start and end token
|
||||||
| boundaries, the document remains unchanged.
|
| boundaries, the document remains unchanged.
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
|
|
|
@ -67,6 +67,16 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Base form of the word, with no inflectional suffixes.
|
+cell Base form of the word, with no inflectional suffixes.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code orth]
|
||||||
|
+cell int
|
||||||
|
+cell word's string.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code orth_]
|
||||||
|
+cell unicode
|
||||||
|
+cell word's string.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code lower]
|
+cell #[code lower]
|
||||||
+cell int
|
+cell int
|
||||||
|
@ -238,11 +248,6 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
|
||||||
+cell #[code text_with_ws]
|
+cell #[code text_with_ws]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Text content, with trailing space character if present.
|
+cell Text content, with trailing space character if present.
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code whitespace]
|
|
||||||
+cell int
|
|
||||||
+cell Trailing space character if present.
|
|
||||||
+row
|
+row
|
||||||
+cell #[code whitespace_]
|
+cell #[code whitespace_]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
|
|
|
@ -124,7 +124,7 @@ p
|
||||||
+cell #[code Lexeme]
|
+cell #[code Lexeme]
|
||||||
+cell The lexeme indicated by the given ID.
|
+cell The lexeme indicated by the given ID.
|
||||||
|
|
||||||
+h(2, "iter") Span.__iter__
|
+h(2, "iter") Vocab.__iter__
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Iterate over the lexemes in the vocabulary.
|
p Iterate over the lexemes in the vocabulary.
|
||||||
|
|
|
@ -33,6 +33,7 @@
|
||||||
|
|
||||||
"index": {
|
"index": {
|
||||||
"title": "Install spaCy",
|
"title": "Install spaCy",
|
||||||
|
"quickstart": true,
|
||||||
"next": "models"
|
"next": "models"
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
|
@ -25,3 +25,4 @@ p
|
||||||
+model-row("en_vectors_glove_md", "English", [1, 0, 0, 1], "727 MB", "CC BY-SA")
|
+model-row("en_vectors_glove_md", "English", [1, 0, 0, 1], "727 MB", "CC BY-SA")
|
||||||
+model-row("de_core_news_md", "German", [1, 1, 1, 1], "645 MB", "CC BY-SA", true, true)
|
+model-row("de_core_news_md", "German", [1, 1, 1, 1], "645 MB", "CC BY-SA", true, true)
|
||||||
+model-row("fr_depvec_web_lg", "French", [1, 1, 0, 1], "1.33 GB", "CC BY-NC", true, true)
|
+model-row("fr_depvec_web_lg", "French", [1, 1, 0, 1], "1.33 GB", "CC BY-NC", true, true)
|
||||||
|
+model-row("es_core_web_md", "Spanish", [1, 1, 1, 1], "377 MB", "CC BY-SA", true, true)
|
||||||
|
|
|
@ -113,7 +113,7 @@ p
|
||||||
else:
|
else:
|
||||||
tokens.append(substring)
|
tokens.append(substring)
|
||||||
substring = ''
|
substring = ''
|
||||||
tokens.extend(suffixes)
|
tokens.extend(reversed(suffixes))
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -214,7 +214,7 @@ p
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
words = text.split(' ')
|
words = text.split(' ')
|
||||||
# All tokens 'own' a subsequent space character in this tokenizer
|
# All tokens 'own' a subsequent space character in this tokenizer
|
||||||
spaces = [True] * len(word)
|
spaces = [True] * len(words)
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
|
@ -36,7 +36,7 @@ p
|
||||||
| to #[code spacy.load()]. The function should take a
|
| to #[code spacy.load()]. The function should take a
|
||||||
| #[code spacy.language.Language] object as its only argument, and return
|
| #[code spacy.language.Language] object as its only argument, and return
|
||||||
| a sequence of callables. Each callable should accept a
|
| a sequence of callables. Each callable should accept a
|
||||||
| #[+api("docs") #[code Doc]] object, modify it in place, and return
|
| #[+api("doc") #[code Doc]] object, modify it in place, and return
|
||||||
| #[code None].
|
| #[code None].
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
|
@ -12,6 +12,40 @@ p
|
||||||
| #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
|
| #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
|
||||||
| and #[a(href="#source-windows") Windows] for details.
|
| and #[a(href="#source-windows") Windows] for details.
|
||||||
|
|
||||||
|
+quickstart(QUICKSTART, "Quickstart")
|
||||||
|
+qs({config: 'venv', python: 2}) python -m pip install -U virtualenv
|
||||||
|
+qs({config: 'venv', python: 3}) python -m pip install -U venv
|
||||||
|
+qs({config: 'venv', python: 2}) virtualenv .env
|
||||||
|
+qs({config: 'venv', python: 3}) venv .env
|
||||||
|
+qs({config: 'venv', os: 'mac'}) source .env/bin/activate
|
||||||
|
+qs({config: 'venv', os: 'linux'}) source .env/bin/activate
|
||||||
|
+qs({config: 'venv', os: 'windows'}) .env\Scripts\activate
|
||||||
|
|
||||||
|
+qs({package: 'pip'}) pip install -U spacy
|
||||||
|
|
||||||
|
+qs({package: 'conda'}) conda config --add channels conda-forge
|
||||||
|
+qs({package: 'conda'}) conda install spacy
|
||||||
|
|
||||||
|
+qs({package: 'source'}) git clone https://github.com/explosion/spaCy
|
||||||
|
+qs({package: 'source'}) cd spaCy
|
||||||
|
+qs({package: 'source'}) pip install -r requirements.txt
|
||||||
|
+qs({package: 'source'}) pip install -e .
|
||||||
|
|
||||||
|
+qs({model: 'en'}) python -m spacy download en
|
||||||
|
+qs({model: 'de'}) python -m spacy download de
|
||||||
|
+qs({model: 'fr'}) python -m spacy download fr
|
||||||
|
+qs({model: 'es'}) python -m spacy download es
|
||||||
|
|
||||||
|
+h(2, "installation") Installation instructions
|
||||||
|
|
||||||
|
+h(3, "pip") pip
|
||||||
|
+badge("pipy")
|
||||||
|
|
||||||
|
p Using pip, spaCy releases are currently only available as source packages.
|
||||||
|
|
||||||
|
+code(false, "bash").
|
||||||
|
pip install -U spacy
|
||||||
|
|
||||||
+aside("Download models")
|
+aside("Download models")
|
||||||
| After installation you need to download a language model. For more info
|
| After installation you need to download a language model. For more info
|
||||||
| and available models, see the #[+a("/docs/usage/models") docs on models].
|
| and available models, see the #[+a("/docs/usage/models") docs on models].
|
||||||
|
@ -22,14 +56,6 @@ p
|
||||||
>>> import spacy
|
>>> import spacy
|
||||||
>>> nlp = spacy.load('en')
|
>>> nlp = spacy.load('en')
|
||||||
|
|
||||||
+h(2, "pip") pip
|
|
||||||
+badge("pipy")
|
|
||||||
|
|
||||||
p Using pip, spaCy releases are currently only available as source packages.
|
|
||||||
|
|
||||||
+code(false, "bash").
|
|
||||||
pip install -U spacy
|
|
||||||
|
|
||||||
p
|
p
|
||||||
| When using pip it is generally recommended to install packages in a
|
| When using pip it is generally recommended to install packages in a
|
||||||
| #[code virtualenv] to avoid modifying system state:
|
| #[code virtualenv] to avoid modifying system state:
|
||||||
|
@ -39,7 +65,7 @@ p
|
||||||
source .env/bin/activate
|
source .env/bin/activate
|
||||||
pip install spacy
|
pip install spacy
|
||||||
|
|
||||||
+h(2, "conda") conda
|
+h(3, "conda") conda
|
||||||
+badge("conda")
|
+badge("conda")
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
|
@ -17,10 +17,10 @@ p
|
||||||
| trying to do.
|
| trying to do.
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
import spacy # See "Installing spaCy"
|
import spacy # See "Installing spaCy"
|
||||||
nlp = spacy.load('en') # You are here.
|
nlp = spacy.load('en') # You are here.
|
||||||
doc = nlp(u'Hello, spacy!') # See "Using the pipeline"
|
doc = nlp(u'Hello, spacy!') # See "Using the pipeline"
|
||||||
print((w.text, w.pos_) for w in doc) # See "Doc, Span and Token"
|
print([(w.text, w.pos_) for w in doc]) # See "Doc, Span and Token"
|
||||||
|
|
||||||
+aside("Why do we have to preload?")
|
+aside("Why do we have to preload?")
|
||||||
| Loading the models takes ~200x longer than
|
| Loading the models takes ~200x longer than
|
||||||
|
|
|
@ -83,7 +83,7 @@ p
|
||||||
+h(2, "examples-word-vectors") Word vectors
|
+h(2, "examples-word-vectors") Word vectors
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
|
doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")
|
||||||
|
|
||||||
apples = doc[0]
|
apples = doc[0]
|
||||||
oranges = doc[2]
|
oranges = doc[2]
|
||||||
|
@ -148,24 +148,20 @@ p
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
def put_spans_around_tokens(doc, get_classes):
|
def put_spans_around_tokens(doc, get_classes):
|
||||||
'''Given some function to compute class names, put each token in a
|
"""Given some function to compute class names, put each token in a
|
||||||
span element, with the appropriate classes computed.
|
span element, with the appropriate classes computed. All whitespace is
|
||||||
|
preserved, outside of the spans. (Of course, HTML won't display more than
|
||||||
All whitespace is preserved, outside of the spans. (Yes, I know HTML
|
one whitespace character it – but the point is, no information is lost
|
||||||
won't display it. But the point is no information is lost, so you can
|
and you can calculate what you need, e.g. <br />, <p> etc.)
|
||||||
calculate what you need, e.g. <br /> tags, <p> tags, etc.)
|
"""
|
||||||
'''
|
|
||||||
output = []
|
output = []
|
||||||
template = '<span classes="{classes}">{word}</span>{space}'
|
html = '<span class="{classes}">{word}</span>{space}'
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if token.is_space:
|
if token.is_space:
|
||||||
output.append(token.orth_)
|
output.append(token.text)
|
||||||
else:
|
else:
|
||||||
output.append(
|
classes = ' '.join(get_classes(token))
|
||||||
template.format(
|
output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
|
||||||
classes=' '.join(get_classes(token)),
|
|
||||||
word=token.orth_,
|
|
||||||
space=token.whitespace_))
|
|
||||||
string = ''.join(output)
|
string = ''.join(output)
|
||||||
string = string.replace('\n', '')
|
string = string.replace('\n', '')
|
||||||
string = string.replace('\t', ' ')
|
string = string.replace('\t', ' ')
|
||||||
|
|
|
@ -203,7 +203,7 @@ p
|
||||||
p
|
p
|
||||||
| If you've trained your own model, for example for
|
| If you've trained your own model, for example for
|
||||||
| #[+a("/docs/usage/adding-languages") additional languages] or
|
| #[+a("/docs/usage/adding-languages") additional languages] or
|
||||||
| #[+a("/docs/usage/train-ner") custom named entities], you can save its
|
| #[+a("/docs/usage/training-ner") custom named entities], you can save its
|
||||||
| state using the #[code Language.save_to_directory()] method. To make the
|
| state using the #[code Language.save_to_directory()] method. To make the
|
||||||
| model more convenient to deploy, we recommend wrapping it as a Python
|
| model more convenient to deploy, we recommend wrapping it as a Python
|
||||||
| package.
|
| package.
|
||||||
|
|
|
@ -19,11 +19,11 @@ p Here's a minimal example. We first add a pattern that specifies three tokens:
|
||||||
p
|
p
|
||||||
| Once we've added the pattern, we can use the #[code matcher] as a
|
| Once we've added the pattern, we can use the #[code matcher] as a
|
||||||
| callable, to receive a list of #[code (ent_id, start, end)] tuples.
|
| callable, to receive a list of #[code (ent_id, start, end)] tuples.
|
||||||
| Note that #[code LOWER] and #[code IS_PUNCT] are data attributes
|
|
||||||
| of #[code spacy.attrs].
|
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
from spacy.attrs import IS_PUNCT, LOWER
|
||||||
|
|
||||||
matcher = Matcher(nlp.vocab)
|
matcher = Matcher(nlp.vocab)
|
||||||
matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}])
|
matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}])
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,7 @@ p
|
||||||
| and walk you through generating the meta data. You can also create the
|
| and walk you through generating the meta data. You can also create the
|
||||||
| meta.json manually and place it in the model data directory, or supply a
|
| meta.json manually and place it in the model data directory, or supply a
|
||||||
| path to it using the #[code --meta] flag. For more info on this, see the
|
| path to it using the #[code --meta] flag. For more info on this, see the
|
||||||
| #[+a("/docs/usage/cli/#package") #[code package] command] documentation.
|
| #[+a("/docs/usage/cli#package") #[code package] command] documentation.
|
||||||
|
|
||||||
+aside-code("meta.json", "json").
|
+aside-code("meta.json", "json").
|
||||||
{
|
{
|
||||||
|
|
|
@ -150,8 +150,8 @@ p
|
||||||
for itn in range(20):
|
for itn in range(20):
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
for raw_text, entity_offsets in train_data:
|
for raw_text, entity_offsets in train_data:
|
||||||
gold = GoldParse(doc, entities=entity_offsets)
|
|
||||||
doc = nlp.make_doc(raw_text)
|
doc = nlp.make_doc(raw_text)
|
||||||
|
gold = GoldParse(doc, entities=entity_offsets)
|
||||||
nlp.tagger(doc)
|
nlp.tagger(doc)
|
||||||
loss = nlp.entity.update(doc, gold)
|
loss = nlp.entity.update(doc, gold)
|
||||||
nlp.end_training()
|
nlp.end_training()
|
||||||
|
|
|
@ -11,7 +11,7 @@ include _includes/_mixins
|
||||||
h2.c-landing__title.o-block.u-heading-1
|
h2.c-landing__title.o-block.u-heading-1
|
||||||
| in Python
|
| in Python
|
||||||
|
|
||||||
+landing-badge("https://survey.spacy.io", "usersurvey", "Take the user survey!")
|
+landing-badge(gh("spaCy") + "/releases/tag/v2.0.0-alpha", "v2alpha", "Try spaCy v2.0.0 alpha!")
|
||||||
|
|
||||||
+grid.o-content
|
+grid.o-content
|
||||||
+grid-col("third").o-card
|
+grid-col("third").o-card
|
||||||
|
|
Loading…
Reference in New Issue
Block a user