mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Fix merge conflit in printer
This commit is contained in:
commit
94267ec50f
1
.appveyor.yml
Normal file
1
.appveyor.yml
Normal file
|
@ -0,0 +1 @@
|
|||
build: off
|
96
.gitignore
vendored
96
.gitignore
vendored
|
@ -1,50 +1,45 @@
|
|||
# Vim
|
||||
*.swp
|
||||
*.sw*
|
||||
Profile.prof
|
||||
tmp/
|
||||
.dev
|
||||
.denv
|
||||
.pypyenv
|
||||
.eggs
|
||||
*.tgz
|
||||
.sass-cache
|
||||
.python-version
|
||||
|
||||
MANIFEST
|
||||
|
||||
# spaCy
|
||||
spacy/data/
|
||||
corpora/
|
||||
models/
|
||||
keys/
|
||||
|
||||
spacy/syntax/*.cpp
|
||||
spacy/syntax/*.html
|
||||
spacy/en/*.cpp
|
||||
spacy/tokens/*.cpp
|
||||
spacy/serialize/*.cpp
|
||||
spacy/en/data/*
|
||||
spacy/*.cpp
|
||||
spacy/ner/*.cpp
|
||||
spacy/orthography/*.cpp
|
||||
ext/murmurhash.cpp
|
||||
ext/sparsehash.cpp
|
||||
# Website
|
||||
website/www/
|
||||
website/_deploy.sh
|
||||
website/package.json
|
||||
website/announcement.jade
|
||||
website/.gitignore
|
||||
|
||||
/spacy/data/
|
||||
|
||||
_build/
|
||||
.env/
|
||||
tmp/
|
||||
# Cython / C extensions
|
||||
cythonize.json
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
||||
# C extensions
|
||||
spacy/*.html
|
||||
*.cpp
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
# Vim / VSCode / editors
|
||||
*.swp
|
||||
*.sw*
|
||||
Profile.prof
|
||||
.vscode
|
||||
.sass-cache
|
||||
|
||||
# Python
|
||||
.Python
|
||||
.python-version
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
.env/
|
||||
.env2/
|
||||
.env3/
|
||||
.~env/
|
||||
.venv
|
||||
venv/
|
||||
.dev
|
||||
.denv
|
||||
.pypyenv
|
||||
|
||||
# Distribution / packaging
|
||||
env/
|
||||
bin/
|
||||
build/
|
||||
|
@ -59,6 +54,12 @@ var/
|
|||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
.eggs
|
||||
MANIFEST
|
||||
|
||||
# Temporary files
|
||||
*.~*
|
||||
tmp/
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
|
@ -87,25 +88,16 @@ coverage.xml
|
|||
*.log
|
||||
*.pot
|
||||
|
||||
# Windows local helper files
|
||||
# Windows
|
||||
*.bat
|
||||
Thumbs.db
|
||||
Desktop.ini
|
||||
|
||||
# Mac OS X
|
||||
*.DS_Store
|
||||
|
||||
# Temporary files / Dropbox hack
|
||||
*.~*
|
||||
|
||||
# Komodo project files
|
||||
*.komodoproject
|
||||
|
||||
# Website
|
||||
website/_deploy.sh
|
||||
website/package.json
|
||||
website/announcement.jade
|
||||
website/www/
|
||||
website/.gitignore
|
||||
|
||||
# Python virtualenv
|
||||
venv
|
||||
venv/*
|
||||
# Other
|
||||
*.tgz
|
||||
|
|
|
@ -16,6 +16,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
|
|||
* Daniel Vila Suero, [@dvsrepo](https://github.com/dvsrepo)
|
||||
* Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi)
|
||||
* Eric Zhao, [@ericzhao28](https://github.com/ericzhao28)
|
||||
* Francisco Aranda, [@frascuchon](https://github.com/frascuchon)
|
||||
* Greg Baker, [@solresol](https://github.com/solresol)
|
||||
* Grégory Howard, [@Gregory-Howard](https://github.com/Gregory-Howard)
|
||||
* György Orosz, [@oroszgy](https://github.com/oroszgy)
|
||||
|
@ -24,6 +25,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
|
|||
* Ines Montani, [@ines](https://github.com/ines)
|
||||
* J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading)
|
||||
* Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan)
|
||||
* Jim Regan, [@jimregan](https://github.com/jimregan)
|
||||
* Jordan Suchow, [@suchow](https://github.com/suchow)
|
||||
* Josh Reeter, [@jreeter](https://github.com/jreeter)
|
||||
* Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks)
|
||||
|
|
12
README.rst
12
README.rst
|
@ -4,18 +4,22 @@ spaCy: Industrial-strength NLP
|
|||
spaCy is a library for advanced natural language processing in Python and
|
||||
Cython. spaCy is built on the very latest research, but it isn't researchware.
|
||||
It was designed from day one to be used in real products. spaCy currently supports
|
||||
English, German and French, as well as tokenization for Spanish, Italian,
|
||||
English, German, French and Spanish, as well as tokenization for Italian,
|
||||
Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew,
|
||||
Chinese and Japanese. It's commercial open-source software, released under the
|
||||
MIT license.
|
||||
|
||||
📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.
|
||||
⭐️ **Test spaCy v2.0.0 alpha and the new models!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/tag/v2.0.0-alpha>`_
|
||||
|
||||
💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
||||
|
||||
.. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square
|
||||
:target: https://travis-ci.org/explosion/spaCy
|
||||
:alt: Build Status
|
||||
:alt: Travis Build Status
|
||||
|
||||
.. image:: https://img.shields.io/appveyor/ci/explosion/spacy/master.svg?style=flat-square
|
||||
:target: https://ci.appveyor.com/project/explosion/spacy
|
||||
:alt: Appveyor Build Status
|
||||
|
||||
.. image:: https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square
|
||||
:target: https://github.com/explosion/spaCy/releases
|
||||
|
@ -85,7 +89,7 @@ Features
|
|||
* GIL-free **multi-threading**
|
||||
* Efficient binary serialization
|
||||
* Easy **deep learning** integration
|
||||
* Statistical models for **English** and **German**
|
||||
* Statistical models for **English**, **German**, **French** and **Spanish**
|
||||
* State-of-the-art speed
|
||||
* Robust, rigorously evaluated accuracy
|
||||
|
||||
|
|
|
@ -52,6 +52,7 @@ def train_ner(nlp, train_data, output_dir):
|
|||
random.shuffle(train_data)
|
||||
loss = 0.
|
||||
for raw_text, entity_offsets in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
# By default, the GoldParse class assumes that the entities
|
||||
# described by offset are complete, and all other words should
|
||||
|
@ -63,7 +64,6 @@ def train_ner(nlp, train_data, output_dir):
|
|||
#for i in range(len(gold.ner)):
|
||||
#if not gold.ner[i].endswith('ANIMAL'):
|
||||
# gold.ner[i] = '-'
|
||||
doc = nlp.make_doc(raw_text)
|
||||
nlp.tagger(doc)
|
||||
# As of 1.9, spaCy's parser now lets you supply a dropout probability
|
||||
# This might help the model generalize better from only a few
|
||||
|
|
|
@ -7,9 +7,11 @@ thinc>=6.5.0,<6.6.0
|
|||
murmurhash>=0.26,<0.27
|
||||
plac<1.0.0,>=0.9.6
|
||||
six
|
||||
html5lib==1.0b8
|
||||
ujson>=1.35
|
||||
dill>=0.2,<0.3
|
||||
requests>=2.13.0,<3.0.0
|
||||
regex==2017.4.5
|
||||
ftfy>=4.4.2,<5.0.0
|
||||
pytest>=3.0.6,<4.0.0
|
||||
pip>=9.0.0,<10.0.0
|
||||
|
|
1
setup.py
1
setup.py
|
@ -197,6 +197,7 @@ def setup_package():
|
|||
'preshed>=1.0.0,<2.0.0',
|
||||
'thinc>=6.5.0,<6.6.0',
|
||||
'plac<1.0.0,>=0.9.6',
|
||||
'pip>=9.0.0,<10.0.0',
|
||||
'six',
|
||||
'pathlib',
|
||||
'ujson>=1.35',
|
||||
|
|
|
@ -10,7 +10,7 @@ __author__ = 'Matthew Honnibal'
|
|||
__email__ = 'matt@explosion.ai'
|
||||
__license__ = 'MIT'
|
||||
|
||||
__docs__ = 'https://spacy.io/docs/usage'
|
||||
__docs_models__ = 'https://spacy.io/docs/usage'
|
||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
||||
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
|
||||
|
|
|
@ -79,5 +79,5 @@ def check_error_depr(model):
|
|||
"As of v1.7.0, the download all command is deprecated. Please "
|
||||
"download the models individually via spacy.download [model name] "
|
||||
"or pip install. For more info on this, see the documentation: "
|
||||
"{d}".format(d=about.__docs__),
|
||||
"{d}".format(d=about.__docs_models__),
|
||||
title="Deprecated command")
|
||||
|
|
|
@ -47,7 +47,7 @@ def package(input_dir, output_dir, meta_path, force):
|
|||
|
||||
def check_dirs(input_path, output_path, meta_path):
|
||||
if not input_path.exists():
|
||||
util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found")
|
||||
util.sys_exit(unicode_(input_path.as_posix()), title="Model directory not found")
|
||||
if not output_path.exists():
|
||||
util.sys_exit(unicode_(output_path), title="Output directory not found")
|
||||
if meta_path and not meta_path.exists():
|
||||
|
|
|
@ -146,7 +146,7 @@ class ModelDownload():
|
|||
"The spacy.{l}.download command is now deprecated. Please use "
|
||||
"python -m spacy download [model name or shortcut] instead. For more "
|
||||
"info and available models, see the documentation: {d}. "
|
||||
"Downloading default '{l}' model now...".format(d=about.__docs__, l=lang),
|
||||
"Downloading default '{l}' model now...".format(d=about.__docs_models__, l=lang),
|
||||
title="Warning: deprecated command")
|
||||
download(lang)
|
||||
|
||||
|
|
|
@ -178,7 +178,7 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
|
|||
|
||||
EXC[orth + "ve"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}
|
||||
]
|
||||
|
||||
EXC[orth + "'d"] = [
|
||||
|
|
|
@ -6,36 +6,6 @@ from ..language_data import PRON_LEMMA, DET_LEMMA
|
|||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {
|
||||
"al": [
|
||||
{ORTH: "a", LEMMA: "a", TAG: ADP},
|
||||
{ORTH: "el", LEMMA: "el", TAG: DET}
|
||||
],
|
||||
|
||||
"consigo": [
|
||||
{ORTH: "con", LEMMA: "con"},
|
||||
{ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"}
|
||||
],
|
||||
|
||||
"conmigo": [
|
||||
{ORTH: "con", LEMMA: "con"},
|
||||
{ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"}
|
||||
],
|
||||
|
||||
"contigo": [
|
||||
{ORTH: "con", LEMMA: "con"},
|
||||
{ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}
|
||||
],
|
||||
|
||||
"del": [
|
||||
{ORTH: "de", LEMMA: "de", TAG: ADP},
|
||||
{ORTH: "l", LEMMA: "el", TAG: DET}
|
||||
],
|
||||
|
||||
"pel": [
|
||||
{ORTH: "pe", LEMMA: "per", TAG: ADP},
|
||||
{ORTH: "l", LEMMA: "el", TAG: DET}
|
||||
],
|
||||
|
||||
"pal": [
|
||||
{ORTH: "pa", LEMMA: "para"},
|
||||
{ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}
|
||||
|
@ -43,7 +13,7 @@ TOKENIZER_EXCEPTIONS = {
|
|||
|
||||
"pala": [
|
||||
{ORTH: "pa", LEMMA: "para"},
|
||||
{ORTH: "la", LEMMA: DET_LEMMA}
|
||||
{ORTH: "la", LEMMA: DET_LEMMA, NORM: "la"}
|
||||
],
|
||||
|
||||
"aprox.": [
|
||||
|
|
|
@ -3,21 +3,39 @@ from __future__ import unicode_literals, print_function
|
|||
|
||||
from os import path
|
||||
|
||||
from ..language import Language
|
||||
from ..language import Language, BaseDefaults
|
||||
from ..tokenizer import Tokenizer
|
||||
from ..attrs import LANG
|
||||
from ..tokens import Doc
|
||||
|
||||
from .language_data import *
|
||||
|
||||
|
||||
class Japanese(Language):
|
||||
lang = 'ja'
|
||||
|
||||
def make_doc(self, text):
|
||||
class JapaneseTokenizer(object):
|
||||
def __init__(self, cls, nlp=None):
|
||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
try:
|
||||
from janome.tokenizer import Tokenizer
|
||||
except ImportError:
|
||||
raise ImportError("The Japanese tokenizer requires the Janome library: "
|
||||
"https://github.com/mocobeta/janome")
|
||||
words = [x.surface for x in Tokenizer().tokenize(text)]
|
||||
self.tokenizer = Tokenizer()
|
||||
|
||||
def __call__(self, text):
|
||||
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
class JapaneseDefaults(BaseDefaults):
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
return JapaneseTokenizer(cls, nlp)
|
||||
|
||||
class Japanese(Language):
|
||||
lang = 'ja'
|
||||
|
||||
Defaults = JapaneseDefaults
|
||||
|
||||
def make_doc(self, text):
|
||||
words = self.tokenizer(text)
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ from .attrs cimport IS_QUOTE
|
|||
from .attrs cimport IS_LEFT_PUNCT
|
||||
from .attrs cimport IS_RIGHT_PUNCT
|
||||
from .attrs cimport IS_OOV
|
||||
from . import about
|
||||
|
||||
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||
|
@ -137,11 +138,10 @@ cdef class Lexeme:
|
|||
cdef int length = self.vocab.vectors_length
|
||||
if length == 0:
|
||||
raise ValueError(
|
||||
"Word vectors set to length 0. This may be because the "
|
||||
"data is not installed. If you haven't already, run"
|
||||
"\npython -m spacy download %s\n"
|
||||
"to install the data." % self.vocab.lang
|
||||
)
|
||||
"Word vectors set to length 0. This may be because you "
|
||||
"don't have a model installed or loaded, or because your "
|
||||
"model doesn't include word vectors. For more info, see "
|
||||
"the documentation: \n%s\n" % about.__docs_models__)
|
||||
|
||||
vector_view = <float[:length,]>self.c.vector
|
||||
return numpy.asarray(vector_view)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..parts_of_speech cimport NOUN, PROPN, PRON
|
||||
from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX
|
||||
|
||||
|
||||
def english_noun_chunks(obj):
|
||||
|
@ -66,4 +66,55 @@ def german_noun_chunks(obj):
|
|||
yield word.left_edge.i, rbracket, np_label
|
||||
|
||||
|
||||
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
|
||||
def es_noun_chunks(obj):
|
||||
|
||||
doc = obj.doc
|
||||
np_label = doc.vocab.strings['NP']
|
||||
|
||||
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
|
||||
right_labels = ['flat', 'fixed', 'compound', 'neg']
|
||||
stop_labels = ['punct']
|
||||
|
||||
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
|
||||
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
|
||||
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
|
||||
|
||||
def next_token(token):
|
||||
try:
|
||||
return token.nbor()
|
||||
except:
|
||||
return None
|
||||
|
||||
def noun_bounds(root):
|
||||
|
||||
def is_verb_token(token):
|
||||
return token.pos in [VERB, AUX]
|
||||
|
||||
left_bound = root
|
||||
for token in reversed(list(root.lefts)):
|
||||
if token.dep in np_left_deps:
|
||||
left_bound = token
|
||||
|
||||
right_bound = root
|
||||
for token in root.rights:
|
||||
if (token.dep in np_right_deps):
|
||||
left, right = noun_bounds(token)
|
||||
|
||||
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps, doc[left_bound.i: right.i])):
|
||||
break
|
||||
else:
|
||||
right_bound = right
|
||||
|
||||
return left_bound, right_bound
|
||||
|
||||
|
||||
token = doc[0]
|
||||
while token and token.i < len(doc):
|
||||
if token.pos in [PROPN, NOUN, PRON]:
|
||||
left, right = noun_bounds(token)
|
||||
yield left.i, right.i+1, np_label
|
||||
token = right
|
||||
token = next_token(token)
|
||||
|
||||
|
||||
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, 'es': es_noun_chunks}
|
||||
|
|
|
@ -5,6 +5,7 @@ from ..en import English
|
|||
from ..de import German
|
||||
from ..es import Spanish
|
||||
from ..it import Italian
|
||||
from ..ja import Japanese
|
||||
from ..fr import French
|
||||
from ..pt import Portuguese
|
||||
from ..nl import Dutch
|
||||
|
@ -26,7 +27,7 @@ from pathlib import Path
|
|||
import os
|
||||
import pytest
|
||||
|
||||
|
||||
# These languages get run through generic tokenizer tests
|
||||
LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
|
||||
Swedish, Hungarian, Finnish, Bengali, Norwegian]
|
||||
|
||||
|
@ -76,6 +77,12 @@ def fi_tokenizer():
|
|||
return Finnish.Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ja_tokenizer():
|
||||
janome = pytest.importorskip("janome")
|
||||
return Japanese.Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sv_tokenizer():
|
||||
return Swedish.Defaults.create_tokenizer()
|
||||
|
|
|
@ -217,10 +217,13 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors):
|
|||
assert doc.has_vector
|
||||
|
||||
|
||||
def test_parse_tree(EN):
|
||||
def test_parse_tree(en_tokenizer):
|
||||
"""Tests doc.print_tree() method."""
|
||||
text = 'I like New York in Autumn.'
|
||||
doc = EN(text, tag=True)
|
||||
doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
|
||||
heads = [1, 0, 1, -2, -3, -1, -5]
|
||||
tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, tags=tags)
|
||||
# full method parse_tree(text) is a trivial composition
|
||||
trees = doc.print_tree()
|
||||
assert len(trees) > 0
|
||||
|
|
0
spacy/tests/ja/__init__.py
Normal file
0
spacy/tests/ja/__init__.py
Normal file
17
spacy/tests/ja/test_tokenizer.py
Normal file
17
spacy/tests/ja/test_tokenizer.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
TOKENIZER_TESTS = [
|
||||
("日本語だよ", ['日本語', 'だ', 'よ']),
|
||||
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
||||
("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
|
||||
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']),
|
||||
("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
|
||||
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
|
||||
tokens = [token.text for token in ja_tokenizer(text)]
|
||||
assert tokens == expected_tokens
|
|
@ -29,6 +29,7 @@ from ..serialize.bits cimport BitArray
|
|||
from ..util import normalize_slice
|
||||
from ..syntax.iterators import CHUNKERS
|
||||
from ..compat import is_config
|
||||
from .. import about
|
||||
|
||||
|
||||
DEF PADDING = 5
|
||||
|
@ -403,9 +404,8 @@ cdef class Doc:
|
|||
if not self.is_parsed:
|
||||
raise ValueError(
|
||||
"noun_chunks requires the dependency parse, which "
|
||||
"requires data to be installed. If you haven't done so, run: "
|
||||
"\npython -m spacy download %s\n"
|
||||
"to install the data" % self.vocab.lang)
|
||||
"requires data to be installed. For more info, see the "
|
||||
"documentation: \n%s\n" % about.__docs_models__)
|
||||
# Accumulate the result before beginning to iterate over it. This prevents
|
||||
# the tokenisation from being changed out from under us during the iteration.
|
||||
# The tricky thing here is that Span accepts its tokenisation changing,
|
||||
|
@ -431,14 +431,14 @@ cdef class Doc:
|
|||
"""
|
||||
def __get__(self):
|
||||
if 'sents' in self.user_hooks:
|
||||
return self.user_hooks['sents'](self)
|
||||
yield from self.user_hooks['sents'](self)
|
||||
return
|
||||
|
||||
if not self.is_parsed:
|
||||
raise ValueError(
|
||||
"sentence boundary detection requires the dependency parse, which "
|
||||
"requires data to be installed. If you haven't done so, run: "
|
||||
"\npython -m spacy download %s\n"
|
||||
"to install the data" % self.vocab.lang)
|
||||
"Sentence boundary detection requires the dependency parse, which "
|
||||
"requires data to be installed. For more info, see the "
|
||||
"documentation: \n%s\n" % about.__docs_models__)
|
||||
cdef int i
|
||||
start = 0
|
||||
for i in range(1, self.length):
|
||||
|
|
|
@ -1,13 +1,23 @@
|
|||
from copy import deepcopy
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .doc import Doc
|
||||
from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
|
||||
|
||||
|
||||
def merge_ents(doc):
|
||||
'''Helper: merge adjacent entities into single tokens; modifies the doc.'''
|
||||
"""
|
||||
Helper: merge adjacent entities into single tokens; modifies the doc.
|
||||
"""
|
||||
for ent in doc.ents:
|
||||
ent.merge(ent.root.tag_, ent.text, ent.label_)
|
||||
return doc
|
||||
|
||||
|
||||
def format_POS(token, light, flat):
|
||||
'''helper: form the POS output for a token'''
|
||||
"""
|
||||
Helper: form the POS output for a token.
|
||||
"""
|
||||
subtree = dict([
|
||||
("word", token.text),
|
||||
("lemma", token.lemma_), # trigger
|
||||
|
@ -25,17 +35,22 @@ def format_POS(token, light, flat):
|
|||
subtree.pop("modifiers")
|
||||
return subtree
|
||||
|
||||
def POS_tree(root, light, flat):
|
||||
'''Helper: generate a POS tree for a root token.
|
||||
The doc must have merge_ents(doc) ran on it.
|
||||
'''
|
||||
|
||||
def POS_tree(root, light=False, flat=False):
|
||||
"""
|
||||
Helper: generate a POS tree for a root token. The doc must have
|
||||
merge_ents(doc) ran on it.
|
||||
"""
|
||||
subtree = format_POS(root, light=light, flat=flat)
|
||||
for c in root.children:
|
||||
subtree["modifiers"].append(POS_tree(c))
|
||||
return subtree
|
||||
|
||||
|
||||
def parse_tree(doc, light=False, flat=False):
|
||||
"""Makes a copy of the doc, then construct a syntactic parse tree, similar to the one used in displaCy. Generates the POS tree for all sentences in a doc
|
||||
"""
|
||||
Makes a copy of the doc, then construct a syntactic parse tree, similar to
|
||||
the one used in displaCy. Generates the POS tree for all sentences in a doc.
|
||||
|
||||
Args:
|
||||
doc: The doc for parsing.
|
||||
|
@ -50,6 +65,8 @@ def parse_tree(doc, light=False, flat=False):
|
|||
[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
|
||||
"""
|
||||
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
||||
doc_clone.from_array(doc.to_array([HEAD, DEP, TAG, ENT_IOB, ENT_TYPE])
|
||||
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
||||
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
|
||||
doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
|
||||
merge_ents(doc_clone) # merge the entities into single tokens first
|
||||
return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]
|
||||
|
|
|
@ -16,6 +16,7 @@ from ..util import normalize_slice
|
|||
from ..attrs cimport IS_PUNCT, IS_SPACE
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..compat import is_config
|
||||
from .. import about
|
||||
|
||||
|
||||
cdef class Span:
|
||||
|
@ -221,9 +222,8 @@ cdef class Span:
|
|||
if not self.doc.is_parsed:
|
||||
raise ValueError(
|
||||
"noun_chunks requires the dependency parse, which "
|
||||
"requires data to be installed. If you haven't done so, run: "
|
||||
"\npython -m spacy download %s\n"
|
||||
"to install the data" % self.vocab.lang)
|
||||
"requires data to be installed. For more info, see the "
|
||||
"documentation: \n%s\n" % about.__docs_models__)
|
||||
# Accumulate the result before beginning to iterate over it. This prevents
|
||||
# the tokenisation from being changed out from under us during the iteration.
|
||||
# The tricky thing here is that Span accepts its tokenisation changing,
|
||||
|
|
|
@ -26,6 +26,7 @@ from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
|||
from ..attrs cimport IS_OOV
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..compat import is_config
|
||||
from .. import about
|
||||
|
||||
|
||||
cdef class Token:
|
||||
|
@ -237,11 +238,10 @@ cdef class Token:
|
|||
cdef int length = self.vocab.vectors_length
|
||||
if length == 0:
|
||||
raise ValueError(
|
||||
"Word vectors set to length 0. This may be because the "
|
||||
"data is not installed. If you haven't already, run"
|
||||
"\npython -m spacy download %s\n"
|
||||
"to install the data." % self.vocab.lang
|
||||
)
|
||||
"Word vectors set to length 0. This may be because you "
|
||||
"don't have a model installed or loaded, or because your "
|
||||
"model doesn't include word vectors. For more info, see "
|
||||
"the documentation: \n%s\n" % about.__docs_models__)
|
||||
vector_view = <float[:length,]>self.c.lex.vector
|
||||
return numpy.asarray(vector_view)
|
||||
|
||||
|
|
|
@ -8,4 +8,5 @@ class Chinese(Language):
|
|||
def make_doc(self, text):
|
||||
import jieba
|
||||
words = list(jieba.cut(text, cut_all=True))
|
||||
words=[x for x in words if x]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
|
|
@ -14,8 +14,8 @@
|
|||
|
||||
"SPACY_VERSION": "1.8",
|
||||
"LATEST_NEWS": {
|
||||
"url": "https://survey.spacy.io/",
|
||||
"title": "Take the spaCy user survey and help us improve the library!"
|
||||
"url": "/docs/usage/models",
|
||||
"title": "The first official Spanish model is here!"
|
||||
},
|
||||
|
||||
"SOCIAL": {
|
||||
|
@ -55,7 +55,33 @@
|
|||
}
|
||||
},
|
||||
|
||||
"V_CSS": "1.6",
|
||||
"QUICKSTART": [
|
||||
{ "id": "os", "title": "Operating system", "options": [
|
||||
{ "id": "mac", "title": "macOS / OSX", "checked": true },
|
||||
{ "id": "windows", "title": "Windows" },
|
||||
{ "id": "linux", "title": "Linux" }]
|
||||
},
|
||||
{ "id": "package", "title": "Package manager", "options": [
|
||||
{ "id": "pip", "title": "pip", "checked": true },
|
||||
{ "id": "conda", "title": "conda" },
|
||||
{ "id": "source", "title": "from source" }]
|
||||
},
|
||||
{ "id": "python", "title": "Python version", "options": [
|
||||
{ "id": 2, "title": "2.x" },
|
||||
{ "id": 3, "title": "3.x", "checked": true }]
|
||||
},
|
||||
{ "id": "config", "title": "Configuration", "multiple": true, "options": [
|
||||
{"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }]
|
||||
},
|
||||
{ "id": "model", "title": "Models", "multiple": true, "options": [
|
||||
{ "id": "en", "title": "English", "meta": "50MB" },
|
||||
{ "id": "de", "title": "German", "meta": "645MB" },
|
||||
{ "id": "fr", "title": "French", "meta": "1.33GB" },
|
||||
{ "id": "es", "title": "Spanish", "meta": "377MB"}]
|
||||
}
|
||||
],
|
||||
|
||||
"V_CSS": "1.7",
|
||||
"V_JS": "1.2",
|
||||
"DEFAULT_SYNTAX": "python",
|
||||
"ANALYTICS": "UA-58931649-1",
|
||||
|
|
|
@ -121,6 +121,47 @@ mixin badge(name)
|
|||
img(src=site.badge alt="{name} version" height="20")
|
||||
|
||||
|
||||
//- Quickstart widget
|
||||
quickstart.js with manual markup, inspired by PyTorch's "Getting started"
|
||||
groups - [object] option groups, uses global variable QUICKSTART
|
||||
headline - [string] optional text to be rendered as widget headline
|
||||
|
||||
mixin quickstart(groups, headline)
|
||||
.c-quickstart.o-block-small#qs
|
||||
.c-quickstart__content
|
||||
if headline
|
||||
+h(2)=headline
|
||||
for group in groups
|
||||
.c-quickstart__group.u-text-small(data-qs-group=group.id)
|
||||
.c-quickstart__legend=group.title
|
||||
.c-quickstart__fields
|
||||
for option in group.options
|
||||
input.c-quickstart__input(class="c-quickstart__input--" + (group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
|
||||
label.c-quickstart__label(for="qs-#{option.id}")=option.title
|
||||
if option.meta
|
||||
| #[span.c-quickstart__label__meta (#{option.meta})]
|
||||
if option.help
|
||||
| #[+help(option.help).c-quickstart__label__meta]
|
||||
|
||||
pre.c-code-block
|
||||
code.c-code-block__content.c-quickstart__code(data-qs-results="")
|
||||
block
|
||||
|
||||
.c-quickstart__info.u-text-tiny.o-block.u-text-right
|
||||
| Like this widget? Check out #[+a("https://github.com/ines/quickstart").u-link quickstart.js]!
|
||||
|
||||
|
||||
//- Quickstart code item
|
||||
data [object] - Rendering conditions (keyed by option group ID, value: option)
|
||||
|
||||
mixin qs(data)
|
||||
- args = {}
|
||||
for value, setting in data
|
||||
- args['data-qs-' + setting] = value
|
||||
span.c-quickstart__line&attributes(args)
|
||||
block
|
||||
|
||||
|
||||
//- Logo
|
||||
|
||||
mixin logo()
|
||||
|
|
|
@ -47,6 +47,14 @@ mixin api(path)
|
|||
| #[+icon("book", 18).o-icon--inline.u-color-subtle]
|
||||
|
||||
|
||||
//- Help icon with tooltip
|
||||
tooltip - [string] Tooltip text
|
||||
|
||||
mixin help(tooltip)
|
||||
span(data-tooltip=tooltip)&attributes(attributes)
|
||||
+icon("help", 16).i-icon--inline
|
||||
|
||||
|
||||
//- Aside for text
|
||||
label - [string] aside title (optional)
|
||||
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
//- 💫 INCLUDES > SCRIPTS
|
||||
|
||||
script(src="/assets/js/main.js?v#{V_JS}", type="text/javascript")
|
||||
script(src="/assets/js/prism.js", type="text/javascript")
|
||||
script(src="/assets/js/main.js?v#{V_JS}")
|
||||
script(src="/assets/js/prism.js")
|
||||
|
||||
if SECTION == "docs"
|
||||
if quickstart
|
||||
script(src="/assets/js/quickstart.js")
|
||||
script var qs = new Quickstart("#qs");
|
||||
|
||||
script.
|
||||
((window.gitter = {}).chat = {}).options = {
|
||||
useStyles: false,
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
|
||||
.c-code-block__content
|
||||
display: block
|
||||
font: normal normal 1.1rem/#{2} $font-code
|
||||
font: normal 600 1.1rem/#{2} $font-code
|
||||
padding: 1em 2em
|
||||
|
||||
|
||||
|
|
90
website/assets/css/_components/_quickstart.sass
Normal file
90
website/assets/css/_components/_quickstart.sass
Normal file
|
@ -0,0 +1,90 @@
|
|||
//- 💫 CSS > COMPONENTS > QUICKSTART
|
||||
|
||||
.c-quickstart
|
||||
border: 1px solid $color-subtle
|
||||
border-radius: 2px
|
||||
display: none
|
||||
background: $color-subtle-light
|
||||
|
||||
&:not([style]) + .c-quickstart__info
|
||||
display: none
|
||||
|
||||
.c-quickstart__content
|
||||
padding: 2rem 3rem
|
||||
|
||||
.c-quickstart__input
|
||||
@include size(0)
|
||||
opacity: 0
|
||||
position: absolute
|
||||
left: -9999px
|
||||
|
||||
.c-quickstart__label
|
||||
cursor: pointer
|
||||
background: $color-back
|
||||
border: 1px solid $color-subtle
|
||||
border-radius: 2px
|
||||
display: inline-block
|
||||
padding: 0.75rem 1.25rem
|
||||
margin: 0 0.5rem 0.5rem 0
|
||||
font-weight: bold
|
||||
|
||||
&:hover
|
||||
background: lighten($color-theme-light, 5)
|
||||
|
||||
.c-quickstart__input:focus + &
|
||||
border: 1px solid $color-theme
|
||||
|
||||
.c-quickstart__input--radio:checked + &
|
||||
color: $color-back
|
||||
border-color: $color-theme
|
||||
background: $color-theme
|
||||
|
||||
.c-quickstart__input--check + &:before
|
||||
content: ""
|
||||
background: $color-back
|
||||
display: inline-block
|
||||
width: 20px
|
||||
height: 20px
|
||||
border: 1px solid $color-subtle
|
||||
vertical-align: middle
|
||||
margin-right: 1rem
|
||||
cursor: pointer
|
||||
border-radius: 50%
|
||||
|
||||
.c-quickstart__input--check:checked + &:before
|
||||
background: $color-theme url()
|
||||
background-size: contain
|
||||
border-color: $color-theme
|
||||
|
||||
.c-quickstart__label__meta
|
||||
font-weight: normal
|
||||
color: $color-subtle-dark
|
||||
|
||||
.c-quickstart__group
|
||||
@include breakpoint(min, md)
|
||||
display: flex
|
||||
flex-flow: row nowrap
|
||||
|
||||
&:not(:last-child)
|
||||
margin-bottom: 1rem
|
||||
|
||||
.c-quickstart__fields
|
||||
flex: 100%
|
||||
|
||||
.c-quickstart__legend
|
||||
color: $color-subtle-dark
|
||||
margin-right: 2rem
|
||||
padding-top: 0.75rem
|
||||
flex: 1 1 35%
|
||||
font-weight: bold
|
||||
|
||||
.c-quickstart__line
|
||||
display: block
|
||||
|
||||
&:before
|
||||
color: $color-theme
|
||||
margin-right: 1em
|
||||
content: "$"
|
||||
|
||||
.c-quickstart__code
|
||||
font-size: 1.6rem
|
29
website/assets/css/_components/_tooltips.sass
Normal file
29
website/assets/css/_components/_tooltips.sass
Normal file
|
@ -0,0 +1,29 @@
|
|||
//- 💫 CSS > COMPONENTS > TOOLTIPS
|
||||
|
||||
[data-tooltip]
|
||||
position: relative
|
||||
|
||||
@include breakpoint(min, sm)
|
||||
&:before
|
||||
@include position(absolute, top, left, 125%, 50%)
|
||||
display: inline-block
|
||||
content: attr(data-tooltip)
|
||||
background: $color-front
|
||||
border-radius: 2px
|
||||
color: $color-back
|
||||
font-family: inherit
|
||||
font-size: 1.3rem
|
||||
line-height: 1.25
|
||||
opacity: 0
|
||||
padding: 0.5em 0.75em
|
||||
transform: translateX(-50%) translateY(-2px)
|
||||
transition: opacity 0.1s ease-out, transform 0.1s ease-out
|
||||
visibility: hidden
|
||||
min-width: 200px
|
||||
max-width: 300px
|
||||
z-index: 200
|
||||
|
||||
&:hover:before
|
||||
opacity: 1
|
||||
transform: translateX(-50%) translateY(0)
|
||||
visibility: visible
|
|
@ -27,6 +27,7 @@ $font-code: 'Source Code Pro', Consolas, 'Andale Mono', Menlo, Monaco, Courier,
|
|||
// Colors
|
||||
|
||||
$colors: ( blue: #09a3d5, red: #d9515d )
|
||||
$colors-light: (blue: #cceaf4, red: #f9d7da)
|
||||
|
||||
$color-back: #fff !default
|
||||
$color-front: #1a1e23 !default
|
||||
|
@ -34,7 +35,7 @@ $color-dark: lighten($color-front, 20) !default
|
|||
|
||||
$color-theme: map-get($colors, $theme)
|
||||
$color-theme-dark: darken(map-get($colors, $theme), 5)
|
||||
$color-theme-light: saturate(lighten(map-get($colors, $theme), 35), 5)
|
||||
$color-theme-light: map-get($colors-light, $theme)
|
||||
|
||||
$color-subtle: #ddd !default
|
||||
$color-subtle-light: #f6f6f6 !default
|
||||
|
|
|
@ -32,3 +32,5 @@ $theme: blue !default
|
|||
@import _components/navigation
|
||||
@import _components/sidebar
|
||||
@import _components/tables
|
||||
@import _components/tooltips
|
||||
@import _components/quickstart
|
||||
|
|
|
@ -1,5 +1,16 @@
|
|||
<svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||
<defs>
|
||||
<symbol id="v2alpha" viewBox="0 0 200 111">
|
||||
<title>spaCy v2.0.0 alpha</title>
|
||||
<path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/>
|
||||
<path fill="#a3cad3" d="M45.4 105.4L19.6 94.6l25.4-1"/>
|
||||
<path fill="#ddd" d="M196.6 2L155 3.4l1 29.2 41.6-1.4L187.2 17"/>
|
||||
<path fill="#a3cad3" d="M155 3.4l25.8 10.8-25.4 1"/>
|
||||
<path fill="#fff" d="M17.6 19.4l163-5.6 1 29.2-163 5.6zM19.2 65.6l163-5.6 1 29.2-163 5.6z"/>
|
||||
<path fill="#008EBC" d="M45.8 29h-3.6v-2.4l10-.4.2 2.5h-3.6l.4 10.8h-3L45.8 29zM62 39L59 34.5h-1.6l.2 5h-3l-.5-13.2L59 26c3 0 5.2.8 5.3 4 0 1.8-.8 3-2.2 3.8l3.3 5.2H62zm-4.5-6.8H59c1.6-.2 2.4-.8 2.3-2 0-1.4-1-1.8-2.5-1.8h-1.5l.2 3.8zM69 34.2l-4.3-8.4H68l1.2 3 1.2 2.8c.4-1 .8-2 1-3l1.2-3 3-.2L72 34l.2 4.7h-3l-.2-4.5zM79.5 25.3h3.2l1.8 6 1.2 4.2c.5-1.5.7-2.8 1-4.3L88 25h3L87.7 38H84l-4.5-13zM92.4 25l8.3-.4V27l-5.2.3V30l4.6-.3.2 2.5-4.5.2v3l5.6-.2v2.5L93 38l-.6-13zM111 37.4l-2.6-4.7h-1.6l.2 5h-3l-.5-13.2 4.8-.2c2.8 0 5 .8 5.2 4 0 1.8-.8 3-2.2 3.8l3.2 5.3H111zm-4.3-7h1.5c1.6 0 2.4-.7 2.3-2 0-1.3-1-1.7-2.5-1.7h-1.5l.2 3.8zM116.8 33.5c1 .8 2.2 1.3 3.3 1.3 1.3 0 2-.5 2-1.3s-1-1-2-1.5l-1.8-.7c-1.4-.5-2.7-1.6-2.8-3.5 0-2.2 1.8-4 4.6-4 1.5-.2 3 .4 4.3 1.5l-1.4 2c-1-.7-1.8-1-3-1-1 0-1.6.4-1.5 1.2 0 .8 1 1 2 1.5l1.8.6c1.6.6 2.7 1.6 2.7 3.5 0 2.3-1.7 4.2-4.8 4.4-1.7 0-3.6-.5-5-1.7l1.6-2.2zM126.8 23.7h3l.5 13-3 .2-.5-13.3zM132.5 30c0-4.3 2.2-7 5.8-7 3.6 0 6 2.3 6.2 6.6 0 4.3-2.2 7-5.8 7-3.5.3-6-2.3-6.2-6.6zm9-.3c-.2-2.6-1.4-4.2-3.2-4-1.8 0-3 1.6-2.8 4.2 0 2.5 1.3 4.2 3 4 2 0 3-1.6 3-4.3zM146.7 23h3l3.8 6.3 1.4 3c-.2-1.5-.5-3.3-.5-5l-.2-4.6h2.8l.6 13-3 .2-3.8-6.6-1.4-2.8c0 1.5.4 3.2.4 4.8l.2 4.7-3 .2-.3-13.2z"/>
|
||||
<path fill="#1A1E23" d="M50.2 84.7c3.2-3.2 5.4-5.5 5.3-7.3 0-1.3-.8-2-2-2-.8 0-1.5.8-2 1.5l-1.8-1.6c1.2-1.4 2.4-2 4.2-2.2 2.4 0 4.2 1.5 4.3 4 0 2-2 4.4-4 6.7.7-.2 1.6-.3 2.2-.3H59l.2 2.4-9 .4v-1.7zM63 82.4c1 0 2 .7 2 1.8 0 1-.7 2-1.7 2s-1.8-.8-2-2c0-1 .7-1.8 1.8-1.8zM66.7 79.3c-.2-4.4 1.6-6.7 4.4-6.8 3 0 4.8 2 5 6.5s-1.7 6.8-4.5 7c-2.7 0-4.6-2.3-4.8-6.7zM73 79c0-3.4-.8-4.2-1.8-4-1 0-1.8.7-1.6 4.3 0 3.5 1 4.4 2 4.3 1 0 1.6-1 1.5-4.5zM79.8 81.8c1 0 1.8.7 2 1.8 0 1-.8 2-1.8 2s-1.8-.8-2-2c0-1 .8-1.7 1.8-1.8zM83.5 78.7C83.3 74.3 85 72 88 72c2.7-.2 4.6 2 4.7 6.4s-1.6 6.8-4.4 7c-2.8 0-4.7-2.3-4.8-6.7zm6.3-.2c0-3.5-1-4.3-2-4.2-1 0-1.7.8-1.5 4.4 0 3.5 1 4.4 2 4.3 1 0 1.7-1 1.5-4.5zM105.5 81.3h-4l-.7 3.3h-3l3.7-13.2h3.6l4.7 13h-3.2l-1-3zm-.7-2.3l-.4-1.2-1.2-4.2-1 4.3-.3 1h2.8zM110.5 71h3l.4 10.7 5-.2.2 2.5-8.2.3-.5-13.2zM121 70.7l4.7-.2c3 0 5.2 1 5.3 4 0 3.2-2.2 4.7-5 4.7h-1.8l.2 4.6h-3l-.5-13zm4.7 6.2c1.6-.2 2.4-1 2.4-2.3 0-1.4-.8-2-2.4-1.8H124v4h1.7zM133 70.3h3l.3 5 4.5-.2-.2-5h3l.5 13-3 .2v-5.5l-4.6.2.2 5.4h-3l-.5-13zM153.3 79.7h-4l-.7 3.3h-3l3.7-13.2h3.6l4.5 13h-3.2l-1-3zm-.7-2.3l-.4-1.2L151 72l-1 4.3-.3 1.2h3z"/>
|
||||
</symbol>
|
||||
|
||||
<symbol id="usersurvey" viewBox="0 0 200 111">
|
||||
<title>spaCy user survey 2017</title>
|
||||
<path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/>
|
||||
|
|
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 21 KiB |
|
@ -27,5 +27,8 @@
|
|||
<symbol id="star" viewBox="0 0 24 24">
|
||||
<path d="M12 17.25l-6.188 3.75 1.641-7.031-5.438-4.734 7.172-0.609 2.813-6.609 2.813 6.609 7.172 0.609-5.438 4.734 1.641 7.031z"></path>
|
||||
</symbol>
|
||||
<symbol id="help" viewBox="0 0 24 24">
|
||||
<path d="M12 6c2.203 0 3.984 1.781 3.984 3.984 0 2.484-3 2.766-3 5.016h-1.969c0-3.234 3-3 3-5.016 0-1.078-0.938-1.969-2.016-1.969s-2.016 0.891-2.016 1.969h-1.969c0-2.203 1.781-3.984 3.984-3.984zM12 20.016c4.406 0 8.016-3.609 8.016-8.016s-3.609-8.016-8.016-8.016-8.016 3.609-8.016 8.016 3.609 8.016 8.016 8.016zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984zM11.016 18v-2.016h1.969v2.016h-1.969z"/>
|
||||
</symbol>
|
||||
</defs>
|
||||
</svg>
|
||||
|
|
Before Width: | Height: | Size: 4.9 KiB After Width: | Height: | Size: 5.4 KiB |
8
website/assets/js/quickstart.js
Normal file
8
website/assets/js/quickstart.js
Normal file
|
@ -0,0 +1,8 @@
|
|||
/**
|
||||
* quickstart.js
|
||||
* A micro-form for user-specific installation instructions
|
||||
*
|
||||
* @author Ines Montani <ines@ines.io>
|
||||
* @version 0.0.1
|
||||
* @license MIT
|
||||
*/'use strict';var _createClass=function(){function a(b,c){for(var e,d=0;d<c.length;d++)e=c[d],e.enumerable=e.enumerable||!1,e.configurable=!0,'value'in e&&(e.writable=!0),Object.defineProperty(b,e.key,e)}return function(b,c,d){return c&&a(b.prototype,c),d&&a(b,d),b}}();function _toConsumableArray(a){if(Array.isArray(a)){for(var b=0,c=Array(a.length);b<a.length;b++)c[b]=a[b];return c}return Array.from(a)}function _classCallCheck(a,b){if(!(a instanceof b))throw new TypeError('Cannot call a class as a function')}var Quickstart=function(){function a(){var b=0<arguments.length&&void 0!==arguments[0]?arguments[0]:'#quickstart',d=arguments[1],c=2<arguments.length&&void 0!==arguments[2]?arguments[2]:{};_classCallCheck(this,a),this.container='string'==typeof b?this._$(b):b,this.groups=d,this.pfx=c.prefix||'qs',this.dpfx='data-'+this.pfx,this.init=this.init.bind(this),c.noInit||document.addEventListener('DOMContentLoaded',this.init)}return _createClass(a,[{key:'init',value:function init(){this.updateContainer(),this.container.style.display='block',this.container.classList.add(''+this.pfx);var b=this.groups;b instanceof Array?b.reverse().forEach(this.createGroup.bind(this)):this._$$('['+this.dpfx+'-group]').forEach(this.updateGroup.bind(this))}},{key:'initGroup',value:function initGroup(b,c){b.addEventListener('change',this.update.bind(this)),b.dispatchEvent(new CustomEvent('change',{detail:c}))}},{key:'updateGroup',value:function updateGroup(b){var c=b.getAttribute(this.dpfx+'-group'),d=this.createStyles(c);b.insertBefore(d,b.firstChild),this.initGroup(b,c)}},{key:'update',value:function update(b){var f=this,c=b.detail||b.target.name,d=this._$$('[name='+c+']:checked').map(function(h){return h.value}),e=d.map(function(h){return':not(['+f.dpfx+'-'+c+'="'+h+'"])'}).join(''),g='['+this.dpfx+'-results]>['+this.dpfx+'-'+c+']'+e+' {display: none}';this._$('['+this.dpfx+'-style="'+c+'"]').textContent=g}},{key:'updateContainer',value:function updateContainer(){if(!this._$('['+this.dpfx+'-results]')){var b=this.childNodes(this.container,'pre'),c=b?b[0]:this._c('pre',this.pfx+'-code'),d=this.childNodes(c,'code')||this.childNodes(this.container,'code'),e=d?d[0]:this._c('code',this.pfx+'-results');e.setAttribute(this.dpfx+'-results','');var f=this.childNodes(e,'span')||this.childNodes(c,'span')||this.childNodes(this.container,'span');f&&f.forEach(function(g){return e.appendChild(g)}),c.appendChild(e),this.container.appendChild(c)}}},{key:'createGroup',value:function createGroup(b){var d=this,c=this._c('fieldset',this.pfx+'-group');c.setAttribute(this.dpfx+'-group',b.id),c.innerHTML=this.createStyles(b.id).outerHTML,c.innerHTML+='<legend class="'+this.pfx+'-legend">'+b.title+'</legend>',c.innerHTML+=b.options.map(function(e){var f=b.multiple?'checkbox':'radio';return'<input class="'+d.pfx+'-input '+d.pfx+'-input--'+f+'" type="'+f+'" name="'+b.id+'" id="'+e.id+'" value="'+e.id+'" '+(e.checked?'checked':'')+' /><label class="'+d.pfx+'-label" for="'+e.id+'">'+e.title+'</label>'}).join(''),this.container.insertBefore(c,this.container.firstChild),this.initGroup(c,b.id)}},{key:'createStyles',value:function createStyles(b){var c=this._c('style');return c.setAttribute(this.dpfx+'-style',b),c.textContent='['+this.dpfx+'-results]>['+this.dpfx+'-'+b+'] {display: none}',c}},{key:'childNodes',value:function childNodes(b,c){var d=c.toUpperCase();if(!b.hasChildNodes)return!1;var e=[].concat(_toConsumableArray(b.childNodes)).filter(function(f){return f.nodeName===d});return!!e.length&&e}},{key:'_$',value:function _$(b){return document.querySelector(b)}},{key:'_$$',value:function _$$(b){return[].concat(_toConsumableArray(document.querySelectorAll(b)))}},{key:'_c',value:function _c(b,c){var d=document.createElement(b);return c&&(d.className=c),d}}]),a}();
|
|
@ -1,10 +1,5 @@
|
|||
//- 💫 DOCS > API > ANNOTATION > DEPENDENCY LABELS
|
||||
|
||||
+infobox("Tip")
|
||||
| In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
|
||||
| description for the string representation of a label. For example,
|
||||
| #[code spacy.explain("prt")] will return "particle".
|
||||
|
||||
+h(3, "dependency-parsing-english") English dependency labels
|
||||
|
||||
p
|
||||
|
|
|
@ -1,10 +1,5 @@
|
|||
//- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES
|
||||
|
||||
+infobox("Tip")
|
||||
| In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
|
||||
| description for the string representation of an entity label. For example,
|
||||
| #[code spacy.explain("LANGUAGE")] will return "any named language".
|
||||
|
||||
+table([ "Type", "Description" ])
|
||||
+row
|
||||
+cell #[code PERSON]
|
||||
|
|
|
@ -1,10 +1,5 @@
|
|||
//- 💫 DOCS > API > ANNOTATION > POS TAGS
|
||||
|
||||
+infobox("Tip")
|
||||
| In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
|
||||
| description for the string representation of a tag. For example,
|
||||
| #[code spacy.explain("RB")] will return "adverb".
|
||||
|
||||
+h(3, "pos-tagging-english") English part-of-speech tag scheme
|
||||
|
||||
p
|
||||
|
|
|
@ -103,7 +103,7 @@ p Get a #[code Token] object.
|
|||
doc = nlp(u'Give it back! He pleaded.')
|
||||
assert doc[0].text == 'Give'
|
||||
assert doc[-1].text == '.'
|
||||
span = doc[1:1]
|
||||
span = doc[1:3]
|
||||
assert span.text == 'it back'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
|
@ -272,7 +272,7 @@ p Import the document contents from a binary string.
|
|||
p
|
||||
| Retokenize the document, such that the span at
|
||||
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
|
||||
| #[code start_idx] and #[end_idx] do not mark start and end token
|
||||
| #[code start_idx] and #[code end_idx] do not mark start and end token
|
||||
| boundaries, the document remains unchanged.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
|
|
|
@ -67,6 +67,16 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
|
|||
+cell unicode
|
||||
+cell Base form of the word, with no inflectional suffixes.
|
||||
|
||||
+row
|
||||
+cell #[code orth]
|
||||
+cell int
|
||||
+cell word's string.
|
||||
|
||||
+row
|
||||
+cell #[code orth_]
|
||||
+cell unicode
|
||||
+cell word's string.
|
||||
|
||||
+row
|
||||
+cell #[code lower]
|
||||
+cell int
|
||||
|
@ -238,11 +248,6 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
|
|||
+cell #[code text_with_ws]
|
||||
+cell unicode
|
||||
+cell Text content, with trailing space character if present.
|
||||
|
||||
+row
|
||||
+cell #[code whitespace]
|
||||
+cell int
|
||||
+cell Trailing space character if present.
|
||||
+row
|
||||
+cell #[code whitespace_]
|
||||
+cell unicode
|
||||
|
|
|
@ -124,7 +124,7 @@ p
|
|||
+cell #[code Lexeme]
|
||||
+cell The lexeme indicated by the given ID.
|
||||
|
||||
+h(2, "iter") Span.__iter__
|
||||
+h(2, "iter") Vocab.__iter__
|
||||
+tag method
|
||||
|
||||
p Iterate over the lexemes in the vocabulary.
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
|
||||
"index": {
|
||||
"title": "Install spaCy",
|
||||
"quickstart": true,
|
||||
"next": "models"
|
||||
},
|
||||
|
||||
|
|
|
@ -25,3 +25,4 @@ p
|
|||
+model-row("en_vectors_glove_md", "English", [1, 0, 0, 1], "727 MB", "CC BY-SA")
|
||||
+model-row("de_core_news_md", "German", [1, 1, 1, 1], "645 MB", "CC BY-SA", true, true)
|
||||
+model-row("fr_depvec_web_lg", "French", [1, 1, 0, 1], "1.33 GB", "CC BY-NC", true, true)
|
||||
+model-row("es_core_web_md", "Spanish", [1, 1, 1, 1], "377 MB", "CC BY-SA", true, true)
|
||||
|
|
|
@ -113,7 +113,7 @@ p
|
|||
else:
|
||||
tokens.append(substring)
|
||||
substring = ''
|
||||
tokens.extend(suffixes)
|
||||
tokens.extend(reversed(suffixes))
|
||||
return tokens
|
||||
|
||||
p
|
||||
|
@ -214,7 +214,7 @@ p
|
|||
def __call__(self, text):
|
||||
words = text.split(' ')
|
||||
# All tokens 'own' a subsequent space character in this tokenizer
|
||||
spaces = [True] * len(word)
|
||||
spaces = [True] * len(words)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
p
|
||||
|
|
|
@ -36,7 +36,7 @@ p
|
|||
| to #[code spacy.load()]. The function should take a
|
||||
| #[code spacy.language.Language] object as its only argument, and return
|
||||
| a sequence of callables. Each callable should accept a
|
||||
| #[+api("docs") #[code Doc]] object, modify it in place, and return
|
||||
| #[+api("doc") #[code Doc]] object, modify it in place, and return
|
||||
| #[code None].
|
||||
|
||||
p
|
||||
|
|
|
@ -12,6 +12,40 @@ p
|
|||
| #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
|
||||
| and #[a(href="#source-windows") Windows] for details.
|
||||
|
||||
+quickstart(QUICKSTART, "Quickstart")
|
||||
+qs({config: 'venv', python: 2}) python -m pip install -U virtualenv
|
||||
+qs({config: 'venv', python: 3}) python -m pip install -U venv
|
||||
+qs({config: 'venv', python: 2}) virtualenv .env
|
||||
+qs({config: 'venv', python: 3}) venv .env
|
||||
+qs({config: 'venv', os: 'mac'}) source .env/bin/activate
|
||||
+qs({config: 'venv', os: 'linux'}) source .env/bin/activate
|
||||
+qs({config: 'venv', os: 'windows'}) .env\Scripts\activate
|
||||
|
||||
+qs({package: 'pip'}) pip install -U spacy
|
||||
|
||||
+qs({package: 'conda'}) conda config --add channels conda-forge
|
||||
+qs({package: 'conda'}) conda install spacy
|
||||
|
||||
+qs({package: 'source'}) git clone https://github.com/explosion/spaCy
|
||||
+qs({package: 'source'}) cd spaCy
|
||||
+qs({package: 'source'}) pip install -r requirements.txt
|
||||
+qs({package: 'source'}) pip install -e .
|
||||
|
||||
+qs({model: 'en'}) python -m spacy download en
|
||||
+qs({model: 'de'}) python -m spacy download de
|
||||
+qs({model: 'fr'}) python -m spacy download fr
|
||||
+qs({model: 'es'}) python -m spacy download es
|
||||
|
||||
+h(2, "installation") Installation instructions
|
||||
|
||||
+h(3, "pip") pip
|
||||
+badge("pipy")
|
||||
|
||||
p Using pip, spaCy releases are currently only available as source packages.
|
||||
|
||||
+code(false, "bash").
|
||||
pip install -U spacy
|
||||
|
||||
+aside("Download models")
|
||||
| After installation you need to download a language model. For more info
|
||||
| and available models, see the #[+a("/docs/usage/models") docs on models].
|
||||
|
@ -22,14 +56,6 @@ p
|
|||
>>> import spacy
|
||||
>>> nlp = spacy.load('en')
|
||||
|
||||
+h(2, "pip") pip
|
||||
+badge("pipy")
|
||||
|
||||
p Using pip, spaCy releases are currently only available as source packages.
|
||||
|
||||
+code(false, "bash").
|
||||
pip install -U spacy
|
||||
|
||||
p
|
||||
| When using pip it is generally recommended to install packages in a
|
||||
| #[code virtualenv] to avoid modifying system state:
|
||||
|
@ -39,7 +65,7 @@ p
|
|||
source .env/bin/activate
|
||||
pip install spacy
|
||||
|
||||
+h(2, "conda") conda
|
||||
+h(3, "conda") conda
|
||||
+badge("conda")
|
||||
|
||||
p
|
||||
|
|
|
@ -17,10 +17,10 @@ p
|
|||
| trying to do.
|
||||
|
||||
+code.
|
||||
import spacy # See "Installing spaCy"
|
||||
nlp = spacy.load('en') # You are here.
|
||||
doc = nlp(u'Hello, spacy!') # See "Using the pipeline"
|
||||
print((w.text, w.pos_) for w in doc) # See "Doc, Span and Token"
|
||||
import spacy # See "Installing spaCy"
|
||||
nlp = spacy.load('en') # You are here.
|
||||
doc = nlp(u'Hello, spacy!') # See "Using the pipeline"
|
||||
print([(w.text, w.pos_) for w in doc]) # See "Doc, Span and Token"
|
||||
|
||||
+aside("Why do we have to preload?")
|
||||
| Loading the models takes ~200x longer than
|
||||
|
|
|
@ -83,7 +83,7 @@ p
|
|||
+h(2, "examples-word-vectors") Word vectors
|
||||
|
||||
+code.
|
||||
doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
|
||||
doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")
|
||||
|
||||
apples = doc[0]
|
||||
oranges = doc[2]
|
||||
|
@ -148,24 +148,20 @@ p
|
|||
|
||||
+code.
|
||||
def put_spans_around_tokens(doc, get_classes):
|
||||
'''Given some function to compute class names, put each token in a
|
||||
span element, with the appropriate classes computed.
|
||||
|
||||
All whitespace is preserved, outside of the spans. (Yes, I know HTML
|
||||
won't display it. But the point is no information is lost, so you can
|
||||
calculate what you need, e.g. <br /> tags, <p> tags, etc.)
|
||||
'''
|
||||
"""Given some function to compute class names, put each token in a
|
||||
span element, with the appropriate classes computed. All whitespace is
|
||||
preserved, outside of the spans. (Of course, HTML won't display more than
|
||||
one whitespace character it – but the point is, no information is lost
|
||||
and you can calculate what you need, e.g. <br />, <p> etc.)
|
||||
"""
|
||||
output = []
|
||||
template = '<span classes="{classes}">{word}</span>{space}'
|
||||
html = '<span class="{classes}">{word}</span>{space}'
|
||||
for token in doc:
|
||||
if token.is_space:
|
||||
output.append(token.orth_)
|
||||
output.append(token.text)
|
||||
else:
|
||||
output.append(
|
||||
template.format(
|
||||
classes=' '.join(get_classes(token)),
|
||||
word=token.orth_,
|
||||
space=token.whitespace_))
|
||||
classes = ' '.join(get_classes(token))
|
||||
output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
|
||||
string = ''.join(output)
|
||||
string = string.replace('\n', '')
|
||||
string = string.replace('\t', ' ')
|
||||
|
|
|
@ -203,7 +203,7 @@ p
|
|||
p
|
||||
| If you've trained your own model, for example for
|
||||
| #[+a("/docs/usage/adding-languages") additional languages] or
|
||||
| #[+a("/docs/usage/train-ner") custom named entities], you can save its
|
||||
| #[+a("/docs/usage/training-ner") custom named entities], you can save its
|
||||
| state using the #[code Language.save_to_directory()] method. To make the
|
||||
| model more convenient to deploy, we recommend wrapping it as a Python
|
||||
| package.
|
||||
|
|
|
@ -19,11 +19,11 @@ p Here's a minimal example. We first add a pattern that specifies three tokens:
|
|||
p
|
||||
| Once we've added the pattern, we can use the #[code matcher] as a
|
||||
| callable, to receive a list of #[code (ent_id, start, end)] tuples.
|
||||
| Note that #[code LOWER] and #[code IS_PUNCT] are data attributes
|
||||
| of #[code spacy.attrs].
|
||||
|
||||
+code.
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.attrs import IS_PUNCT, LOWER
|
||||
|
||||
matcher = Matcher(nlp.vocab)
|
||||
matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}])
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@ p
|
|||
| and walk you through generating the meta data. You can also create the
|
||||
| meta.json manually and place it in the model data directory, or supply a
|
||||
| path to it using the #[code --meta] flag. For more info on this, see the
|
||||
| #[+a("/docs/usage/cli/#package") #[code package] command] documentation.
|
||||
| #[+a("/docs/usage/cli#package") #[code package] command] documentation.
|
||||
|
||||
+aside-code("meta.json", "json").
|
||||
{
|
||||
|
|
|
@ -150,8 +150,8 @@ p
|
|||
for itn in range(20):
|
||||
random.shuffle(train_data)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
doc = nlp.make_doc(raw_text)
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
nlp.tagger(doc)
|
||||
loss = nlp.entity.update(doc, gold)
|
||||
nlp.end_training()
|
||||
|
|
|
@ -11,7 +11,7 @@ include _includes/_mixins
|
|||
h2.c-landing__title.o-block.u-heading-1
|
||||
| in Python
|
||||
|
||||
+landing-badge("https://survey.spacy.io", "usersurvey", "Take the user survey!")
|
||||
+landing-badge(gh("spaCy") + "/releases/tag/v2.0.0-alpha", "v2alpha", "Try spaCy v2.0.0 alpha!")
|
||||
|
||||
+grid.o-content
|
||||
+grid-col("third").o-card
|
||||
|
|
Loading…
Reference in New Issue
Block a user