* Remove old docs files

This commit is contained in:
Matthew Honnibal 2015-09-06 20:20:55 +02:00
commit a03e2a0b65
99 changed files with 3740 additions and 4166 deletions

2
.gitignore vendored
View File

@ -13,6 +13,8 @@ MANIFEST
corpora/
models/
examples/
keys/
spacy/syntax/*.cpp
spacy/syntax/*.html

View File

@ -20,6 +20,7 @@ from __future__ import unicode_literals
from ast import literal_eval
import math
import gzip
import json
import plac
from pathlib import Path
@ -29,8 +30,6 @@ from shutil import copytree
import codecs
from collections import defaultdict
from spacy.en import get_lex_props
from spacy.en.lemmatizer import Lemmatizer
from spacy.vocab import Vocab
from spacy.vocab import write_binary_vectors
from spacy.strings import hash_string
@ -38,6 +37,13 @@ from preshed.counter import PreshCounter
from spacy.parts_of_speech import NOUN, VERB, ADJ
import spacy.en
import spacy.de
import spacy.fi
import spacy.it
def setup_tokenizer(lang_data_dir, tok_dir):
if not tok_dir.exists():
@ -139,7 +145,7 @@ def _read_senses(loc):
return lexicon
def setup_vocab(src_dir, dst_dir):
def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
if not dst_dir.exists():
dst_dir.mkdir()
@ -148,13 +154,13 @@ def setup_vocab(src_dir, dst_dir):
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
else:
print("Warning: Word vectors file not found")
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
vocab = Vocab(get_lex_attr=get_lex_attr, tag_map=tag_map)
clusters = _read_clusters(src_dir / 'clusters.txt')
probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
if not probs:
probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
if not probs:
oov_prob = 0.0
oov_prob = -20
else:
oov_prob = min(probs.values())
for word in clusters:
@ -163,23 +169,32 @@ def setup_vocab(src_dir, dst_dir):
lexicon = []
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
entry = get_lex_props(word)
entry['prob'] = float(prob)
cluster = clusters.get(word, '0')
lexeme = vocab[word]
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
entry['cluster'] = int(cluster[::-1], 2)
vocab[word] = entry
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
vocab.dump(str(dst_dir / 'lexemes.bin'))
vocab.strings.dump(str(dst_dir / 'strings.txt'))
with (dst_dir / 'oov_prob').open('w') as file_:
file_.write('%f' % oov_prob)
def main(lang_data_dir, corpora_dir, model_dir):
def main(lang_id, lang_data_dir, corpora_dir, model_dir):
languages = {
'en': spacy.en.English.default_lex_attrs(),
'de': spacy.de.German.default_lex_attrs(),
'fi': spacy.fi.Finnish.default_lex_attrs(),
'it': spacy.it.Italian.default_lex_attrs(),
}
model_dir = Path(model_dir)
lang_data_dir = Path(lang_data_dir)
corpora_dir = Path(corpora_dir)
lang_data_dir = Path(lang_data_dir) / lang_id
corpora_dir = Path(corpora_dir) / lang_id
assert corpora_dir.exists()
assert lang_data_dir.exists()
@ -187,9 +202,19 @@ def main(lang_data_dir, corpora_dir, model_dir):
if not model_dir.exists():
model_dir.mkdir()
tag_map = json.load((lang_data_dir / 'tag_map.json').open())
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
setup_vocab(corpora_dir, model_dir / 'vocab')
if not (model_dir / 'wordnet').exists():
setup_vocab(languages[lang_id], tag_map, corpora_dir, model_dir / 'vocab')
if (lang_data_dir / 'gazetteer.json').exists():
copyfile(str(lang_data_dir / 'gazetteer.json'),
str(model_dir / 'vocab' / 'gazetteer.json'))
if (lang_data_dir / 'lemma_rules.json').exists():
copyfile(str(lang_data_dir / 'lemma_rules.json'),
str(model_dir / 'vocab' / 'lemma_rules.json'))
if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists():
copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))

View File

@ -14,7 +14,6 @@ import re
import spacy.util
from spacy.en import English
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
from spacy.syntax.util import Config
from spacy.gold import read_json_file
@ -22,6 +21,11 @@ from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.ner import BiluoPushDown
from spacy.tagger import Tagger
from spacy.syntax.parser import Parser
def _corrupt(c, noise_level):
if random.random() >= noise_level:
@ -80,32 +84,28 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
beam_width=1, verbose=False,
use_orig_arc_eager=False):
dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos')
ner_model_dir = path.join(model_dir, 'ner')
if path.exists(dep_model_dir):
shutil.rmtree(dep_model_dir)
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
if path.exists(ner_model_dir):
shutil.rmtree(ner_model_dir)
os.mkdir(dep_model_dir)
os.mkdir(pos_model_dir)
os.mkdir(ner_model_dir)
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
labels=ArcEager.get_labels(gold_tuples),
beam_width=beam_width)
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
labels=Language.EntityTransitionSystem.get_labels(gold_tuples),
labels=BiluoPushDown.get_labels(gold_tuples),
beam_width=0)
if n_sents > 0:
gold_tuples = gold_tuples[:n_sents]
nlp = Language(data_dir=model_dir)
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
for itn in range(n_iter):
scorer = Scorer()
@ -140,7 +140,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
scorer.tags_acc,
scorer.token_acc))
nlp.end_training()
nlp.end_training(model_dir)
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None):

175
bin/tagger/train.py Executable file
View File

@ -0,0 +1,175 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
from __future__ import print_function
import os
from os import path
import shutil
import codecs
import random
import plac
import re
import spacy.util
from spacy.en import English
from spacy.tagger import Tagger
from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse
from spacy.scorer import Scorer
def score_model(scorer, nlp, raw_text, annot_tuples):
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
else:
tokens = nlp.tokenizer(raw_text)
nlp.tagger(tokens)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold)
def _merge_sents(sents):
m_deps = [[], [], [], [], [], []]
m_brackets = []
i = 0
for (ids, words, tags, heads, labels, ner), brackets in sents:
m_deps[0].extend(id_ + i for id_ in ids)
m_deps[1].extend(words)
m_deps[2].extend(tags)
m_deps[3].extend(head + i for head in heads)
m_deps[4].extend(labels)
m_deps[5].extend(ner)
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
i += len(ids)
return [(m_deps, m_brackets)]
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
beam_width=1, verbose=False,
use_orig_arc_eager=False):
if n_sents > 0:
gold_tuples = gold_tuples[:n_sents]
templates = Tagger.default_templates()
nlp = Language(data_dir=model_dir, tagger=False)
nlp.tagger = Tagger.blank(nlp.vocab, templates)
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
for itn in range(n_iter):
scorer = Scorer()
loss = 0
for raw_text, sents in gold_tuples:
if gold_preproc:
raw_text = None
else:
sents = _merge_sents(sents)
for annot_tuples, ctnt in sents:
words = annot_tuples[1]
gold_tags = annot_tuples[2]
score_model(scorer, nlp, raw_text, annot_tuples)
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(words)
else:
tokens = nlp.tokenizer(raw_text)
loss += nlp.tagger.train(tokens, gold_tags)
random.shuffle(gold_tuples)
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
scorer.tags_acc,
scorer.token_acc))
nlp.end_training(model_dir)
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None):
nlp = Language(data_dir=model_dir)
if beam_width is not None:
nlp.parser.cfg.beam_width = beam_width
scorer = Scorer()
for raw_text, sents in gold_tuples:
if gold_preproc:
raw_text = None
else:
sents = _merge_sents(sents)
for annot_tuples, brackets in sents:
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.entity(tokens)
nlp.parser(tokens)
else:
tokens = nlp(raw_text, merge_mwes=False)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
return scorer
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
nlp = Language(data_dir=model_dir)
if beam_width is not None:
nlp.parser.cfg.beam_width = beam_width
gold_tuples = read_json_file(dev_loc)
scorer = Scorer()
out_file = codecs.open(out_loc, 'w', 'utf8')
for raw_text, sents in gold_tuples:
sents = _merge_sents(sents)
for annot_tuples, brackets in sents:
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.entity(tokens)
nlp.parser(tokens)
else:
tokens = nlp(raw_text, merge_mwes=False)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=False)
for t in tokens:
out_file.write(
'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
)
return scorer
@plac.annotations(
train_loc=("Location of training file or directory"),
dev_loc=("Location of development file or directory"),
model_dir=("Location of output model directory",),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
corruption_level=("Amount of noise to add to training data", "option", "c", float),
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
out_loc=("Out location", "option", "o", str),
n_sents=("Number of training sentences", "option", "n", int),
n_iter=("Number of training iterations", "option", "i", int),
verbose=("Verbose error reporting", "flag", "v", bool),
debug=("Debug mode", "flag", "d", bool),
)
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
if not eval_only:
gold_train = list(read_json_file(train_loc))
train(English, gold_train, model_dir,
feat_set='basic' if not debug else 'debug',
gold_preproc=gold_preproc, n_sents=n_sents,
corruption_level=corruption_level, n_iter=n_iter,
verbose=verbose)
#if out_loc:
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
scorer = evaluate(English, list(read_json_file(dev_loc)),
model_dir, gold_preproc=gold_preproc, verbose=verbose)
print('TOK', scorer.token_acc)
print('POS', scorer.tags_acc)
print('UAS', scorer.uas)
print('LAS', scorer.las)
print('NER P', scorer.ents_p)
print('NER R', scorer.ents_r)
print('NER F', scorer.ents_f)
if __name__ == '__main__':
plac.call(main)

View File

@ -1,177 +0,0 @@
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = ../../docs-spacy
# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/spaCy.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/spaCy.qhc"
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/spaCy"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/spaCy"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."

View File

@ -1,271 +0,0 @@
# -*- coding: utf-8 -*-
#
# spaCy documentation build configuration file, created by
# sphinx-quickstart on Thu Sep 25 17:47:15 2014.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys
import os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#sys.path.insert(0, os.path.abspath('.'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.doctest',
'sphinx.ext.intersphinx',
'sphinx.ext.viewcode',
'sphinxcontrib.napoleon',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'spaCy'
copyright = u'2015, Matthew Honnibal'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = '0.85'
# The full version, including alpha/beta/rc tags.
release = '0.85'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = []
# The reST default role (used for this markup: `text`) to use for all
# documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
#keep_warnings = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'sphinx_rtd_theme'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
html_theme_options = {
'google_analytics_id': 'UA-58931649-1'
}
# Add any paths that contain custom themes here, relative to this directory.
html_theme_path = ["../_themes"]
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
#html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'spaCydoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
('index', 'spaCy.tex', u'spaCy Documentation',
u'Matthew Honnibal', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'spacy', u'spaCy Documentation',
[u'Matthew Honnibal'], 1)
]
# If true, show URL addresses after external links.
#man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'spaCy', u'spaCy Documentation',
u'Matthew Honnibal', 'spaCy', 'One line description of project.',
'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
#texinfo_appendices = []
# If false, no module index is generated.
#texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
#texinfo_no_detailmenu = False
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {'http://docs.python.org/': None}

View File

@ -1,116 +0,0 @@
====================
Annotation Standards
====================
This document describes the target annotations spaCy is trained to predict.
This is currently a work in progress. Please ask questions on the issue tracker,
so that the answers can be integrated here to improve the documentation.
https://github.com/honnibal/spaCy/issues
English
=======
Tokenization
------------
Tokenization standards are based on the OntoNotes 5 corpus.
The tokenizer differs from most by including tokens for significant whitespace.
Any sequence of whitespace characters beyond a single space (' ') is included
as a token. For instance:
>>> from spacy.en import English
>>> nlp = English(parse=False)
>>> tokens = nlp(u'Some\nspaces and\ttab characters')
>>> print [t.orth_ for t in tokens]
[u'Some', u'\n', u'spaces', u' ', u'and', u'\t', u'tab', u'characters']
The whitespace tokens are useful for much the same reason punctuation is --- it's
often an important delimiter in the text. By preserving it in the token output,
we are able to maintain a simple alignment between the tokens and the original
string, and we ensure that the token stream does not lose information.
Sentence boundary detection
---------------------------
Sentence boundaries are calculated from the syntactic parse tree, so features
such as punctuation and capitalisation play an important but non-decisive role
in determining the sentence boundaries. Usually this means that the sentence
boundaries will at least coincide with clause boundaries, even given poorly
punctuated text.
Part-of-speech Tagging
----------------------
The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank
tag set. We also map the tags to the simpler Google Universal POS Tag set.
Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124
Lemmatization
-------------
A "lemma" is the uninflected form of a word. In English, this means:
* Adjectives: The form like "happy", not "happier" or "happiest"
* Adverbs: The form like "badly", not "worse" or "worst"
* Nouns: The form like "dog", not "dogs"; like "child", not "children"
* Verbs: The form like "write", not "writes", "writing", "wrote" or "written"
The lemmatization data is taken from WordNet. However, we also add a special
case for pronouns: all pronouns are lemmatized to the special token -PRON-.
Syntactic Dependency Parsing
----------------------------
The parser is trained on data produced by the ClearNLP converter. Details of
the annotation scheme can be found here:
http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf
Named Entity Recognition
------------------------
+--------------+-----------------------------------------------------+
| PERSON | People, including fictional |
+--------------+-----------------------------------------------------+
| NORP | Nationalities or religious or political groups |
+--------------+-----------------------------------------------------+
| FACILITY | Buildings, airports, highways, bridges, etc. |
+--------------+-----------------------------------------------------+
| ORGANIZATION | Companies, agencies, institutions, etc. |
+--------------+-----------------------------------------------------+
| GPE | Countries, cities, states |
+--------------+-----------------------------------------------------+
| LOCATION | Non-GPE locations, mountain ranges, bodies of water |
+--------------+-----------------------------------------------------+
| PRODUCT | Vehicles, weapons, foods, etc. (Not services) |
+--------------+-----------------------------------------------------+
| EVENT | Named hurricanes, battles, wars, sports events, etc.|
+--------------+-----------------------------------------------------+
| WORK OF ART | Titles of books, songs, etc. |
+--------------+-----------------------------------------------------+
| LAW | Named documents made into laws |
+--------------+-----------------------------------------------------+
| LANGUAGE | Any named language |
+--------------+-----------------------------------------------------+
The following values are also annotated in a style similar to names:
+--------------+---------------------------------------------+
| DATE | Absolute or relative dates or periods |
+--------------+---------------------------------------------+
| TIME | Times smaller than a day |
+--------------+---------------------------------------------+
| PERCENT | Percentage (including “%”) |
+--------------+---------------------------------------------+
| MONEY | Monetary values, including unit |
+--------------+---------------------------------------------+
| QUANTITY | Measurements, as of weight or distance |
+--------------+---------------------------------------------+
| ORDINAL | "first", "second" |
+--------------+---------------------------------------------+
| CARDINAL | Numerals that do not fall under another type|
+--------------+---------------------------------------------+

View File

@ -1 +0,0 @@

View File

@ -1,77 +0,0 @@
Lexeme Features
===============
A lexeme is an entry in the lexicon --- the vocabulary --- for a word, punctuation
symbol, whitespace unit, etc. Lexemes come with lots of pre-computed information,
that help you write good feature functions. Features are integer-valued where
possible --- instead of strings, spaCy refers to strings by consecutive ID numbers,
which you can use to look up the string values if necessary.
String features
---------------
+---------+-------------------------------------------------------------------+
| SIC | The word as it appeared in the sentence, unaltered. |
+---------+-------------------------------------------------------------------+
| NORM | For frequent words, case normalization is applied. |
| | Otherwise, back-off to SHAPE. |
+---------+-------------------------------------------------------------------+
| SHAPE | Remap the characters of the word as follows: |
| | |
| | a-z --> x, A-Z --> X, 0-9 --> d, ,.;:"'?!$- --> self, other --> \*|
| | |
| | Trim sequences of length 3+ to 3, e.g |
| | |
| | apples --> xxx, Apples --> Xxxx, app9LES@ --> xxx9XXX* |
+---------+-------------------------------------------------------------------+
| ASCIIED | Use unidecode.unidecode(sic) to approximate the word using the |
| | ascii characters. |
+---------+-------------------------------------------------------------------+
| PREFIX | sic_unicode_string[:1] |
+---------+-------------------------------------------------------------------+
| SUFFIX | sic_unicode_string[-3:] |
+---------+-------------------------------------------------------------------+
Integer features
----------------
+--------------+--------------------------------------------------------------+
| LENGTH | Length of the string, in unicode |
+--------------+--------------------------------------------------------------+
| CLUSTER | Brown cluster |
+--------------+--------------------------------------------------------------+
| POS_TYPE | K-means cluster of word's tag affinities |
+--------------+--------------------------------------------------------------+
| SENSE_TYPE | K-means cluster of word's sense affinities |
+--------------+--------------------------------------------------------------+
Boolean features
----------------
+-------------+--------------------------------------------------------------+
| IS_ALPHA | The result of sic.isalpha() |
+-------------+--------------------------------------------------------------+
| IS_ASCII | Check whether all the word's characters are ascii characters |
+-------------+--------------------------------------------------------------+
| IS_DIGIT | The result of sic.isdigit() |
+-------------+--------------------------------------------------------------+
| IS_LOWER | The result of sic.islower() |
+-------------+--------------------------------------------------------------+
| IS_PUNCT | Check whether all characters are in the class TODO |
+-------------+--------------------------------------------------------------+
| IS_SPACE | The result of sic.isspace() |
+-------------+--------------------------------------------------------------+
| IS_TITLE | The result of sic.istitle() |
+-------------+--------------------------------------------------------------+
| IS_UPPER | The result of sic.isupper() |
+-------------+--------------------------------------------------------------+
| LIKE_URL | Check whether the string looks like it could be a URL. Aims |
| | for low false negative rate. |
+-------------+--------------------------------------------------------------+
| LIKE_NUMBER | Check whether the string looks like it could be a numeric |
| | entity, e.g. 10,000 10th .10 . Skews for low false negative |
| | rate. |
+-------------+--------------------------------------------------------------+
| IN_LIST | Facility for loading arbitrary run-time word lists? |
+-------------+--------------------------------------------------------------+

View File

@ -1,337 +0,0 @@
{
"id": "wsj_0001",
"paragraphs": [
{
"raw": "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29. Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.",
"segmented": "Pierre Vinken<SEP>, 61 years old<SEP>, will join the board as a nonexecutive director Nov. 29<SEP>.<SENT>Mr. Vinken is chairman of Elsevier N.V.<SEP>, the Dutch publishing group<SEP>.",
"sents": [
0,
85
],
"tokens": [
{
"dep": "NMOD",
"start": 0,
"head": 7,
"tag": "NNP",
"orth": "Pierre"
},
{
"dep": "SUB",
"start": 7,
"head": 29,
"tag": "NNP",
"orth": "Vinken"
},
{
"dep": "P",
"start": 13,
"head": 7,
"tag": ",",
"orth": ","
},
{
"dep": "NMOD",
"start": 15,
"head": 18,
"tag": "CD",
"orth": "61"
},
{
"dep": "AMOD",
"start": 18,
"head": 24,
"tag": "NNS",
"orth": "years"
},
{
"dep": "NMOD",
"start": 24,
"head": 7,
"tag": "JJ",
"orth": "old"
},
{
"dep": "P",
"start": 27,
"head": 7,
"tag": ",",
"orth": ","
},
{
"dep": "ROOT",
"start": 29,
"head": -1,
"tag": "MD",
"orth": "will"
},
{
"dep": "VC",
"start": 34,
"head": 29,
"tag": "VB",
"orth": "join"
},
{
"dep": "NMOD",
"start": 39,
"head": 43,
"tag": "DT",
"orth": "the"
},
{
"dep": "OBJ",
"start": 43,
"head": 34,
"tag": "NN",
"orth": "board"
},
{
"dep": "VMOD",
"start": 49,
"head": 34,
"tag": "IN",
"orth": "as"
},
{
"dep": "NMOD",
"start": 52,
"head": 67,
"tag": "DT",
"orth": "a"
},
{
"dep": "NMOD",
"start": 54,
"head": 67,
"tag": "JJ",
"orth": "nonexecutive"
},
{
"dep": "PMOD",
"start": 67,
"head": 49,
"tag": "NN",
"orth": "director"
},
{
"dep": "VMOD",
"start": 76,
"head": 34,
"tag": "NNP",
"orth": "Nov."
},
{
"dep": "NMOD",
"start": 81,
"head": 76,
"tag": "CD",
"orth": "29"
},
{
"dep": "P",
"start": 83,
"head": 29,
"tag": ".",
"orth": "."
},
{
"dep": "NMOD",
"start": 85,
"head": 89,
"tag": "NNP",
"orth": "Mr."
},
{
"dep": "SUB",
"start": 89,
"head": 96,
"tag": "NNP",
"orth": "Vinken"
},
{
"dep": "ROOT",
"start": 96,
"head": -1,
"tag": "VBZ",
"orth": "is"
},
{
"dep": "PRD",
"start": 99,
"head": 96,
"tag": "NN",
"orth": "chairman"
},
{
"dep": "NMOD",
"start": 108,
"head": 99,
"tag": "IN",
"orth": "of"
},
{
"dep": "NMOD",
"start": 111,
"head": 120,
"tag": "NNP",
"orth": "Elsevier"
},
{
"dep": "NMOD",
"start": 120,
"head": 147,
"tag": "NNP",
"orth": "N.V."
},
{
"dep": "P",
"start": 124,
"head": 147,
"tag": ",",
"orth": ","
},
{
"dep": "NMOD",
"start": 126,
"head": 147,
"tag": "DT",
"orth": "the"
},
{
"dep": "NMOD",
"start": 130,
"head": 147,
"tag": "NNP",
"orth": "Dutch"
},
{
"dep": "NMOD",
"start": 136,
"head": 147,
"tag": "VBG",
"orth": "publishing"
},
{
"dep": "PMOD",
"start": 147,
"head": 108,
"tag": "NN",
"orth": "group"
},
{
"dep": "P",
"start": 152,
"head": 96,
"tag": ".",
"orth": "."
}
],
"brackets": [
{
"start": 0,
"end": 7,
"label": "NP"
},
{
"start": 15,
"end": 18,
"label": "NP"
},
{
"start": 15,
"end": 24,
"label": "ADJP"
},
{
"start": 0,
"end": 27,
"label": "NP-SBJ"
},
{
"start": 39,
"end": 43,
"label": "NP"
},
{
"start": 52,
"end": 67,
"label": "NP"
},
{
"start": 49,
"end": 67,
"label": "PP-CLR"
},
{
"start": 76,
"end": 81,
"label": "NP-TMP"
},
{
"start": 34,
"end": 81,
"label": "VP"
},
{
"start": 29,
"end": 81,
"label": "VP"
},
{
"start": 0,
"end": 83,
"label": "S"
},
{
"start": 85,
"end": 89,
"label": "NP-SBJ"
},
{
"start": 99,
"end": 99,
"label": "NP"
},
{
"start": 111,
"end": 120,
"label": "NP"
},
{
"start": 126,
"end": 147,
"label": "NP"
},
{
"start": 111,
"end": 147,
"label": "NP"
},
{
"start": 108,
"end": 147,
"label": "PP"
},
{
"start": 99,
"end": 147,
"label": "NP-PRD"
},
{
"start": 96,
"end": 147,
"label": "VP"
},
{
"start": 85,
"end": 152,
"label": "S"
}
]
}
]
}

View File

@ -1,262 +0,0 @@
How spaCy Works
===============
The following are some hasty preliminary notes on how spaCy works. The short
story is, there are no new killer algorithms. The way that the tokenizer works
is novel and a bit neat, and the parser has a new feature set, but otherwise
the key algorithms are well known in the recent literature.
Some might also wonder how I get Python code to run so fast. I don't --- spaCy
is written in `Cython`_, an optionally statically-typed language that compiles
to C or C++, which is then loaded as a C extension module.
This makes it `easy to achieve the performance of native C code`_, but allows the
use of Python language features, via the Python C API. The Python unicode
library was particularly useful to me. I think it would have been much more
difficult to write spaCy in another language.
.. _Cython: http://cython.org/
.. _easy to achieve the performance of native C code: https://honnibal.wordpress.com/2014/10/21/writing-c-in-cython/
Tokenizer and Lexicon
---------------------
Tokenization is the task of splitting a string into meaningful pieces, called
tokens, which you can then compute with. In practice, the task is usually to
match the tokenization performed in some treebank, or other corpus. If we want
to apply a tagger, entity recogniser, parser etc, then we want our run-time
text to match the training conventions. If we want to use a model that's been
trained to expect "isn't" to be split into two tokens, ["is", "n't"], then that's
how we need to prepare our data.
In order to train spaCy's models with the best data available, I therefore
tokenize English according to the Penn Treebank scheme. It's not perfect, but
it's what everybody is using, and it's good enough.
What we don't do
################
The Penn Treebank was distributed with a script called tokenizer.sed, which
tokenizes ASCII newswire text roughly according to the Penn Treebank standard.
Almost all tokenizers are based on these regular expressions, with various
updates to account for unicode characters, and the fact that it's no longer
1986 --- today's text has URLs, emails, emoji, etc.
Usually, the resulting regular expressions are applied in multiple passes, which
is quite inefficient. Often no care is taken to preserve indices into the original
string. If you lose these indices, it'll be difficult to calculate mark-up based
on your annotations.
Tokenizer Algorithm
###################
spaCy's tokenizer assumes that no tokens will cross whitespace --- there will
be no multi-word tokens. If we want these, we can post-process the
token-stream later, merging as necessary. This assumption allows us to deal
only with small chunks of text. We can cache the processing of these, and
simplify our expressions somewhat.
Here is what the outer-loop would look like in Python. (You can see the
production implementation, in Cython, here.)
.. code:: python
cache = {}
def tokenize(text):
tokens = []
for substring in text.split(' '):
if substring in cache:
tokens.extend(cache[substring])
else:
subtokens = _tokenize_substring(substring)
tokens.extend(subtokens)
cache[substring] = subtokens
return tokens
The actual work is performed in _tokenize_substring. For this, I divide the
tokenization rules into three pieces:
1. A prefixes expression, which matches from the start of the string;
2. A suffixes expression, which matches from the end of the string;
3. A special-cases table, which matches the whole string.
The algorithm then proceeds roughly like this (consider this like pseudo-code;
this was written quickly and has not been executed):
.. code:: python
# Tokens which can be attached at the beginning or end of another
prefix_re = _make_re([",", '"', '(', ...])
suffix_re = _make_re(s[",", "'", ":", "'s", ...])
# Contractions etc are simply enumerated, since they're a finite set. We
# can also specify anything we like here, which is nice --- different data
# has different quirks, so we want to be able to add ad hoc exceptions.
special_cases = {
"can't": ("ca", "n't"),
"won't": ("wo", "n't"),
"he'd've": ("he", "'d", "'ve"),
...
":)": (":)",) # We can add any arbitrary thing to this list.
}
def _tokenize_substring(substring):
prefixes = []
suffixes = []
while substring not in special_cases:
prefix, substring = _apply_re(substring, prefix_re)
if prefix:
prefixes.append(prefix)
else:
suffix, substring = _apply_re(substring, suffix_re)
if suffix:
suffixes.append(suffix)
else:
break
This procedure splits off tokens from the start and end of the string, at each
point checking whether the remaining string is in our special-cases table. If
it is, we stop splitting, and return the tokenization at that point.
The advantage of this design is that the prefixes, suffixes and special-cases
can be declared separately, in easy-to-understand files. If a new entry is
added to the special-cases, you can be sure that it won't have some unforeseen
consequence to a complicated regular-expression grammar.
Coupling the Tokenizer and Lexicon
##################################
As mentioned above, the tokenizer is designed to support easy caching. If all
we were caching were the matched substrings, this would not be so advantageous.
Instead, what we do is create a struct which houses all of our lexical
features, and cache *that*. The tokens are then simply pointers to these rich
lexical types.
In a sample of text, vocabulary size grows exponentially slower than word
count. So any computations we can perform over the vocabulary and apply to the
word count are efficient.
Part-of-speech Tagger
---------------------
.. _how to write a good part of speech tagger: https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ .
In 2013, I wrote a blog post describing `how to write a good part of speech
tagger`_.
My recommendation then was to use greedy decoding with the averaged perceptron.
I think this is still the best approach, so it's what I implemented in spaCy.
The tutorial also recommends the use of Brown cluster features, and case
normalization features, as these make the model more robust and domain
independent. spaCy's tagger makes heavy use of these features.
Dependency Parser
-----------------
.. _2014 blog post: https://honnibal.wordpress.com/2013/12/18/a-simple-fast-algorithm-for-natural-language-dependency-parsing/
The parser uses the algorithm described in my `2014 blog post`_.
This algorithm, shift-reduce dependency parsing, is becoming widely adopted due
to its compelling speed/accuracy trade-off.
Some quick details about spaCy's take on this, for those who happen to know
these models well. I'll write up a better description shortly.
1. I use greedy decoding, not beam search;
2. I use the arc-eager transition system;
3. I use the Goldberg and Nivre (2012) dynamic oracle.
4. I use the non-monotonic update from my CoNLL 2013 paper (Honnibal, Goldberg
and Johnson 2013).
So far, this is exactly the configuration from the CoNLL 2013 paper, which
scored 91.0. So how have I gotten it to 92.4? The following tweaks:
1. I use Brown cluster features --- these help a lot;
2. I redesigned the feature set. I've long known that the Zhang and Nivre
(2011) feature set was suboptimal, but a few features don't make a very
compelling publication. Still, they're important.
3. When I do the dynamic oracle training, I also make
the upate cost-sensitive: if the oracle determines that the move the parser
took has a cost of N, then the weights for the gold class are incremented by
+N, and the weights for the predicted class are incremented by -N. This
only made a small (0.1-0.2%) difference.
Implementation
##############
I don't do anything algorithmically novel to improve the efficiency of the
parser. However, I was very careful in the implementation.
A greedy shift-reduce parser with a linear model boils down to the following
loop:
.. code:: python
def parse(words, model, feature_funcs, n_classes):
state = init_state(words)
for _ in range(len(words) * 2):
features = [templ(state) for templ in feature_funcs]
scores = [0 for _ in range(n_classes)]
for feat in features:
weights = model[feat]
for i, weight in enumerate(weights):
scores[i] += weight
class_, score = max(enumerate(scores), key=lambda item: item[1])
transition(state, class_)
The parser makes 2N transitions for a sentence of length N. In order to select
the transition, it extracts a vector of K features from the state. Each feature
is used as a key into a hash table managed by the model. The features map to
a vector of weights, of length C. We then dot product the feature weights to the
scores vector we are building for that instance.
The inner-most loop here is not so bad: we only have a few dozen classes, so
it's just a short dot product. Both of the vectors are in the cache, so this
is a snack to a modern CPU.
The bottle-neck in this algorithm is the 2NK look-ups into the hash-table that
we must make, as these almost always have to hit main memory. The feature-set
is enormously large, because all of our features are one-hot boolean
indicators. Some of the features will be common, so they'll lurk around in the
CPU's cache hierarchy. But a lot of them won't be, and accessing main memory
takes a lot of cycles.
.. _Jeff Preshing's excellent post: http://preshing.com/20130107/this-hash-table-is-faster-than-a-judy-array/ .
I used to use the Google dense_hash_map implementation. This seemed a solid
choice: it came from a big brand, it was in C++, and it seemed very
complicated. Later, I read `Jeff Preshing's excellent post`_ on open-addressing
with linear probing.
This really spoke to me. I had assumed that a fast hash table implementation
would necessarily be very complicated, but no --- this is another situation
where the simple strategy wins.
I've packaged my Cython implementation separately from spaCy, in the package
`preshed`_ --- for "pre-hashed", but also as a nod to Preshing. I've also taken
great care over the feature extraction and perceptron code, which I'm distributing
in a package named `thinc`_ (since it's for learning very sparse models with
Cython).
.. _preshed: https://github.com/syllog1sm/preshed
.. _thinc: https://github.com/honnibal/thinc
By the way: from comparing notes with a few people, it seems common to
implement linear models in a way that's suboptimal for multi-class
classification. The mistake is to store in the hash-table one weight per
(feature, class) pair, rather than mapping the feature to a vector of weights,
for all of the classes. This is bad because it means you need to hit the table
C times, one per class, as you always need to evaluate a feature against all of
the classes. In the case of the parser, this means the hash table is accessed
2NKC times, instead of the 2NK times if you have a weights vector. You should
also be careful to store the weights contiguously in memory --- you don't want
a linked list here. I use a block-sparse format, because my problems tend to
have a few dozen classes.
I guess if I had to summarize my experience, I'd say that the efficiency of
these models is really all about the data structures. We want to stay small,
and stay contiguous. Minimize redundancy and minimize pointer chasing.
That's why Cython is so well suited to this: we get to lay out our data
structures, and manage the memory ourselves, with full C-level control.

View File

@ -1,51 +0,0 @@
.. spaCy documentation master file, created by
sphinx-quickstart on Tue Aug 19 16:27:38 2014.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
==============================
spaCy: Industrial-strength NLP
==============================
`spaCy`_ is a library for building tomorrow's language technology products.
It's like Stanford's CoreNLP for Python, but with a fundamentally different
objective. While CoreNLP is primarily built for conducting research, spaCy is
designed for application.
If you're a small company doing NLP, I think spaCy will seem like a minor miracle.
It's by far the fastest NLP software ever released.
The full processing pipeline completes in under 50ms per document, including accurate
tagging, entity recognition and parsing. All strings are mapped to integer IDs,
tokens are linked to embedded word representations, and a range of useful features
are pre-calculated and cached. The full analysis can be exported to numpy
arrays, or losslessly serialized into binary data smaller than the raw text.
If none of that made any sense to you, here's the gist of it. Computers don't
understand text. This is unfortunate, because that's what the web almost entirely
consists of. We want to recommend people text based on other text they liked.
We want to shorten text to display it on a mobile screen. We want to aggregate
it, link it, filter it, categorise it, generate it and correct it.
spaCy provides a library of utility functions that help programmers build such
products. It's commercial open source software: you can either use it under
the AGPL, or you can `buy a commercial license`_ for a one-time fee.
.. _spaCy: https://github.com/honnibal/spaCy/
.. _Issue Tracker: https://github.com/honnibal/spaCy/issues
**2015-07-08**: `Version 0.89 released`_
.. _Version 0.89 released: updates.html
.. _buy a commercial license: license.html
.. toctree::
:maxdepth: 4
:hidden:
quickstart.rst
reference/index.rst
license.rst
updates.rst

View File

@ -1,126 +0,0 @@
=======
License
=======
* Download the `license agreement`_
* Get in touch: matt@spacy.io
.. _license agreement: spacy_trial_free.docx
+------------+-----------+----------+-------------------------------------+
| License | Price | Term | Suitable for |
+============+===========+==========+=====================================+
| Commercial | $5,000 | Life | Production use |
+------------+-----------+----------+-------------------------------------+
| Trial | $0 | 90 days | Evaluation, seed startup |
+------------+-----------+----------+-------------------------------------+
| AGPLv3 | Free | Life | Research, teaching, hobbyists, FOSS |
+------------+-----------+----------+-------------------------------------+
To make spaCy as valuable as possible, licenses to it are for life. You get
complete transparency, certainty and control.
If you need to use spaCy as an API, it's trivial to host it yourself --- and
you don't need to worry about the service changing or disappearing.
And if you're ever in acquisition or IPO talks, the story is simple.
spaCy can also be used as free open-source software, under the Aferro GPL
license. If you use it this way, you must comply with the AGPL license terms.
When you distribute your project, or offer it as a network service, you must
distribute the source-code and grant users an AGPL license to it.
.. I left academia in June 2014, just when I should have been submitting my first
grant proposal. Grant writing seemed a bad business model. I wasn't sure
exactly what I would do instead, but I knew that the work I could do was
valuable, and that it would make sense for people to pay me to do it, and that
it's often easy to convince smart people of things that are true.
.. I left because I don't like the grant system. It's not the
best way to create value, and it's not the best way to get paid.
Examples
--------
In order to clarify how spaCy's license structure might apply to you, I've
written a few examples, in the form of user-stories.
Ashley and Casey: Seed stage start-up
#####################################
Ashley and Casey have an idea for a start-up. To explore their idea, they want
to build a minimum viable product they can put in front of potential users and
investors.
They have two options.
1. **Trial commercial license.** With a simple form, they can use spaCy for 90
days, for a nominal fee of $1. They are free to modify spaCy, and they
will own the copyright to their modifications for the duration of the license.
After the trial period elapses, they can either pay the license fee, stop
using spaCy, release their project under the AGPL.
2. **AGPL.** Casey and Pat can instead use spaCy under the AGPL license.
However, they must then release any code that statically or dynamically
links to spaCy under the AGPL as well (e.g. if they import the module, or
import a module that imports it, etc). They also cannot use spaCy as
a network resource, by running it as a service --- this is the
loophole that the "A" part of the AGPL is designed to close.
Ashley and Casey find the AGPL license unattractive for commercial use.
They decide to take up the trial commercial license.
However, over the next 90 days, Ashley has to move house twice, and Casey gets
sick. By the time the trial expires, they still don't have a demo they can show
investors. They send an email explaining the situation, and a 90 day extension
to their trial license is granted.
By the time the extension period has elapsed, spaCy has helped them secure
funding, and they even have a little revenue. They are glad to pay the $5,000
commercial license fee.
spaCy is now permanently licensed for the product Ashley and Casey are
developing. They own the copyright to any modifications they make to spaCy,
but not to the original spaCy code.
No additional fees will be due when they hire new developers, run spaCy on
additional internal servers, etc. If their company is acquired, the license will
be transferred to the company acquiring them. However, to use spaCy in another
product, they will have to buy a second license.
Alex and Sasha: University Academics
####################################
Alex and Sasha are post-doctoral researchers working for a university. Part of
their funding comes from a grant from Google, but Google will not own any part
of the work that they produce. Their mission is just to write papers.
Alex and Sasha find spaCy convenient, so they use it in their system under the
AGPL. This means that their system must also be released under the AGPL, but they're
cool with that --- they were going to release their code anyway, as it's the only
way to ensure their experiments are properly repeatable.
Alex and Sasha find and fix a few bugs in spaCy. They must release these
modifications, and they ask that they be accepted into the main spaCy repo.
In order to do this, they must sign a contributor agreement, ceding their
copyright. When commercial licenses to spaCy are sold, Alex and Sasha will
not be able to claim any royalties from their contributions.
Later, Alex and Sasha implement new features into spaCy, for another paper. The
code was quite rushed, and they don't want to take the time to put together a
proper pull request. They must release their modifications under the AGPL, but
they are not obliged to contribute it to the spaCy repository, or concede their
copyright.
Phuong and Jessie: Open Source developers
#########################################
Phuong and Jessie use the open-source software Calibre to manage their e-book
libraries. They have an idea for a search feature, and they want to use spaCy
to implement it. Calibre is released under the GPLv3. The AGPL has additional
restrictions for projects used as a network resource, but they don't apply to
this project, so Phuong and Jessie can use spaCy to improve Calibre. They'll
have to release their code, but that was always their intention anyway.

View File

@ -1,236 +0,0 @@
Quick Start
===========
Install
-------
.. py:currentmodule:: spacy
With Python 2.7 or Python 3, using Linux or OSX, run:
.. code:: bash
$ pip install spacy
$ python -m spacy.en.download
.. _300 mb of data: http://s3-us-west-1.amazonaws.com/media.spacynlp.com/en_data_all-0.4.tgz
The download command fetches and installs about 300mb of data, for the
parser model and word vectors, which it installs within the spacy.en package directory.
If you're stuck using a server with an old version of Python, and you don't
have root access, I've prepared a bootstrap script to help you compile a local
Python install. Run:
.. code:: bash
$ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
The other way to install the package is to clone the github repository, and
build it from source. This installs an additional dependency, Cython.
If you're using Python 2, I also recommend installing fabric and fabtools ---
this is how I build the project.
.. code:: bash
$ git clone https://github.com/honnibal/spaCy.git
$ cd spaCy
$ virtualenv .env && source .env/bin/activate
$ export PYTHONPATH=`pwd`
$ pip install -r requirements.txt
$ python setup.py build_ext --inplace
$ python -m spacy.en.download
$ pip install pytest
$ py.test tests/
Python packaging is awkward at the best of times, and it's particularly tricky
with C extensions, built via Cython, requiring large data files. So, please
report issues as you encounter them, and bear with me :)
Usage
-----
The main entry-point is :meth:`en.English.__call__`, which accepts a unicode string
as an argument, and returns a :py:class:`tokens.Doc` object. You can
iterate over it to get :py:class:`tokens.Token` objects, which provide
a convenient API:
>>> from __future__ import unicode_literals # If Python 2
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp(u'I ate the pizza with anchovies.')
>>> pizza = tokens[3]
>>> (pizza.orth, pizza.orth_, pizza.head.lemma, pizza.head.lemma_)
... (14702, u'pizza', 14702, u'eat')
spaCy maps all strings to sequential integer IDs --- a common trick in NLP.
If an attribute `Token.foo` is an integer ID, then `Token.foo_` is the string,
e.g. `pizza.orth` and `pizza.orth_` provide the integer ID and the string of
the original orthographic form of the word.
.. note:: en.English.__call__ is stateful --- it has an important **side-effect**.
When it processes a previously unseen word, it increments the ID counter,
assigns the ID to the string, and writes the mapping in
:py:data:`English.vocab.strings` (instance of
:py:class:`strings.StringStore`).
Future releases will feature a way to reconcile mappings, but for now, you
should only work with one instance of the pipeline at a time.
(Most of the) API at a glance
-----------------------------
**Process the string:**
.. py:class:: spacy.en.English(self, data_dir=join(dirname(__file__), 'data'))
.. py:method:: __call__(self, text: unicode, tag=True, parse=True, entity=True, merge_mwes=False) --> Doc
+-----------------+--------------+--------------+
| Attribute | Type | Its API |
+=================+==============+==============+
| vocab | Vocab | __getitem__ |
+-----------------+--------------+--------------+
| vocab.strings | StingStore | __getitem__ |
+-----------------+--------------+--------------+
| tokenizer | Tokenizer | __call__ |
+-----------------+--------------+--------------+
| tagger | EnPosTagger | __call__ |
+-----------------+--------------+--------------+
| parser | GreedyParser | __call__ |
+-----------------+--------------+--------------+
| entity | GreedyParser | __call__ |
+-----------------+--------------+--------------+
**Get dict or numpy array:**
.. py:method:: tokens.Doc.to_array(self, attr_ids: List[int]) --> ndarray[ndim=2, dtype=long]
.. py:method:: tokens.Doc.count_by(self, attr_id: int) --> Dict[int, int]
**Get Token objects**
.. py:method:: tokens.Doc.__getitem__(self, i) --> Token
.. py:method:: tokens.Doc.__iter__(self) --> Iterator[Token]
**Get sentence or named entity spans**
.. py:attribute:: tokens.Doc.sents --> Iterator[Span]
.. py:attribute:: tokens.Doc.ents --> Iterator[Span]
You can iterate over a Span to access individual Doc, or access its
start, end or label.
**Embedded word representenations**
.. py:attribute:: tokens.Token.repvec
.. py:attribute:: lexeme.Lexeme.repvec
**Navigate to tree- or string-neighbor tokens**
.. py:method:: nbor(self, i=1) --> Token
.. py:method:: child(self, i=1) --> Token
.. py:method:: sibling(self, i=1) --> Token
.. py:attribute:: head: Token
.. py:attribute:: dep: int
**Align to original string**
.. py:attribute:: string: unicode
Padded with original whitespace.
.. py:attribute:: length: int
Length, in unicode code-points. Equal to len(self.orth_).
.. py:attribute:: idx: int
Starting offset of word in the original string.
Features
--------
**Boolean features**
>>> lexeme = nlp.vocab[u'Apple']
>>> lexeme.is_alpha, is_upper
True, False
>>> tokens = nlp('Apple computers')
>>> tokens[0].is_alpha, tokens[0].is_upper
>>> True, False
>>> from spacy.en.attrs import IS_ALPHA, IS_UPPER
>>> tokens.to_array((IS_ALPHA, IS_UPPER))[0]
array([1, 0])
+----------+---------------------------------------------------------------+
| is_alpha | :py:meth:`str.isalpha` |
+----------+---------------------------------------------------------------+
| is_digit | :py:meth:`str.isdigit` |
+----------+---------------------------------------------------------------+
| is_lower | :py:meth:`str.islower` |
+----------+---------------------------------------------------------------+
| is_title | :py:meth:`str.istitle` |
+----------+---------------------------------------------------------------+
| is_upper | :py:meth:`str.isupper` |
+----------+---------------------------------------------------------------+
| is_ascii | all(ord(c) < 128 for c in string) |
+----------+---------------------------------------------------------------+
| is_punct | all(unicodedata.category(c).startswith('P') for c in string) |
+----------+---------------------------------------------------------------+
| like_url | Using various heuristics, does the string resemble a URL? |
+----------+---------------------------------------------------------------+
| like_num | "Two", "10", "1,000", "10.54", "1/2" etc all match |
+----------+---------------------------------------------------------------+
**String-transform Features**
+----------+---------------------------------------------------------------+
| orth | The original string, unmodified. |
+----------+---------------------------------------------------------------+
| lower | The original string, forced to lower-case |
+----------+---------------------------------------------------------------+
| norm | The string after additional normalization |
+----------+---------------------------------------------------------------+
| shape | Word shape, e.g. 10 --> dd, Garden --> Xxxx, Hi!5 --> Xx!d |
+----------+---------------------------------------------------------------+
| prefix | A short slice from the start of the string. |
+----------+---------------------------------------------------------------+
| suffix | A short slice from the end of the string. |
+----------+---------------------------------------------------------------+
| lemma | The word's lemma, i.e. morphological suffixes removed |
+----------+---------------------------------------------------------------+
**Syntactic labels**
+----------+---------------------------------------------------------------+
| pos | The word's part-of-speech, from the Google Universal Tag Set |
+----------+---------------------------------------------------------------+
| tag | A fine-grained morphosyntactic tag, e.g. VBZ, NNS, etc |
+----------+---------------------------------------------------------------+
| dep | Dependency type label between word and its head, e.g. subj |
+----------+---------------------------------------------------------------+
**Distributional**
+---------+-----------------------------------------------------------+
| cluster | Brown cluster ID of the word |
+---------+-----------------------------------------------------------+
| prob | Log probability of word, smoothed with Simple Good-Turing |
+---------+-----------------------------------------------------------+

View File

@ -1,116 +0,0 @@
====================
Annotation Standards
====================
This document describes the target annotations spaCy is trained to predict.
This is currently a work in progress. Please ask questions on the issue tracker,
so that the answers can be integrated here to improve the documentation.
https://github.com/honnibal/spaCy/issues
English
=======
Tokenization
------------
Tokenization standards are based on the OntoNotes 5 corpus.
The tokenizer differs from most by including tokens for significant whitespace.
Any sequence of whitespace characters beyond a single space (' ') is included
as a token. For instance:
>>> from spacy.en import English
>>> nlp = English(parse=False)
>>> tokens = nlp(u'Some\nspaces and\ttab characters')
>>> print [t.orth_ for t in tokens]
[u'Some', u'\n', u'spaces', u' ', u'and', u'\t', u'tab', u'characters']
The whitespace tokens are useful for much the same reason punctuation is --- it's
often an important delimiter in the text. By preserving it in the token output,
we are able to maintain a simple alignment between the tokens and the original
string, and we ensure that the token stream does not lose information.
Sentence boundary detection
---------------------------
Sentence boundaries are calculated from the syntactic parse tree, so features
such as punctuation and capitalisation play an important but non-decisive role
in determining the sentence boundaries. Usually this means that the sentence
boundaries will at least coincide with clause boundaries, even given poorly
punctuated text.
Part-of-speech Tagging
----------------------
The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank
tag set. We also map the tags to the simpler Google Universal POS Tag set.
Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124
Lemmatization
-------------
A "lemma" is the uninflected form of a word. In English, this means:
* Adjectives: The form like "happy", not "happier" or "happiest"
* Adverbs: The form like "badly", not "worse" or "worst"
* Nouns: The form like "dog", not "dogs"; like "child", not "children"
* Verbs: The form like "write", not "writes", "writing", "wrote" or "written"
The lemmatization data is taken from WordNet. However, we also add a special
case for pronouns: all pronouns are lemmatized to the special token -PRON-.
Syntactic Dependency Parsing
----------------------------
The parser is trained on data produced by the ClearNLP converter. Details of
the annotation scheme can be found here:
http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf
Named Entity Recognition
------------------------
+--------------+-----------------------------------------------------+
| PERSON | People, including fictional |
+--------------+-----------------------------------------------------+
| NORP | Nationalities or religious or political groups |
+--------------+-----------------------------------------------------+
| FACILITY | Buildings, airports, highways, bridges, etc. |
+--------------+-----------------------------------------------------+
| ORGANIZATION | Companies, agencies, institutions, etc. |
+--------------+-----------------------------------------------------+
| GPE | Countries, cities, states |
+--------------+-----------------------------------------------------+
| LOCATION | Non-GPE locations, mountain ranges, bodies of water |
+--------------+-----------------------------------------------------+
| PRODUCT | Vehicles, weapons, foods, etc. (Not services) |
+--------------+-----------------------------------------------------+
| EVENT | Named hurricanes, battles, wars, sports events, etc.|
+--------------+-----------------------------------------------------+
| WORK OF ART | Titles of books, songs, etc. |
+--------------+-----------------------------------------------------+
| LAW | Named documents made into laws |
+--------------+-----------------------------------------------------+
| LANGUAGE | Any named language |
+--------------+-----------------------------------------------------+
The following values are also annotated in a style similar to names:
+--------------+---------------------------------------------+
| DATE | Absolute or relative dates or periods |
+--------------+---------------------------------------------+
| TIME | Times smaller than a day |
+--------------+---------------------------------------------+
| PERCENT | Percentage (including “%”) |
+--------------+---------------------------------------------+
| MONEY | Monetary values, including unit |
+--------------+---------------------------------------------+
| QUANTITY | Measurements, as of weight or distance |
+--------------+---------------------------------------------+
| ORDINAL | "first", "second" |
+--------------+---------------------------------------------+
| CARDINAL | Numerals that do not fall under another type|
+--------------+---------------------------------------------+

View File

@ -1,112 +0,0 @@
=============
Documentation
=============
The table below shows every class in spaCy: a link to its documentation, implementation,
and a small usage snippet.
+----------------+--------------------------+--------------------------------+
| Class name | Usage | Implemention |
+================+==========================+================================+
| `English`_ | doc = English() | `spacy/en/__init__.py`_ |
+----------------+--------------------------+--------------------------------+
| Data objects |
+----------------+--------------------------+--------------------------------+
| `Doc`_ | doc = nlp(text) | `spacy/doc.pyx`_ |
+----------------+--------------------------+--------------------------------+
| `Token`_ | token = doc[10] | `spacy/token.pyx`_ |
+----------------+--------------------------+--------------------------------+
| `Span`_ | sent = doc.sents.next() | `spacy/span.pyx`_ |
+----------------+--------------------------+--------------------------------+
| `Lexeme`_ | lex = nlp.vocab[u'word'] | `spacy/lexeme.pyx`_ |
+----------------+--------------------------+--------------------------------+
| Lookup tables |
+----------------+--------------------------+--------------------------------+
| `Vocab`_ | nlp.vocab | `spacy/vocab.pyx`_ |
+----------------+--------------------------+--------------------------------+
| `StringStore`_ | nlp.vocab.strings | `spacy/strings.pyx`_ |
+----------------+--------------------------+--------------------------------+
| Processing modules |
+----------------+--------------------------+--------------------------------+
| `Tokenizer`_ | nlp.tokenizer | `spacy/tokenizer.pyx`_ |
+----------------+--------------------------+--------------------------------+
| `EnPosTagger`_ | nlp.tagger | `spacy/en/pos.pyx`_ |
+----------------+--------------------------+--------------------------------+
| `Parser`_ | nlp.parser | `spacy/syntax/parser.pyx`_ |
+----------------+--------------------------+--------------------------------+
| Parser internals |
+----------------+--------------------------+--------------------------------+
| ArcEager | | spacy/syntax/arc_eager.pyx |
+----------------+--------------------------+--------------------------------+
| BiluoPushDown | | spacy/syntax/ner.pyx |
+----------------+--------------------------+--------------------------------+
| StateClass | | spacy/syntax/stateclass.pyx |
+----------------+--------------------------+--------------------------------+
| Research Utilities |
+----------------+--------------------------+--------------------------------+
| `GoldParse`_ | | `spacy/gold.pyx`_ |
+----------------+--------------------------+--------------------------------+
| `Scorer`_ | | `spacy/scorer.py`_ |
+----------------+--------------------------+--------------------------------+
.. toctree::
:maxdepth: 4
loading.rst
processing.rst
using/document.rst
using/span.rst
using/token.rst
using/lexeme.rst
.. _English: processing.html
.. _Doc: using/doc.html
.. _Token: using/token.html
.. _Span: using/span.html
.. _Lexeme: using/lexeme.html
.. _Vocab: lookup.html
.. _StringStore: lookup.html
.. _Tokenizer: processing.html
.. _EnPosTagger: processing.html
.. _Parser: processing.html
.. _Scorer: misc.html
.. _GoldParse: misc.html
.. _spacy/en/__init__.py: https://github.com/honnibal/spaCy/tree/master/spacy/en/__init__.py
.. _spacy/doc.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/tokens.pyx
.. _spacy/token.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/tokens.pyx
.. _spacy/span.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/spans.pyx
.. _spacy/vocab.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/vocab.pyx
.. _spacy/strings.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/strings.pyx
.. _spacy/tokenizer.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/tokenizer.pyx
.. _spacy/en/pos.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/en/pos.pyx
.. _spacy/syntax/parser.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/syntax/parser.pyx
.. _spacy/lexeme.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/lexeme.pyx
.. _spacy/gold.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/gold.pyx
.. _spacy/scorer.py: https://github.com/honnibal/spaCy/tree/master/spacy/scorer.py

View File

@ -1,62 +0,0 @@
=================
Loading Resources
=================
99\% of the time, you will load spaCy's resources using a language pipeline class,
e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a
specified directory. By default, spaCy installs data into each language's
package directory, and loads it from there.
Usually, this is all you will need:
>>> from spacy.en import English
>>> nlp = English()
If you need to replace some of the components, you may want to just make your
own pipeline class --- the English class itself does almost no work; it just
applies the modules in order. You can also provide a function or class that
produces a tokenizer, tagger, parser or entity recognizer to :code:`English.__init__`,
to customize the pipeline:
>>> from spacy.en import English
>>> from my_module import MyTagger
>>> nlp = English(Tagger=MyTagger)
In more detail:
.. code::
class English(object):
def __init__(self,
data_dir=path.join(path.dirname(__file__), 'data'),
Tokenizer=Tokenizer.from_dir,
Tagger=EnPosTagger,
Parser=Createarser(ArcEager),
Entity=CreateParser(BiluoNER),
load_vectors=True
):
:code:`data_dir`
:code:`unicode path`
The data directory. May be None, to disable any data loading (including
the vocabulary).
:code:`Tokenizer`
:code:`(Vocab vocab, unicode data_dir)(unicode) --> Doc`
A class/function that creates the tokenizer.
:code:`Tagger` / :code:`Parser` / :code:`Entity`
:code:`(Vocab vocab, unicode data_dir)(Doc) --> None`
A class/function that creates the part-of-speech tagger /
syntactic dependency parser / named entity recogniser.
May be None or False, to disable tagging.
:code:`load_vectors`
:code:`bool`
A boolean value to control whether the word vectors are loaded.

View File

@ -1,49 +0,0 @@
Lexical Lookup
--------------
Where possible, spaCy computes information over lexical *types*, rather than
*tokens*. If you process a large batch of text, the number of unique types
you will see will grow exponentially slower than the number of tokens --- so
it's much more efficient to compute over types. And, in small samples, we generally
want to know about the distribution of a word in the language at large ---
which again, is type-based information.
You can access the lexical features via the Token object, but you can also look them
up in the vocabulary directly:
>>> from spacy.en import English
>>> nlp = English()
>>> lexeme = nlp.vocab[u'Amazon']
.. py:class:: vocab.Vocab(self, data_dir=None, lex_props_getter=None)
.. py:method:: __len__(self) --> int
.. py:method:: __getitem__(self, id: int) --> unicode
.. py:method:: __getitem__(self, string: unicode) --> int
.. py:method:: __setitem__(self, py_str: unicode, props: Dict[str, int[float]) --> None
.. py:method:: dump(self, loc: unicode) --> None
.. py:method:: load_lexemes(self, loc: unicode) --> None
.. py:method:: load_vectors(self, loc: unicode) --> None
.. py:class:: strings.StringStore(self)
.. py:method:: __len__(self) --> int
.. py:method:: __getitem__(self, id: int) --> unicode
.. py:method:: __getitem__(self, string: bytes) --> id
.. py:method:: __getitem__(self, string: unicode) --> id
.. py:method:: dump(self, loc: unicode) --> None
.. py:method:: load(self, loc: unicode) --> None

View File

@ -1,67 +0,0 @@
===============
Processing Text
===============
The text processing API is very small and simple. Everything is a callable object,
and you will almost always apply the pipeline all at once.
Applying a pipeline
-------------------
.. py:method:: English.__call__(text, tag=True, parse=True, entity=True) --> Doc
text (unicode)
The text to be processed. No pre-processing needs to be applied, and any
length of text can be submitted. Usually you will submit a whole document.
Text may be zero-length. An exception is raised if byte strings are supplied.
tag (bool)
Whether to apply the part-of-speech tagger. Required for parsing and entity recognition.
parse (bool)
Whether to apply the syntactic dependency parser.
entity (bool)
Whether to apply the named entity recognizer.
**Examples**
>>> from spacy.en import English
>>> nlp = English()
>>> doc = nlp(u'Some text.) # Applies tagger, parser, entity
>>> doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
>>> doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
>>> doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
>>> doc = nlp(u'') # Zero-length tokens, not an error
>>> doc = nlp(b'Some text') # Error: need unicode
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "spacy/en/__init__.py", line 128, in __call__
tokens = self.tokenizer(text)
TypeError: Argument 'string' has incorrect type (expected unicode, got str)
>>> doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
>>>
Tokenizer
---------
.. autoclass:: spacy.tokenizer.Tokenizer
:members:
Tagger
------
.. autoclass:: spacy.en.pos.EnPosTagger
:members:
Parser and Entity Recognizer
----------------------------
.. autoclass:: spacy.syntax.parser.Parser
:members:

View File

@ -1,100 +0,0 @@
==============
The Doc Object
==============
.. autoclass:: spacy.tokens.Tokens
:code:`__getitem__`, :code:`__iter__`, :code:`__len__`
The Tokens class behaves as a Python sequence, supporting the usual operators,
len(), etc. Negative indexing is supported. Slices are supported as of v0.89
.. code::
>>> tokens = nlp(u'Zero one two three four five six')
>>> tokens[0].orth_
u'Zero'
>>> tokens[-1].orth_
u'six'
>>> span = tokens[0:4]
>>> [w.orth_ for w in span]
[u'Zero', u'one', u'two', u'three']
>>> span.string
u'Zero one two three'
:code:`sents`
Iterate over sentences in the document. Each sentence is a Span object.
:code:`ents`
Iterate over entities in the document. Each entity is a Span object.
:code:`to_array`
Given a list of M attribute IDs, export the tokens to a numpy ndarray
of shape N*M, where N is the length of the sentence.
Arguments:
attr_ids (list[int]): A list of attribute ID ints.
Returns:
feat_array (numpy.ndarray[long, ndim=2]):
A feature matrix, with one row per word, and one column per attribute
indicated in the input attr_ids.
:code:`count_by`
Produce a dict of {attribute (int): count (ints)} frequencies, keyed
by the values of the given attribute ID.
>>> from spacy.en import English, attrs
>>> nlp = English()
>>> tokens = nlp(u'apple apple orange banana')
>>> tokens.count_by(attrs.ORTH)
{12800L: 1, 11880L: 2, 7561L: 1}
>>> tokens.to_array([attrs.ORTH])
array([[11880],
[11880],
[ 7561],
[12800]])
:code:`merge`
Merge a multi-word expression into a single token. Currently
experimental; API is likely to change.
:code:`to_bytes()`
Get a byte-string representation of the document, i.e. serialize.
:code:`from_bytes(self, byte_string)`
Load data from a byte-string, i.e. deserialize
:code:`Doc.read_bytes`
A staticmethod, used to read bytes from a file.
Example of serialization:
::
doc1 = EN(u'This is a simple test. With a couple of sentences.')
doc2 = EN(u'This is another test document.')
with open('/tmp/spacy_docs.bin', 'wb') as file_:
file_.write(doc1.to_bytes())
file_.write(doc2.to_bytes())
with open('/tmp/spacy_docs.bin', 'rb') as file_:
bytes1, bytes2 = Doc.read_bytes(file_)
r1 = Doc(EN.vocab).from_bytes(bytes1)
r2 = Doc(EN.vocab).from_bytes(bytes2)
assert r1.string == doc1.string
assert r2.string == doc2.string
Internals
A Tokens instance stores the annotations in a C-array of `TokenC` structs.
Each TokenC struct holds a const pointer to a LexemeC struct, which describes
a vocabulary item.
The Token objects are built lazily, from this underlying C-data.
For faster access, the underlying C data can be accessed from Cython. You
can also export the data to a numpy array, via `Tokens.to_array`, if pure Python
access is required, and you need slightly better performance.

View File

@ -1,11 +0,0 @@
==================
Annotation Objects
==================
.. toctree::
:maxdepth: 3
document.rst
token.rst
span.rst

View File

@ -1,32 +0,0 @@
===============
The Span Object
===============
.. autoclass:: spacy.spans.Span
:code:`__getitem__`, :code:`__iter__`, :code:`__len__`
Sequence API
:code:`head`
Syntactic head, or None
:code:`left`
Tokens to the left of the span
:code:`rights`
Tokens to the left of the span
:code:`orth` / :code:`orth_`
Orth string
:code:`lemma` / :code:`lemma_`
Lemma string
:code:`string`
String
:code:`label` / :code:`label_`
Label
:code:`subtree`
Lefts + [self] + Rights

View File

@ -1,166 +0,0 @@
================
The Token Object
================
A Token represents a single word, punctuation or significant whitespace symbol.
Integer IDs are provided for all string features. The (unicode) string is
provided by an attribute of the same name followed by an underscore, e.g.
token.orth is an integer ID, token.orth\_ is the unicode value.
The only exception is the Token.string attribute, which is (unicode)
string-typed.
**String Features**
:code:`orth` / :code:`orth_`
The form of the word with no string normalization or processing, as it
appears in the string, without trailing whitespace.
:code:`lemma` / :code:`lemma_`
The "base" of the word, with no inflectional suffixes, e.g. the lemma of
"developing" is "develop", the lemma of "geese" is "goose", etc. Note that
*derivational* suffixes are not stripped, e.g. the lemma of "instutitions"
is "institution", not "institute". Lemmatization is performed using the
WordNet data, but extended to also cover closed-class words such as
pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his".
We assign pronouns the lemma -PRON-.
:code:`lower` / :code:`lower_`
The form of the word, but forced to lower-case, i.e. lower = word.orth\_.lower()
:code:`norm` / :code:`norm_`
The form of the word, after language-specific normalizations have been
applied.
:code:`shape` / :code:`shape_`
A transform of the word's string, to show orthographic features. The
characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d.
After these mappings, sequences of 4 or more of the same character are
truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
:) --> :)
:code:`prefix` / :code:`prefix_`
A length-N substring from the start of the word. Length may vary by
language; currently for English n=1, i.e. prefix = word.orth\_[:1]
:code:`suffix` / :code:`suffix_`
A length-N substring from the end of the word. Length may vary by
language; currently for English n=3, i.e. suffix = word.orth\_[-3:]
:code:`string`
The form of the word as it appears in the string, **including trailing
whitespace**. This is useful when you need to use linguistic features to
add inline mark-up to the string.
**Boolean Features**
:code:`is_oov`
Is the word out-of-vocabulary?
:code:`is_alpha`
Equivalent to `word.orth_.isalpha()`
:code:`is_ascii`
Equivalent to `any(ord(c) >= 128 for c in word.orth_)`
:code:`is_digit`
Equivalent to `word.orth_.isdigit()`
:code:`is_lower`
Equivalent to `word.orth_.islower()`
:code:`is_title`
Equivalent to `word.orth_.istitle()`
:code:`is_punct`
Equivalent to `word.orth_.ispunct()`
:code:`is_space`
Equivalent to `word.orth_.isspace()`
:code:`like_url`
Does the word resembles a URL?
:code:`like_num`
Does the word represent a number? e.g. "10.9", "10", "ten", etc
:code:`like_email`
Does the word resemble an email?
**Distributional Features**
:code:`prob`
The unigram log-probability of the word, estimated from counts from a
large corpus, smoothed using Simple Good Turing estimation.
:code:`cluster`
The Brown cluster ID of the word. These are often useful features for
linear models. If you're using a non-linear model, particularly
a neural net or random forest, consider using the real-valued word
representation vector, in Token.repvec, instead.
:code:`repvec`
A "word embedding" representation: a dense real-valued vector that supports
similarity queries between words. By default, spaCy currently loads
vectors produced by the Levy and Goldberg (2014) dependency-based word2vec
model.
**Syntactic Features**
:code:`tag`
A morphosyntactic tag, e.g. NN, VBZ, DT, etc. These tags are
language/corpus specific, and typically describe part-of-speech and some
amount of morphological information. For instance, in the Penn Treebank
tag set, VBZ is assigned to a present-tense singular verb.
:code:`pos`
A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB,
ADV. Constants for the 17 tag values are provided in spacy.parts\_of\_speech.
:code:`dep`
The type of syntactic dependency relation between the word and its
syntactic head.
:code:`n_lefts`
The number of immediate syntactic children preceding the word in the
string.
:code:`n_rights`
The number of immediate syntactic children following the word in the
string.
**Navigating the Dependency Tree**
:code:`head`
The Token that is the immediate syntactic head of the word. If the word is
the root of the dependency tree, the same word is returned.
:code:`lefts`
An iterator for the immediate leftward syntactic children of the word.
:code:`rights`
An iterator for the immediate rightward syntactic children of the word.
:code:`children`
An iterator that yields from lefts, and then yields from rights.
:code:`subtree`
An iterator for the part of the sentence syntactically governed by the
word, including the word itself.
:code:`left_edge`
The leftmost descendent of the word's subtree. Equivalent to `list(word.subtree)[0]`
:code:`right_edge`
The rightmost descendent of the word's subtree. Equivalent to `list(word.subtree)[-1]`
**Named Entities**
:code:`ent_type`
If the token is part of an entity, its entity type
:code:`ent_iob`
The IOB (inside, outside, begin) entity recognition tag for the token

View File

@ -1,280 +0,0 @@
===================================
Tutorial: Extractive Summarization
===================================
This tutorial will go through the implementation of several extractive
summarization models with spaCy.
An *extractive* summarization system is a filter over the original document/s:
most of the text is removed, and the remaining text is formatted as a summary.
In contrast, an *abstractive* summarization system generates new text.
Application Context
-------------------
Extractive summarization systems need an application context. We can't ask how
to design the system without some concept of what sort of summary will be
useful for a given application. (Contrast with speech recognition, where
a notion of "correct" is much less application-sensitive.)
For this, I've adopted the application context that `Flipboard`_ discuss in a
recent blog post: they want to display lead-text to readers on mobile devices,
so that readers can easily choose interesting links.
I've chosen this application context for two reasons. First, `Flipboard`_ say
they're putting something like this into production. Second, there's a ready
source of evaluation data. We can look at the lead-text that human editors
have chosen, and evaluate whether our automatic system chooses similar text.
Experimental Setup
------------------
Instead of scraping data, I'm using articles from the New York Times Annotated
Corpus, which is a handy dump of XML-annotated articles distributed by the LDC.
The annotations come with a field named "online lead paragraph". Our
summarization systems will be evaluated on their Rouge-1 overlap with this
field.
Further details of the experimental setup can be found in the appendices.
.. _newyorktimes.com: http://newyorktimes.com
.. _Flipboard: http://engineering.flipboard.com/2014/10/summarization/
.. _vector-space model: https://en.wikipedia.org/wiki/Vector_space_model
.. _LexRank algorithm: https://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html
.. _PageRank: https://en.wikipedia.org/wiki/PageRank
Summarizer API
--------------
Each summarization model will have the following API:
.. py:func:`summarize(nlp: spacy.en.English, headline: unicode, paragraphs: List[unicode],
target_length: int) --> summary: unicode
We receive the headline and a list of paragraphs, and a target length. We have
to produce a block of text where len(text) < target_length. We want summaries
that users will click-on, and not bounce back out of. Long-term, we want
summaries that would keep people using the app.
Baselines: Truncate
-------------------
.. code:: python
def truncate_chars(nlp, headline, paragraphs, target_length):
text = ' '.join(paragraphs)
return text[:target_length - 3] + '...'
def truncate_words(nlp, headline, paragraphs, target_length):
text = ' '.join(paragraphs)
tokens = text.split()
summary = []
n_words = 0
n_chars = 0
while n_chars < target_length - 3:
n_chars += len(tokens[n_words])
n_chars += 1 # Space
n_words += 1
return ' '.join(tokens[:n_words]) + '...'
def truncate_sentences(nlp, headline, paragraphs, target_length):
sentences = []
summary = ''
for para in paragraphs:
tokens = nlp(para)
for sentence in tokens.sentences():
if len(summary) + len(sentence) >= target_length:
return summary
summary += str(sentence)
return summary
I'd be surprised if Flipboard never had something like this in production. Details
like lead-text take a while to float up the priority list. This strategy also has
the advantage of transparency: it's obvious to users how the decision is being
made, so nobody is likely to complain about the feature if it works this way.
Instead of cutting off the text mid-word, we can tokenize the text, and
+----------------+-----------+
| System | Rouge-1 R |
+----------------+-----------+
| Truncate chars | 69.3 |
+----------------+-----------+
| Truncate words | 69.8 |
+----------------+-----------+
| Truncate sents | 48.5 |
+----------------+-----------+
Sentence Vectors
----------------
A simple bag-of-words model can be created using the `count_by` method, which
produces a dictionary of frequencies, keyed by string IDs:
.. code:: python
>>> from spacy.en import English
>>> from spacy.en.attrs import SIC
>>> nlp = English()
>>> tokens = nlp(u'a a a. b b b b.')
>>> tokens.count_by(SIC)
{41L: 4, 11L: 3, 5L: 2}
>>> [s.count_by(SIC) for s in tokens.sentences()]
[{11L: 3, 5L: 1}, {41L: 4, 5L: 1}]
Similar functionality is provided by `scikit-learn`_, but with a different
style of API design. With spaCy, functions generally have more limited
responsibility. The advantage of this is that spaCy's APIs are much simpler,
and it's often easier to compose functions in a more flexible way.
One particularly powerful feature of spaCy is its support for
`word embeddings`_ --- the dense vectors introduced by deep learning models, and
now commonly produced by `word2vec`_ and related systems.
Once a set of word embeddings has been installed, the vectors are available
from any token:
>>> from spacy.en import English
>>> from spacy.en.attrs import SIC
>>> from scipy.spatial.distance import cosine
>>> nlp = English()
>>> tokens = nlp(u'Apple banana Batman hero')
>>> cosine(tokens[0].vec, tokens[1].vec)
.. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/
.. _word2vec: https://code.google.com/p/word2vec/
.. code:: python
def main(db_loc, output_dir, feat_type="tfidf"):
nlp = spacy.en.English()
# Read stop list and make TF-IDF weights --- data needed for the
# feature extraction.
with open(stops_loc) as file_:
stop_words = set(nlp.vocab.strings[word.strip()] for word in file_)
idf_weights = get_idf_weights(nlp, iter_docs(db_loc))
if feat_type == 'tfidf':
feature_extractor = tfidf_extractor(stop_words, idf_weights)
elif feat_type == 'vec':
feature_extractor = vec_extractor(stop_words, idf_weights)
for i, text in enumerate(iter_docs(db_loc)):
tokens = nlp(body)
sentences = tokens.sentences()
summary = summarize(sentences, feature_extractor)
write_output(summary, output_dir, i)
.. _scikit-learn: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text
The LexRank Algorithm
----------------------
LexRank is described as a graph-based algorithm, derived from `Google's PageRank`_.
The nodes are sentences, and the edges are the similarities between one
sentence and another. The "graph" is fully-connected, and its edges are
undirected --- so, it's natural to represent this as a matrix:
.. code:: python
from scipy.spatial.distance import cosine
import numpy
def lexrank(sent_vectors):
n = len(sent_vectors)
# Build the cosine similarity matrix
matrix = numpy.ndarray(shape=(n, n))
for i in range(n):
for j in range(n):
matrix[i, j] = cosine(sent_vectors[i], sent_vectors[j])
# Normalize
for i in range(n):
matrix[i] /= sum(matrix[i])
return _pagerank(matrix)
The rows are normalized (i.e. rows sum to 1), allowing the PageRank algorithm
to be applied. Unfortunately the PageRank implementation is rather opaque ---
it's easier to just read the Wikipedia page:
.. code:: python
def _pagerank(matrix, d=0.85):
# This is admittedly opaque --- just read the Wikipedia page.
n = len(matrix)
rank = numpy.ones(shape=(n,)) / n
new_rank = numpy.zeros(shape=(n,))
while not _has_converged(rank, new_rank):
rank, new_rank = new_rank, rank
for i in range(n):
new_rank[i] = ((1.0 - d) / n) + (d * sum(rank * matrix[i]))
return rank
def _has_converged(x, y, epsilon=0.0001):
return all(abs(x[i] - y[i]) < epsilon for i in range(n))
Initial Processing
------------------
Feature Extraction
------------------
.. code:: python
def sentence_vectors(sentence, idf_weights):
tf_idf = {}
for term, freq in sent.count_by(LEMMA).items():
tf_idf[term] = freq * idf_weights[term]
vectors.append(tf_idf)
return vectors
The LexRank paper models each sentence as a bag-of-words
This is simple and fairly standard, but often gives
underwhelming results. My idea is to instead calculate vectors from
`word-embeddings`_, which have been one of the exciting outcomes of the recent
work on deep-learning. I had a quick look at the literature, and found
a `recent workshop paper`_ that suggested the idea was plausible.
Taking the feature representation and similarity function as parameters, the
LexRank function looks like this:
Given a list of N sentences, a function that maps a sentence to a feature
vector, and a function that computes a similarity measure of two feature
vectors, this produces a vector of N floats, which indicate how well each
sentence represents the document as a whole.
.. _Rouge: https://en.wikipedia.org/wiki/ROUGE_%28metric%29
.. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/
.. _recent workshop paper: https://www.aclweb.org/anthology/W/W14/W14-1504.pdf
Document Model
--------------

View File

@ -1,298 +0,0 @@
Updates
=======
To update your installation:
.. code:: bash
$ pip install --upgrade spacy
$ python -m spacy.en.download all
Most updates ship a new model, so you will usually have to redownload the data.
2015-07-28 v0.89
----------------
Major update!
* Support efficient binary serialization. The dependency tree,
part-of-speech tags, named entities, tokenization and text can be dumped to a
byte string smaller than the original text representation. Serialization is
lossless, so there's no need to separately store the original text.
Serialize:
.. code-block:: python
byte_string = doc.to_bytes()
Deserialize by first creating a Doc object, and then loading the bytes:
.. code-block:: python
doc = Doc(nlp.vocab)
doc.from_bytes(byte_string)
If you have a binary file with several parses saved, you can iterate over
them using the staticmethod `Doc.read_bytes`. Putting it all together:
.. code-block:: python
import codecs
from spacy.en import English
def serialize(nlp, texts, out_loc):
with open(out_loc, 'wb') as out_file:
for text in texts:
doc = nlp(text)
out_file.write(doc.to_bytes())
def deserialize(nlp, file_loc):
docs = []
with open(file_loc, 'rb') as read_file:
for byte_string in Doc.read_bytes(read_file, 'rb')):
doc = Doc(nlp.vocab).from_bytes(byte_string)
docs.append(doc)
return docs
Full tutorial coming soon.
* Fix probability estimates, and base them off counts from the 2015 Reddit Comments
dump. The probability estimates are now very reliable, and out-of-vocabulary
words now receive an accurate smoothed probability estimate.
* Fix regression in parse times on very long texts. Recent versions were
calculating parse features in a way that was polynomial in input length.
* Allow slicing into the Doc object, so that you can do e.g. doc[2:4]. Returns
a Span object.
* Add tag SP (coarse tag SPACE) for whitespace tokens. Fix bug where
whitespace was sometimes marked as an entity.
* Reduce memory usage. Memory usage now under 2GB per process.
* Rename :code:`Span.head` to :code:`Span.root`, fix its documentation, and make
it more efficient. I considered adding Span.head, Span.dep and Span.dep\_ as
well, but for now I leave these as accessible via :code:`Span.root.head`,
:code:`Span.head.dep`, and :code:`Span.head.dep\_`, to keep the API smaller.
* Add boolean features to Token and Lexeme objects.
* Main parse function now marked **nogil**. This
means I'll be able to add a Worker class that allows multi-threaded
processing. This will be available in the next version. In the meantime,
you should continue to use multiprocessing for parallelization.
2015-07-08 v0.88
----------------
Refactoring release.
If you have the data for v0.87, you don't need to redownload the data for this
release.
* You can now set tag=False, parse=False or entity=False when creating the pipleine,
to disable some of the models. See the documentation for details.
* Models no longer lazy-loaded.
* Warning emitted when parse=True or entity=True but model not loaded.
* Rename the tokens.Tokens class to tokens.Doc. An alias has been made to assist
backwards compatibility, but you should update your code to refer to the new
class name.
* Various bits of internal refactoring
2015-07-01 v0.87
----------------
* Changed weights data structure. Memory use should be reduced 30-40%.
* Fixed speed regressions introduced in the last few versions.
* Models should now be slightly more robust to noise in the input text, as I'm
now training on data with a small amount of noise added, e.g. I randomly corrupt
capitalization, swap spaces for newlines, etc. This is bringing a small
benefit on out-of-domain data. I think this strategy could yield better
results with a better noise-generation function. If you think you have a good
way to make clean text resemble the kind of noisy input you're seeing in your
domain, get in touch.
2015-06-24 v0.86
----------------
* Parser now more accurate, using novel non-monotonic transition system that's
currently under review.
2015-05-12 v0.85
----------------
* Parser produces richer dependency labels following the `ClearNLP scheme`_
* Training data now includes text from a variety of genres.
* Parser now uses more memory and the data is slightly larger, due to the additional
labels. Impact on efficiency is minimal: entire process still takes
<10ms per document.
Most users should see a substantial increase in accuracy from the new model.
Long post on accuracy evaluation and model details coming soon.
.. _ClearNLP scheme: https://github.com/clir/clearnlp-guidelines/blob/master/md/dependency/dependency_guidelines.md
2015-05-12 v0.84
----------------
* Bug fixes for parsing
* Bug fixes for named entity recognition
2015-04-13 v0.80
----------------
* Preliminary support for named-entity recognition. Its accuracy is substantially behind the state-of-the-art. I'm working on improvements.
* Better sentence boundary detection, drawn from the syntactic structure.
* Lots of bug fixes.
2015-03-05 v0.70
----------------
* Improved parse navigation API
* Bug fixes to labelled parsing
2015-01-30 spaCy v0.4: Still alpha, improving quickly
-----------------------------------------------------
Five days ago I presented the alpha release of spaCy, a natural language
processing library that brings state-of-the-art technology to small companies.
spaCy has been well received, and there are now a lot of eyes on the project.
Naturally, lots of issues have surfaced. I'm grateful to those who've reported
them. I've worked hard to address them as quickly as I could.
Bug Fixes
----------
* Lexemes.bin data file had a platform-specific encoding.
This was a silly error: instead of the string, or an index into the
list of strings, I was storing the 64-bit hash of the string. On
wide-unicode builds, a unicode string hashes differently. This meant that
all look-ups into the vocabulary failed on wide unicode builds, which
further meant that the part-of-speech tagger and parser features were not
computed correctly.
The fix is simple: we already have to read in a list of all the strings, so
just store an index into that list, instead of a hash.
* Parse tree navigation API was rough, and buggy.
The parse-tree navigation API was the last thing I added before v0.3. I've
now replaced it with something better. The previous API design was flawed,
and the implementation was buggy --- Token.child() and Token.head were
sometimes inconsistent.
I've addressed the most immediate problems, but this part of the design is
still a work in progress. It's a difficult problem. The parse is a tree,
and we want to freely navigate up and down it without creating reference
cycles that inhibit garbage collection, and without doing a lot of copying,
creating and deleting.
I think I've got a promising solution to this, but I suspect there's
currently a memory leak. Please get in touch no the tracker if you want to
know more, especially if you think you can help.
Known Issues
------------
Some systems are still experiencing memory errors, which I'm having trouble
pinning down or reproducing. Please send details of your system to the
`Issue Tracker`_ if this is happening to you.
.. _Issue Tracker: https://github.com/honnibal/spaCy/issues
Enhancements: Train and evaluate on whole paragraphs
----------------------------------------------------
.. note:: tl;dr: I shipped the wrong parsing model with 0.3. That model expected input to be segmented into sentences. 0.4 ships the correct model, which uses some algorithmic tricks to minimize the impact of tokenization and sentence segmentation errors on the parser.
Most English parsing research is performed on text with perfect pre-processing:
one newline between every sentence, one space between every token.
It's always been done this way, and it's good. It's a useful idealisation,
because the pre-processing has few algorithmic implications.
But, for practical performance, this stuff can matter a lot.
Dridan and Oepen (2013) did a simple but rare thing: they actually ran a few
parsers on raw text. Even on the standard Wall Street Journal corpus,
where pre-processing tools are quite good, the quality of pre-processing
made a big difference:
+-------------+-------+----------+
| Preprocess | BLLIP | Berkeley |
+-------------+-------+----------+
| Gold | 90.9 | 89.8 |
+-------------+-------+----------+
| Default | 86.4 | 88.4 |
+-------------+-------+----------+
| Corrected | 89.9 | 88.8 |
+-------------+-------+----------+
.. note:: spaCy is evaluated on unlabelled dependencies, where the above accuracy figures refer to phrase-structure trees. Accuracies are non-comparable.
In the standard experimental condition --- gold pre-processing --- the
BLLIP parser is better. But, it turns out it ships with lousy pre-processing
tools: when you evaluate the parsers on raw text, the BLLIP parser falls way
behind. To verify that this was due to the quality of the pre-processing
tools, and not some particular algorithmic sensitivity, Dridan and Oepen ran
both parsers with their high-quality tokenizer and sentence segmenter. This
confirmed that with equal pre-processing, the BLLIP parser is better.
The Dridan and Oepen paper really convinced me to take pre-processing seriously
in spaCy. In fact, spaCy started life as just a tokenizer --- hence the name.
The spaCy parser has a special trick up its sleeve. Because both the tagger
and parser run in linear time, it doesn't require that the input be divided
into sentences. This is nice because it avoids error-cascades: if you segment
first, then the parser just has to live with whatever decision the segmenter
made.
But, even though I designed the system with this consideration in mind,
I decided to present the initial results using the standard methodology, using
gold-standard inputs. But...then I made a mistake.
Unfortunately, with all the other things I was doing before launch, I forgot
all about this problem. spaCy launched with a parsing model that expected the
input to be segmented into sentences, but with no sentence segmenter. This
caused a drop in parse accuracy of 4%!
Over the last five days, I've worked hard to correct this. I implemented the
modifications to the parsing algorithm I had planned, from Dongdong Zhang et al.
(2013), and trained and evaluated the parser on raw text, using the version of
the WSJ distributed by Read et al. (2012), and used in Dridan and Oepen's
experiments.
I'm pleased to say that on the WSJ at least, spaCy 0.4 performs almost exactly
as well on raw text as text with gold-standard tokenization and sentence
boundary detection.
I still need to evaluate this on web text, and I need to compare against the
Stanford CoreNLP and other parsers. I suspect that most other parsers will
decline in accuracy by 1% --- we'll see.
+-------------+---------+
| Preprocess | spaCy |
+-------------+---------+
| Gold | 92.4% |
+-------------+---------+
| Default | 92.2% |
+-------------+---------+
2015-01-25
----------
spaCy v0.33 launched --- first alpha build.

View File

@ -0,0 +1,36 @@
# encoding: utf8
from __future__ import unicode_literals, print_function
import plac
import codecs
import pathlib
import random
import twython
import spacy.en
import _handler
class Connection(twython.TwythonStreamer):
def __init__(self, keys_dir, nlp, query):
keys_dir = pathlib.Path(keys_dir)
read = lambda fn: (keys_dir / (fn + '.txt')).open().read().strip()
api_key = map(read, ['key', 'secret', 'token', 'token_secret'])
twython.TwythonStreamer.__init__(self, *api_key)
self.nlp = nlp
self.query = query
def on_success(self, data):
_handler.handle_tweet(self.nlp, data, self.query)
if random.random() >= 0.1:
reload(_handler)
def main(keys_dir, term):
nlp = spacy.en.English()
twitter = Connection(keys_dir, nlp, term)
twitter.statuses.filter(track=term, language='en')
if __name__ == '__main__':
plac.call(main)

3
lang_data/de/infix.txt Normal file
View File

@ -0,0 +1,3 @@
\.\.\.
(?<=[a-z])\.(?=[A-Z])
(?<=[a-zA-Z])-(?=[a-zA-z])

View File

0
lang_data/de/morphs.json Normal file
View File

21
lang_data/de/prefix.txt Normal file
View File

@ -0,0 +1,21 @@
,
"
(
[
{
*
<
$
£
'
``
`
#
US$
C$
A$
a-
....
...

3
lang_data/de/sample.txt Normal file
View File

@ -0,0 +1,3 @@
Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern.
Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs.

149
lang_data/de/specials.json Normal file
View File

@ -0,0 +1,149 @@
{
"a.m.": [{"F": "a.m."}],
"p.m.": [{"F": "p.m."}],
"1a.m.": [{"F": "1"}, {"F": "a.m."}],
"2a.m.": [{"F": "2"}, {"F": "a.m."}],
"3a.m.": [{"F": "3"}, {"F": "a.m."}],
"4a.m.": [{"F": "4"}, {"F": "a.m."}],
"5a.m.": [{"F": "5"}, {"F": "a.m."}],
"6a.m.": [{"F": "6"}, {"F": "a.m."}],
"7a.m.": [{"F": "7"}, {"F": "a.m."}],
"8a.m.": [{"F": "8"}, {"F": "a.m."}],
"9a.m.": [{"F": "9"}, {"F": "a.m."}],
"10a.m.": [{"F": "10"}, {"F": "a.m."}],
"11a.m.": [{"F": "11"}, {"F": "a.m."}],
"12a.m.": [{"F": "12"}, {"F": "a.m."}],
"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
"1p.m.": [{"F": "1"}, {"F": "p.m."}],
"2p.m.": [{"F": "2"}, {"F": "p.m."}],
"3p.m.": [{"F": "3"}, {"F": "p.m."}],
"4p.m.": [{"F": "4"}, {"F": "p.m."}],
"5p.m.": [{"F": "5"}, {"F": "p.m."}],
"6p.m.": [{"F": "6"}, {"F": "p.m."}],
"7p.m.": [{"F": "7"}, {"F": "p.m."}],
"8p.m.": [{"F": "8"}, {"F": "p.m."}],
"9p.m.": [{"F": "9"}, {"F": "p.m."}],
"10p.m.": [{"F": "10"}, {"F": "p.m."}],
"11p.m.": [{"F": "11"}, {"F": "p.m."}],
"12p.m.": [{"F": "12"}, {"F": "p.m."}],
"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
"Jan.": [{"F": "Jan.", "L": "Januar"}],
"Feb.": [{"F": "Feb.", "L": "Februar"}],
"Mär.": [{"F": "Mär.", "L": "März"}],
"Apr.": [{"F": "Apr.", "L": "April"}],
"Mai.": [{"F": "Mai.", "L": "Mai"}],
"Jun.": [{"F": "Jun.", "L": "Juni"}],
"Jul.": [{"F": "Jul.", "L": "Juli"}],
"Aug.": [{"F": "Aug.", "L": "August"}],
"Sep.": [{"F": "Sep.", "L": "September"}],
"Sept.": [{"F": "Sept.", "L": "September"}],
"Okt.": [{"F": "Okt.", "L": "Oktober"}],
"Nov.": [{"F": "Nov.", "L": "November"}],
"Dez.": [{"F": "Dez.", "L": "Dezember"}],
":)": [{"F": ":)"}],
"<3": [{"F": "<3"}],
";)": [{"F": ";)"}],
"(:": [{"F": "(:"}],
":(": [{"F": ":("}],
"-_-": [{"F": "-_-"}],
"=)": [{"F": "=)"}],
":/": [{"F": ":/"}],
":>": [{"F": ":>"}],
";-)": [{"F": ";-)"}],
":Y": [{"F": ":Y"}],
":P": [{"F": ":P"}],
":-P": [{"F": ":-P"}],
":3": [{"F": ":3"}],
"=3": [{"F": "=3"}],
"xD": [{"F": "xD"}],
"^_^": [{"F": "^_^"}],
"=]": [{"F": "=]"}],
"=D": [{"F": "=D"}],
"<333": [{"F": "<333"}],
":))": [{"F": ":))"}],
":0": [{"F": ":0"}],
"-__-": [{"F": "-__-"}],
"xDD": [{"F": "xDD"}],
"o_o": [{"F": "o_o"}],
"o_O": [{"F": "o_O"}],
"V_V": [{"F": "V_V"}],
"=[[": [{"F": "=[["}],
"<33": [{"F": "<33"}],
";p": [{"F": ";p"}],
";D": [{"F": ";D"}],
";-p": [{"F": ";-p"}],
";(": [{"F": ";("}],
":p": [{"F": ":p"}],
":]": [{"F": ":]"}],
":O": [{"F": ":O"}],
":-/": [{"F": ":-/"}],
":-)": [{"F": ":-)"}],
":(((": [{"F": ":((("}],
":((": [{"F": ":(("}],
":')": [{"F": ":')"}],
"(^_^)": [{"F": "(^_^)"}],
"(=": [{"F": "(="}],
"o.O": [{"F": "o.O"}],
"\")": [{"F": "\")"}],
"a.": [{"F": "a."}],
"b.": [{"F": "b."}],
"c.": [{"F": "c."}],
"d.": [{"F": "d."}],
"e.": [{"F": "e."}],
"f.": [{"F": "f."}],
"g.": [{"F": "g."}],
"h.": [{"F": "h."}],
"i.": [{"F": "i."}],
"j.": [{"F": "j."}],
"k.": [{"F": "k."}],
"l.": [{"F": "l."}],
"m.": [{"F": "m."}],
"n.": [{"F": "n."}],
"o.": [{"F": "o."}],
"p.": [{"F": "p."}],
"q.": [{"F": "q."}],
"s.": [{"F": "s."}],
"t.": [{"F": "t."}],
"u.": [{"F": "u."}],
"v.": [{"F": "v."}],
"w.": [{"F": "w."}],
"x.": [{"F": "x."}],
"y.": [{"F": "y."}],
"z.": [{"F": "z."}],
"z.b.": [{"F": "z.b."}],
"e.h.": [{"F": "I.e."}],
"o.ä.": [{"F": "I.E."}],
"bzw.": [{"F": "bzw."}],
"usw.": [{"F": "usw."}],
"\n": [{"F": "\n", "pos": "SP"}],
"\t": [{"F": "\t", "pos": "SP"}],
" ": [{"F": " ", "pos": "SP"}]
}

26
lang_data/de/suffix.txt Normal file
View File

@ -0,0 +1,26 @@
,
\"
\)
\]
\}
\*
\!
\?
%
\$
>
:
;
'
''
's
'S
s
S
\.\.
\.\.\.
\.\.\.\.
(?<=[a-z0-9)\]"'%\)])\.
(?<=[0-9])km

56
lang_data/de/tag_map.json Normal file
View File

@ -0,0 +1,56 @@
{
"$(": {"pos": "PUNCT", "PunctType": "Brck"},
"$,": {"pos": "PUNCT", "PunctType": "Comm"},
"$.": {"pos": "PUNCT", "PunctType": "Peri"},
"ADJA": {"pos": "ADJ"},
"ADJD": {"pos": "ADJ", "Variant": "Short"},
"ADV": {"pos": "ADV"},
"APPO": {"pos": "ADP", "AdpType": "Post"},
"APPR": {"pos": "ADP", "AdpType": "Prep"},
"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
"APZR": {"pos": "ADP", "AdpType": "Circ"},
"ART": {"pos": "DET", "PronType": "Art"},
"CARD": {"pos": "NUM", "NumType": "Card"},
"FM": {"pos": "X", "Foreign": "Yes"},
"ITJ": {"pos": "INTJ"},
"KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
"KON": {"pos": "CONJ"},
"KOUI": {"pos": "SCONJ"},
"KOUS": {"pos": "SCONJ"},
"NE": {"pos": "PROPN"},
"NN": {"pos": "NOUN"},
"PAV": {"pos": "ADV", "PronType": "Dem"},
"PDAT": {"pos": "DET", "PronType": "Dem"},
"PDS": {"pos": "PRON", "PronType": "Dem"},
"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"},
"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"},
"PPER": {"pos": "PRON", "PronType": "Prs"},
"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
"PRELAT": {"pos": "DET", "PronType": "Rel"},
"PRELS": {"pos": "PRON", "PronType": "Rel"},
"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
"PTKA": {"pos": "PART"},
"PTKANT": {"pos": "PART", "PartType": "Res"},
"PTKNEG": {"pos": "PART", "Negative": "Neg"},
"PTKVZ": {"pos": "PART", "PartType": "Vbp"},
"PTKZU": {"pos": "PART", "PartType": "Inf"},
"PWAT": {"pos": "DET", "PronType": "Int"},
"PWAV": {"pos": "ADV", "PronType": "Int"},
"PWS": {"pos": "PRON", "PronType": "Int"},
"TRUNC": {"pos": "X", "Hyph": "Yes"},
"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
"VAINF": {"pos": "AUX", "VerbForm": "Inf"},
"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
"VVINF": {"pos": "VERB", "VerbForm": "Inf"},
"VVIZU": {"pos": "VERB", "VerbForm": "Inf"},
"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
"XY": {"pos": "X"}
}

198
lang_data/en/gazetteer.json Normal file
View File

@ -0,0 +1,198 @@
{
"Reddit": [
"PRODUCT",
{},
[
[{"lower": "reddit"}]
]
],
"SeptemberElevenAttacks": [
"EVENT",
{},
[
[
{"orth": "9/11"}
],
[
{"lower": "septmber"},
{"lower": "eleven"}
],
[
{"lower": "september"},
{"orth": "11"}
]
]
],
"Linux": [
"PRODUCT",
{},
[
[{"lower": "linux"}]
]
],
"Haskell": [
"PRODUCT",
{},
[
[{"lower": "haskell"}]
]
],
"HaskellCurry": [
"PERSON",
{},
[
[
{"lower": "haskell"},
{"lower": "curry"}
]
]
],
"Javascript": [
"PRODUCT",
{},
[
[{"lower": "javascript"}]
]
],
"CSS": [
"PRODUCT",
{},
[
[{"lower": "css"}],
[{"lower": "css3"}]
]
],
"displaCy": [
"PRODUCT",
{},
[
[{"lower": "displacy"}]
]
],
"spaCy": [
"PRODUCT",
{},
[
[{"orth": "spaCy"}]
]
],
"HTML": [
"PRODUCT",
{},
[
[{"lower": "html"}],
[{"lower": "html5"}]
]
],
"Python": [
"PRODUCT",
{},
[
[{"orth": "Python"}]
]
],
"Ruby": [
"PRODUCT",
{},
[
[{"orth": "Ruby"}]
]
],
"Digg": [
"PRODUCT",
{},
[
[{"lower": "digg"}]
]
],
"FoxNews": [
"ORG",
{},
[
[{"orth": "Fox"}],
[{"orth": "News"}]
]
],
"Google": [
"ORG",
{},
[
[{"lower": "google"}]
]
],
"Mac": [
"PRODUCT",
{},
[
[{"lower": "mac"}]
]
],
"Wikipedia": [
"PRODUCT",
{},
[
[{"lower": "wikipedia"}]
]
],
"Windows": [
"PRODUCT",
{},
[
[{"orth": "Windows"}]
]
],
"Dell": [
"ORG",
{},
[
[{"lower": "dell"}]
]
],
"Facebook": [
"ORG",
{},
[
[{"lower": "facebook"}]
]
],
"Blizzard": [
"ORG",
{},
[
[{"orth": "Facebook"}]
]
],
"Ubuntu": [
"ORG",
{},
[
[{"orth": "Ubuntu"}]
]
],
"Youtube": [
"PRODUCT",
{},
[
[{"lower": "youtube"}]
]
],
"false_positives": [
null,
{},
[
[{"orth": "Shit"}],
[{"orth": "Weed"}],
[{"orth": "Cool"}],
[{"orth": "Btw"}],
[{"orth": "Bah"}],
[{"orth": "Bullshit"}],
[{"orth": "Lol"}],
[{"orth": "Yo"}, {"lower": "dawg"}],
[{"orth": "Yay"}],
[{"orth": "Ahh"}],
[{"orth": "Yea"}],
[{"orth": "Bah"}]
]
]
}

View File

@ -0,0 +1,31 @@
{
"noun": [
["s", ""],
["ses", "s"],
["ves", "f"],
["xes", "x"],
["zes", "z"],
["ches", "ch"],
["shes", "sh"],
["men", "man"],
["ies", "y"]
],
"verb": [
["s", ""],
["ies", "y"],
["es", "e"],
["es", ""],
["ed", "e"],
["ed", ""],
["ing", "e"],
["ing", ""]
],
"adj": [
["er", ""],
["est", ""],
["er", "e"],
["est", "e"]
]
}

View File

@ -16,6 +16,8 @@
''
's
'S
s
S
\.\.
\.\.\.

60
lang_data/en/tag_map.json Normal file
View File

@ -0,0 +1,60 @@
{
".": {"pos": "punct", "puncttype": "peri"},
",": {"pos": "punct", "puncttype": "comm"},
"-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"},
"-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"},
"``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"},
"\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
"''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
":": {"pos": "punct"},
"$": {"pos": "sym", "other": {"symtype": "currency"}},
"#": {"pos": "sym", "other": {"symtype": "numbersign"}},
"AFX": {"pos": "adj", "hyph": "hyph"},
"CC": {"pos": "conj", "conjtype": "coor"},
"CD": {"pos": "num", "numtype": "card"},
"DT": {"pos": "adj", "prontype": "prn"},
"EX": {"pos": "adv", "advtype": "ex"},
"FW": {"pos": "x", "foreign": "foreign"},
"HYPH": {"pos": "punct", "puncttype": "dash"},
"IN": {"pos": "adp"},
"JJ": {"pos": "adj", "degree": "pos"},
"JJR": {"pos": "adj", "degree": "comp"},
"JJS": {"pos": "adj", "degree": "sup"},
"LS": {"pos": "punct", "numtype": "ord"},
"MD": {"pos": "verb", "verbtype": "mod"},
"NIL": {"pos": "no_tag"},
"NN": {"pos": "noun", "number": "sing"},
"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},
"NNS": {"pos": "noun", "number": "plur"},
"PDT": {"pos": "adj", "adjtype": "pdt", "prontype": "prn"},
"POS": {"pos": "part", "poss": "poss"},
"PRP": {"pos": "noun", "prontype": "prs"},
"PRP$": {"pos": "adj", "prontype": "prs", "poss": "poss"},
"RB": {"pos": "adv", "degree": "pos"},
"RBR": {"pos": "adv", "degree": "comp"},
"RBS": {"pos": "adv", "degree": "sup"},
"RP": {"pos": "part"},
"SYM": {"pos": "sym"},
"TO": {"pos": "part", "parttype": "inf", "verbform": "inf"},
"UH": {"pos": "intJ"},
"VB": {"pos": "verb", "verbform": "inf"},
"VBD": {"pos": "verb", "verbform": "fin", "tense": "past"},
"VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"},
"VBN": {"pos": "verb", "verbform": "part", "tense": "past", "aspect": "perf"},
"VBP": {"pos": "verb", "verbform": "fin", "tense": "pres"},
"VBZ": {"pos": "verb", "verbform": "fin", "tense": "pres", "number": "sing", "person": 3},
"WDT": {"pos": "adj", "prontype": "int|rel"},
"WP": {"pos": "noun", "prontype": "int|rel"},
"WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"},
"WRB": {"pos": "adv", "prontype": "int|rel"},
"SP": {"pos": "space"},
"ADD": {"pos": "x"},
"NFP": {"pos": "punct"},
"GW": {"pos": "x"},
"AFX": {"pos": "x"},
"HYPH": {"pos": "punct"},
"XX": {"pos": "x"},
"BES": {"pos": "verb"},
"HVS": {"pos": "verb"}
}

3
lang_data/fi/infix.txt Normal file
View File

@ -0,0 +1,3 @@
\.\.\.
(?<=[a-z])\.(?=[A-Z])
(?<=[a-zA-Z])-(?=[a-zA-z])

View File

@ -0,0 +1 @@
{}

0
lang_data/fi/morphs.json Normal file
View File

21
lang_data/fi/prefix.txt Normal file
View File

@ -0,0 +1,21 @@
,
"
(
[
{
*
<
$
£
'
``
`
#
US$
C$
A$
a-
....
...

3
lang_data/fi/sample.txt Normal file
View File

@ -0,0 +1,3 @@
Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern.
Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs.

149
lang_data/fi/specials.json Normal file
View File

@ -0,0 +1,149 @@
{
"a.m.": [{"F": "a.m."}],
"p.m.": [{"F": "p.m."}],
"1a.m.": [{"F": "1"}, {"F": "a.m."}],
"2a.m.": [{"F": "2"}, {"F": "a.m."}],
"3a.m.": [{"F": "3"}, {"F": "a.m."}],
"4a.m.": [{"F": "4"}, {"F": "a.m."}],
"5a.m.": [{"F": "5"}, {"F": "a.m."}],
"6a.m.": [{"F": "6"}, {"F": "a.m."}],
"7a.m.": [{"F": "7"}, {"F": "a.m."}],
"8a.m.": [{"F": "8"}, {"F": "a.m."}],
"9a.m.": [{"F": "9"}, {"F": "a.m."}],
"10a.m.": [{"F": "10"}, {"F": "a.m."}],
"11a.m.": [{"F": "11"}, {"F": "a.m."}],
"12a.m.": [{"F": "12"}, {"F": "a.m."}],
"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
"1p.m.": [{"F": "1"}, {"F": "p.m."}],
"2p.m.": [{"F": "2"}, {"F": "p.m."}],
"3p.m.": [{"F": "3"}, {"F": "p.m."}],
"4p.m.": [{"F": "4"}, {"F": "p.m."}],
"5p.m.": [{"F": "5"}, {"F": "p.m."}],
"6p.m.": [{"F": "6"}, {"F": "p.m."}],
"7p.m.": [{"F": "7"}, {"F": "p.m."}],
"8p.m.": [{"F": "8"}, {"F": "p.m."}],
"9p.m.": [{"F": "9"}, {"F": "p.m."}],
"10p.m.": [{"F": "10"}, {"F": "p.m."}],
"11p.m.": [{"F": "11"}, {"F": "p.m."}],
"12p.m.": [{"F": "12"}, {"F": "p.m."}],
"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
"Jan.": [{"F": "Jan.", "L": "Januar"}],
"Feb.": [{"F": "Feb.", "L": "Februar"}],
"Mär.": [{"F": "Mär.", "L": "März"}],
"Apr.": [{"F": "Apr.", "L": "April"}],
"Mai.": [{"F": "Mai.", "L": "Mai"}],
"Jun.": [{"F": "Jun.", "L": "Juni"}],
"Jul.": [{"F": "Jul.", "L": "Juli"}],
"Aug.": [{"F": "Aug.", "L": "August"}],
"Sep.": [{"F": "Sep.", "L": "September"}],
"Sept.": [{"F": "Sept.", "L": "September"}],
"Okt.": [{"F": "Okt.", "L": "Oktober"}],
"Nov.": [{"F": "Nov.", "L": "November"}],
"Dez.": [{"F": "Dez.", "L": "Dezember"}],
":)": [{"F": ":)"}],
"<3": [{"F": "<3"}],
";)": [{"F": ";)"}],
"(:": [{"F": "(:"}],
":(": [{"F": ":("}],
"-_-": [{"F": "-_-"}],
"=)": [{"F": "=)"}],
":/": [{"F": ":/"}],
":>": [{"F": ":>"}],
";-)": [{"F": ";-)"}],
":Y": [{"F": ":Y"}],
":P": [{"F": ":P"}],
":-P": [{"F": ":-P"}],
":3": [{"F": ":3"}],
"=3": [{"F": "=3"}],
"xD": [{"F": "xD"}],
"^_^": [{"F": "^_^"}],
"=]": [{"F": "=]"}],
"=D": [{"F": "=D"}],
"<333": [{"F": "<333"}],
":))": [{"F": ":))"}],
":0": [{"F": ":0"}],
"-__-": [{"F": "-__-"}],
"xDD": [{"F": "xDD"}],
"o_o": [{"F": "o_o"}],
"o_O": [{"F": "o_O"}],
"V_V": [{"F": "V_V"}],
"=[[": [{"F": "=[["}],
"<33": [{"F": "<33"}],
";p": [{"F": ";p"}],
";D": [{"F": ";D"}],
";-p": [{"F": ";-p"}],
";(": [{"F": ";("}],
":p": [{"F": ":p"}],
":]": [{"F": ":]"}],
":O": [{"F": ":O"}],
":-/": [{"F": ":-/"}],
":-)": [{"F": ":-)"}],
":(((": [{"F": ":((("}],
":((": [{"F": ":(("}],
":')": [{"F": ":')"}],
"(^_^)": [{"F": "(^_^)"}],
"(=": [{"F": "(="}],
"o.O": [{"F": "o.O"}],
"\")": [{"F": "\")"}],
"a.": [{"F": "a."}],
"b.": [{"F": "b."}],
"c.": [{"F": "c."}],
"d.": [{"F": "d."}],
"e.": [{"F": "e."}],
"f.": [{"F": "f."}],
"g.": [{"F": "g."}],
"h.": [{"F": "h."}],
"i.": [{"F": "i."}],
"j.": [{"F": "j."}],
"k.": [{"F": "k."}],
"l.": [{"F": "l."}],
"m.": [{"F": "m."}],
"n.": [{"F": "n."}],
"o.": [{"F": "o."}],
"p.": [{"F": "p."}],
"q.": [{"F": "q."}],
"s.": [{"F": "s."}],
"t.": [{"F": "t."}],
"u.": [{"F": "u."}],
"v.": [{"F": "v."}],
"w.": [{"F": "w."}],
"x.": [{"F": "x."}],
"y.": [{"F": "y."}],
"z.": [{"F": "z."}],
"z.b.": [{"F": "z.b."}],
"e.h.": [{"F": "I.e."}],
"o.ä.": [{"F": "I.E."}],
"bzw.": [{"F": "bzw."}],
"usw.": [{"F": "usw."}],
"\n": [{"F": "\n", "pos": "SP"}],
"\t": [{"F": "\t", "pos": "SP"}],
" ": [{"F": " ", "pos": "SP"}]
}

26
lang_data/fi/suffix.txt Normal file
View File

@ -0,0 +1,26 @@
,
\"
\)
\]
\}
\*
\!
\?
%
\$
>
:
;
'
''
's
'S
s
S
\.\.
\.\.\.
\.\.\.\.
(?<=[a-z0-9)\]"'%\)])\.
(?<=[0-9])km

17
lang_data/fi/tag_map.json Normal file
View File

@ -0,0 +1,17 @@
{
"NOUN": {"pos": "NOUN"},
"VERB": {"pos": "VERB"},
"PUNCT": {"pos": "PUNCT"},
"ADV": {"pos": "ADV"},
"ADJ": {"pos": "ADJ"},
"PRON": {"pos": "PRON"},
"PROPN": {"pos": "PROPN"},
"CONJ": {"pos": "CONJ"},
"NUM": {"pos": "NUM"},
"AUX": {"pos": "AUX"},
"SCONJ": {"pos": "SCONJ"},
"ADP": {"pos": "ADP"},
"SYM": {"pos": "SYM"},
"X": {"pos": "X"},
"INTJ": {"pos": "INTJ"}
}

3
lang_data/it/infix.txt Normal file
View File

@ -0,0 +1,3 @@
\.\.\.
(?<=[a-z])\.(?=[A-Z])
(?<=[a-zA-Z])-(?=[a-zA-z])

0
lang_data/it/morphs.json Normal file
View File

21
lang_data/it/prefix.txt Normal file
View File

@ -0,0 +1,21 @@
,
"
(
[
{
*
<
$
£
'
``
`
#
US$
C$
A$
a-
....
...

149
lang_data/it/specials.json Normal file
View File

@ -0,0 +1,149 @@
{
"a.m.": [{"F": "a.m."}],
"p.m.": [{"F": "p.m."}],
"1a.m.": [{"F": "1"}, {"F": "a.m."}],
"2a.m.": [{"F": "2"}, {"F": "a.m."}],
"3a.m.": [{"F": "3"}, {"F": "a.m."}],
"4a.m.": [{"F": "4"}, {"F": "a.m."}],
"5a.m.": [{"F": "5"}, {"F": "a.m."}],
"6a.m.": [{"F": "6"}, {"F": "a.m."}],
"7a.m.": [{"F": "7"}, {"F": "a.m."}],
"8a.m.": [{"F": "8"}, {"F": "a.m."}],
"9a.m.": [{"F": "9"}, {"F": "a.m."}],
"10a.m.": [{"F": "10"}, {"F": "a.m."}],
"11a.m.": [{"F": "11"}, {"F": "a.m."}],
"12a.m.": [{"F": "12"}, {"F": "a.m."}],
"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
"1p.m.": [{"F": "1"}, {"F": "p.m."}],
"2p.m.": [{"F": "2"}, {"F": "p.m."}],
"3p.m.": [{"F": "3"}, {"F": "p.m."}],
"4p.m.": [{"F": "4"}, {"F": "p.m."}],
"5p.m.": [{"F": "5"}, {"F": "p.m."}],
"6p.m.": [{"F": "6"}, {"F": "p.m."}],
"7p.m.": [{"F": "7"}, {"F": "p.m."}],
"8p.m.": [{"F": "8"}, {"F": "p.m."}],
"9p.m.": [{"F": "9"}, {"F": "p.m."}],
"10p.m.": [{"F": "10"}, {"F": "p.m."}],
"11p.m.": [{"F": "11"}, {"F": "p.m."}],
"12p.m.": [{"F": "12"}, {"F": "p.m."}],
"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
"Jan.": [{"F": "Jan.", "L": "Januar"}],
"Feb.": [{"F": "Feb.", "L": "Februar"}],
"Mär.": [{"F": "Mär.", "L": "März"}],
"Apr.": [{"F": "Apr.", "L": "April"}],
"Mai.": [{"F": "Mai.", "L": "Mai"}],
"Jun.": [{"F": "Jun.", "L": "Juni"}],
"Jul.": [{"F": "Jul.", "L": "Juli"}],
"Aug.": [{"F": "Aug.", "L": "August"}],
"Sep.": [{"F": "Sep.", "L": "September"}],
"Sept.": [{"F": "Sept.", "L": "September"}],
"Okt.": [{"F": "Okt.", "L": "Oktober"}],
"Nov.": [{"F": "Nov.", "L": "November"}],
"Dez.": [{"F": "Dez.", "L": "Dezember"}],
":)": [{"F": ":)"}],
"<3": [{"F": "<3"}],
";)": [{"F": ";)"}],
"(:": [{"F": "(:"}],
":(": [{"F": ":("}],
"-_-": [{"F": "-_-"}],
"=)": [{"F": "=)"}],
":/": [{"F": ":/"}],
":>": [{"F": ":>"}],
";-)": [{"F": ";-)"}],
":Y": [{"F": ":Y"}],
":P": [{"F": ":P"}],
":-P": [{"F": ":-P"}],
":3": [{"F": ":3"}],
"=3": [{"F": "=3"}],
"xD": [{"F": "xD"}],
"^_^": [{"F": "^_^"}],
"=]": [{"F": "=]"}],
"=D": [{"F": "=D"}],
"<333": [{"F": "<333"}],
":))": [{"F": ":))"}],
":0": [{"F": ":0"}],
"-__-": [{"F": "-__-"}],
"xDD": [{"F": "xDD"}],
"o_o": [{"F": "o_o"}],
"o_O": [{"F": "o_O"}],
"V_V": [{"F": "V_V"}],
"=[[": [{"F": "=[["}],
"<33": [{"F": "<33"}],
";p": [{"F": ";p"}],
";D": [{"F": ";D"}],
";-p": [{"F": ";-p"}],
";(": [{"F": ";("}],
":p": [{"F": ":p"}],
":]": [{"F": ":]"}],
":O": [{"F": ":O"}],
":-/": [{"F": ":-/"}],
":-)": [{"F": ":-)"}],
":(((": [{"F": ":((("}],
":((": [{"F": ":(("}],
":')": [{"F": ":')"}],
"(^_^)": [{"F": "(^_^)"}],
"(=": [{"F": "(="}],
"o.O": [{"F": "o.O"}],
"\")": [{"F": "\")"}],
"a.": [{"F": "a."}],
"b.": [{"F": "b."}],
"c.": [{"F": "c."}],
"d.": [{"F": "d."}],
"e.": [{"F": "e."}],
"f.": [{"F": "f."}],
"g.": [{"F": "g."}],
"h.": [{"F": "h."}],
"i.": [{"F": "i."}],
"j.": [{"F": "j."}],
"k.": [{"F": "k."}],
"l.": [{"F": "l."}],
"m.": [{"F": "m."}],
"n.": [{"F": "n."}],
"o.": [{"F": "o."}],
"p.": [{"F": "p."}],
"q.": [{"F": "q."}],
"s.": [{"F": "s."}],
"t.": [{"F": "t."}],
"u.": [{"F": "u."}],
"v.": [{"F": "v."}],
"w.": [{"F": "w."}],
"x.": [{"F": "x."}],
"y.": [{"F": "y."}],
"z.": [{"F": "z."}],
"z.b.": [{"F": "z.b."}],
"e.h.": [{"F": "I.e."}],
"o.ä.": [{"F": "I.E."}],
"bzw.": [{"F": "bzw."}],
"usw.": [{"F": "usw."}],
"\n": [{"F": "\n", "pos": "SP"}],
"\t": [{"F": "\t", "pos": "SP"}],
" ": [{"F": " ", "pos": "SP"}]
}

26
lang_data/it/suffix.txt Normal file
View File

@ -0,0 +1,26 @@
,
\"
\)
\]
\}
\*
\!
\?
%
\$
>
:
;
'
''
's
'S
s
S
\.\.
\.\.\.
\.\.\.\.
(?<=[a-z0-9)\]"'%\)])\.
(?<=[0-9])km

44
lang_data/it/tag_map.json Normal file
View File

@ -0,0 +1,44 @@
{
"S": {"pos": "NOUN"},
"E": {"pos": "ADP"},
"RD": {"pos": "DET"},
"V": {"pos": "VER"},
"_": {"pos": "_"},
"A": {"pos": "ADJ"},
"SP": {"pos": "PROP"},
"FF": {"pos": "PUNC"},
"FS": {"pos": "PUNC"},
"B": {"pos": "ADV"},
"CC": {"pos": "CON"},
"FB": {"pos": "PUNC"},
"VA": {"pos": "AUX"},
"PC": {"pos": "PRO"},
"N": {"pos": "NUM"},
"RI": {"pos": "DET"},
"PR": {"pos": "PRO"},
"CS": {"pos": "SCON"},
"BN": {"pos": "ADV"},
"AP": {"pos": "DET"},
"VM": {"pos": "AUX"},
"DI": {"pos": "DET"},
"FC": {"pos": "PUNC"},
"PI": {"pos": "PRO"},
"DD": {"pos": "DET"},
"DQ": {"pos": "DET"},
"PQ": {"pos": "PRO"},
"PD": {"pos": "PRO"},
"NO": {"pos": "ADJ"},
"PE": {"pos": "PRO"},
"T": {"pos": "DET"},
"X": {"pos": "SYM"},
"SW": {"pos": "X"},
"NO": {"pos": "PRO"},
"I": {"pos": "INT"},
"X": {"pos": "X"},
"DR": {"pos": "DET"},
"EA": {"pos": "ADP"},
"PP": {"pos": "PRO"},
"X": {"pos": "NUM"},
"DE": {"pos": "DET"},
"X": {"pos": "PAR"}
}

View File

@ -153,7 +153,7 @@ def main(modules, is_pypy):
MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
'spacy.morphology',
'spacy.morphology', 'spacy.tagger',
'spacy.syntax.stateclass',
'spacy._ml', 'spacy._theano',
'spacy.tokenizer', 'spacy.en.attrs',
@ -164,7 +164,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.gold', 'spacy.orth',
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
'spacy.cfile',
'spacy.cfile', 'spacy.matcher',
'spacy.syntax.ner']

View File

@ -91,6 +91,8 @@ cdef class Model:
count_feats(counts[guess], feats, n_feats, -cost)
self._model.update(counts)
def end_training(self):
def end_training(self, model_loc=None):
if model_loc is None:
model_loc = self.model_loc
self._model.end_training()
self._model.dump(self.model_loc, freq_thresh=0)
self._model.dump(model_loc, freq_thresh=0)

View File

@ -14,7 +14,7 @@ cpdef enum attr_id_t:
IS_STOP
IS_OOV
FLAG13
FLAG13 = 13
FLAG14
FLAG15
FLAG16
@ -84,3 +84,4 @@ cpdef enum attr_id_t:
ENT_TYPE
HEAD
SPACY
PROB

View File

@ -1,174 +1,12 @@
from __future__ import unicode_literals
from __future__ import unicode_literals, print_function
from os import path
import re
import struct
import json
from .. import orth
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.arc_eager import ArcEager
from ..syntax.ner import BiluoPushDown
from ..syntax.parser import ParserFactory
from ..serialize.bits import BitArray
from ..language import Language
from ..tokens import Doc
from ..multi_words import RegexMerger
from .pos import EnPosTagger
from .pos import POS_TAGS
from .attrs import get_flags
from . import regexes
from ..util import read_lang_data
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
def get_lex_props(string, oov_prob=-30, is_oov=False):
return {
'flags': get_flags(string, is_oov=is_oov),
'length': len(string),
'orth': string,
'lower': string.lower(),
'norm': string,
'shape': orth.word_shape(string),
'prefix': string[0],
'suffix': string[-3:],
'cluster': 0,
'prob': oov_prob,
'sentiment': 0
}
if_model_present = -1
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
class English(object):
"""The English NLP pipeline.
Example:
Load data from default directory:
>>> nlp = English()
>>> nlp = English(data_dir=u'')
Load data from specified directory:
>>> nlp = English(data_dir=u'path/to/data_directory')
Disable (and avoid loading) parts of the processing pipeline:
>>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
Start with nothing loaded:
>>> nlp = English(data_dir=None)
"""
ParserTransitionSystem = ArcEager
EntityTransitionSystem = BiluoPushDown
def __init__(self,
data_dir=LOCAL_DATA_DIR,
Tokenizer=Tokenizer.from_dir,
Tagger=EnPosTagger,
Parser=ParserFactory(ParserTransitionSystem),
Entity=ParserFactory(EntityTransitionSystem),
Packer=None,
load_vectors=True
):
self.data_dir = data_dir
if path.exists(path.join(data_dir, 'vocab', 'oov_prob')):
oov_prob = float(open(path.join(data_dir, 'vocab', 'oov_prob')).read())
else:
oov_prob = None
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
get_lex_props=get_lex_props, load_vectors=load_vectors,
pos_tags=POS_TAGS,
oov_prob=oov_prob)
if Tagger is True:
Tagger = EnPosTagger
if Parser is True:
transition_system = self.ParserTransitionSystem
Parser = lambda s, d: parser.Parser(s, d, transition_system)
if Entity is True:
transition_system = self.EntityTransitionSystem
Entity = lambda s, d: parser.Parser(s, d, transition_system)
self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
if Tagger and path.exists(path.join(data_dir, 'pos')):
self.tagger = Tagger(self.vocab.strings, data_dir)
else:
self.tagger = None
if Parser and path.exists(path.join(data_dir, 'deps')):
self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
else:
self.parser = None
if Entity and path.exists(path.join(data_dir, 'ner')):
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
else:
self.entity = None
if Packer:
self.packer = Packer(self.vocab, data_dir)
else:
self.packer = None
self.mwe_merger = RegexMerger([
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
('CD', 'TIME', regexes.TIME_RE),
('NNP', 'DATE', regexes.DAYS_RE),
('CD', 'MONEY', regexes.MONEY_RE)])
def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False):
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
Args:
text (unicode): The text to be processed.
Returns:
tokens (spacy.tokens.Doc):
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].orth_, tokens[0].head.tag_
('An', 'NN')
"""
tokens = self.tokenizer(text)
if self.tagger and tag:
self.tagger(tokens)
if self.parser and parse:
self.parser(tokens)
if self.entity and entity:
self.entity(tokens)
if merge_mwes and self.mwe_merger is not None:
self.mwe_merger(tokens)
return tokens
def end_training(self, data_dir=None):
if data_dir is None:
data_dir = self.data_dir
self.parser.model.end_training()
self.entity.model.end_training()
self.tagger.model.end_training()
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
file_.write(
json.dumps([
(TAG, list(self.tagger.freqs[TAG].items())),
(DEP, list(self.parser.moves.freqs[DEP].items())),
(ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())),
(ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())),
(HEAD, list(self.parser.moves.freqs[HEAD].items()))]))
@property
def tags(self):
"""Deprecated. List of part-of-speech tag names."""
return self.tagger.tag_names
class English(Language):
@classmethod
def default_data_dir(cls):
return LOCAL_DATA_DIR

View File

@ -1,105 +0,0 @@
from __future__ import unicode_literals
from os import path
import codecs
NOUN_RULES = (
('s', ''),
('ses', 's'),
('ves', 'f'),
('xes', 'x'),
('zes', 'z'),
('ches', 'ch'),
('shes', 'sh'),
('men', 'man'),
('ies', 'y')
)
VERB_RULES = (
("s", ""),
("ies", "y"),
("es", "e"),
("es", ""),
("ed", "e"),
("ed", ""),
("ing", "e"),
("ing", "")
)
ADJ_RULES = (
("er", ""),
("est", ""),
("er", "e"),
("est", "e")
)
class Lemmatizer(object):
def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id):
self.noun_id = noun_id
self.verb_id = verb_id
self.adj_id = adj_id
self.index = {}
self.exc = {}
for pos in ['adj', 'adv', 'noun', 'verb']:
self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
def __call__(self, string, pos):
if pos == self.noun_id:
return self.noun(string)
elif pos == self.verb_id:
return self.verb(string)
elif pos == self.adj_id:
return self.adj(string)
else:
raise Exception("Cannot lemmatize with unknown pos: %s" % pos)
def noun(self, string):
return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
def verb(self, string):
return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
def adj(self, string):
return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
def lemmatize(string, index, exceptions, rules):
string = string.lower()
forms = []
if string in index:
forms.append(string)
forms.extend(exceptions.get(string, []))
for old, new in rules:
if string.endswith(old):
form = string[:len(string) - len(old)] + new
if form in index:
forms.append(form)
if not forms:
forms.append(string)
return set(forms)
def read_index(loc):
index = set()
for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '):
continue
pieces = line.split()
word = pieces[0]
if word.count('_') == 0:
index.add(word)
return index
def read_exc(loc):
exceptions = {}
for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '):
continue
pieces = line.split()
exceptions[pieces[0]] = tuple(pieces[1:])
return exceptions

View File

@ -1,26 +1,5 @@
from preshed.maps cimport PreshMapArray
from preshed.counter cimport PreshCounter
from cymem.cymem cimport Pool
from .._ml cimport Model
from ..strings cimport StringStore
from ..structs cimport TokenC, LexemeC, Morphology, PosTag
from ..parts_of_speech cimport univ_pos_t
from .lemmatizer import Lemmatizer
from ..tagger cimport Tagger
cdef class EnPosTagger:
cdef readonly Pool mem
cdef readonly StringStore strings
cdef readonly Model model
cdef public object lemmatizer
cdef PreshMapArray _morph_cache
cdef public dict freqs
cdef PosTag* tags
cdef readonly object tag_names
cdef readonly object tag_map
cdef readonly int n_tags
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
cdef class EnPosTagger(Tagger):
pass

View File

@ -1,389 +1,11 @@
from os import path
import json
import os
import shutil
from libc.string cimport memset
from ..parts_of_speech cimport NOUN, VERB, ADJ
from cymem.cymem cimport Address
from thinc.typedefs cimport atom_t, weight_t
from collections import defaultdict
from ..parts_of_speech cimport univ_pos_t
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens.doc cimport Doc
from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max
from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
from ..typedefs cimport attr_t
from .lemmatizer import Lemmatizer
from ..lemmatizer import Lemmatizer
cpdef enum en_person_t:
NO_PERSON
FIRST
SECOND
THIRD
NON_THIRD
cpdef enum en_number_t:
NO_NUMBER
SINGULAR
PLURAL
MASS
cpdef enum en_gender_t:
NO_GENDER
MASCULINE
FEMININE
NEUTER
cpdef enum en_case_t:
NO_CASE
NOMINATIVE
GENITIVE
ACCUSATIVE
REFLEXIVE
DEMONYM
cpdef enum en_tenspect_t:
NO_TENSE
BASE_VERB
PRESENT
PAST
PASSIVE
ING
MODAL
cpdef enum misc_t:
NO_MISC
COMPARATIVE
SUPERLATIVE
RELATIVE
NAME
cpdef enum:
P2_orth
P2_cluster
P2_shape
P2_prefix
P2_suffix
P2_pos
P2_lemma
P2_flags
P1_orth
P1_cluster
P1_shape
P1_prefix
P1_suffix
P1_pos
P1_lemma
P1_flags
W_orth
W_cluster
W_shape
W_prefix
W_suffix
W_pos
W_lemma
W_flags
N1_orth
N1_cluster
N1_shape
N1_prefix
N1_suffix
N1_pos
N1_lemma
N1_flags
N2_orth
N2_cluster
N2_shape
N2_prefix
N2_suffix
N2_pos
N2_lemma
N2_flags
N_CONTEXT_FIELDS
POS_TAGS = {
'NULL': (NO_TAG, {}),
'EOL': (EOL, {}),
'CC': (CONJ, {}),
'CD': (NUM, {}),
'DT': (DET, {}),
'EX': (DET, {}),
'FW': (X, {}),
'IN': (ADP, {}),
'JJ': (ADJ, {}),
'JJR': (ADJ, {'misc': COMPARATIVE}),
'JJS': (ADJ, {'misc': SUPERLATIVE}),
'LS': (X, {}),
'MD': (VERB, {'tenspect': MODAL}),
'NN': (NOUN, {}),
'NNS': (NOUN, {'number': PLURAL}),
'NNP': (NOUN, {'misc': NAME}),
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
'PDT': (DET, {}),
'POS': (PRT, {'case': GENITIVE}),
'PRP': (PRON, {}),
'PRP$': (PRON, {'case': GENITIVE}),
'RB': (ADV, {}),
'RBR': (ADV, {'misc': COMPARATIVE}),
'RBS': (ADV, {'misc': SUPERLATIVE}),
'RP': (PRT, {}),
'SYM': (X, {}),
'TO': (PRT, {}),
'UH': (X, {}),
'VB': (VERB, {}),
'VBD': (VERB, {'tenspect': PAST}),
'VBG': (VERB, {'tenspect': ING}),
'VBN': (VERB, {'tenspect': PASSIVE}),
'VBP': (VERB, {'tenspect': PRESENT}),
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
'WDT': (DET, {'misc': RELATIVE}),
'WP': (PRON, {'misc': RELATIVE}),
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
'WRB': (ADV, {'misc': RELATIVE}),
'!': (PUNCT, {}),
'#': (PUNCT, {}),
'$': (PUNCT, {}),
"''": (PUNCT, {}),
"(": (PUNCT, {}),
")": (PUNCT, {}),
"-LRB-": (PUNCT, {}),
"-RRB-": (PUNCT, {}),
".": (PUNCT, {}),
",": (PUNCT, {}),
"``": (PUNCT, {}),
":": (PUNCT, {}),
"?": (PUNCT, {}),
"ADD": (X, {}),
"NFP": (PUNCT, {}),
"GW": (X, {}),
"AFX": (X, {}),
"HYPH": (PUNCT, {}),
"XX": (X, {}),
"BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
"HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
"SP": (SPACE, {})
}
POS_TEMPLATES = (
(W_orth,),
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
(N1_orth,),
(N2_orth,),
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_orth),
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_cluster,),
(N1_cluster,),
(N2_cluster,),
(P1_cluster,),
(P2_cluster,),
(W_flags,),
(N1_flags,),
(N2_flags,),
(P1_flags,),
(P2_flags,),
)
cdef struct _CachedMorph:
Morphology morph
int lemma
def setup_model_dir(tag_names, tag_map, templates, model_dir):
if path.exists(model_dir):
shutil.rmtree(model_dir)
os.mkdir(model_dir)
config = {
'templates': templates,
'tag_names': tag_names,
'tag_map': tag_map
}
with open(path.join(model_dir, 'config.json'), 'w') as file_:
json.dump(config, file_)
cdef class EnPosTagger:
cdef class EnPosTagger(Tagger):
"""A part-of-speech tagger for English"""
def __init__(self, StringStore strings, data_dir):
self.mem = Pool()
model_dir = path.join(data_dir, 'pos')
self.strings = strings
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
self.tag_names = sorted(cfg['tag_names'])
assert self.tag_names
self.n_tags = len(self.tag_names)
self.tag_map = cfg['tag_map']
cdef int n_tags = len(self.tag_names) + 1
self.model = Model(n_tags, cfg['templates'], model_dir)
self._morph_cache = PreshMapArray(n_tags)
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
for i, tag in enumerate(sorted(self.tag_names)):
pos, props = self.tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
set_morph_from_dict(&self.tags[i].morph, props)
if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')):
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
'morphs.json'))))
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
self.freqs[TAG][self.strings[tag]] = 1
self.freqs[TAG][0] = 1
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.
Args:
tokens (Doc): The tokens to be tagged.
"""
if tokens.length == 0:
return 0
cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context
cdef const weight_t* scores
for i in range(tokens.length):
if tokens.data[i].pos == 0:
fill_context(context, i, tokens.data)
scores = self.model.score(context)
guess = arg_max(scores, self.model.n_classes)
tokens.data[i].tag = self.strings[self.tag_names[guess]]
self.set_morph(i, &self.tags[guess], tokens.data)
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i
for i in range(tokens.length):
tokens.data[i].tag = self.strings[tag_strs[i]]
self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])],
tokens.data)
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def train(self, Doc tokens, object gold_tag_strs):
cdef int i
cdef int loss
cdef atom_t[N_CONTEXT_FIELDS] context
cdef const weight_t* scores
golds = [self.tag_names.index(g) if g is not None else -1
for g in gold_tag_strs]
correct = 0
for i in range(tokens.length):
fill_context(context, i, tokens.data)
scores = self.model.score(context)
guess = arg_max(scores, self.model.n_classes)
loss = guess != golds[i] if golds[i] != -1 else 0
self.model.update(context, guess, golds[i], loss)
tokens.data[i].tag = self.strings[self.tag_names[guess]]
self.set_morph(i, &self.tags[guess], tokens.data)
correct += loss == 0
self.freqs[TAG][tokens.data[i].tag] += 1
return correct
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:
tokens[i].pos = tag.pos
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
if cached is NULL:
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
cached.morph = tag.morph
self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
if self.lemmatizer is None:
return lex.orth
cdef unicode py_string = self.strings[lex.orth]
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.orth
cdef set lemma_strings
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings[lemma_string]
return lemma
def load_morph_exceptions(self, dict exc):
cdef unicode pos_str
cdef unicode form_str
cdef unicode lemma_str
cdef dict entries
cdef dict props
cdef int lemma
cdef attr_t orth
cdef int pos
for pos_str, entries in exc.items():
pos = self.tag_names.index(pos_str)
for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
orth = self.strings[form_str]
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.strings[lemma_str]
set_morph_from_dict(&cached.morph, props)
self._morph_cache.set(pos, orth, <void*>cached)
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.lower
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.tag
context[6] = t.lemma
if t.lex.flags & (1 << IS_ALPHA):
context[7] = 1
elif t.lex.flags & (1 << IS_PUNCT):
context[7] = 2
elif t.lex.flags & (1 << LIKE_URL):
context[7] = 3
elif t.lex.flags & (1 << LIKE_NUM):
context[7] = 4
else:
context[7] = 0
def make_lemmatizer(self, data_dir):
return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)

11
spacy/fi/__init__.py Normal file
View File

@ -0,0 +1,11 @@
from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
class Finnish(Language):
@classmethod
def default_data_dir(cls):
return path.join(path.dirname(__file__), 'data')

252
spacy/language.py Normal file
View File

@ -0,0 +1,252 @@
from os import path
try:
import ujson as json
except ImportError:
import json
from .tokenizer import Tokenizer
from .vocab import Vocab
from .syntax.parser import Parser
from .tagger import Tagger
from .matcher import Matcher
from .serialize.packer import Packer
from ._ml import Model
from . import attrs
from . import orth
from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
class Language(object):
@staticmethod
def lower(string):
return string.lower()
@staticmethod
def norm(string):
return string
@staticmethod
def shape(string):
return orth.word_shape(string)
@staticmethod
def prefix(string):
return string[0]
@staticmethod
def suffix(string):
return string[-3:]
@staticmethod
def prob(string):
return -30
@staticmethod
def cluster(string):
return 0
@staticmethod
def is_alpha(string):
return orth.is_alpha(string)
@staticmethod
def is_ascii(string):
return orth.is_ascii(string)
@staticmethod
def is_digit(string):
return string.isdigit()
@staticmethod
def is_lower(string):
return orth.is_lower(string)
@staticmethod
def is_punct(string):
return orth.is_punct(string)
@staticmethod
def is_space(string):
return string.isspace()
@staticmethod
def is_title(string):
return orth.is_title(string)
@staticmethod
def is_upper(string):
return orth.is_upper(string)
@staticmethod
def like_url(string):
return orth.like_url(string)
@staticmethod
def like_number(string):
return orth.like_number(string)
@staticmethod
def like_email(string):
return orth.like_email(string)
@classmethod
def default_lex_attrs(cls, data_dir=None):
return {
attrs.LOWER: cls.lower,
attrs.NORM: cls.norm,
attrs.SHAPE: cls.shape,
attrs.PREFIX: cls.prefix,
attrs.SUFFIX: cls.suffix,
attrs.CLUSTER: cls.cluster,
attrs.PROB: lambda string: -10.0,
attrs.IS_ALPHA: cls.is_alpha,
attrs.IS_ASCII: cls.is_ascii,
attrs.IS_DIGIT: cls.is_digit,
attrs.IS_LOWER: cls.is_lower,
attrs.IS_PUNCT: cls.is_punct,
attrs.IS_SPACE: cls.is_space,
attrs.IS_TITLE: cls.is_title,
attrs.IS_UPPER: cls.is_upper,
attrs.LIKE_URL: cls.like_url,
attrs.LIKE_NUM: cls.like_number,
attrs.LIKE_EMAIL: cls.like_email,
attrs.IS_STOP: lambda string: False,
attrs.IS_OOV: lambda string: True
}
@classmethod
def default_dep_labels(cls):
return {0: {'ROOT': True}}
@classmethod
def default_ner_labels(cls):
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
@classmethod
def default_data_dir(cls):
return path.join(path.dirname(__file__), 'data')
@classmethod
def default_vectors(cls, data_dir):
return None
@classmethod
def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None):
if data_dir is None:
data_dir = cls.default_data_dir()
if vectors is None:
vectors = cls.default_vectors(data_dir)
if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs(data_dir)
return Vocab.from_dir(
path.join(data_dir, 'vocab'),
get_lex_attr=get_lex_attr,
vectors=vectors)
@classmethod
def default_tokenizer(cls, vocab, data_dir):
if path.exists(data_dir):
return Tokenizer.from_dir(vocab, data_dir)
else:
return Tokenizer(vocab, {}, None, None, None)
@classmethod
def default_tagger(cls, vocab, data_dir):
if path.exists(data_dir):
return Tagger.from_dir(data_dir, vocab)
else:
return None
@classmethod
def default_parser(cls, vocab, data_dir):
if path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, ArcEager)
else:
return None
@classmethod
def default_entity(cls, vocab, data_dir):
if path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
else:
return None
@classmethod
def default_matcher(cls, vocab, data_dir):
if path.exists(data_dir):
return Matcher.from_dir(data_dir, vocab)
else:
return None
def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
parser=None, entity=None, matcher=None, serializer=None):
if data_dir is None:
data_dir = self.default_data_dir()
if vocab is None:
vocab = self.default_vocab(data_dir)
if tokenizer is None:
tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer'))
if tagger is None:
tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos'))
if entity is None:
entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner'))
if parser is None:
parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps'))
if matcher is None:
matcher = self.default_matcher(vocab, data_dir=data_dir)
self.vocab = vocab
self.tokenizer = tokenizer
self.tagger = tagger
self.parser = parser
self.entity = entity
self.matcher = matcher
def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False):
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
Args:
text (unicode): The text to be processed.
Returns:
tokens (spacy.tokens.Doc):
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].orth_, tokens[0].head.tag_
('An', 'NN')
"""
tokens = self.tokenizer(text)
if self.tagger and tag:
self.tagger(tokens)
if self.matcher and entity:
self.matcher(tokens)
if self.parser and parse:
self.parser(tokens)
if self.entity and entity:
self.entity(tokens)
return tokens
def end_training(self, data_dir=None):
if data_dir is None:
data_dir = self.data_dir
self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
self.entity.model.end_training(path.join(data_dir, 'ner', 'model'))
self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
file_.write(
json.dumps([
(TAG, list(self.tagger.freqs[TAG].items())),
(DEP, list(self.parser.moves.freqs[DEP].items())),
(ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())),
(ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())),
(HEAD, list(self.parser.moves.freqs[HEAD].items()))]))

86
spacy/lemmatizer.py Normal file
View File

@ -0,0 +1,86 @@
from __future__ import unicode_literals
from os import path
import codecs
try:
import ujson as json
except ImportError:
import json
from .parts_of_speech import NOUN, VERB, ADJ
class Lemmatizer(object):
@classmethod
def from_dir(cls, data_dir):
index = {}
exc = {}
for pos in ['adj', 'adv', 'noun', 'verb']:
index[pos] = read_index(path.join(data_dir, 'index.%s' % pos))
exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos))
rules = json.load(open(path.join(data_dir, 'lemma_rules.json')))
return cls(index, exc, rules)
def __init__(self, index, exceptions, rules):
self.index = index
self.exc = exceptions
self.rules = rules
def __call__(self, string, pos):
if pos == NOUN:
pos = 'noun'
elif pos == VERB:
pos = 'verb'
elif pos == ADJ:
pos = 'adj'
else:
return string
lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, []))
return min(lemmas)
def noun(self, string):
return self(string, 'noun')
def verb(self, string):
return self(string, 'verb')
def adj(self, string):
return self(string, 'adj')
def lemmatize(string, index, exceptions, rules):
string = string.lower()
forms = []
if string in index:
forms.append(string)
forms.extend(exceptions.get(string, []))
for old, new in rules:
if string.endswith(old):
form = string[:len(string) - len(old)] + new
if form in index:
forms.append(form)
if not forms:
forms.append(string)
return set(forms)
def read_index(loc):
index = set()
for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '):
continue
pieces = line.split()
word = pieces[0]
if word.count('_') == 0:
index.add(word)
return index
def read_exc(loc):
exceptions = {}
for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '):
continue
pieces = line.split()
exceptions[pieces[0]] = tuple(pieces[1:])
return exceptions

View File

@ -4,101 +4,80 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE
from .structs cimport LexemeC
from .strings cimport StringStore
from .vocab cimport Vocab
from numpy cimport ndarray
cdef LexemeC EMPTY_LEXEME
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
const float* empty_vec) except -1
cdef class Lexeme:
cdef readonly ndarray repvec
cdef readonly flags_t flags
cdef readonly attr_t id
cdef readonly attr_t length
cdef LexemeC* c
cdef readonly Vocab vocab
cdef readonly attr_t orth
cdef readonly attr_t lower
cdef readonly attr_t norm
cdef readonly attr_t shape
cdef readonly attr_t prefix
cdef readonly attr_t suffix
cdef readonly unicode orth_
cdef readonly unicode lower_
cdef readonly unicode norm_
cdef readonly unicode shape_
cdef readonly unicode prefix_
cdef readonly unicode suffix_
cdef readonly attr_t cluster
cdef readonly float prob
cdef readonly float sentiment
cdef readonly float l2_norm
# Workaround for an apparent bug in the way the decorator is handled ---
# TODO: post bug report / patch to Cython.
@staticmethod
cdef inline Lexeme from_ptr(const LexemeC* ptr, StringStore strings, int repvec_length):
cdef Lexeme py = Lexeme.__new__(Lexeme, repvec_length)
for i in range(repvec_length):
py.repvec[i] = ptr.repvec[i]
py.l2_norm = ptr.l2_norm
py.flags = ptr.flags
py.id = ptr.id
py.length = ptr.length
py.orth = ptr.orth
py.lower = ptr.lower
py.norm = ptr.norm
py.shape = ptr.shape
py.prefix = ptr.prefix
py.suffix = ptr.suffix
py.orth_ = strings[ptr.orth]
py.lower_ = strings[ptr.lower]
py.norm_ = strings[ptr.norm]
py.shape_ = strings[ptr.shape]
py.prefix_ = strings[ptr.prefix]
py.suffix_ = strings[ptr.suffix]
py.cluster = ptr.cluster
py.prob = ptr.prob
py.sentiment = ptr.sentiment
return py
cpdef bint check_flag(self, attr_id_t flag_id) except -1
cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length):
cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth)
self.c = lex
self.vocab = vocab
self.orth = lex.orth
@staticmethod
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
if name < (sizeof(flags_t) * 8):
Lexeme.set_flag(lex, name, value)
elif name == ID:
lex.id = value
elif name == LOWER:
lex.lower = value
elif name == NORM:
lex.norm = value
elif name == SHAPE:
lex.shape = value
elif name == PREFIX:
lex.prefix = value
elif name == SUFFIX:
lex.suffix = value
elif name == CLUSTER:
lex.cluster = value
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)
@staticmethod
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8):
if Lexeme.check_flag(lex, feat_name):
return 1
else:
return 0
elif feat_name == ID:
return lex.id
elif feat_name == ORTH:
return lex.orth
elif feat_name == LOWER:
return lex.lower
elif feat_name == NORM:
return lex.norm
elif feat_name == SHAPE:
return lex.shape
elif feat_name == PREFIX:
return lex.prefix
elif feat_name == SUFFIX:
return lex.suffix
elif feat_name == LENGTH:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
else:
return 0
@staticmethod
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)
cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8):
return check_flag(lex, feat_name)
elif feat_name == ID:
return lex.id
elif feat_name == ORTH:
return lex.orth
elif feat_name == LOWER:
return lex.norm
elif feat_name == NORM:
return lex.norm
elif feat_name == SHAPE:
return lex.shape
elif feat_name == PREFIX:
return lex.prefix
elif feat_name == SUFFIX:
return lex.suffix
elif feat_name == LENGTH:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
else:
return 0
@staticmethod
cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil:
cdef flags_t one = 1
if value:
lex.flags |= one << flag_id
else:
lex.flags &= ~(one << flag_id)

View File

@ -17,70 +17,120 @@ from .attrs cimport IS_OOV
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
const float* empty_vec) except -1:
lex.length = props['length']
lex.orth = string_store[props['orth']]
lex.lower = string_store[props['lower']]
lex.norm = string_store[props['norm']]
lex.shape = string_store[props['shape']]
lex.prefix = string_store[props['prefix']]
lex.suffix = string_store[props['suffix']]
lex.cluster = props['cluster']
lex.prob = props['prob']
lex.sentiment = props['sentiment']
lex.flags = props['flags']
lex.repvec = empty_vec
cdef class Lexeme:
"""An entry in the vocabulary. A Lexeme has no string context --- it's a
word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
tag).
"""
def __cinit__(self, int vec_size):
self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
def __init__(self, Vocab vocab, int orth):
self.vocab = vocab
self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
assert self.c.orth == orth
@property
def has_repvec(self):
return self.l2_norm != 0
def py_set_flag(self, attr_id_t flag_id):
Lexeme.set_flag(self.c, flag_id, True)
def py_check_flag(self, attr_id_t flag_id):
return True if Lexeme.check_flag(self.c, flag_id) else False
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
cdef flags_t one = 1
return self.flags & (one << flag_id)
property orth_:
def __get__(self):
return self.vocab.strings[self.c.orth]
property lower:
def __get__(self): return self.c.lower
def __set__(self, int x): self.c.lower = x
property norm:
def __get__(self): return self.c.norm
def __set__(self, int x): self.c.norm = x
property shape:
def __get__(self): return self.c.shape
def __set__(self, int x): self.c.shape = x
property prefix:
def __get__(self): return self.c.prefix
def __set__(self, int x): self.c.prefix = x
property suffix:
def __get__(self): return self.c.suffix
def __set__(self, int x): self.c.suffix = x
property cluster:
def __get__(self): return self.c.suffix
def __set__(self, int x): self.c.suffix = x
property prob:
def __get__(self): return self.c.suffix
def __set__(self, int x): self.c.suffix = x
property lower_:
def __get__(self): return self.vocab.strings[self.c.lower]
def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]
property norm_:
def __get__(self): return self.c.norm
def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]
property shape_:
def __get__(self): return self.vocab.strings[self.c.shape]
def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]
property prefix_:
def __get__(self): return self.c.prefix
def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]
property suffix_:
def __get__(self): return self.c.suffix
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
property flags:
def __get__(self): return self.c.flags
def __set__(self, flags_t x): self.c.flags = x
property is_oov:
def __get__(self): return self.check_flag(IS_OOV)
def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_OOV, x)
property is_alpha:
def __get__(self): return self.check_flag(IS_ALPHA)
def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA)
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ALPHA, x)
property is_ascii:
def __get__(self): return self.check_flag(IS_ASCII)
def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII)
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ASCII, x)
property is_digit:
def __get__(self): return self.check_flag(IS_DIGIT)
def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT)
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_DIGIT, x)
property is_lower:
def __get__(self): return self.check_flag(IS_LOWER)
def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER)
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_LOWER, x)
property is_title:
def __get__(self): return self.check_flag(IS_TITLE)
def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE)
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_TITLE, x)
property is_punct:
def __get__(self): return self.check_flag(IS_PUNCT)
def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT)
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_PUNCT, x)
property is_space:
def __get__(self): return self.check_flag(IS_SPACE)
def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE)
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_SPACE, x)
property like_url:
def __get__(self): return self.check_flag(LIKE_URL)
def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL)
def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_URL, x)
property like_num:
def __get__(self): return self.check_flag(LIKE_NUM)
def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM)
def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_NUM, x)
property like_email:
def __get__(self): return self.check_flag(LIKE_EMAIL)
def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL)
def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_EMAIL, x)

200
spacy/matcher.pyx Normal file
View File

@ -0,0 +1,200 @@
from os import path
from .typedefs cimport attr_t
from .attrs cimport attr_id_t
from .structs cimport TokenC
from cymem.cymem cimport Pool
from libcpp.vector cimport vector
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc
from .vocab cimport Vocab
from libcpp.vector cimport vector
try:
import ujson as json
except ImportError:
import json
cdef struct AttrValue:
attr_id_t attr
attr_t value
cdef struct Pattern:
AttrValue* spec
int length
cdef Pattern* init_pattern(Pool mem, object token_specs, attr_t entity_type) except NULL:
pattern = <Pattern*>mem.alloc(len(token_specs) + 1, sizeof(Pattern))
cdef int i
for i, spec in enumerate(token_specs):
pattern[i].spec = <AttrValue*>mem.alloc(len(spec), sizeof(AttrValue))
pattern[i].length = len(spec)
for j, (attr, value) in enumerate(spec):
pattern[i].spec[j].attr = attr
pattern[i].spec[j].value = value
i = len(token_specs)
pattern[i].spec = <AttrValue*>mem.alloc(1, sizeof(AttrValue))
pattern[i].spec[0].attr = ENT_TYPE
pattern[i].spec[0].value = entity_type
pattern[i].spec[1].attr = LENGTH
pattern[i].spec[1].value = len(token_specs)
pattern[i].length = 0
return pattern
cdef int match(const Pattern* pattern, const TokenC* token) except -1:
cdef int i
for i in range(pattern.length):
if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value:
print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value
print get_token_attr(token, pattern.spec[i].attr)
return False
return True
cdef int is_final(const Pattern* pattern) except -1:
return (pattern + 1).length == 0
cdef object get_entity(const Pattern* pattern, const TokenC* tokens, int i):
pattern += 1
i += 1
return (pattern.spec[0].value, i - pattern.spec[1].value, i)
def _convert_strings(token_specs, string_store):
converted = []
for spec in token_specs:
converted.append([])
for attr, value in spec.items():
if isinstance(attr, basestring):
attr = map_attr_name(attr)
if isinstance(value, basestring):
value = string_store[value]
if isinstance(value, bool):
value = int(value)
converted[-1].append((attr, value))
print "Converted", converted[-1]
return converted
def map_attr_name(attr):
attr = attr.upper()
if attr == 'ORTH':
return ORTH
elif attr == 'LEMMA':
return LEMMA
elif attr == 'LOWER':
return LOWER
elif attr == 'SHAPE':
return SHAPE
elif attr == 'NORM':
return NORM
elif attr == 'FLAG13':
return FLAG13
elif attr == 'FLAG14':
return FLAG14
elif attr == 'FLAG15':
return FLAG15
elif attr == 'FLAG16':
return FLAG16
elif attr == 'FLAG17':
return FLAG17
elif attr == 'FLAG18':
return FLAG18
elif attr == 'FLAG19':
return FLAG19
elif attr == 'FLAG20':
return FLAG20
elif attr == 'FLAG21':
return FLAG21
elif attr == 'FLAG22':
return FLAG22
elif attr == 'FLAG23':
return FLAG23
elif attr == 'FLAG24':
return FLAG24
elif attr == 'FLAG25':
return FLAG25
else:
raise Exception("TODO: Finish supporting attr mapping %s" % attr)
cdef class Matcher:
cdef Pool mem
cdef vector[Pattern*] patterns
cdef readonly Vocab vocab
def __init__(self, vocab, patterns):
self.vocab = vocab
self.mem = Pool()
self.vocab = vocab
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
self.add(entity_key, etype, attrs, specs)
@classmethod
def from_dir(cls, data_dir, Vocab vocab):
patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json')
if path.exists(patterns_loc):
patterns_data = open(patterns_loc).read()
patterns = json.loads(patterns_data)
return cls(vocab, patterns)
else:
return cls(vocab, {})
property n_patterns:
def __get__(self): return self.patterns.size()
def add(self, entity_key, etype, attrs, specs):
if isinstance(entity_key, basestring):
entity_key = self.vocab.strings[entity_key]
if isinstance(etype, basestring):
etype = self.vocab.strings[etype]
elif etype is None:
etype = -1
# TODO: Do something more clever about multiple patterns for single
# entity
for spec in specs:
spec = _convert_strings(spec, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, spec, etype))
def __call__(self, Doc doc):
cdef vector[Pattern*] partials
cdef int n_partials = 0
cdef int q = 0
cdef int i, token_i
cdef const TokenC* token
cdef Pattern* state
matches = []
for token_i in range(doc.length):
print 'check', doc[token_i].orth_
token = &doc.data[token_i]
q = 0
for i in range(partials.size()):
state = partials.at(i)
if match(state, token):
print 'match!'
if is_final(state):
matches.append(get_entity(state, token, token_i))
else:
partials[q] = state + 1
q += 1
partials.resize(q)
for i in range(self.n_patterns):
state = self.patterns[i]
if match(state, token):
print 'match!'
if is_final(state):
matches.append(get_entity(state, token, token_i))
else:
partials.push_back(state + 1)
doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches
return matches

View File

@ -1,4 +1,755 @@
from .structs cimport TokenC, Morphology, PosTag
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMapArray
from libc.stdint cimport uint64_t
from .structs cimport TokenC
from .strings cimport StringStore
from .typedefs cimport attr_t
from .parts_of_speech cimport univ_pos_t
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1
cdef struct RichTagC:
uint64_t morph
int id
univ_pos_t pos
attr_t name
cdef struct MorphAnalysisC:
RichTagC tag
attr_t lemma
cdef class Morphology:
cdef readonly Pool mem
cdef readonly StringStore strings
cdef public object lemmatizer
cdef public object n_tags
cdef public object reverse_index
cdef public object tag_names
cdef RichTagC* rich_tags
cdef PreshMapArray _cache
cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
#
#cpdef enum Feature_t:
# Abbr
# AdpType
# AdvType
# ConjType
# Connegative
# Derivation
# Echo
# Foreign
# Gender_dat
# Gender_erg
# Gender_psor
# Hyph
# InfForm
# NameType
# NounType
# NumberAbs
# NumberDat
# NumberErg
# NumberPsee
# NumberPsor
# NumForm
# NumValue
# PartForm
# PartType
# Person_abs
# Person_dat
# Person_psor
# Polite
# Polite_abs
# Polite_dat
# Prefix
# PrepCase
# PunctSide
# PunctType
# Style
# Typo
# Variant
# VerbType
#
#
#cpdef enum Animacy:
# Anim
# Inam
#
#
#cpdef enum Aspect:
# Freq
# Imp
# Mod
# None_
# Perf
#
#
#cpdef enum Case1:
# Nom
# Gen
# Acc
# Dat
# Voc
# Abl
#
#cdef enum Case2:
# Abe
# Abs
# Ade
# All
# Cau
# Com
# Del
# Dis
#
#cdef enum Case3:
# Ela
# Ess
# Ill
# Ine
# Ins
# Loc
# Lat
# Par
#
#cdef enum Case4:
# Sub
# Sup
# Tem
# Ter
# Tra
#
#
#cpdef enum Definite:
# Two
# Def
# Red
# Ind
#
#
#cpdef enum Degree:
# Cmp
# Comp
# None_
# Pos
# Sup
# Abs
# Com
# Degree # du
#
#
#cpdef enum Gender:
# Com
# Fem
# Masc
# Neut
#
#
#cpdef enum Mood:
# Cnd
# Imp
# Ind
# N
# Pot
# Sub
# Opt
#
#
#cpdef enum Negative:
# Neg
# Pos
# Yes
#
#
#cpdef enum Number:
# Com
# Dual
# None_
# Plur
# Sing
# Ptan # bg
# Count # bg
#
#
#cpdef enum NumType:
# Card
# Dist
# Frac
# Gen
# Mult
# None_
# Ord
# Sets
#
#
#cpdef enum Person:
# One
# Two
# Three
# None_
#
#
#cpdef enum Poss:
# Yes
#
#
#cpdef enum PronType1:
# AdvPart
# Art
# Default
# Dem
# Ind
# Int
# Neg
#
#cpdef enum PronType2:
# Prs
# Rcp
# Rel
# Tot
# Clit
# Exc # es, ca, it, fa
# Clit # it
#
#
#cpdef enum Reflex:
# Yes
#
#
#cpdef enum Tense:
# Fut
# Imp
# Past
# Pres
#
#cpdef enum VerbForm1:
# Fin
# Ger
# Inf
# None_
# Part
# PartFut
# PartPast
#
#cpdef enum VerbForm2:
# PartPres
# Sup
# Trans
# Gdv # la
#
#
#cpdef enum Voice:
# Act
# Cau
# Pass
# Mid # gkc
# Int # hb
#
#
#cpdef enum Abbr:
# Yes # cz, fi, sl, U
#
#cpdef enum AdpType:
# Prep # cz, U
# Post # U
# Voc # cz
# Comprep # cz
# Circ # U
# Voc # U
#
#
#cpdef enum AdvType1:
# # U
# Man
# Loc
# Tim
# Deg
# Cau
# Mod
# Sta
# Ex
#
#cpdef enum AdvType2:
# Adadj
#
#cpdef enum ConjType:
# Oper # cz, U
# Comp # cz, U
#
#cpdef enum Connegative:
# Yes # fi
#
#
#cpdef enum Derivation1:
# Minen # fi
# Sti # fi
# Inen # fi
# Lainen # fi
# Ja # fi
# Ton # fi
# Vs # fi
# Ttain # fi
#
#cpdef enum Derivation2:
# Ttaa
#
#
#cpdef enum Echo:
# Rdp # U
# Ech # U
#
#
#cpdef enum Foreign:
# Foreign # cz, fi, U
# Fscript # cz, fi, U
# Tscript # cz, U
# Yes # sl
#
#
#cpdef enum Gender_dat:
# Masc # bq, U
# Fem # bq, U
#
#
#cpdef enum Gender_erg:
# Masc # bq
# Fem # bq
#
#
#cpdef enum Gender_psor:
# Masc # cz, sl, U
# Fem # cz, sl, U
# Neut # sl
#
#
#cpdef enum Hyph:
# Yes # cz, U
#
#
#cpdef enum InfForm:
# One # fi
# Two # fi
# Three # fi
#
#
#cpdef enum NameType:
# Geo # U, cz
# Prs # U, cz
# Giv # U, cz
# Sur # U, cz
# Nat # U, cz
# Com # U, cz
# Pro # U, cz
# Oth # U, cz
#
#
#cpdef enum NounType:
# Com # U
# Prop # U
# Class # U
#
#cpdef enum Number_abs:
# Sing # bq, U
# Plur # bq, U
#
#cpdef enum Number_dat:
# Sing # bq, U
# Plur # bq, U
#
#cpdef enum Number_erg:
# Sing # bq, U
# Plur # bq, U
#
#cpdef enum Number_psee:
# Sing # U
# Plur # U
#
#
#cpdef enum Number_psor:
# Sing # cz, fi, sl, U
# Plur # cz, fi, sl, U
#
#
#cpdef enum NumForm:
# Digit # cz, sl, U
# Roman # cz, sl, U
# Word # cz, sl, U
#
#
#cpdef enum NumValue:
# One # cz, U
# Two # cz, U
# Three # cz, U
#
#
#cpdef enum PartForm:
# Pres # fi
# Past # fi
# Agt # fi
# Neg # fi
#
#
#cpdef enum PartType:
# Mod # U
# Emp # U
# Res # U
# Inf # U
# Vbp # U
#
#cpdef enum Person_abs:
# One # bq, U
# Two # bq, U
# Three # bq, U
#
#
#cpdef enum Person_dat:
# One # bq, U
# Two # bq, U
# Three # bq, U
#
#
#cpdef enum Person_erg:
# One # bq, U
# Two # bq, U
# Three # bq, U
#
#
#cpdef enum Person_psor:
# One # fi, U
# Two # fi, U
# Three # fi, U
#
#
#cpdef enum Polite:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Polite_abs:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Polite_erg:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Polite_dat:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Prefix:
# Yes # U
#
#
#cpdef enum PrepCase:
# Npr # cz
# Pre # U
#
#
#cpdef enum PunctSide:
# Ini # U
# Fin # U
#
#cpdef enum PunctType1:
# Peri # U
# Qest # U
# Excl # U
# Quot # U
# Brck # U
# Comm # U
# Colo # U
# Semi # U
#
#cpdef enum PunctType2:
# Dash # U
#
#
#cpdef enum Style1:
# Arch # cz, fi, U
# Rare # cz, fi, U
# Poet # cz, U
# Norm # cz, U
# Coll # cz, U
# Vrnc # cz, U
# Sing # cz, U
# Expr # cz, U
#
#
#cpdef enum Style2:
# Derg # cz, U
# Vulg # cz, U
#
#
#cpdef enum Typo:
# Yes # fi, U
#
#
#cpdef enum Variant:
# Short # cz
# Bound # cz, sl
#
#
#cpdef enum VerbType:
# Aux # U
# Cop # U
# Mod # U
# Light # U
#
cpdef enum Value_t:
Animacy_Anim
Animacy_Inam
Aspect_Freq
Aspect_Imp
Aspect_Mod
Aspect_None_
Aspect_Perf
Case_Abe
Case_Abl
Case_Abs
Case_Acc
Case_Ade
Case_All
Case_Cau
Case_Com
Case_Dat
Case_Del
Case_Dis
Case_Ela
Case_Ess
Case_Gen
Case_Ill
Case_Ine
Case_Ins
Case_Loc
Case_Lat
Case_Nom
Case_Par
Case_Sub
Case_Sup
Case_Tem
Case_Ter
Case_Tra
Case_Voc
Definite_Two
Definite_Def
Definite_Red
Definite_Ind
Degree_Cmp
Degree_Comp
Degree_None
Degree_Pos
Degree_Sup
Degree_Abs
Degree_Com
Degree_Dim # du
Gender_Com
Gender_Fem
Gender_Masc
Gender_Neut
Mood_Cnd
Mood_Imp
Mood_Ind
Mood_N
Mood_Pot
Mood_Sub
Mood_Opt
Negative_Neg
Negative_Pos
Negative_Yes
Number_Com
Number_Dual
Number_None
Number_Plur
Number_Sing
Number_Ptan # bg
Number_Count # bg
NumType_Card
NumType_Dist
NumType_Frac
NumType_Gen
NumType_Mult
NumType_None
NumType_Ord
NumType_Sets
Person_One
Person_Two
Person_Three
Person_None
Poss_Yes
PronType_AdvPart
PronType_Art
PronType_Default
PronType_Dem
PronType_Ind
PronType_Int
PronType_Neg
PronType_Prs
PronType_Rcp
PronType_Rel
PronType_Tot
PronType_Clit
PronType_Exc # es, ca, it, fa
Reflex_Yes
Tense_Fut
Tense_Imp
Tense_Past
Tense_Pres
VerbForm_Fin
VerbForm_Ger
VerbForm_Inf
VerbForm_None
VerbForm_Part
VerbForm_PartFut
VerbForm_PartPast
VerbForm_PartPres
VerbForm_Sup
VerbForm_Trans
VerbForm_Gdv # la
Voice_Act
Voice_Cau
Voice_Pass
Voice_Mid # gkc
Voice_Int # hb
Abbr_Yes # cz, fi, sl, U
AdpType_Prep # cz, U
AdpType_Post # U
AdpType_Voc # cz
AdpType_Comprep # cz
AdpType_Circ # U
AdvType_Man
AdvType_Loc
AdvType_Tim
AdvType_Deg
AdvType_Cau
AdvType_Mod
AdvType_Sta
AdvType_Ex
AdvType_Adadj
ConjType_Oper # cz, U
ConjType_Comp # cz, U
Connegative_Yes # fi
Derivation_Minen # fi
Derivation_Sti # fi
Derivation_Inen # fi
Derivation_Lainen # fi
Derivation_Ja # fi
Derivation_Ton # fi
Derivation_Vs # fi
Derivation_Ttain # fi
Derivation_Ttaa # fi
Echo_Rdp # U
Echo_Ech # U
Foreign_Foreign # cz, fi, U
Foreign_Fscript # cz, fi, U
Foreign_Tscript # cz, U
Foreign_Yes # sl
Gender_dat_Masc # bq, U
Gender_dat_Fem # bq, U
Gender_erg_Masc # bq
Gender_erg_Fem # bq
Gender_psor_Masc # cz, sl, U
Gender_psor_Fem # cz, sl, U
Gender_psor_Neut # sl
Hyph_Yes # cz, U
InfForm_One # fi
InfForm_Two # fi
InfForm_Three # fi
NameType_Geo # U, cz
NameType_Prs # U, cz
NameType_Giv # U, cz
NameType_Sur # U, cz
NameType_Nat # U, cz
NameType_Com # U, cz
NameType_Pro # U, cz
NameType_Oth # U, cz
NounType_Com # U
NounType_Prop # U
NounType_Class # U
Number_abs_Sing # bq, U
Number_abs_Plur # bq, U
Number_dat_Sing # bq, U
Number_dat_Plur # bq, U
Number_erg_Sing # bq, U
Number_erg_Plur # bq, U
Number_psee_Sing # U
Number_psee_Plur # U
Number_psor_Sing # cz, fi, sl, U
Number_psor_Plur # cz, fi, sl, U
NumForm_Digit # cz, sl, U
NumForm_Roman # cz, sl, U
NumForm_Word # cz, sl, U
NumValue_One # cz, U
NumValue_Two # cz, U
NumValue_Three # cz, U
PartForm_Pres # fi
PartForm_Past # fi
PartForm_Agt # fi
PartForm_Neg # fi
PartType_Mod # U
PartType_Emp # U
PartType_Res # U
PartType_Inf # U
PartType_Vbp # U
Person_abs_One # bq, U
Person_abs_Two # bq, U
Person_abs_Three # bq, U
Person_dat_One # bq, U
Person_dat_Two # bq, U
Person_dat_Three # bq, U
Person_erg_One # bq, U
Person_erg_Two # bq, U
Person_erg_Three # bq, U
Person_psor_One # fi, U
Person_psor_Two # fi, U
Person_psor_Three # fi, U
Polite_Inf # bq, U
Polite_Pol # bq, U
Polite_abs_Inf # bq, U
Polite_abs_Pol # bq, U
Polite_erg_Inf # bq, U
Polite_erg_Pol # bq, U
Polite_dat_Inf # bq, U
Polite_dat_Pol # bq, U
Prefix_Yes # U
PrepCase_Npr # cz
PrepCase_Pre # U
PunctSide_Ini # U
PunctSide_Fin # U
PunctType_Peri # U
PunctType_Qest # U
PunctType_Excl # U
PunctType_Quot # U
PunctType_Brck # U
PunctType_Comm # U
PunctType_Colo # U
PunctType_Semi # U
PunctType_Dash # U
Style_Arch # cz, fi, U
Style_Rare # cz, fi, U
Style_Poet # cz, U
Style_Norm # cz, U
Style_Coll # cz, U
Style_Vrnc # cz, U
Style_Sing # cz, U
Style_Expr # cz, U
Style_Derg # cz, U
Style_Vulg # cz, U
Style_Yes # fi, U
StyleVariant_StyleShort # cz
StyleVariant_StyleBound # cz, sl
VerbType_Aux # U
VerbType_Cop # U
VerbType_Mod # U
VerbType_Light # U

View File

@ -1,11 +1,89 @@
# cython: embedsignature=True
from os import path
from .lemmatizer import Lemmatizer
try:
import ujson as json
except ImportError:
import json
from .parts_of_speech import UNIV_POS_NAMES
from .parts_of_speech cimport ADJ, VERB, NOUN
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
morph.number = props.get('number', 0)
morph.tenspect = props.get('tenspect', 0)
morph.mood = props.get('mood', 0)
morph.gender = props.get('gender', 0)
morph.person = props.get('person', 0)
morph.case = props.get('case', 0)
morph.misc = props.get('misc', 0)
cdef class Morphology:
def __init__(self, StringStore string_store, tag_map, lemmatizer):
self.mem = Pool()
self.strings = string_store
self.lemmatizer = lemmatizer
self.n_tags = len(tag_map) + 1
self.tag_names = tuple(sorted(tag_map.keys()))
self.reverse_index = {}
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str]
self.rich_tags[i].morph = 0
self.reverse_index[self.rich_tags[i].name] = i
self._cache = PreshMapArray(self.n_tags)
cdef int assign_tag(self, TokenC* token, tag) except -1:
cdef int tag_id
if isinstance(tag, basestring):
try:
tag_id = self.reverse_index[self.strings[tag]]
except KeyError:
print tag
raise
else:
tag_id = tag
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL:
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
analysis.tag = self.rich_tags[tag_id]
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
token.lemma = analysis.lemma
token.pos = analysis.tag.pos
token.tag = analysis.tag.name
token.morph = analysis.tag.morph
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
pass
def load_morph_exceptions(self, dict exc):
# Map (form, pos) to (lemma, rich tag)
cdef unicode pos_str
cdef unicode form_str
cdef unicode lemma_str
cdef dict entries
cdef dict props
cdef int lemma
cdef attr_t orth
cdef int pos
for tag_str, entries in exc.items():
tag = self.strings[tag_str]
rich_tag = self.rich_tags[self.reverse_index[tag]]
for form_str, props in entries.items():
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
orth = self.strings[form_str]
for name_str, value_str in props.items():
if name_str == 'L':
cached.lemma = self.strings[value_str]
else:
self.assign_feature(&cached.tag.morph, name_str, value_str)
if cached.lemma == 0:
cached.lemma = self.lemmatize(rich_tag.pos, orth)
self._cache.set(rich_tag.pos, orth, <void*>cached)
def lemmatize(self, const univ_pos_t pos, attr_t orth):
if self.lemmatizer is None:
return orth
cdef unicode py_string = self.strings[orth]
if pos != NOUN and pos != VERB and pos != ADJ:
return orth
cdef set lemma_strings
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings[lemma_string]
return lemma

View File

@ -69,7 +69,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
cpdef bint like_url(unicode string):
# We're looking for things that function in text like URLs. So, valid URL
# or not, anything they say http:// is going to be good.
if string.startswith('http://'):
if string.startswith('http://') or string.startswith('https://'):
return True
elif string.startswith('www.') and len(string) >= 5:
return True
@ -92,6 +92,7 @@ cpdef bint like_url(unicode string):
return False
# TODO: This should live in the language.orth
NUM_WORDS = set('zero one two three four five six seven eight nine ten'
'eleven twelve thirteen fourteen fifteen sixteen seventeen'
'eighteen nineteen twenty thirty forty fifty sixty seventy'

View File

@ -2,17 +2,22 @@
cpdef enum univ_pos_t:
NO_TAG
ADJ
ADV
ADP
ADV
AUX
CONJ
DET
INTJ
NOUN
NUM
PART
PRON
PRT
PROPN
PUNCT
SCONJ
SYM
VERB
X
PUNCT
EOL
SPACE
N_UNIV_TAGS

View File

@ -4,17 +4,22 @@ from __future__ import unicode_literals
UNIV_POS_NAMES = {
"NO_TAG": NO_TAG,
"ADJ": ADJ,
"ADV": ADV,
"ADP": ADP,
"ADV": ADV,
"AUX": AUX,
"CONJ": CONJ,
"DET": DET,
"INTJ": INTJ,
"NOUN": NOUN,
"NUM": NUM,
"PART": PART,
"PRON": PRON,
"PRT": PRT,
"PROPN": PROPN,
"PUNCT": PUNCT,
"SCONJ": SCONJ,
"SYM": SYM,
"VERB": VERB,
"X": X,
"PUNCT": PUNCT,
"SPACE": SPACE,
"EOL": EOL
"EOL": EOL,
"SPACE": SPACE
}

View File

@ -108,6 +108,11 @@ cdef class StringStore:
else:
raise TypeError(type(string_or_id))
def __iter__(self):
cdef int i
for i in range(self.size):
yield self[i]
cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
# 0 means missing, but we don't bother offsetting the index.
key = hash64(chars, length * sizeof(char), 0)
@ -137,6 +142,8 @@ cdef class StringStore:
def load(self, loc):
with codecs.open(loc, 'r', 'utf8') as file_:
strings = file_.read().split(SEPARATOR)
if strings == ['']:
return None
cdef unicode string
cdef bytes byte_string
for string in strings:

View File

@ -1,4 +1,4 @@
from libc.stdint cimport uint8_t, uint32_t, int32_t
from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
from .typedefs cimport flags_t, attr_t, hash_t
from .parts_of_speech cimport univ_pos_t
@ -26,22 +26,6 @@ cdef struct LexemeC:
float l2_norm
cdef struct Morphology:
uint8_t number
uint8_t tenspect # Tense/aspect/voice
uint8_t mood
uint8_t gender
uint8_t person
uint8_t case
uint8_t misc
cdef struct PosTag:
Morphology morph
int id
univ_pos_t pos
cdef struct Entity:
int start
int end
@ -59,8 +43,8 @@ cdef struct Constituent:
cdef struct TokenC:
const LexemeC* lex
Morphology morph
const Constituent* ctnt
uint64_t morph
univ_pos_t pos
bint spacy
int tag

View File

@ -109,7 +109,7 @@ cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label)
cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
return gold.labels[word] == -1 or gold.heads[word] == word
cdef class Shift:
@staticmethod
@ -267,7 +267,7 @@ cdef class Break:
return cost
else:
return cost + 1
@staticmethod
cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
return 0
@ -279,7 +279,7 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil:
return -1
else:
return word
cdef class ArcEager(TransitionSystem):
@classmethod
@ -322,8 +322,9 @@ cdef class ArcEager(TransitionSystem):
cdef Transition lookup_transition(self, object name) except *:
if '-' in name:
move_str, label_str = name.split('-', 1)
label = self.label_ids[label_str]
label = self.strings[label_str]
else:
move_str = name
label = 0
move = MOVE_NAMES.index(move_str)
for i in range(self.n_moves):

View File

@ -47,6 +47,7 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil:
else:
return False
cdef class BiluoPushDown(TransitionSystem):
@classmethod
def get_labels(cls, gold_tuples):
@ -160,7 +161,17 @@ cdef class Missing:
cdef class Begin:
@staticmethod
cdef bint is_valid(StateClass st, int label) nogil:
return label != 0 and not st.entity_is_open()
# Ensure we don't clobber preset entities. If no entity preset,
# ent_iob is 0
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 1:
return False
elif preset_ent_iob == 2:
return False
elif preset_ent_iob == 3 and st.B_(0).ent_type != label:
return False
else:
return label != 0 and not st.entity_is_open()
@staticmethod
cdef int transition(StateClass st, int label) nogil:
@ -190,6 +201,14 @@ cdef class Begin:
cdef class In:
@staticmethod
cdef bint is_valid(StateClass st, int label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2:
return False
elif preset_ent_iob == 3:
return False
# TODO: Is this quite right?
elif st.B_(1).ent_iob != preset_ent_iob:
return False
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod
@ -230,6 +249,14 @@ cdef class In:
cdef class Last:
@staticmethod
cdef bint is_valid(StateClass st, int label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2:
return False
elif preset_ent_iob == 3:
return False
elif st.B_(1).ent_iob == 1:
return False
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod
@ -269,6 +296,15 @@ cdef class Last:
cdef class Unit:
@staticmethod
cdef bint is_valid(StateClass st, int label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2:
return False
elif preset_ent_iob == 1:
return False
elif preset_ent_iob == 3 and st.B_(0).ent_type != label:
return False
elif st.B_(1).ent_iob == 1:
return False
return label != 0 and not st.entity_is_open()
@staticmethod
@ -300,6 +336,11 @@ cdef class Unit:
cdef class Out:
@staticmethod
cdef bint is_valid(StateClass st, int label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 3:
return False
elif preset_ent_iob == 1:
return False
return not st.entity_is_open()
@staticmethod

View File

@ -11,8 +11,8 @@ from .stateclass cimport StateClass
cdef class Parser:
cdef readonly object cfg
cdef readonly Model model
cdef readonly TransitionSystem moves
cdef void parse(self, StateClass stcls, ExampleC eg) nogil
cdef void predict(self, StateClass stcls, ExampleC* eg) nogil

View File

@ -67,16 +67,22 @@ def ParserFactory(transition_system):
cdef class Parser:
def __init__(self, StringStore strings, model_dir, transition_system):
def __init__(self, StringStore strings, transition_system, model):
self.moves = transition_system
self.model = model
@classmethod
def from_dir(cls, model_dir, strings, transition_system):
if not os.path.exists(model_dir):
print >> sys.stderr, "Warning: No model found at", model_dir
elif not os.path.isdir(model_dir):
print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory"
else:
self.cfg = Config.read(model_dir, 'config')
self.moves = transition_system(strings, self.cfg.labels)
templates = get_templates(self.cfg.features)
self.model = Model(self.moves.n_moves, templates, model_dir)
cfg = Config.read(model_dir, 'config')
moves = transition_system(strings, cfg.labels)
templates = get_templates(cfg.features)
model = Model(moves.n_moves, templates, model_dir)
return cls(strings, moves, model)
def __call__(self, Doc tokens):
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
@ -84,17 +90,21 @@ cdef class Parser:
cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
self.model.n_feats, self.model.n_feats)
with nogil:
self.parse(stcls, eg.c)
self.parse(stcls, eg.c)
tokens.set_parse(stcls._sent)
cdef void predict(self, StateClass stcls, ExampleC* eg) nogil:
memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
self.moves.set_valid(eg.is_valid, stcls)
fill_context(eg.atoms, stcls)
self.model.set_scores(eg.scores, eg.atoms)
eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes)
cdef void parse(self, StateClass stcls, ExampleC eg) nogil:
while not stcls.is_final():
memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
self.moves.set_valid(eg.is_valid, stcls)
fill_context(eg.atoms, stcls)
self.model.set_scores(eg.scores, eg.atoms)
eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes)
self.predict(stcls, &eg)
if not eg.is_valid[eg.guess]:
break
self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
self.moves.finalize_state(stcls)
@ -109,15 +119,93 @@ cdef class Parser:
cdef Transition G
while not stcls.is_final():
memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t))
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
fill_context(eg.c.atoms, stcls)
self.model.train(eg)
G = self.moves.c[eg.c.guess]
self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label)
loss += eg.c.loss
return loss
def step_through(self, Doc doc):
return StepwiseState(self, doc)
cdef class StepwiseState:
cdef readonly StateClass stcls
cdef readonly Example eg
cdef readonly Doc doc
cdef readonly Parser parser
def __init__(self, Parser parser, Doc doc):
self.parser = parser
self.doc = doc
self.stcls = StateClass.init(doc.data, doc.length)
self.parser.moves.initialize_state(self.stcls)
self.eg = Example(self.parser.model.n_classes, CONTEXT_SIZE,
self.parser.model.n_feats, self.parser.model.n_feats)
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.finish()
@property
def is_final(self):
return self.stcls.is_final()
@property
def stack(self):
return self.stcls.stack
@property
def queue(self):
return self.stcls.queue
@property
def heads(self):
return [self.stcls.H(i) for i in range(self.stcls.length)]
@property
def deps(self):
return [self.doc.vocab.strings[self.stcls._sent[i].dep]
for i in range(self.stcls.length)]
def predict(self):
self.parser.predict(self.stcls, &self.eg.c)
action = self.parser.moves.c[self.eg.c.guess]
return self.parser.moves.move_name(action.move, action.label)
def transition(self, action_name):
moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3}
if action_name == '_':
action_name = self.predict()
action = self.parser.moves.lookup_transition(action_name)
elif action_name == 'L' or action_name == 'R':
self.predict()
move = moves[action_name]
clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c,
self.eg.c.nr_class)
action = self.parser.moves.c[clas]
else:
action = self.parser.moves.lookup_transition(action_name)
action.do(self.stcls, action.label)
def finish(self):
if self.stcls.is_final():
self.parser.moves.finalize_state(self.stcls)
self.doc.set_parse(self.stcls._sent)
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
int nr_class) except -1:
cdef weight_t score = 0
cdef int mode = -1
cdef int i
for i in range(nr_class):
if actions[i].move == move and (mode == -1 or scores[i] >= score):
mode = i
score = scores[i]
return mode

View File

@ -71,7 +71,6 @@ cdef class StateClass:
return -1
return self._sent[i].head + i
cdef int E(self, int i) nogil
cdef int R(self, int i, int idx) nogil
@ -125,7 +124,7 @@ cdef class StateClass:
cdef void add_arc(self, int head, int child, int label) nogil
cdef void del_arc(self, int head, int child) nogil
cdef void open_ent(self, int label) nogil
cdef void close_ent(self) nogil

View File

@ -36,6 +36,14 @@ cdef class StateClass:
self._buffer[i] = i
self._empty_token.lex = &EMPTY_LEXEME
@property
def stack(self):
return {self.S(i) for i in range(self._s_i)}
@property
def queue(self):
return {self.B(i) for i in range(self._b_i)}
cdef int E(self, int i) nogil:
if self._e_i <= 0 or self._e_i >= self.length:
return 0

View File

@ -47,6 +47,6 @@ cdef class TransitionSystem:
cdef Transition init_transition(self, int clas, int move, int label) except *
cdef int set_valid(self, int* output, StateClass state) nogil
cdef int set_costs(self, int* is_valid, int* costs,
StateClass state, GoldParse gold) except -1

View File

@ -54,6 +54,10 @@ cdef class TransitionSystem:
cdef Transition init_transition(self, int clas, int move, int label) except *:
raise NotImplementedError
def is_valid(self, StateClass stcls, move_name):
action = self.lookup_transition(move_name)
return action.is_valid(stcls, action.label)
cdef int set_valid(self, int* is_valid, StateClass stcls) nogil:
cdef int i
for i in range(self.n_moves):

12
spacy/tagger.pxd Normal file
View File

@ -0,0 +1,12 @@
from ._ml cimport Model
from .structs cimport TokenC
from .vocab cimport Vocab
cdef class Tagger:
cdef readonly Vocab vocab
cdef readonly Model model
cdef public dict freqs
cdef int predict(self, int i, const TokenC* tokens) except -1
cdef int update(self, int i, const TokenC* tokens, int gold) except -1

220
spacy/tagger.pyx Normal file
View File

@ -0,0 +1,220 @@
import json
from os import path
from collections import defaultdict
from thinc.typedefs cimport atom_t, weight_t
from .typedefs cimport attr_t
from .tokens.doc cimport Doc
from .attrs cimport TAG
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .attrs cimport *
from ._ml cimport arg_max
cpdef enum:
P2_orth
P2_cluster
P2_shape
P2_prefix
P2_suffix
P2_pos
P2_lemma
P2_flags
P1_orth
P1_cluster
P1_shape
P1_prefix
P1_suffix
P1_pos
P1_lemma
P1_flags
W_orth
W_cluster
W_shape
W_prefix
W_suffix
W_pos
W_lemma
W_flags
N1_orth
N1_cluster
N1_shape
N1_prefix
N1_suffix
N1_pos
N1_lemma
N1_flags
N2_orth
N2_cluster
N2_shape
N2_prefix
N2_suffix
N2_pos
N2_lemma
N2_flags
N_CONTEXT_FIELDS
cdef class Tagger:
"""A part-of-speech tagger for English"""
@classmethod
def read_config(cls, data_dir):
return json.load(open(path.join(data_dir, 'pos', 'config.json')))
@classmethod
def default_templates(cls):
return (
(W_orth,),
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
(N1_orth,),
(N2_orth,),
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_orth),
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_cluster,),
(N1_cluster,),
(N2_cluster,),
(P1_cluster,),
(P2_cluster,),
(W_flags,),
(N1_flags,),
(N2_flags,),
(P1_flags,),
(P2_flags,),
)
@classmethod
def blank(cls, vocab, templates):
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
return cls(vocab, model)
@classmethod
def from_dir(cls, data_dir, vocab):
if path.exists(path.join(data_dir, 'templates.json')):
templates = json.loads(open(path.join(data_dir, 'templates.json')))
else:
templates = cls.default_templates()
model = Model(vocab.morphology.n_tags, templates, data_dir)
return cls(vocab, model)
def __init__(self, Vocab vocab, model):
self.vocab = vocab
self.model = model
# TODO: Move this to tag map
self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
self.freqs[TAG][self.vocab.strings[tag]] = 1
self.freqs[TAG][0] = 1
@property
def tag_names(self):
return self.vocab.morphology.tag_names
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.
Args:
tokens (Doc): The tokens to be tagged.
"""
if tokens.length == 0:
return 0
cdef int i
cdef const weight_t* scores
for i in range(tokens.length):
if tokens.data[i].pos == 0:
guess = self.predict(i, tokens.data)
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i
for i in range(tokens.length):
self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def train(self, Doc tokens, object gold_tag_strs):
assert len(tokens) == len(gold_tag_strs)
cdef int i
cdef int loss
cdef const weight_t* scores
try:
golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
except ValueError:
raise ValueError(
[g for g in gold_tag_strs if g is not None and g not in self.tag_names])
correct = 0
for i in range(tokens.length):
guess = self.update(i, tokens.data, golds[i])
loss = golds[i] != -1 and guess != golds[i]
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
correct += loss == 0
self.freqs[TAG][tokens.data[i].tag] += 1
return correct
cdef int predict(self, int i, const TokenC* tokens) except -1:
cdef atom_t[N_CONTEXT_FIELDS] context
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
scores = self.model.score(context)
return arg_max(scores, self.model.n_classes)
cdef int update(self, int i, const TokenC* tokens, int gold) except -1:
cdef atom_t[N_CONTEXT_FIELDS] context
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
scores = self.model.score(context)
guess = arg_max(scores, self.model.n_classes)
loss = guess != gold if gold != -1 else 0
self.model.update(context, guess, gold, loss)
return guess
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.lower
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.tag
context[6] = t.lemma
if t.lex.flags & (1 << IS_ALPHA):
context[7] = 1
elif t.lex.flags & (1 << IS_PUNCT):
context[7] = 2
elif t.lex.flags & (1 << LIKE_URL):
context[7] = 3
elif t.lex.flags & (1 << LIKE_NUM):
context[7] = 4
else:
context[7] = 0

View File

@ -4,15 +4,10 @@ from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from .typedefs cimport hash_t
from .structs cimport LexemeC, TokenC, Morphology
from .structs cimport LexemeC, TokenC
from .strings cimport StringStore
from .tokens.doc cimport Doc
from .vocab cimport Vocab, _Cached
cdef union LexemesOrTokens:
const LexemeC* const* lexemes
TokenC* tokens
from .vocab cimport Vocab, LexemesOrTokens, _Cached
cdef class Tokenizer:

View File

@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from .morphology cimport set_morph_from_dict
from .strings cimport hash_string
cimport cython
@ -29,7 +28,7 @@ cdef class Tokenizer:
self._suffix_re = suffix_re
self._infix_re = infix_re
self.vocab = vocab
self._load_special_tokenization(rules, self.vocab.pos_tags)
self._load_special_tokenization(rules)
@classmethod
def from_dir(cls, Vocab vocab, data_dir):
@ -193,9 +192,7 @@ cdef class Tokenizer:
tokens.push_back(prefixes[0][i], False)
if string:
cache_hit = self._try_cache(hash_string(string), tokens)
if cache_hit:
pass
else:
if not cache_hit:
match = self.find_infix(string)
if match is None:
tokens.push_back(self.vocab.get(tokens.mem, string), False)
@ -242,7 +239,7 @@ cdef class Tokenizer:
match = self._suffix_re.search(string)
return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, object rules, object tag_map):
def _load_special_tokenization(self, special_cases):
'''Add a special-case tokenization rule.
'''
cdef int i
@ -253,29 +250,11 @@ cdef class Tokenizer:
cdef dict props
cdef LexemeC** lexemes
cdef hash_t hashed
for chunk, substrings in sorted(rules.items()):
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
form = props['F']
lemma = props.get("L", None)
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
if lemma is not None:
tokens[i].lemma = self.vocab.strings[lemma]
else:
tokens[i].lemma = 0
if 'pos' in props:
tokens[i].tag = self.vocab.strings[props['pos']]
tokens[i].pos = tag_map[props['pos']][0]
# These are defaults, which can be over-ridden by the
# token-specific props.
set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
if tokens[i].lemma == 0:
tokens[i].lemma = tokens[i].lex.orth
set_morph_from_dict(&tokens[i].morph, props)
for chunk, substrings in sorted(special_cases.items()):
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings)
cached.is_lex = False
cached.data.tokens = tokens
hashed = hash_string(chunk)
self._specials.set(hashed, cached)
self._cache.set(hashed, cached)
cached.data.tokens = self.vocab.make_fused_token(substrings)
key = hash_string(chunk)
self._specials.set(key, cached)
self._cache.set(key, cached)

View File

@ -4,14 +4,19 @@ from preshed.counter cimport PreshCounter
from ..vocab cimport Vocab
from ..structs cimport TokenC, LexemeC
from ..typedefs cimport attr_t
from ..attrs cimport attr_id_t
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
ctypedef const LexemeC* const_Lexeme_ptr
ctypedef TokenC* TokenC_ptr
ctypedef const TokenC* const_TokenC_ptr
ctypedef fused LexemeOrToken:
const_Lexeme_ptr
TokenC_ptr
const_TokenC_ptr
cdef class Doc:

View File

@ -5,16 +5,17 @@ from libc.stdint cimport uint32_t
import numpy
import struct
from ..lexeme cimport Lexeme
from ..lexeme cimport EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech import UNIV_POS_NAMES
from ..parts_of_speech cimport CONJ, PUNCT
from ..lexeme cimport check_flag
from ..lexeme cimport get_attr as get_lex_attr
from .spans import Span
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme
from .spans cimport Span
from .token cimport Token
from ..serialize.bits cimport BitArray
@ -47,7 +48,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
elif feat_name == ENT_TYPE:
return token.ent_type
else:
return get_lex_attr(token.lex, feat_name)
return Lexeme.get_struct_attr(token.lex, feat_name)
cdef class Doc:
@ -119,40 +120,79 @@ cdef class Doc:
def string(self):
return u''.join([t.string for t in self])
@property
def ents(self):
"""Yields named-entity Span objects.
property ents:
def __get__(self):
"""Yields named-entity Span objects.
Iterate over the span to get individual Token objects, or access the label:
Iterate over the span to get individual Token objects, or access the label:
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
>>> ents = list(tokens.ents)
>>> ents[0].label, ents[0].label_, ''.join(t.orth_ for t in ents[0])
(112504, u'PERSON', u'Best ')
"""
cdef int i
cdef const TokenC* token
cdef int start = -1
cdef int label = 0
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
>>> ents = list(tokens.ents)
>>> ents[0].label, ents[0].label_, ''.join(t.orth_ for t in ents[0])
(112504, u'PERSON', u'Best ')
"""
cdef int i
cdef const TokenC* token
cdef int start = -1
cdef int label = 0
output = []
for i in range(self.length):
token = &self.data[i]
if token.ent_iob == 1:
assert start != -1
elif token.ent_iob == 2 or token.ent_iob == 0:
if start != -1:
output.append(Span(self, start, i, label=label))
start = -1
label = 0
elif token.ent_iob == 3:
if start != -1:
output.append(Span(self, start, i, label=label))
start = i
label = token.ent_type
if start != -1:
output.append(Span(self, start, self.length, label=label))
return tuple(output)
def __set__(self, ents):
# TODO:
# 1. Allow negative matches
# 2. Ensure pre-set NERs are not over-written during statistical prediction
# 3. Test basic data-driven ORTH gazetteer
# 4. Test more nuanced date and currency regex
cdef int i
for i in range(self.length):
self.data[i].ent_type = 0
self.data[i].ent_iob = 0
cdef attr_t ent_type
cdef int start, end
for ent_type, start, end in ents:
if ent_type is None or ent_type < 0:
# Mark as O
for i in range(start, end):
self.data[i].ent_type = 0
self.data[i].ent_iob = 2
else:
# Mark (inside) as I
for i in range(start, end):
self.data[i].ent_type = ent_type
self.data[i].ent_iob = 1
# Set start as B
self.data[start].ent_iob = 3
@property
def noun_chunks(self):
"""Yield spans for base noun phrases."""
cdef const TokenC* word
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr']
np_deps = [self.vocab.strings[label] for label in labels]
np_label = self.vocab.strings['NP']
for i in range(self.length):
token = &self.data[i]
if token.ent_iob == 1:
assert start != -1
pass
elif token.ent_iob == 2:
if start != -1:
yield Span(self, start, i, label=label)
start = -1
label = 0
elif token.ent_iob == 3:
if start != -1:
yield Span(self, start, i, label=label)
start = i
label = token.ent_type
if start != -1:
yield Span(self, start, self.length, label=label)
word = &self.data[i]
if word.pos == NOUN and word.dep in np_deps:
yield Span(self, word.l_edge, i+1, label=np_label)
@property
def sents(self):
@ -171,7 +211,7 @@ cdef class Doc:
if self.length == self.max_length:
self._realloc(self.length * 2)
cdef TokenC* t = &self.data[self.length]
if LexemeOrToken is TokenC_ptr:
if LexemeOrToken is const_TokenC_ptr:
t[0] = lex_or_tok[0]
else:
t.lex = lex_or_tok
@ -179,6 +219,7 @@ cdef class Doc:
t.idx = 0
else:
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
assert t.lex.orth != 0
t.spacy = has_space
self.length += 1
self._py_tokens.append(None)
@ -288,6 +329,9 @@ cdef class Doc:
elif attr_id == TAG:
for i in range(length):
tokens[i].tag = values[i]
elif attr_id == POS:
for i in range(length):
tokens[i].pos = <univ_pos_t>values[i]
elif attr_id == DEP:
for i in range(length):
tokens[i].dep = values[i]
@ -297,20 +341,7 @@ cdef class Doc:
elif attr_id == ENT_TYPE:
for i in range(length):
tokens[i].ent_type = values[i]
cdef TokenC* head
cdef TokenC* child
# Set left edges
for i in range(length):
child = &tokens[i]
head = &tokens[i + child.head]
if child < head and child.l_edge < head.l_edge:
head.l_edge = child.l_edge
# Set right edges --- same as above, but iterate in reverse
for i in range(length-1, -1, -1):
child = &tokens[i]
head = &tokens[i + child.head]
if child > head and child.r_edge > head.r_edge:
head.r_edge = child.r_edge
set_children_from_heads(self.data, self.length)
return self
def to_bytes(self):
@ -354,14 +385,18 @@ cdef class Doc:
break
else:
return None
cdef unicode string = self.string
cdef Span span = self[start:end]
# Get LexemeC for newly merged token
new_orth = string[start_idx:end_idx]
new_orth = ''.join([t.string for t in span])
if span[-1].whitespace_:
new_orth = new_orth[:-1]
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
# House the new merged token where it starts
cdef TokenC* token = &self.data[start]
# Update fields
token.lex = lex
token.spacy = self.data[end].spacy
# What to do about morphology??
# TODO: token.morph = ???
token.tag = self.vocab.strings[tag]
@ -372,30 +407,16 @@ cdef class Doc:
else:
token.ent_iob = 3
token.ent_type = self.vocab.strings[ent_type]
# Fix dependencies
# Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a dependency
# bridges over the entity. Here the alignment of the tokens changes.
span_root = span.root.i
token.dep = span.root.dep
for i in range(self.length):
self.data[i].head += i
# Find the head of the merged token, and its dep relation
outer_heads = {}
for i in range(start, end):
head_idx = self.data[i].head
if head_idx == i or head_idx < start or head_idx >= end:
# Don't consider "heads" which are actually dominated by a word
# in the region we're merging
gp = head_idx
while self.data[gp].head != gp:
if start <= gp < end:
break
gp = self.data[gp].head
else:
# If we have multiple words attaching to the same head,
# but with different dep labels, we're preferring the last
# occurring dep label. Shrug. What else could we do, I guess?
outer_heads[head_idx] = self.data[i].dep
token.head, token.dep = max(outer_heads.items())
# Set the head of the merged token, and its dep relation, from the Span
token.head = self.data[span_root].head
# Adjust deps before shrinking tokens
# Tokens which point into the merged token should now point to it
# Subtract the offset from all tokens which point to >= end
@ -406,7 +427,6 @@ cdef class Doc:
self.data[i].head = start
elif head_idx >= end:
self.data[i].head -= offset
# TODO: Fix left and right deps
# Now compress the token array
for i in range(end, self.length):
self.data[i - offset] = self.data[i]
@ -417,6 +437,28 @@ cdef class Doc:
for i in range(self.length):
# ...And, set heads back to a relative position
self.data[i].head -= i
# Set the left/right children, left/right edges
set_children_from_heads(self.data, self.length)
# Clear the cached Python objects
self._py_tokens = [None] * self.length
# Return the merged Python object
return self[start]
cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
cdef TokenC* head
cdef TokenC* child
cdef int i
# Set left edges
for i in range(length):
child = &tokens[i]
head = &tokens[i + child.head]
if child < head and child.l_edge < head.l_edge:
head.l_edge = child.l_edge
# Set right edges --- same as above, but iterate in reverse
for i in range(length-1, -1, -1):
child = &tokens[i]
head = &tokens[i + child.head]
if child > head and child.r_edge > head.r_edge:
head.r_edge = child.r_edge

View File

@ -1,7 +1,7 @@
from __future__ import unicode_literals
from collections import defaultdict
from ..structs cimport Morphology, TokenC, LexemeC
from ..structs cimport TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t
from ..attrs cimport attr_id_t
from ..parts_of_speech cimport univ_pos_t
@ -40,12 +40,18 @@ cdef class Span:
return self.end - self.start
def __getitem__(self, int i):
return self._seq[self.start + i]
if i < 0:
return self._seq[self.end + i]
else:
return self._seq[self.start + i]
def __iter__(self):
for i in range(self.start, self.end):
yield self._seq[i]
def merge(self, unicode tag, unicode lemma, unicode ent_type):
self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type)
property root:
"""The first ancestor of the first word of the span that has its head
outside the span.

View File

@ -1,6 +1,5 @@
from libc.string cimport memcpy
from cpython.mem cimport PyMem_Malloc, PyMem_Free
from ..lexeme cimport check_flag
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
cimport numpy as np
@ -9,6 +8,7 @@ np.import_array()
import numpy
from ..lexeme cimport Lexeme
from ..parts_of_speech import UNIV_POS_NAMES
from ..attrs cimport LEMMA
@ -20,6 +20,8 @@ from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport IS_OOV
from ..lexeme cimport Lexeme
cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
@ -42,7 +44,7 @@ cdef class Token:
return self.string
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
return check_flag(self.c.lex, flag_id)
return Lexeme.check_flag(self.c.lex, flag_id)
def nbor(self, int i=1):
return self.doc[self.i+i]
@ -142,7 +144,7 @@ cdef class Token:
"""The leftward immediate children of the word, in the syntactic
dependency parse.
"""
cdef const TokenC* ptr = self.c - self.i
cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
while ptr < self.c:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
@ -160,7 +162,7 @@ cdef class Token:
def __get__(self):
"""The rightward immediate children of the word, in the syntactic
dependency parse."""
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
tokens = []
while ptr > self.c:
# If this head is still to the right of us, we can skip to it
@ -193,7 +195,7 @@ cdef class Token:
property left_edge:
def __get__(self):
return self.doc[self.c.l_edge]
property right_edge:
def __get__(self):
return self.doc[self.c.r_edge]
@ -202,7 +204,7 @@ cdef class Token:
def __get__(self):
"""The token predicted by the parser to be the head of the current token."""
return self.doc[self.i + self.c.head]
property conjuncts:
def __get__(self):
"""Get a list of conjoined words"""
@ -286,37 +288,37 @@ cdef class Token:
return self.vocab.strings[self.c.dep]
property is_oov:
def __get__(self): return check_flag(self.c.lex, IS_OOV)
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)
property is_alpha:
def __get__(self): return check_flag(self.c.lex, IS_ALPHA)
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)
property is_ascii:
def __get__(self): return check_flag(self.c.lex, IS_ASCII)
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII)
property is_digit:
def __get__(self): return check_flag(self.c.lex, IS_DIGIT)
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT)
property is_lower:
def __get__(self): return check_flag(self.c.lex, IS_LOWER)
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER)
property is_title:
def __get__(self): return check_flag(self.c.lex, IS_TITLE)
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE)
property is_punct:
def __get__(self): return check_flag(self.c.lex, IS_PUNCT)
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT)
property is_space:
def __get__(self): return check_flag(self.c.lex, IS_SPACE)
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE)
property like_url:
def __get__(self): return check_flag(self.c.lex, LIKE_URL)
def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL)
property like_num:
def __get__(self): return check_flag(self.c.lex, LIKE_NUM)
def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM)
property like_email:
def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL)
def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL)
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}

View File

@ -7,6 +7,7 @@ from murmurhash.mrmr cimport hash64
from .structs cimport LexemeC, TokenC
from .typedefs cimport utf8_t, attr_t, hash_t
from .strings cimport StringStore
from .morphology cimport Morphology
cdef LexemeC EMPTY_LEXEME
@ -14,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME
cdef union LexemesOrTokens:
const LexemeC* const* lexemes
TokenC* tokens
const TokenC* tokens
cdef struct _Cached:
@ -27,16 +28,20 @@ cdef class Vocab:
cpdef public lexeme_props_getter
cdef Pool mem
cpdef readonly StringStore strings
cdef readonly object pos_tags
cpdef readonly Morphology morphology
cdef readonly int length
cdef public object _serializer
cdef public object data_dir
cdef public float oov_prob
cdef public object get_lex_attr
cdef public object pos_tags
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef const TokenC* make_fused_token(self, substrings) except NULL
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
cdef PreshMap _by_hash
cdef PreshMap _by_orth

View File

@ -12,16 +12,17 @@ import math
import json
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport set_lex_struct_props
from .lexeme cimport Lexeme
from .strings cimport hash_string
from .orth cimport word_shape
from .typedefs cimport attr_t
from .cfile cimport CFile
from .lemmatizer import Lemmatizer
from cymem.cymem cimport Address
from . import util
from .serialize.packer cimport Packer
from .attrs cimport PROB
DEF MAX_VEC_SIZE = 100000
@ -36,34 +37,33 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
cdef class Vocab:
'''A map container for a language's LexemeC structs.
'''
def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
pos_tags=None, oov_prob=-30):
if oov_prob is None:
oov_prob = -30
def __init__(self, get_lex_attr=None, tag_map=None, vectors=None):
if tag_map is None:
tag_map = {}
self.mem = Pool()
self._by_hash = PreshMap()
self._by_orth = PreshMap()
self.strings = StringStore()
self.pos_tags = pos_tags if pos_tags is not None else {}
self.lexeme_props_getter = get_lex_props
self.repvec_length = 0
self.length = 0
self._add_lex_to_vocab(0, &EMPTY_LEXEME)
if data_dir is not None:
if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
if data_dir is not None:
if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
self.load_lexemes(path.join(data_dir, 'strings.txt'),
path.join(data_dir, 'lexemes.bin'))
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
self.get_lex_attr = get_lex_attr
self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {}))
self.length = 1
self._serializer = None
self.data_dir = data_dir
self.oov_prob = oov_prob
@classmethod
def from_dir(cls, data_dir, get_lex_attr=None, vectors=None):
if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
return self
property serializer:
def __get__(self):
@ -89,20 +89,12 @@ cdef class Vocab:
cdef LexemeC* lex
cdef hash_t key = hash_string(string)
lex = <LexemeC*>self._by_hash.get(key)
cdef size_t addr
if lex != NULL:
assert lex.orth == self.strings[string]
return lex
cdef bint is_oov = mem is not self.mem
if len(string) < 3:
mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
if is_oov:
lex.id = 0
else:
self._add_lex_to_vocab(key, lex)
assert lex != NULL, string
return lex
return self._new_lexeme(mem, string)
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
@ -114,18 +106,34 @@ cdef class Vocab:
lex = <LexemeC*>self._by_orth.get(orth)
if lex != NULL:
return lex
cdef unicode string = self.strings[orth]
else:
return self._new_lexeme(mem, self.strings[orth])
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
cdef hash_t key
cdef bint is_oov = mem is not self.mem
mem = self.mem
if len(string) < 3:
mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
lex.orth = self.strings[string]
lex.length = len(string)
lex.id = self.length
if self.get_lex_attr is not None:
for attr, func in self.get_lex_attr.items():
value = func(string)
if isinstance(value, unicode):
value = self.strings[value]
if attr == PROB:
lex.prob = value
else:
Lexeme.set_struct_attr(lex, attr, value)
if is_oov:
lex.id = 0
else:
self._add_lex_to_vocab(hash_string(string), lex)
assert lex != NULL, orth
key = hash_string(string)
self._add_lex_to_vocab(key, lex)
assert lex != NULL, string
return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
@ -137,7 +145,7 @@ cdef class Vocab:
cdef attr_t orth
cdef size_t addr
for orth, addr in self._by_orth.items():
yield Lexeme.from_ptr(<LexemeC*>addr, self.strings, self.repvec_length)
yield Lexeme(self, orth)
def __getitem__(self, id_or_string):
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
@ -154,32 +162,29 @@ cdef class Vocab:
An instance of the Lexeme Python class, with data copied on
instantiation.
'''
cdef const LexemeC* lexeme
cdef attr_t orth
if type(id_or_string) == int:
orth = id_or_string
lexeme = <LexemeC*>self._by_orth.get(orth)
if lexeme == NULL:
raise KeyError(id_or_string)
assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth))
elif type(id_or_string) == unicode:
lexeme = self.get(self.mem, id_or_string)
assert lexeme.orth == self.strings[id_or_string]
if type(id_or_string) == unicode:
orth = self.strings[id_or_string]
else:
raise ValueError("Vocab unable to map type: "
"%s. Maps unicode --> Lexeme or "
"int --> Lexeme" % str(type(id_or_string)))
return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
def __setitem__(self, unicode string, dict props):
cdef hash_t key = hash_string(string)
cdef LexemeC* lex
lex = <LexemeC*>self._by_hash.get(key)
if lex == NULL:
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
self._add_lex_to_vocab(key, lex)
orth = id_or_string
return Lexeme(self, orth)
cdef const TokenC* make_fused_token(self, substrings) except NULL:
cdef int i
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
token = &tokens[i]
# Set the special tokens up to have morphology and lemmas if
# specified, otherwise use the part-of-speech tag (if specified)
token.lex = <LexemeC*>self.get(self.mem, props['F'])
if 'pos' in props:
self.morphology.assign_tag(token, props['pos'])
if 'L' in props:
tokens[i].lemma = self.strings[props['L']]
for feature, value in props.get('morph', {}).items():
self.morphology.assign_feature(&token.morph, feature, value)
return tokens
def dump(self, loc):
if path.exists(loc):
assert not path.isdir(loc)

View File

@ -0,0 +1,40 @@
import pytest
@pytest.mark.models
def test_nsubj(EN):
sent = EN(u'A base phrase should be recognized.')
base_nps = list(sent.noun_chunks)
assert len(base_nps) == 1
assert base_nps[0].string == 'A base phrase '
@pytest.mark.models
def test_coord(EN):
sent = EN(u'A base phrase and a good phrase are often the same.')
base_nps = list(sent.noun_chunks)
assert len(base_nps) == 2
assert base_nps[0].string == 'A base phrase '
assert base_nps[1].string == 'a good phrase '
@pytest.mark.models
def test_pp(EN):
sent = EN(u'A phrase with another phrase occurs')
base_nps = list(sent.noun_chunks)
assert len(base_nps) == 2
assert base_nps[0].string == 'A phrase '
assert base_nps[1].string == 'another phrase '
@pytest.mark.models
def test_merge_pp(EN):
sent = EN(u'A phrase with another phrase occurs')
nps = [(np[0].idx, np[-1].idx + len(np[-1]), np[0].ent_type_) for np in sent.noun_chunks]
for start, end, ent_type in nps:
sent.merge(start, end, u'NP', np.lemma_, ent_type)
assert sent[0].string == 'A phrase '
assert sent[1].string == 'with '
assert sent[2].string == 'another phrase '
assert sent[3].string == 'occurs'

View File

@ -0,0 +1,16 @@
import pytest
@pytest.mark.models
def test_initial(EN):
doc = EN.tokenizer(u'I ate the pizza with anchovies.')
EN.tagger(doc)
with EN.parser.step_through(doc) as stepwise:
stepwise.transition('L-nsubj')
stepwise.transition('S')
stepwise.transition('L-det')
assert doc[0].head.i == 1
assert doc[1].head.i == 1
assert doc[2].head.i == 3
assert doc[3].head.i == 3
assert doc

View File

@ -41,25 +41,10 @@ def test_attribute():
def test_vocab_codec():
def get_lex_props(string, prob):
return {
'flags': 0,
'length': len(string),
'orth': string,
'lower': string,
'norm': string,
'shape': string,
'prefix': string[0],
'suffix': string[-3:],
'cluster': 0,
'prob': prob,
'sentiment': 0
}
vocab = Vocab()
vocab['dog'] = get_lex_props('dog', 0.001)
vocab['the'] = get_lex_props('the', 0.05)
vocab['jumped'] = get_lex_props('jumped', 0.005)
lex = vocab['dog']
lex = vocab['the']
lex = vocab['jumped']
codec = HuffmanCodec([(lex.orth, lex.prob) for lex in vocab])

View File

@ -5,6 +5,7 @@ import re
import pytest
import numpy
from spacy.language import Language
from spacy.vocab import Vocab
from spacy.tokens.doc import Doc
from spacy.tokenizer import Tokenizer
@ -17,30 +18,14 @@ from spacy.serialize.packer import Packer
from spacy.serialize.bits import BitArray
def get_lex_props(string, prob=-22, is_oov=False):
return {
'flags': 0,
'length': len(string),
'orth': string,
'lower': string,
'norm': string,
'shape': string,
'prefix': string[0],
'suffix': string[-3:],
'cluster': 0,
'prob': prob,
'sentiment': 0
}
@pytest.fixture
def vocab():
vocab = Vocab(get_lex_props=get_lex_props)
vocab['dog'] = get_lex_props('dog', 0.001)
vocab = Vocab(Language.default_lex_attrs())
lex = vocab['dog']
assert vocab[vocab.strings['dog']].orth_ == 'dog'
vocab['the'] = get_lex_props('the', 0.01)
vocab['quick'] = get_lex_props('quick', 0.005)
vocab['jumped'] = get_lex_props('jumped', 0.007)
lex = vocab['the']
lex = vocab['quick']
lex = vocab['jumped']
return vocab

View File

@ -1,7 +1,6 @@
from __future__ import unicode_literals
import pytest
@pytest.mark.models
def test_merge_tokens(EN):
tokens = EN(u'Los Angeles start.')
@ -32,3 +31,19 @@ def test_merge_heads(EN):
def test_issue_54(EN):
text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
tokens = EN(text, merge_mwes=True)
@pytest.mark.models
def test_np_merges(EN):
text = u'displaCy is a parse tool built with Javascript'
tokens = EN(text)
assert tokens[4].head.i == 1
tokens.merge(tokens[2].idx, tokens[4].idx + len(tokens[4]), u'NP', u'tool', u'O')
assert tokens[2].head.i == 1
tokens = EN('displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript.')
ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_)
for e in tokens.ents]
for start, end, label, lemma in ents:
merged = tokens.merge(start, end, label, lemma, label)
assert merged != None, (start, end, label, lemma)

View File

@ -14,6 +14,7 @@ def tagged(EN):
tokens = EN(string, tag=True, parse=False)
return tokens
@pytest.mark.models
def test_spaces(tagged):
assert tagged[0].pos != SPACE
assert tagged[0].pos_ != 'SPACE'

View File

@ -1,82 +1,81 @@
# -*- coding: utf-8 -*-
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
from spacy.en.attrs import IS_LOWER
import pytest
@pytest.mark.models
def test_1():
import spacy.en
from spacy.parts_of_speech import ADV
# Load the pipeline, and call it with some text.
nlp = spacy.en.English()
tokens = nlp(u"Give it back, he pleaded abjectly, its mine.",
tag=True, parse=False)
o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
assert u"Give it BACK, he pleaded ABJECTLY, its mine."
o = nlp.vocab[u'back'].prob
assert o == -7.033305644989014
o = nlp.vocab[u'not'].prob
assert o == -5.332601070404053
o = nlp.vocab[u'quietly'].prob
assert o == -11.994928359985352
@pytest.mark.models
def test2():
import spacy.en
from spacy.parts_of_speech import ADV
nlp = spacy.en.English()
# Find log probability of Nth most frequent word
probs = [lex.prob for lex in nlp.vocab]
probs.sort()
is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
tokens = nlp(u"Give it back, he pleaded abjectly, its mine.")
o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
o == u'Give it back, he pleaded ABJECTLY, its mine.'
@pytest.mark.models
def test3():
import spacy.en
from spacy.parts_of_speech import ADV
nlp = spacy.en.English()
# Find log probability of Nth most frequent word
probs = [lex.prob for lex in nlp.vocab]
probs.sort()
is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
tokens = nlp(u"Give it back, he pleaded abjectly, its mine.")
o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
assert o == u'Give it back, he pleaded ABJECTLY, its mine.'
pleaded = tokens[7]
assert pleaded.repvec.shape == (300,)
o = pleaded.repvec[:5]
assert sum(o) != 0
from numpy import dot
from numpy.linalg import norm
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
words = [w for w in nlp.vocab if w.is_lower and w.has_repvec]
words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
words.reverse()
o = [w.orth_ for w in words[0:20]]
assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded',
u'pleads', u'testified', u'conspired', u'motioned', u'demurred',
u'countersued', u'remonstrated', u'begged', u'apologised',
u'consented', u'acquiesced', u'petitioned', u'quarreled',
u'appealed', u'pleading']
o = [w.orth_ for w in words[50:60]]
assert o == [u'martialed', u'counselled', u'bragged',
u'backtracked', u'caucused', u'refiled', u'dueled', u'mused',
u'dissented', u'yearned']
o = [w.orth_ for w in words[100:110]]
assert o == [u'acquits', u'cabled', u'ducked', u'sentenced',
u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed',
u'clerked']
#o = [w.orth_ for w in words[1000:1010]]
#assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled',
# u'posited', u'firebombed', u'slimed', u'deferred', u'sagged']
#o = [w.orth_ for w in words[50000:50010]]
#assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid',
# u'dirty', u'rims', u'artists']
#@pytest.mark.models
#def test_1():
# import spacy.en
# from spacy.parts_of_speech import ADV
# # Load the pipeline, and call it with some text.
# nlp = spacy.en.English()
# tokens = nlp(u"Give it back, he pleaded abjectly, its mine.",
# tag=True, parse=False)
# o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
# assert u"Give it BACK, he pleaded ABJECTLY, its mine."
#
# o = nlp.vocab[u'back'].prob
# assert o == -7.033305644989014
# o = nlp.vocab[u'not'].prob
# assert o == -5.332601070404053
# o = nlp.vocab[u'quietly'].prob
# assert o == -11.994928359985352
#
#
#@pytest.mark.m
#def test2():
# import spacy.en
# from spacy.parts_of_speech import ADV
# nlp = spacy.en.English()
# # Find log probability of Nth most frequent word
# probs = [lex.prob for lex in nlp.vocab]
# probs.sort()
# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
# tokens = nlp(u"Give it back, he pleaded abjectly, its mine.")
# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
# o == u'Give it back, he pleaded ABJECTLY, its mine.'
#
#@pytest.mark.models
#def test3():
# import spacy.en
# from spacy.parts_of_speech import ADV
# nlp = spacy.en.English()
# # Find log probability of Nth most frequent word
# probs = [lex.prob for lex in nlp.vocab]
# probs.sort()
# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
# tokens = nlp(u"Give it back, he pleaded abjectly, its mine.")
# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
# assert o == u'Give it back, he pleaded ABJECTLY, its mine.'
#
# pleaded = tokens[7]
# assert pleaded.repvec.shape == (300,)
# o = pleaded.repvec[:5]
# assert sum(o) != 0
# from numpy import dot
# from numpy.linalg import norm
#
# cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
# words = [w for w in nlp.vocab if w.is_lower and w.has_repvec]
# words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
# words.reverse()
# o = [w.orth_ for w in words[0:20]]
# assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded',
# u'pleads', u'testified', u'conspired', u'motioned', u'demurred',
# u'countersued', u'remonstrated', u'begged', u'apologised',
# u'consented', u'acquiesced', u'petitioned', u'quarreled',
# u'appealed', u'pleading']
# o = [w.orth_ for w in words[50:60]]
# assert o == [u'martialed', u'counselled', u'bragged',
# u'backtracked', u'caucused', u'refiled', u'dueled', u'mused',
# u'dissented', u'yearned']
# o = [w.orth_ for w in words[100:110]]
# assert o == [u'acquits', u'cabled', u'ducked', u'sentenced',
# u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed',
# u'clerked']
#
# #o = [w.orth_ for w in words[1000:1010]]
# #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled',
# # u'posited', u'firebombed', u'slimed', u'deferred', u'sagged']
# #o = [w.orth_ for w in words[50000:50010]]
# #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid',
# # u'dirty', u'rims', u'artists']

61
tests/test_matcher.py Normal file
View File

@ -0,0 +1,61 @@
from __future__ import unicode_literals
import pytest
from spacy.strings import StringStore
from spacy.matcher import *
from spacy.attrs import LOWER
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
@pytest.fixture
def matcher(EN):
patterns = {
'Javascript': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]],
'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]],
'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]],
}
return Matcher(EN.vocab, patterns)
def test_compile(matcher):
assert matcher.n_patterns == 3
def test_no_match(matcher, EN):
tokens = EN('I like cheese')
assert matcher(tokens) == []
def test_match_start(matcher, EN):
tokens = EN('JavaScript is good')
assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 0, 1)]
def test_match_end(matcher, EN):
tokens = EN('I like java')
assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 3)]
def test_match_middle(matcher, EN):
tokens = EN('I like Google Now best')
assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4)]
def test_match_multi(matcher, EN):
tokens = EN('I like Google Now and java best')
assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4),
(EN.vocab.strings['PRODUCT'], 5, 6)]
@pytest.mark.models
def test_match_preserved(matcher, EN):
doc = EN.tokenizer('I like java')
EN.tagger(doc)
assert len(doc.ents) == 0
doc = EN.tokenizer('I like java')
matcher(doc)
assert len(doc.ents) == 1
EN.tagger(doc)
EN.entity(doc)
assert len(doc.ents) == 1

View File

@ -4,7 +4,6 @@ from spacy.tokens import Doc
import pytest
@pytest.mark.models
def test_getitem(EN):
tokens = EN(u'Give it back! He pleaded.')
@ -32,3 +31,15 @@ def test_serialize_whitespace(EN):
assert tokens.string == new_tokens.string
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
def test_set_ents(EN):
tokens = EN.tokenizer(u'I use goggle chrone to surf the web')
assert len(tokens.ents) == 0
tokens.ents = [(EN.vocab.strings['PRODUCT'], 2, 4)]
assert len(list(tokens.ents)) == 1
assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
ent = tokens.ents[0]
assert ent.label_ == 'PRODUCT'
assert ent.start == 2
assert ent.end == 4