diff --git a/.gitignore b/.gitignore index 2644ca342..dc6568914 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ MANIFEST corpora/ models/ +examples/ +keys/ spacy/syntax/*.cpp spacy/syntax/*.html diff --git a/bin/init_model.py b/bin/init_model.py index db01885b3..0780d3c4b 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -20,6 +20,7 @@ from __future__ import unicode_literals from ast import literal_eval import math import gzip +import json import plac from pathlib import Path @@ -29,8 +30,6 @@ from shutil import copytree import codecs from collections import defaultdict -from spacy.en import get_lex_props -from spacy.en.lemmatizer import Lemmatizer from spacy.vocab import Vocab from spacy.vocab import write_binary_vectors from spacy.strings import hash_string @@ -38,6 +37,13 @@ from preshed.counter import PreshCounter from spacy.parts_of_speech import NOUN, VERB, ADJ +import spacy.en +import spacy.de +import spacy.fi +import spacy.it + + + def setup_tokenizer(lang_data_dir, tok_dir): if not tok_dir.exists(): @@ -139,7 +145,7 @@ def _read_senses(loc): return lexicon -def setup_vocab(src_dir, dst_dir): +def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() @@ -148,13 +154,13 @@ def setup_vocab(src_dir, dst_dir): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) else: print("Warning: Word vectors file not found") - vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) + vocab = Vocab(get_lex_attr=get_lex_attr, tag_map=tag_map) clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: probs, oov_prob = _read_freqs(src_dir / 'freqs.txt') if not probs: - oov_prob = 0.0 + oov_prob = -20 else: oov_prob = min(probs.values()) for word in clusters: @@ -163,23 +169,32 @@ def setup_vocab(src_dir, dst_dir): lexicon = [] for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): - entry = get_lex_props(word) - entry['prob'] = float(prob) - cluster = clusters.get(word, '0') + lexeme = vocab[word] + lexeme.prob = prob + lexeme.is_oov = False # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx - entry['cluster'] = int(cluster[::-1], 2) - vocab[word] = entry + if word in clusters: + lexeme.cluster = int(clusters[word][::-1], 2) + else: + lexeme.cluster = 0 vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt')) with (dst_dir / 'oov_prob').open('w') as file_: file_.write('%f' % oov_prob) -def main(lang_data_dir, corpora_dir, model_dir): +def main(lang_id, lang_data_dir, corpora_dir, model_dir): + languages = { + 'en': spacy.en.English.default_lex_attrs(), + 'de': spacy.de.German.default_lex_attrs(), + 'fi': spacy.fi.Finnish.default_lex_attrs(), + 'it': spacy.it.Italian.default_lex_attrs(), + } + model_dir = Path(model_dir) - lang_data_dir = Path(lang_data_dir) - corpora_dir = Path(corpora_dir) + lang_data_dir = Path(lang_data_dir) / lang_id + corpora_dir = Path(corpora_dir) / lang_id assert corpora_dir.exists() assert lang_data_dir.exists() @@ -187,9 +202,19 @@ def main(lang_data_dir, corpora_dir, model_dir): if not model_dir.exists(): model_dir.mkdir() + tag_map = json.load((lang_data_dir / 'tag_map.json').open()) setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') - setup_vocab(corpora_dir, model_dir / 'vocab') - if not (model_dir / 'wordnet').exists(): + setup_vocab(languages[lang_id], tag_map, corpora_dir, model_dir / 'vocab') + + if (lang_data_dir / 'gazetteer.json').exists(): + copyfile(str(lang_data_dir / 'gazetteer.json'), + str(model_dir / 'vocab' / 'gazetteer.json')) + + if (lang_data_dir / 'lemma_rules.json').exists(): + copyfile(str(lang_data_dir / 'lemma_rules.json'), + str(model_dir / 'vocab' / 'lemma_rules.json')) + + if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists(): copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet')) diff --git a/bin/parser/train.py b/bin/parser/train.py index 68217fcb3..abd5eb16e 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -14,7 +14,6 @@ import re import spacy.util from spacy.en import English -from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir from spacy.syntax.util import Config from spacy.gold import read_json_file @@ -22,6 +21,11 @@ from spacy.gold import GoldParse from spacy.scorer import Scorer +from spacy.syntax.arc_eager import ArcEager +from spacy.syntax.ner import BiluoPushDown +from spacy.tagger import Tagger +from spacy.syntax.parser import Parser + def _corrupt(c, noise_level): if random.random() >= noise_level: @@ -80,32 +84,28 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', beam_width=1, verbose=False, use_orig_arc_eager=False): dep_model_dir = path.join(model_dir, 'deps') - pos_model_dir = path.join(model_dir, 'pos') ner_model_dir = path.join(model_dir, 'ner') if path.exists(dep_model_dir): shutil.rmtree(dep_model_dir) - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) if path.exists(ner_model_dir): shutil.rmtree(ner_model_dir) os.mkdir(dep_model_dir) - os.mkdir(pos_model_dir) os.mkdir(ner_model_dir) - setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) - Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, - labels=Language.ParserTransitionSystem.get_labels(gold_tuples), + labels=ArcEager.get_labels(gold_tuples), beam_width=beam_width) Config.write(ner_model_dir, 'config', features='ner', seed=seed, - labels=Language.EntityTransitionSystem.get_labels(gold_tuples), + labels=BiluoPushDown.get_labels(gold_tuples), beam_width=0) if n_sents > 0: gold_tuples = gold_tuples[:n_sents] - nlp = Language(data_dir=model_dir) - + nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) + nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) + nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) + nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown) print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") for itn in range(n_iter): scorer = Scorer() @@ -140,7 +140,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc, scorer.token_acc)) - nlp.end_training() + nlp.end_training(model_dir) def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None): diff --git a/bin/tagger/train.py b/bin/tagger/train.py new file mode 100755 index 000000000..9cd8cc011 --- /dev/null +++ b/bin/tagger/train.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python +from __future__ import division +from __future__ import unicode_literals +from __future__ import print_function + +import os +from os import path +import shutil +import codecs +import random + +import plac +import re + +import spacy.util +from spacy.en import English + +from spacy.tagger import Tagger + +from spacy.syntax.util import Config +from spacy.gold import read_json_file +from spacy.gold import GoldParse + +from spacy.scorer import Scorer + + +def score_model(scorer, nlp, raw_text, annot_tuples): + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + else: + tokens = nlp.tokenizer(raw_text) + nlp.tagger(tokens) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold) + + +def _merge_sents(sents): + m_deps = [[], [], [], [], [], []] + m_brackets = [] + i = 0 + for (ids, words, tags, heads, labels, ner), brackets in sents: + m_deps[0].extend(id_ + i for id_ in ids) + m_deps[1].extend(words) + m_deps[2].extend(tags) + m_deps[3].extend(head + i for head in heads) + m_deps[4].extend(labels) + m_deps[5].extend(ner) + m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) + i += len(ids) + return [(m_deps, m_brackets)] + + +def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', + seed=0, gold_preproc=False, n_sents=0, corruption_level=0, + beam_width=1, verbose=False, + use_orig_arc_eager=False): + if n_sents > 0: + gold_tuples = gold_tuples[:n_sents] + + templates = Tagger.default_templates() + nlp = Language(data_dir=model_dir, tagger=False) + nlp.tagger = Tagger.blank(nlp.vocab, templates) + + print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") + for itn in range(n_iter): + scorer = Scorer() + loss = 0 + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, ctnt in sents: + words = annot_tuples[1] + gold_tags = annot_tuples[2] + score_model(scorer, nlp, raw_text, annot_tuples) + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(words) + else: + tokens = nlp.tokenizer(raw_text) + loss += nlp.tagger.train(tokens, gold_tags) + random.shuffle(gold_tuples) + print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, + scorer.tags_acc, + scorer.token_acc)) + nlp.end_training(model_dir) + +def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, + beam_width=None): + nlp = Language(data_dir=model_dir) + if beam_width is not None: + nlp.parser.cfg.beam_width = beam_width + scorer = Scorer() + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=verbose) + return scorer + + +def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): + nlp = Language(data_dir=model_dir) + if beam_width is not None: + nlp.parser.cfg.beam_width = beam_width + gold_tuples = read_json_file(dev_loc) + scorer = Scorer() + out_file = codecs.open(out_loc, 'w', 'utf8') + for raw_text, sents in gold_tuples: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=False) + for t in tokens: + out_file.write( + '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_) + ) + return scorer + + +@plac.annotations( + train_loc=("Location of training file or directory"), + dev_loc=("Location of development file or directory"), + model_dir=("Location of output model directory",), + eval_only=("Skip training, and only evaluate", "flag", "e", bool), + corruption_level=("Amount of noise to add to training data", "option", "c", float), + gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool), + out_loc=("Out location", "option", "o", str), + n_sents=("Number of training sentences", "option", "n", int), + n_iter=("Number of training iterations", "option", "i", int), + verbose=("Verbose error reporting", "flag", "v", bool), + debug=("Debug mode", "flag", "d", bool), +) +def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, + debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False): + if not eval_only: + gold_train = list(read_json_file(train_loc)) + train(English, gold_train, model_dir, + feat_set='basic' if not debug else 'debug', + gold_preproc=gold_preproc, n_sents=n_sents, + corruption_level=corruption_level, n_iter=n_iter, + verbose=verbose) + #if out_loc: + # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) + scorer = evaluate(English, list(read_json_file(dev_loc)), + model_dir, gold_preproc=gold_preproc, verbose=verbose) + print('TOK', scorer.token_acc) + print('POS', scorer.tags_acc) + print('UAS', scorer.uas) + print('LAS', scorer.las) + + print('NER P', scorer.ents_p) + print('NER R', scorer.ents_r) + print('NER F', scorer.ents_f) + + +if __name__ == '__main__': + plac.call(main) diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index bb80f0928..000000000 --- a/docs/Makefile +++ /dev/null @@ -1,177 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = ../../docs-spacy - -# User-friendly check for sphinx-build -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext - -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/spaCy.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/spaCy.qhc" - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/spaCy" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/spaCy" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index ac6849abd..000000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,271 +0,0 @@ -# -*- coding: utf-8 -*- -# -# spaCy documentation build configuration file, created by -# sphinx-quickstart on Thu Sep 25 17:47:15 2014. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys -import os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.viewcode', - 'sphinxcontrib.napoleon', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'spaCy' -copyright = u'2015, Matthew Honnibal' - - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '0.85' -# The full version, including alpha/beta/rc tags. -release = '0.85' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = [] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'sphinx_rtd_theme' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -html_theme_options = { - 'google_analytics_id': 'UA-58931649-1' -} - -# Add any paths that contain custom themes here, relative to this directory. -html_theme_path = ["../_themes"] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -#html_extra_path = [] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_domain_indices = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = 'spaCydoc' - - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - ('index', 'spaCy.tex', u'spaCy Documentation', - u'Matthew Honnibal', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# If true, show page references after internal links. -#latex_show_pagerefs = False - -# If true, show URL addresses after external links. -#latex_show_urls = False - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'spacy', u'spaCy Documentation', - [u'Matthew Honnibal'], 1) -] - -# If true, show URL addresses after external links. -#man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ('index', 'spaCy', u'spaCy Documentation', - u'Matthew Honnibal', 'spaCy', 'One line description of project.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -#texinfo_appendices = [] - -# If false, no module index is generated. -#texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False - - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/docs/source/depr/annotation.rst b/docs/source/depr/annotation.rst deleted file mode 100644 index c19e70bbd..000000000 --- a/docs/source/depr/annotation.rst +++ /dev/null @@ -1,116 +0,0 @@ -==================== -Annotation Standards -==================== - -This document describes the target annotations spaCy is trained to predict. - -This is currently a work in progress. Please ask questions on the issue tracker, -so that the answers can be integrated here to improve the documentation. - -https://github.com/honnibal/spaCy/issues - -English -======= - -Tokenization ------------- - -Tokenization standards are based on the OntoNotes 5 corpus. - -The tokenizer differs from most by including tokens for significant whitespace. -Any sequence of whitespace characters beyond a single space (' ') is included -as a token. For instance: - - >>> from spacy.en import English - >>> nlp = English(parse=False) - >>> tokens = nlp(u'Some\nspaces and\ttab characters') - >>> print [t.orth_ for t in tokens] - [u'Some', u'\n', u'spaces', u' ', u'and', u'\t', u'tab', u'characters'] - -The whitespace tokens are useful for much the same reason punctuation is --- it's -often an important delimiter in the text. By preserving it in the token output, -we are able to maintain a simple alignment between the tokens and the original -string, and we ensure that the token stream does not lose information. - -Sentence boundary detection ---------------------------- - -Sentence boundaries are calculated from the syntactic parse tree, so features -such as punctuation and capitalisation play an important but non-decisive role -in determining the sentence boundaries. Usually this means that the sentence -boundaries will at least coincide with clause boundaries, even given poorly -punctuated text. - -Part-of-speech Tagging ----------------------- - -The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank -tag set. We also map the tags to the simpler Google Universal POS Tag set. - -Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124 - -Lemmatization -------------- - -A "lemma" is the uninflected form of a word. In English, this means: - -* Adjectives: The form like "happy", not "happier" or "happiest" -* Adverbs: The form like "badly", not "worse" or "worst" -* Nouns: The form like "dog", not "dogs"; like "child", not "children" -* Verbs: The form like "write", not "writes", "writing", "wrote" or "written" - -The lemmatization data is taken from WordNet. However, we also add a special -case for pronouns: all pronouns are lemmatized to the special token -PRON-. - -Syntactic Dependency Parsing ----------------------------- - -The parser is trained on data produced by the ClearNLP converter. Details of -the annotation scheme can be found here: - -http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf - -Named Entity Recognition ------------------------- - - +--------------+-----------------------------------------------------+ - | PERSON | People, including fictional | - +--------------+-----------------------------------------------------+ - | NORP | Nationalities or religious or political groups | - +--------------+-----------------------------------------------------+ - | FACILITY | Buildings, airports, highways, bridges, etc. | - +--------------+-----------------------------------------------------+ - | ORGANIZATION | Companies, agencies, institutions, etc. | - +--------------+-----------------------------------------------------+ - | GPE | Countries, cities, states | - +--------------+-----------------------------------------------------+ - | LOCATION | Non-GPE locations, mountain ranges, bodies of water | - +--------------+-----------------------------------------------------+ - | PRODUCT | Vehicles, weapons, foods, etc. (Not services) | - +--------------+-----------------------------------------------------+ - | EVENT | Named hurricanes, battles, wars, sports events, etc.| - +--------------+-----------------------------------------------------+ - | WORK OF ART | Titles of books, songs, etc. | - +--------------+-----------------------------------------------------+ - | LAW | Named documents made into laws | - +--------------+-----------------------------------------------------+ - | LANGUAGE | Any named language | - +--------------+-----------------------------------------------------+ - -The following values are also annotated in a style similar to names: - - +--------------+---------------------------------------------+ - | DATE | Absolute or relative dates or periods | - +--------------+---------------------------------------------+ - | TIME | Times smaller than a day | - +--------------+---------------------------------------------+ - | PERCENT | Percentage (including “%”) | - +--------------+---------------------------------------------+ - | MONEY | Monetary values, including unit | - +--------------+---------------------------------------------+ - | QUANTITY | Measurements, as of weight or distance | - +--------------+---------------------------------------------+ - | ORDINAL | "first", "second" | - +--------------+---------------------------------------------+ - | CARDINAL | Numerals that do not fall under another type| - +--------------+---------------------------------------------+ diff --git a/docs/source/depr/api.rst b/docs/source/depr/api.rst deleted file mode 100644 index 8b1378917..000000000 --- a/docs/source/depr/api.rst +++ /dev/null @@ -1 +0,0 @@ - diff --git a/docs/source/depr/features.rst b/docs/source/depr/features.rst deleted file mode 100644 index ecd465182..000000000 --- a/docs/source/depr/features.rst +++ /dev/null @@ -1,77 +0,0 @@ -Lexeme Features -=============== - -A lexeme is an entry in the lexicon --- the vocabulary --- for a word, punctuation -symbol, whitespace unit, etc. Lexemes come with lots of pre-computed information, -that help you write good feature functions. Features are integer-valued where -possible --- instead of strings, spaCy refers to strings by consecutive ID numbers, -which you can use to look up the string values if necessary. - -String features ---------------- - -+---------+-------------------------------------------------------------------+ -| SIC | The word as it appeared in the sentence, unaltered. | -+---------+-------------------------------------------------------------------+ -| NORM | For frequent words, case normalization is applied. | -| | Otherwise, back-off to SHAPE. | -+---------+-------------------------------------------------------------------+ -| SHAPE | Remap the characters of the word as follows: | -| | | -| | a-z --> x, A-Z --> X, 0-9 --> d, ,.;:"'?!$- --> self, other --> \*| -| | | -| | Trim sequences of length 3+ to 3, e.g | -| | | -| | apples --> xxx, Apples --> Xxxx, app9LES@ --> xxx9XXX* | -+---------+-------------------------------------------------------------------+ -| ASCIIED | Use unidecode.unidecode(sic) to approximate the word using the | -| | ascii characters. | -+---------+-------------------------------------------------------------------+ -| PREFIX | sic_unicode_string[:1] | -+---------+-------------------------------------------------------------------+ -| SUFFIX | sic_unicode_string[-3:] | -+---------+-------------------------------------------------------------------+ - - -Integer features ----------------- - -+--------------+--------------------------------------------------------------+ -| LENGTH | Length of the string, in unicode | -+--------------+--------------------------------------------------------------+ -| CLUSTER | Brown cluster | -+--------------+--------------------------------------------------------------+ -| POS_TYPE | K-means cluster of word's tag affinities | -+--------------+--------------------------------------------------------------+ -| SENSE_TYPE | K-means cluster of word's sense affinities | -+--------------+--------------------------------------------------------------+ - -Boolean features ----------------- - -+-------------+--------------------------------------------------------------+ -| IS_ALPHA | The result of sic.isalpha() | -+-------------+--------------------------------------------------------------+ -| IS_ASCII | Check whether all the word's characters are ascii characters | -+-------------+--------------------------------------------------------------+ -| IS_DIGIT | The result of sic.isdigit() | -+-------------+--------------------------------------------------------------+ -| IS_LOWER | The result of sic.islower() | -+-------------+--------------------------------------------------------------+ -| IS_PUNCT | Check whether all characters are in the class TODO | -+-------------+--------------------------------------------------------------+ -| IS_SPACE | The result of sic.isspace() | -+-------------+--------------------------------------------------------------+ -| IS_TITLE | The result of sic.istitle() | -+-------------+--------------------------------------------------------------+ -| IS_UPPER | The result of sic.isupper() | -+-------------+--------------------------------------------------------------+ -| LIKE_URL | Check whether the string looks like it could be a URL. Aims | -| | for low false negative rate. | -+-------------+--------------------------------------------------------------+ -| LIKE_NUMBER | Check whether the string looks like it could be a numeric | -| | entity, e.g. 10,000 10th .10 . Skews for low false negative | -| | rate. | -+-------------+--------------------------------------------------------------+ -| IN_LIST | Facility for loading arbitrary run-time word lists? | -+-------------+--------------------------------------------------------------+ diff --git a/docs/source/example_wsj0001.json b/docs/source/example_wsj0001.json deleted file mode 100644 index 25d1cf5c7..000000000 --- a/docs/source/example_wsj0001.json +++ /dev/null @@ -1,337 +0,0 @@ -{ - "id": "wsj_0001", - "paragraphs": [ - { - "raw": "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29. Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.", - - "segmented": "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.", - - "sents": [ - 0, - 85 - ], - - "tokens": [ - { - "dep": "NMOD", - "start": 0, - "head": 7, - "tag": "NNP", - "orth": "Pierre" - }, - { - "dep": "SUB", - "start": 7, - "head": 29, - "tag": "NNP", - "orth": "Vinken" - }, - { - "dep": "P", - "start": 13, - "head": 7, - "tag": ",", - "orth": "," - }, - { - "dep": "NMOD", - "start": 15, - "head": 18, - "tag": "CD", - "orth": "61" - }, - { - "dep": "AMOD", - "start": 18, - "head": 24, - "tag": "NNS", - "orth": "years" - }, - { - "dep": "NMOD", - "start": 24, - "head": 7, - "tag": "JJ", - "orth": "old" - }, - { - "dep": "P", - "start": 27, - "head": 7, - "tag": ",", - "orth": "," - }, - { - "dep": "ROOT", - "start": 29, - "head": -1, - "tag": "MD", - "orth": "will" - }, - { - "dep": "VC", - "start": 34, - "head": 29, - "tag": "VB", - "orth": "join" - }, - { - "dep": "NMOD", - "start": 39, - "head": 43, - "tag": "DT", - "orth": "the" - }, - { - "dep": "OBJ", - "start": 43, - "head": 34, - "tag": "NN", - "orth": "board" - }, - { - "dep": "VMOD", - "start": 49, - "head": 34, - "tag": "IN", - "orth": "as" - }, - { - "dep": "NMOD", - "start": 52, - "head": 67, - "tag": "DT", - "orth": "a" - }, - { - "dep": "NMOD", - "start": 54, - "head": 67, - "tag": "JJ", - "orth": "nonexecutive" - }, - { - "dep": "PMOD", - "start": 67, - "head": 49, - "tag": "NN", - "orth": "director" - }, - { - "dep": "VMOD", - "start": 76, - "head": 34, - "tag": "NNP", - "orth": "Nov." - }, - { - "dep": "NMOD", - "start": 81, - "head": 76, - "tag": "CD", - "orth": "29" - }, - { - "dep": "P", - "start": 83, - "head": 29, - "tag": ".", - "orth": "." - }, - { - "dep": "NMOD", - "start": 85, - "head": 89, - "tag": "NNP", - "orth": "Mr." - }, - { - "dep": "SUB", - "start": 89, - "head": 96, - "tag": "NNP", - "orth": "Vinken" - }, - { - "dep": "ROOT", - "start": 96, - "head": -1, - "tag": "VBZ", - "orth": "is" - }, - { - "dep": "PRD", - "start": 99, - "head": 96, - "tag": "NN", - "orth": "chairman" - }, - { - "dep": "NMOD", - "start": 108, - "head": 99, - "tag": "IN", - "orth": "of" - }, - { - "dep": "NMOD", - "start": 111, - "head": 120, - "tag": "NNP", - "orth": "Elsevier" - }, - { - "dep": "NMOD", - "start": 120, - "head": 147, - "tag": "NNP", - "orth": "N.V." - }, - { - "dep": "P", - "start": 124, - "head": 147, - "tag": ",", - "orth": "," - }, - { - "dep": "NMOD", - "start": 126, - "head": 147, - "tag": "DT", - "orth": "the" - }, - { - "dep": "NMOD", - "start": 130, - "head": 147, - "tag": "NNP", - "orth": "Dutch" - }, - { - "dep": "NMOD", - "start": 136, - "head": 147, - "tag": "VBG", - "orth": "publishing" - }, - { - "dep": "PMOD", - "start": 147, - "head": 108, - "tag": "NN", - "orth": "group" - }, - { - "dep": "P", - "start": 152, - "head": 96, - "tag": ".", - "orth": "." - } - ], - "brackets": [ - { - "start": 0, - "end": 7, - "label": "NP" - }, - { - "start": 15, - "end": 18, - "label": "NP" - }, - { - "start": 15, - "end": 24, - "label": "ADJP" - }, - { - "start": 0, - "end": 27, - "label": "NP-SBJ" - }, - { - "start": 39, - "end": 43, - "label": "NP" - }, - { - "start": 52, - "end": 67, - "label": "NP" - }, - { - "start": 49, - "end": 67, - "label": "PP-CLR" - }, - { - "start": 76, - "end": 81, - "label": "NP-TMP" - }, - { - "start": 34, - "end": 81, - "label": "VP" - }, - { - "start": 29, - "end": 81, - "label": "VP" - }, - { - "start": 0, - "end": 83, - "label": "S" - }, - { - "start": 85, - "end": 89, - "label": "NP-SBJ" - }, - { - "start": 99, - "end": 99, - "label": "NP" - }, - { - "start": 111, - "end": 120, - "label": "NP" - }, - { - "start": 126, - "end": 147, - "label": "NP" - }, - { - "start": 111, - "end": 147, - "label": "NP" - }, - { - "start": 108, - "end": 147, - "label": "PP" - }, - { - "start": 99, - "end": 147, - "label": "NP-PRD" - }, - { - "start": 96, - "end": 147, - "label": "VP" - }, - { - "start": 85, - "end": 152, - "label": "S" - } - ] - } - ] -} diff --git a/docs/source/howworks.rst b/docs/source/howworks.rst deleted file mode 100644 index 00d61d66d..000000000 --- a/docs/source/howworks.rst +++ /dev/null @@ -1,262 +0,0 @@ -How spaCy Works -=============== - -The following are some hasty preliminary notes on how spaCy works. The short -story is, there are no new killer algorithms. The way that the tokenizer works -is novel and a bit neat, and the parser has a new feature set, but otherwise -the key algorithms are well known in the recent literature. - -Some might also wonder how I get Python code to run so fast. I don't --- spaCy -is written in `Cython`_, an optionally statically-typed language that compiles -to C or C++, which is then loaded as a C extension module. -This makes it `easy to achieve the performance of native C code`_, but allows the -use of Python language features, via the Python C API. The Python unicode -library was particularly useful to me. I think it would have been much more -difficult to write spaCy in another language. - -.. _Cython: http://cython.org/ - -.. _easy to achieve the performance of native C code: https://honnibal.wordpress.com/2014/10/21/writing-c-in-cython/ - -Tokenizer and Lexicon ---------------------- - -Tokenization is the task of splitting a string into meaningful pieces, called -tokens, which you can then compute with. In practice, the task is usually to -match the tokenization performed in some treebank, or other corpus. If we want -to apply a tagger, entity recogniser, parser etc, then we want our run-time -text to match the training conventions. If we want to use a model that's been -trained to expect "isn't" to be split into two tokens, ["is", "n't"], then that's -how we need to prepare our data. - -In order to train spaCy's models with the best data available, I therefore -tokenize English according to the Penn Treebank scheme. It's not perfect, but -it's what everybody is using, and it's good enough. - -What we don't do -################ - -The Penn Treebank was distributed with a script called tokenizer.sed, which -tokenizes ASCII newswire text roughly according to the Penn Treebank standard. -Almost all tokenizers are based on these regular expressions, with various -updates to account for unicode characters, and the fact that it's no longer -1986 --- today's text has URLs, emails, emoji, etc. - -Usually, the resulting regular expressions are applied in multiple passes, which -is quite inefficient. Often no care is taken to preserve indices into the original -string. If you lose these indices, it'll be difficult to calculate mark-up based -on your annotations. - -Tokenizer Algorithm -################### - -spaCy's tokenizer assumes that no tokens will cross whitespace --- there will -be no multi-word tokens. If we want these, we can post-process the -token-stream later, merging as necessary. This assumption allows us to deal -only with small chunks of text. We can cache the processing of these, and -simplify our expressions somewhat. - -Here is what the outer-loop would look like in Python. (You can see the -production implementation, in Cython, here.) - -.. code:: python - - cache = {} - def tokenize(text): - tokens = [] - for substring in text.split(' '): - if substring in cache: - tokens.extend(cache[substring]) - else: - subtokens = _tokenize_substring(substring) - tokens.extend(subtokens) - cache[substring] = subtokens - return tokens - -The actual work is performed in _tokenize_substring. For this, I divide the -tokenization rules into three pieces: - -1. A prefixes expression, which matches from the start of the string; -2. A suffixes expression, which matches from the end of the string; -3. A special-cases table, which matches the whole string. - -The algorithm then proceeds roughly like this (consider this like pseudo-code; -this was written quickly and has not been executed): - -.. code:: python - - # Tokens which can be attached at the beginning or end of another - prefix_re = _make_re([",", '"', '(', ...]) - suffix_re = _make_re(s[",", "'", ":", "'s", ...]) - - # Contractions etc are simply enumerated, since they're a finite set. We - # can also specify anything we like here, which is nice --- different data - # has different quirks, so we want to be able to add ad hoc exceptions. - special_cases = { - "can't": ("ca", "n't"), - "won't": ("wo", "n't"), - "he'd've": ("he", "'d", "'ve"), - ... - ":)": (":)",) # We can add any arbitrary thing to this list. - } - - def _tokenize_substring(substring): - prefixes = [] - suffixes = [] - while substring not in special_cases: - prefix, substring = _apply_re(substring, prefix_re) - if prefix: - prefixes.append(prefix) - else: - suffix, substring = _apply_re(substring, suffix_re) - if suffix: - suffixes.append(suffix) - else: - break - - -This procedure splits off tokens from the start and end of the string, at each -point checking whether the remaining string is in our special-cases table. If -it is, we stop splitting, and return the tokenization at that point. - -The advantage of this design is that the prefixes, suffixes and special-cases -can be declared separately, in easy-to-understand files. If a new entry is -added to the special-cases, you can be sure that it won't have some unforeseen -consequence to a complicated regular-expression grammar. - -Coupling the Tokenizer and Lexicon -################################## - -As mentioned above, the tokenizer is designed to support easy caching. If all -we were caching were the matched substrings, this would not be so advantageous. -Instead, what we do is create a struct which houses all of our lexical -features, and cache *that*. The tokens are then simply pointers to these rich -lexical types. - -In a sample of text, vocabulary size grows exponentially slower than word -count. So any computations we can perform over the vocabulary and apply to the -word count are efficient. - - -Part-of-speech Tagger ---------------------- - -.. _how to write a good part of speech tagger: https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ . - -In 2013, I wrote a blog post describing `how to write a good part of speech -tagger`_. -My recommendation then was to use greedy decoding with the averaged perceptron. -I think this is still the best approach, so it's what I implemented in spaCy. - -The tutorial also recommends the use of Brown cluster features, and case -normalization features, as these make the model more robust and domain -independent. spaCy's tagger makes heavy use of these features. - -Dependency Parser ------------------ - -.. _2014 blog post: https://honnibal.wordpress.com/2013/12/18/a-simple-fast-algorithm-for-natural-language-dependency-parsing/ - -The parser uses the algorithm described in my `2014 blog post`_. -This algorithm, shift-reduce dependency parsing, is becoming widely adopted due -to its compelling speed/accuracy trade-off. - -Some quick details about spaCy's take on this, for those who happen to know -these models well. I'll write up a better description shortly. - -1. I use greedy decoding, not beam search; -2. I use the arc-eager transition system; -3. I use the Goldberg and Nivre (2012) dynamic oracle. -4. I use the non-monotonic update from my CoNLL 2013 paper (Honnibal, Goldberg - and Johnson 2013). - -So far, this is exactly the configuration from the CoNLL 2013 paper, which -scored 91.0. So how have I gotten it to 92.4? The following tweaks: - -1. I use Brown cluster features --- these help a lot; -2. I redesigned the feature set. I've long known that the Zhang and Nivre - (2011) feature set was suboptimal, but a few features don't make a very - compelling publication. Still, they're important. -3. When I do the dynamic oracle training, I also make - the upate cost-sensitive: if the oracle determines that the move the parser - took has a cost of N, then the weights for the gold class are incremented by - +N, and the weights for the predicted class are incremented by -N. This - only made a small (0.1-0.2%) difference. - -Implementation -############## - -I don't do anything algorithmically novel to improve the efficiency of the -parser. However, I was very careful in the implementation. - -A greedy shift-reduce parser with a linear model boils down to the following -loop: - -.. code:: python - - def parse(words, model, feature_funcs, n_classes): - state = init_state(words) - for _ in range(len(words) * 2): - features = [templ(state) for templ in feature_funcs] - scores = [0 for _ in range(n_classes)] - for feat in features: - weights = model[feat] - for i, weight in enumerate(weights): - scores[i] += weight - class_, score = max(enumerate(scores), key=lambda item: item[1]) - transition(state, class_) - -The parser makes 2N transitions for a sentence of length N. In order to select -the transition, it extracts a vector of K features from the state. Each feature -is used as a key into a hash table managed by the model. The features map to -a vector of weights, of length C. We then dot product the feature weights to the -scores vector we are building for that instance. - -The inner-most loop here is not so bad: we only have a few dozen classes, so -it's just a short dot product. Both of the vectors are in the cache, so this -is a snack to a modern CPU. - -The bottle-neck in this algorithm is the 2NK look-ups into the hash-table that -we must make, as these almost always have to hit main memory. The feature-set -is enormously large, because all of our features are one-hot boolean -indicators. Some of the features will be common, so they'll lurk around in the -CPU's cache hierarchy. But a lot of them won't be, and accessing main memory -takes a lot of cycles. - -.. _Jeff Preshing's excellent post: http://preshing.com/20130107/this-hash-table-is-faster-than-a-judy-array/ . - -I used to use the Google dense_hash_map implementation. This seemed a solid -choice: it came from a big brand, it was in C++, and it seemed very -complicated. Later, I read `Jeff Preshing's excellent post`_ on open-addressing -with linear probing. -This really spoke to me. I had assumed that a fast hash table implementation -would necessarily be very complicated, but no --- this is another situation -where the simple strategy wins. - -I've packaged my Cython implementation separately from spaCy, in the package -`preshed`_ --- for "pre-hashed", but also as a nod to Preshing. I've also taken -great care over the feature extraction and perceptron code, which I'm distributing -in a package named `thinc`_ (since it's for learning very sparse models with -Cython). - -.. _preshed: https://github.com/syllog1sm/preshed - -.. _thinc: https://github.com/honnibal/thinc - -By the way: from comparing notes with a few people, it seems common to -implement linear models in a way that's suboptimal for multi-class -classification. The mistake is to store in the hash-table one weight per -(feature, class) pair, rather than mapping the feature to a vector of weights, -for all of the classes. This is bad because it means you need to hit the table -C times, one per class, as you always need to evaluate a feature against all of -the classes. In the case of the parser, this means the hash table is accessed -2NKC times, instead of the 2NK times if you have a weights vector. You should -also be careful to store the weights contiguously in memory --- you don't want -a linked list here. I use a block-sparse format, because my problems tend to -have a few dozen classes. - -I guess if I had to summarize my experience, I'd say that the efficiency of -these models is really all about the data structures. We want to stay small, -and stay contiguous. Minimize redundancy and minimize pointer chasing. -That's why Cython is so well suited to this: we get to lay out our data -structures, and manage the memory ourselves, with full C-level control. diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index 50333f74b..000000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,51 +0,0 @@ -.. spaCy documentation master file, created by - sphinx-quickstart on Tue Aug 19 16:27:38 2014. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -============================== -spaCy: Industrial-strength NLP -============================== - -`spaCy`_ is a library for building tomorrow's language technology products. -It's like Stanford's CoreNLP for Python, but with a fundamentally different -objective. While CoreNLP is primarily built for conducting research, spaCy is -designed for application. - -If you're a small company doing NLP, I think spaCy will seem like a minor miracle. -It's by far the fastest NLP software ever released. -The full processing pipeline completes in under 50ms per document, including accurate -tagging, entity recognition and parsing. All strings are mapped to integer IDs, -tokens are linked to embedded word representations, and a range of useful features -are pre-calculated and cached. The full analysis can be exported to numpy -arrays, or losslessly serialized into binary data smaller than the raw text. - -If none of that made any sense to you, here's the gist of it. Computers don't -understand text. This is unfortunate, because that's what the web almost entirely -consists of. We want to recommend people text based on other text they liked. -We want to shorten text to display it on a mobile screen. We want to aggregate -it, link it, filter it, categorise it, generate it and correct it. - -spaCy provides a library of utility functions that help programmers build such -products. It's commercial open source software: you can either use it under -the AGPL, or you can `buy a commercial license`_ for a one-time fee. - - -.. _spaCy: https://github.com/honnibal/spaCy/ - -.. _Issue Tracker: https://github.com/honnibal/spaCy/issues - -**2015-07-08**: `Version 0.89 released`_ - -.. _Version 0.89 released: updates.html - -.. _buy a commercial license: license.html - -.. toctree:: - :maxdepth: 4 - :hidden: - - quickstart.rst - reference/index.rst - license.rst - updates.rst diff --git a/docs/source/license.rst b/docs/source/license.rst deleted file mode 100644 index 7f3b55418..000000000 --- a/docs/source/license.rst +++ /dev/null @@ -1,126 +0,0 @@ -======= -License -======= - -* Download the `license agreement`_ -* Get in touch: matt@spacy.io - -.. _license agreement: spacy_trial_free.docx - - - +------------+-----------+----------+-------------------------------------+ - | License | Price | Term | Suitable for | - +============+===========+==========+=====================================+ - | Commercial | $5,000 | Life | Production use | - +------------+-----------+----------+-------------------------------------+ - | Trial | $0 | 90 days | Evaluation, seed startup | - +------------+-----------+----------+-------------------------------------+ - | AGPLv3 | Free | Life | Research, teaching, hobbyists, FOSS | - +------------+-----------+----------+-------------------------------------+ - - -To make spaCy as valuable as possible, licenses to it are for life. You get -complete transparency, certainty and control. -If you need to use spaCy as an API, it's trivial to host it yourself --- and -you don't need to worry about the service changing or disappearing. -And if you're ever in acquisition or IPO talks, the story is simple. - -spaCy can also be used as free open-source software, under the Aferro GPL -license. If you use it this way, you must comply with the AGPL license terms. -When you distribute your project, or offer it as a network service, you must -distribute the source-code and grant users an AGPL license to it. - - -.. I left academia in June 2014, just when I should have been submitting my first - grant proposal. Grant writing seemed a bad business model. I wasn't sure - exactly what I would do instead, but I knew that the work I could do was - valuable, and that it would make sense for people to pay me to do it, and that - it's often easy to convince smart people of things that are true. - -.. I left because I don't like the grant system. It's not the - best way to create value, and it's not the best way to get paid. - - -Examples --------- - -In order to clarify how spaCy's license structure might apply to you, I've -written a few examples, in the form of user-stories. - -Ashley and Casey: Seed stage start-up -##################################### - -Ashley and Casey have an idea for a start-up. To explore their idea, they want -to build a minimum viable product they can put in front of potential users and -investors. - -They have two options. - - 1. **Trial commercial license.** With a simple form, they can use spaCy for 90 - days, for a nominal fee of $1. They are free to modify spaCy, and they - will own the copyright to their modifications for the duration of the license. - After the trial period elapses, they can either pay the license fee, stop - using spaCy, release their project under the AGPL. - - 2. **AGPL.** Casey and Pat can instead use spaCy under the AGPL license. - However, they must then release any code that statically or dynamically - links to spaCy under the AGPL as well (e.g. if they import the module, or - import a module that imports it, etc). They also cannot use spaCy as - a network resource, by running it as a service --- this is the - loophole that the "A" part of the AGPL is designed to close. - -Ashley and Casey find the AGPL license unattractive for commercial use. -They decide to take up the trial commercial license. -However, over the next 90 days, Ashley has to move house twice, and Casey gets -sick. By the time the trial expires, they still don't have a demo they can show -investors. They send an email explaining the situation, and a 90 day extension -to their trial license is granted. - -By the time the extension period has elapsed, spaCy has helped them secure -funding, and they even have a little revenue. They are glad to pay the $5,000 -commercial license fee. - -spaCy is now permanently licensed for the product Ashley and Casey are -developing. They own the copyright to any modifications they make to spaCy, -but not to the original spaCy code. - -No additional fees will be due when they hire new developers, run spaCy on -additional internal servers, etc. If their company is acquired, the license will -be transferred to the company acquiring them. However, to use spaCy in another -product, they will have to buy a second license. - - -Alex and Sasha: University Academics -#################################### - -Alex and Sasha are post-doctoral researchers working for a university. Part of -their funding comes from a grant from Google, but Google will not own any part -of the work that they produce. Their mission is just to write papers. - -Alex and Sasha find spaCy convenient, so they use it in their system under the -AGPL. This means that their system must also be released under the AGPL, but they're -cool with that --- they were going to release their code anyway, as it's the only -way to ensure their experiments are properly repeatable. - -Alex and Sasha find and fix a few bugs in spaCy. They must release these -modifications, and they ask that they be accepted into the main spaCy repo. -In order to do this, they must sign a contributor agreement, ceding their -copyright. When commercial licenses to spaCy are sold, Alex and Sasha will -not be able to claim any royalties from their contributions. - -Later, Alex and Sasha implement new features into spaCy, for another paper. The -code was quite rushed, and they don't want to take the time to put together a -proper pull request. They must release their modifications under the AGPL, but -they are not obliged to contribute it to the spaCy repository, or concede their -copyright. - - -Phuong and Jessie: Open Source developers -######################################### - -Phuong and Jessie use the open-source software Calibre to manage their e-book -libraries. They have an idea for a search feature, and they want to use spaCy -to implement it. Calibre is released under the GPLv3. The AGPL has additional -restrictions for projects used as a network resource, but they don't apply to -this project, so Phuong and Jessie can use spaCy to improve Calibre. They'll -have to release their code, but that was always their intention anyway. diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst deleted file mode 100644 index ba1c24626..000000000 --- a/docs/source/quickstart.rst +++ /dev/null @@ -1,236 +0,0 @@ -Quick Start -=========== - - -Install -------- - -.. py:currentmodule:: spacy - - -With Python 2.7 or Python 3, using Linux or OSX, run: - -.. code:: bash - - $ pip install spacy - $ python -m spacy.en.download - -.. _300 mb of data: http://s3-us-west-1.amazonaws.com/media.spacynlp.com/en_data_all-0.4.tgz - - -The download command fetches and installs about 300mb of data, for the -parser model and word vectors, which it installs within the spacy.en package directory. - -If you're stuck using a server with an old version of Python, and you don't -have root access, I've prepared a bootstrap script to help you compile a local -Python install. Run: - -.. code:: bash - - $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate - -The other way to install the package is to clone the github repository, and -build it from source. This installs an additional dependency, Cython. -If you're using Python 2, I also recommend installing fabric and fabtools --- -this is how I build the project. - -.. code:: bash - - $ git clone https://github.com/honnibal/spaCy.git - $ cd spaCy - $ virtualenv .env && source .env/bin/activate - $ export PYTHONPATH=`pwd` - $ pip install -r requirements.txt - $ python setup.py build_ext --inplace - $ python -m spacy.en.download - $ pip install pytest - $ py.test tests/ - -Python packaging is awkward at the best of times, and it's particularly tricky -with C extensions, built via Cython, requiring large data files. So, please -report issues as you encounter them, and bear with me :) - -Usage ------ - -The main entry-point is :meth:`en.English.__call__`, which accepts a unicode string -as an argument, and returns a :py:class:`tokens.Doc` object. You can -iterate over it to get :py:class:`tokens.Token` objects, which provide -a convenient API: - - >>> from __future__ import unicode_literals # If Python 2 - >>> from spacy.en import English - >>> nlp = English() - >>> tokens = nlp(u'I ate the pizza with anchovies.') - >>> pizza = tokens[3] - >>> (pizza.orth, pizza.orth_, pizza.head.lemma, pizza.head.lemma_) - ... (14702, u'pizza', 14702, u'eat') - -spaCy maps all strings to sequential integer IDs --- a common trick in NLP. -If an attribute `Token.foo` is an integer ID, then `Token.foo_` is the string, -e.g. `pizza.orth` and `pizza.orth_` provide the integer ID and the string of -the original orthographic form of the word. - - .. note:: en.English.__call__ is stateful --- it has an important **side-effect**. - - When it processes a previously unseen word, it increments the ID counter, - assigns the ID to the string, and writes the mapping in - :py:data:`English.vocab.strings` (instance of - :py:class:`strings.StringStore`). - Future releases will feature a way to reconcile mappings, but for now, you - should only work with one instance of the pipeline at a time. - - -(Most of the) API at a glance ------------------------------ - -**Process the string:** - - .. py:class:: spacy.en.English(self, data_dir=join(dirname(__file__), 'data')) - - .. py:method:: __call__(self, text: unicode, tag=True, parse=True, entity=True, merge_mwes=False) --> Doc - - +-----------------+--------------+--------------+ - | Attribute | Type | Its API | - +=================+==============+==============+ - | vocab | Vocab | __getitem__ | - +-----------------+--------------+--------------+ - | vocab.strings | StingStore | __getitem__ | - +-----------------+--------------+--------------+ - | tokenizer | Tokenizer | __call__ | - +-----------------+--------------+--------------+ - | tagger | EnPosTagger | __call__ | - +-----------------+--------------+--------------+ - | parser | GreedyParser | __call__ | - +-----------------+--------------+--------------+ - | entity | GreedyParser | __call__ | - +-----------------+--------------+--------------+ - -**Get dict or numpy array:** - - .. py:method:: tokens.Doc.to_array(self, attr_ids: List[int]) --> ndarray[ndim=2, dtype=long] - - .. py:method:: tokens.Doc.count_by(self, attr_id: int) --> Dict[int, int] - -**Get Token objects** - - .. py:method:: tokens.Doc.__getitem__(self, i) --> Token - - .. py:method:: tokens.Doc.__iter__(self) --> Iterator[Token] - -**Get sentence or named entity spans** - - .. py:attribute:: tokens.Doc.sents --> Iterator[Span] - - .. py:attribute:: tokens.Doc.ents --> Iterator[Span] - - You can iterate over a Span to access individual Doc, or access its - start, end or label. - - -**Embedded word representenations** - - .. py:attribute:: tokens.Token.repvec - - .. py:attribute:: lexeme.Lexeme.repvec - - -**Navigate to tree- or string-neighbor tokens** - - .. py:method:: nbor(self, i=1) --> Token - - .. py:method:: child(self, i=1) --> Token - - .. py:method:: sibling(self, i=1) --> Token - - .. py:attribute:: head: Token - - .. py:attribute:: dep: int - -**Align to original string** - - .. py:attribute:: string: unicode - - Padded with original whitespace. - - .. py:attribute:: length: int - - Length, in unicode code-points. Equal to len(self.orth_). - - .. py:attribute:: idx: int - - Starting offset of word in the original string. - - -Features --------- - - -**Boolean features** - - >>> lexeme = nlp.vocab[u'Apple'] - >>> lexeme.is_alpha, is_upper - True, False - >>> tokens = nlp('Apple computers') - >>> tokens[0].is_alpha, tokens[0].is_upper - >>> True, False - >>> from spacy.en.attrs import IS_ALPHA, IS_UPPER - >>> tokens.to_array((IS_ALPHA, IS_UPPER))[0] - array([1, 0]) - - +----------+---------------------------------------------------------------+ - | is_alpha | :py:meth:`str.isalpha` | - +----------+---------------------------------------------------------------+ - | is_digit | :py:meth:`str.isdigit` | - +----------+---------------------------------------------------------------+ - | is_lower | :py:meth:`str.islower` | - +----------+---------------------------------------------------------------+ - | is_title | :py:meth:`str.istitle` | - +----------+---------------------------------------------------------------+ - | is_upper | :py:meth:`str.isupper` | - +----------+---------------------------------------------------------------+ - | is_ascii | all(ord(c) < 128 for c in string) | - +----------+---------------------------------------------------------------+ - | is_punct | all(unicodedata.category(c).startswith('P') for c in string) | - +----------+---------------------------------------------------------------+ - | like_url | Using various heuristics, does the string resemble a URL? | - +----------+---------------------------------------------------------------+ - | like_num | "Two", "10", "1,000", "10.54", "1/2" etc all match | - +----------+---------------------------------------------------------------+ - -**String-transform Features** - - - +----------+---------------------------------------------------------------+ - | orth | The original string, unmodified. | - +----------+---------------------------------------------------------------+ - | lower | The original string, forced to lower-case | - +----------+---------------------------------------------------------------+ - | norm | The string after additional normalization | - +----------+---------------------------------------------------------------+ - | shape | Word shape, e.g. 10 --> dd, Garden --> Xxxx, Hi!5 --> Xx!d | - +----------+---------------------------------------------------------------+ - | prefix | A short slice from the start of the string. | - +----------+---------------------------------------------------------------+ - | suffix | A short slice from the end of the string. | - +----------+---------------------------------------------------------------+ - | lemma | The word's lemma, i.e. morphological suffixes removed | - +----------+---------------------------------------------------------------+ - -**Syntactic labels** - - +----------+---------------------------------------------------------------+ - | pos | The word's part-of-speech, from the Google Universal Tag Set | - +----------+---------------------------------------------------------------+ - | tag | A fine-grained morphosyntactic tag, e.g. VBZ, NNS, etc | - +----------+---------------------------------------------------------------+ - | dep | Dependency type label between word and its head, e.g. subj | - +----------+---------------------------------------------------------------+ - -**Distributional** - - +---------+-----------------------------------------------------------+ - | cluster | Brown cluster ID of the word | - +---------+-----------------------------------------------------------+ - | prob | Log probability of word, smoothed with Simple Good-Turing | - +---------+-----------------------------------------------------------+ diff --git a/docs/source/reference/annotation.rst b/docs/source/reference/annotation.rst deleted file mode 100644 index c19e70bbd..000000000 --- a/docs/source/reference/annotation.rst +++ /dev/null @@ -1,116 +0,0 @@ -==================== -Annotation Standards -==================== - -This document describes the target annotations spaCy is trained to predict. - -This is currently a work in progress. Please ask questions on the issue tracker, -so that the answers can be integrated here to improve the documentation. - -https://github.com/honnibal/spaCy/issues - -English -======= - -Tokenization ------------- - -Tokenization standards are based on the OntoNotes 5 corpus. - -The tokenizer differs from most by including tokens for significant whitespace. -Any sequence of whitespace characters beyond a single space (' ') is included -as a token. For instance: - - >>> from spacy.en import English - >>> nlp = English(parse=False) - >>> tokens = nlp(u'Some\nspaces and\ttab characters') - >>> print [t.orth_ for t in tokens] - [u'Some', u'\n', u'spaces', u' ', u'and', u'\t', u'tab', u'characters'] - -The whitespace tokens are useful for much the same reason punctuation is --- it's -often an important delimiter in the text. By preserving it in the token output, -we are able to maintain a simple alignment between the tokens and the original -string, and we ensure that the token stream does not lose information. - -Sentence boundary detection ---------------------------- - -Sentence boundaries are calculated from the syntactic parse tree, so features -such as punctuation and capitalisation play an important but non-decisive role -in determining the sentence boundaries. Usually this means that the sentence -boundaries will at least coincide with clause boundaries, even given poorly -punctuated text. - -Part-of-speech Tagging ----------------------- - -The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank -tag set. We also map the tags to the simpler Google Universal POS Tag set. - -Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124 - -Lemmatization -------------- - -A "lemma" is the uninflected form of a word. In English, this means: - -* Adjectives: The form like "happy", not "happier" or "happiest" -* Adverbs: The form like "badly", not "worse" or "worst" -* Nouns: The form like "dog", not "dogs"; like "child", not "children" -* Verbs: The form like "write", not "writes", "writing", "wrote" or "written" - -The lemmatization data is taken from WordNet. However, we also add a special -case for pronouns: all pronouns are lemmatized to the special token -PRON-. - -Syntactic Dependency Parsing ----------------------------- - -The parser is trained on data produced by the ClearNLP converter. Details of -the annotation scheme can be found here: - -http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf - -Named Entity Recognition ------------------------- - - +--------------+-----------------------------------------------------+ - | PERSON | People, including fictional | - +--------------+-----------------------------------------------------+ - | NORP | Nationalities or religious or political groups | - +--------------+-----------------------------------------------------+ - | FACILITY | Buildings, airports, highways, bridges, etc. | - +--------------+-----------------------------------------------------+ - | ORGANIZATION | Companies, agencies, institutions, etc. | - +--------------+-----------------------------------------------------+ - | GPE | Countries, cities, states | - +--------------+-----------------------------------------------------+ - | LOCATION | Non-GPE locations, mountain ranges, bodies of water | - +--------------+-----------------------------------------------------+ - | PRODUCT | Vehicles, weapons, foods, etc. (Not services) | - +--------------+-----------------------------------------------------+ - | EVENT | Named hurricanes, battles, wars, sports events, etc.| - +--------------+-----------------------------------------------------+ - | WORK OF ART | Titles of books, songs, etc. | - +--------------+-----------------------------------------------------+ - | LAW | Named documents made into laws | - +--------------+-----------------------------------------------------+ - | LANGUAGE | Any named language | - +--------------+-----------------------------------------------------+ - -The following values are also annotated in a style similar to names: - - +--------------+---------------------------------------------+ - | DATE | Absolute or relative dates or periods | - +--------------+---------------------------------------------+ - | TIME | Times smaller than a day | - +--------------+---------------------------------------------+ - | PERCENT | Percentage (including “%”) | - +--------------+---------------------------------------------+ - | MONEY | Monetary values, including unit | - +--------------+---------------------------------------------+ - | QUANTITY | Measurements, as of weight or distance | - +--------------+---------------------------------------------+ - | ORDINAL | "first", "second" | - +--------------+---------------------------------------------+ - | CARDINAL | Numerals that do not fall under another type| - +--------------+---------------------------------------------+ diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst deleted file mode 100644 index 66050d2de..000000000 --- a/docs/source/reference/index.rst +++ /dev/null @@ -1,112 +0,0 @@ -============= -Documentation -============= - -The table below shows every class in spaCy: a link to its documentation, implementation, -and a small usage snippet. - - - +----------------+--------------------------+--------------------------------+ - | Class name | Usage | Implemention | - +================+==========================+================================+ - | `English`_ | doc = English() | `spacy/en/__init__.py`_ | - +----------------+--------------------------+--------------------------------+ - | Data objects | - +----------------+--------------------------+--------------------------------+ - | `Doc`_ | doc = nlp(text) | `spacy/doc.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `Token`_ | token = doc[10] | `spacy/token.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `Span`_ | sent = doc.sents.next() | `spacy/span.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `Lexeme`_ | lex = nlp.vocab[u'word'] | `spacy/lexeme.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | Lookup tables | - +----------------+--------------------------+--------------------------------+ - | `Vocab`_ | nlp.vocab | `spacy/vocab.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `StringStore`_ | nlp.vocab.strings | `spacy/strings.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | Processing modules | - +----------------+--------------------------+--------------------------------+ - | `Tokenizer`_ | nlp.tokenizer | `spacy/tokenizer.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `EnPosTagger`_ | nlp.tagger | `spacy/en/pos.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `Parser`_ | nlp.parser | `spacy/syntax/parser.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | Parser internals | - +----------------+--------------------------+--------------------------------+ - | ArcEager | | spacy/syntax/arc_eager.pyx | - +----------------+--------------------------+--------------------------------+ - | BiluoPushDown | | spacy/syntax/ner.pyx | - +----------------+--------------------------+--------------------------------+ - | StateClass | | spacy/syntax/stateclass.pyx | - +----------------+--------------------------+--------------------------------+ - | Research Utilities | - +----------------+--------------------------+--------------------------------+ - | `GoldParse`_ | | `spacy/gold.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `Scorer`_ | | `spacy/scorer.py`_ | - +----------------+--------------------------+--------------------------------+ - - -.. toctree:: - :maxdepth: 4 - - loading.rst - processing.rst - using/document.rst - using/span.rst - using/token.rst - using/lexeme.rst - - -.. _English: processing.html - -.. _Doc: using/doc.html - -.. _Token: using/token.html - -.. _Span: using/span.html - -.. _Lexeme: using/lexeme.html - -.. _Vocab: lookup.html - -.. _StringStore: lookup.html - -.. _Tokenizer: processing.html - -.. _EnPosTagger: processing.html - -.. _Parser: processing.html - -.. _Scorer: misc.html - -.. _GoldParse: misc.html - - -.. _spacy/en/__init__.py: https://github.com/honnibal/spaCy/tree/master/spacy/en/__init__.py - -.. _spacy/doc.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/tokens.pyx - -.. _spacy/token.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/tokens.pyx - -.. _spacy/span.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/spans.pyx - -.. _spacy/vocab.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/vocab.pyx - -.. _spacy/strings.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/strings.pyx - -.. _spacy/tokenizer.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/tokenizer.pyx - -.. _spacy/en/pos.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/en/pos.pyx - -.. _spacy/syntax/parser.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/syntax/parser.pyx - -.. _spacy/lexeme.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/lexeme.pyx - -.. _spacy/gold.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/gold.pyx - -.. _spacy/scorer.py: https://github.com/honnibal/spaCy/tree/master/spacy/scorer.py diff --git a/docs/source/reference/loading.rst b/docs/source/reference/loading.rst deleted file mode 100644 index 83f3aaf5f..000000000 --- a/docs/source/reference/loading.rst +++ /dev/null @@ -1,62 +0,0 @@ -================= -Loading Resources -================= - -99\% of the time, you will load spaCy's resources using a language pipeline class, -e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a -specified directory. By default, spaCy installs data into each language's -package directory, and loads it from there. - -Usually, this is all you will need: - - >>> from spacy.en import English - >>> nlp = English() - -If you need to replace some of the components, you may want to just make your -own pipeline class --- the English class itself does almost no work; it just -applies the modules in order. You can also provide a function or class that -produces a tokenizer, tagger, parser or entity recognizer to :code:`English.__init__`, -to customize the pipeline: - - >>> from spacy.en import English - >>> from my_module import MyTagger - >>> nlp = English(Tagger=MyTagger) - -In more detail: - -.. code:: - - class English(object): - def __init__(self, - data_dir=path.join(path.dirname(__file__), 'data'), - Tokenizer=Tokenizer.from_dir, - Tagger=EnPosTagger, - Parser=Createarser(ArcEager), - Entity=CreateParser(BiluoNER), - load_vectors=True - ): - -:code:`data_dir` - :code:`unicode path` - - The data directory. May be None, to disable any data loading (including - the vocabulary). - -:code:`Tokenizer` - :code:`(Vocab vocab, unicode data_dir)(unicode) --> Doc` - - A class/function that creates the tokenizer. - -:code:`Tagger` / :code:`Parser` / :code:`Entity` - :code:`(Vocab vocab, unicode data_dir)(Doc) --> None` - - A class/function that creates the part-of-speech tagger / - syntactic dependency parser / named entity recogniser. - May be None or False, to disable tagging. - -:code:`load_vectors` - :code:`bool` - A boolean value to control whether the word vectors are loaded. - - - diff --git a/docs/source/reference/lookup.rst b/docs/source/reference/lookup.rst deleted file mode 100644 index 0b6b9bb89..000000000 --- a/docs/source/reference/lookup.rst +++ /dev/null @@ -1,49 +0,0 @@ -Lexical Lookup --------------- - -Where possible, spaCy computes information over lexical *types*, rather than -*tokens*. If you process a large batch of text, the number of unique types -you will see will grow exponentially slower than the number of tokens --- so -it's much more efficient to compute over types. And, in small samples, we generally -want to know about the distribution of a word in the language at large --- -which again, is type-based information. - -You can access the lexical features via the Token object, but you can also look them -up in the vocabulary directly: - - >>> from spacy.en import English - >>> nlp = English() - >>> lexeme = nlp.vocab[u'Amazon'] - -.. py:class:: vocab.Vocab(self, data_dir=None, lex_props_getter=None) - - .. py:method:: __len__(self) --> int - - .. py:method:: __getitem__(self, id: int) --> unicode - - .. py:method:: __getitem__(self, string: unicode) --> int - - .. py:method:: __setitem__(self, py_str: unicode, props: Dict[str, int[float]) --> None - - .. py:method:: dump(self, loc: unicode) --> None - - .. py:method:: load_lexemes(self, loc: unicode) --> None - - .. py:method:: load_vectors(self, loc: unicode) --> None - - -.. py:class:: strings.StringStore(self) - - .. py:method:: __len__(self) --> int - - .. py:method:: __getitem__(self, id: int) --> unicode - - .. py:method:: __getitem__(self, string: bytes) --> id - - .. py:method:: __getitem__(self, string: unicode) --> id - - .. py:method:: dump(self, loc: unicode) --> None - - .. py:method:: load(self, loc: unicode) --> None - - diff --git a/docs/source/reference/processing.rst b/docs/source/reference/processing.rst deleted file mode 100644 index 5b53c26d6..000000000 --- a/docs/source/reference/processing.rst +++ /dev/null @@ -1,67 +0,0 @@ -=============== -Processing Text -=============== - -The text processing API is very small and simple. Everything is a callable object, -and you will almost always apply the pipeline all at once. - -Applying a pipeline -------------------- - - -.. py:method:: English.__call__(text, tag=True, parse=True, entity=True) --> Doc - - -text (unicode) - The text to be processed. No pre-processing needs to be applied, and any - length of text can be submitted. Usually you will submit a whole document. - Text may be zero-length. An exception is raised if byte strings are supplied. - -tag (bool) - Whether to apply the part-of-speech tagger. Required for parsing and entity recognition. - -parse (bool) - Whether to apply the syntactic dependency parser. - -entity (bool) - Whether to apply the named entity recognizer. - - -**Examples** - - >>> from spacy.en import English - >>> nlp = English() - >>> doc = nlp(u'Some text.) # Applies tagger, parser, entity - >>> doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser - >>> doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity - >>> doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser - >>> doc = nlp(u'') # Zero-length tokens, not an error - >>> doc = nlp(b'Some text') # Error: need unicode - Traceback (most recent call last): - File "", line 1, in - File "spacy/en/__init__.py", line 128, in __call__ - tokens = self.tokenizer(text) - TypeError: Argument 'string' has incorrect type (expected unicode, got str) - >>> doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. - >>> - - -Tokenizer ---------- - - -.. autoclass:: spacy.tokenizer.Tokenizer - :members: - - -Tagger ------- - -.. autoclass:: spacy.en.pos.EnPosTagger - :members: - -Parser and Entity Recognizer ----------------------------- - -.. autoclass:: spacy.syntax.parser.Parser - :members: diff --git a/docs/source/reference/using/document.rst b/docs/source/reference/using/document.rst deleted file mode 100644 index d4852c98c..000000000 --- a/docs/source/reference/using/document.rst +++ /dev/null @@ -1,100 +0,0 @@ -============== -The Doc Object -============== - -.. autoclass:: spacy.tokens.Tokens - -:code:`__getitem__`, :code:`__iter__`, :code:`__len__` - The Tokens class behaves as a Python sequence, supporting the usual operators, - len(), etc. Negative indexing is supported. Slices are supported as of v0.89 - - .. code:: - - >>> tokens = nlp(u'Zero one two three four five six') - >>> tokens[0].orth_ - u'Zero' - >>> tokens[-1].orth_ - u'six' - >>> span = tokens[0:4] - >>> [w.orth_ for w in span] - [u'Zero', u'one', u'two', u'three'] - >>> span.string - u'Zero one two three' - -:code:`sents` - Iterate over sentences in the document. Each sentence is a Span object. - -:code:`ents` - Iterate over entities in the document. Each entity is a Span object. - -:code:`to_array` - Given a list of M attribute IDs, export the tokens to a numpy ndarray - of shape N*M, where N is the length of the sentence. - - Arguments: - attr_ids (list[int]): A list of attribute ID ints. - - Returns: - feat_array (numpy.ndarray[long, ndim=2]): - A feature matrix, with one row per word, and one column per attribute - indicated in the input attr_ids. - -:code:`count_by` - Produce a dict of {attribute (int): count (ints)} frequencies, keyed - by the values of the given attribute ID. - - >>> from spacy.en import English, attrs - >>> nlp = English() - >>> tokens = nlp(u'apple apple orange banana') - >>> tokens.count_by(attrs.ORTH) - {12800L: 1, 11880L: 2, 7561L: 1} - >>> tokens.to_array([attrs.ORTH]) - array([[11880], - [11880], - [ 7561], - [12800]]) - -:code:`merge` - Merge a multi-word expression into a single token. Currently - experimental; API is likely to change. - -:code:`to_bytes()` - Get a byte-string representation of the document, i.e. serialize. - -:code:`from_bytes(self, byte_string)` - Load data from a byte-string, i.e. deserialize - -:code:`Doc.read_bytes` - A staticmethod, used to read bytes from a file. - - -Example of serialization: - -:: - - doc1 = EN(u'This is a simple test. With a couple of sentences.') - doc2 = EN(u'This is another test document.') - - with open('/tmp/spacy_docs.bin', 'wb') as file_: - file_.write(doc1.to_bytes()) - file_.write(doc2.to_bytes()) - - with open('/tmp/spacy_docs.bin', 'rb') as file_: - bytes1, bytes2 = Doc.read_bytes(file_) - r1 = Doc(EN.vocab).from_bytes(bytes1) - r2 = Doc(EN.vocab).from_bytes(bytes2) - - assert r1.string == doc1.string - assert r2.string == doc2.string - - -Internals - A Tokens instance stores the annotations in a C-array of `TokenC` structs. - Each TokenC struct holds a const pointer to a LexemeC struct, which describes - a vocabulary item. - - The Token objects are built lazily, from this underlying C-data. - - For faster access, the underlying C data can be accessed from Cython. You - can also export the data to a numpy array, via `Tokens.to_array`, if pure Python - access is required, and you need slightly better performance. diff --git a/docs/source/reference/using/index.rst b/docs/source/reference/using/index.rst deleted file mode 100644 index cf8b0cde2..000000000 --- a/docs/source/reference/using/index.rst +++ /dev/null @@ -1,11 +0,0 @@ -================== -Annotation Objects -================== - - -.. toctree:: - :maxdepth: 3 - - document.rst - token.rst - span.rst diff --git a/docs/source/reference/using/span.rst b/docs/source/reference/using/span.rst deleted file mode 100644 index 3aa19c469..000000000 --- a/docs/source/reference/using/span.rst +++ /dev/null @@ -1,32 +0,0 @@ -=============== -The Span Object -=============== - -.. autoclass:: spacy.spans.Span - -:code:`__getitem__`, :code:`__iter__`, :code:`__len__` - Sequence API - -:code:`head` - Syntactic head, or None - -:code:`left` - Tokens to the left of the span - -:code:`rights` - Tokens to the left of the span - -:code:`orth` / :code:`orth_` - Orth string - -:code:`lemma` / :code:`lemma_` - Lemma string - -:code:`string` - String - -:code:`label` / :code:`label_` - Label - -:code:`subtree` - Lefts + [self] + Rights diff --git a/docs/source/reference/using/token.rst b/docs/source/reference/using/token.rst deleted file mode 100644 index 869c54369..000000000 --- a/docs/source/reference/using/token.rst +++ /dev/null @@ -1,166 +0,0 @@ -================ -The Token Object -================ - -A Token represents a single word, punctuation or significant whitespace symbol. - -Integer IDs are provided for all string features. The (unicode) string is -provided by an attribute of the same name followed by an underscore, e.g. -token.orth is an integer ID, token.orth\_ is the unicode value. - -The only exception is the Token.string attribute, which is (unicode) -string-typed. - -**String Features** - -:code:`orth` / :code:`orth_` - The form of the word with no string normalization or processing, as it - appears in the string, without trailing whitespace. - -:code:`lemma` / :code:`lemma_` - The "base" of the word, with no inflectional suffixes, e.g. the lemma of - "developing" is "develop", the lemma of "geese" is "goose", etc. Note that - *derivational* suffixes are not stripped, e.g. the lemma of "instutitions" - is "institution", not "institute". Lemmatization is performed using the - WordNet data, but extended to also cover closed-class words such as - pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his". - We assign pronouns the lemma -PRON-. - -:code:`lower` / :code:`lower_` - The form of the word, but forced to lower-case, i.e. lower = word.orth\_.lower() - -:code:`norm` / :code:`norm_` - The form of the word, after language-specific normalizations have been - applied. - -:code:`shape` / :code:`shape_` - A transform of the word's string, to show orthographic features. The - characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. - After these mappings, sequences of 4 or more of the same character are - truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, - :) --> :) - -:code:`prefix` / :code:`prefix_` - A length-N substring from the start of the word. Length may vary by - language; currently for English n=1, i.e. prefix = word.orth\_[:1] - -:code:`suffix` / :code:`suffix_` - A length-N substring from the end of the word. Length may vary by - language; currently for English n=3, i.e. suffix = word.orth\_[-3:] - -:code:`string` - The form of the word as it appears in the string, **including trailing - whitespace**. This is useful when you need to use linguistic features to - add inline mark-up to the string. - -**Boolean Features** - -:code:`is_oov` - Is the word out-of-vocabulary? - -:code:`is_alpha` - Equivalent to `word.orth_.isalpha()` - -:code:`is_ascii` - Equivalent to `any(ord(c) >= 128 for c in word.orth_)` - -:code:`is_digit` - Equivalent to `word.orth_.isdigit()` - -:code:`is_lower` - Equivalent to `word.orth_.islower()` - -:code:`is_title` - Equivalent to `word.orth_.istitle()` - -:code:`is_punct` - Equivalent to `word.orth_.ispunct()` - -:code:`is_space` - Equivalent to `word.orth_.isspace()` - -:code:`like_url` - Does the word resembles a URL? - -:code:`like_num` - Does the word represent a number? e.g. "10.9", "10", "ten", etc - -:code:`like_email` - Does the word resemble an email? - - -**Distributional Features** - -:code:`prob` - The unigram log-probability of the word, estimated from counts from a - large corpus, smoothed using Simple Good Turing estimation. - -:code:`cluster` - The Brown cluster ID of the word. These are often useful features for - linear models. If you're using a non-linear model, particularly - a neural net or random forest, consider using the real-valued word - representation vector, in Token.repvec, instead. - -:code:`repvec` - A "word embedding" representation: a dense real-valued vector that supports - similarity queries between words. By default, spaCy currently loads - vectors produced by the Levy and Goldberg (2014) dependency-based word2vec - model. - -**Syntactic Features** - -:code:`tag` - A morphosyntactic tag, e.g. NN, VBZ, DT, etc. These tags are - language/corpus specific, and typically describe part-of-speech and some - amount of morphological information. For instance, in the Penn Treebank - tag set, VBZ is assigned to a present-tense singular verb. - -:code:`pos` - A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB, - ADV. Constants for the 17 tag values are provided in spacy.parts\_of\_speech. - -:code:`dep` - The type of syntactic dependency relation between the word and its - syntactic head. - -:code:`n_lefts` - The number of immediate syntactic children preceding the word in the - string. - -:code:`n_rights` - The number of immediate syntactic children following the word in the - string. - -**Navigating the Dependency Tree** - -:code:`head` - The Token that is the immediate syntactic head of the word. If the word is - the root of the dependency tree, the same word is returned. - -:code:`lefts` - An iterator for the immediate leftward syntactic children of the word. - -:code:`rights` - An iterator for the immediate rightward syntactic children of the word. - -:code:`children` - An iterator that yields from lefts, and then yields from rights. - -:code:`subtree` - An iterator for the part of the sentence syntactically governed by the - word, including the word itself. - -:code:`left_edge` - The leftmost descendent of the word's subtree. Equivalent to `list(word.subtree)[0]` - -:code:`right_edge` - The rightmost descendent of the word's subtree. Equivalent to `list(word.subtree)[-1]` - - -**Named Entities** - -:code:`ent_type` - If the token is part of an entity, its entity type - -:code:`ent_iob` - The IOB (inside, outside, begin) entity recognition tag for the token diff --git a/docs/source/tutorials/lexrank_tutorial.rst b/docs/source/tutorials/lexrank_tutorial.rst deleted file mode 100644 index f5c5ae8fd..000000000 --- a/docs/source/tutorials/lexrank_tutorial.rst +++ /dev/null @@ -1,280 +0,0 @@ -=================================== -Tutorial: Extractive Summarization -=================================== - -This tutorial will go through the implementation of several extractive -summarization models with spaCy. - -An *extractive* summarization system is a filter over the original document/s: -most of the text is removed, and the remaining text is formatted as a summary. -In contrast, an *abstractive* summarization system generates new text. - -Application Context -------------------- - -Extractive summarization systems need an application context. We can't ask how -to design the system without some concept of what sort of summary will be -useful for a given application. (Contrast with speech recognition, where -a notion of "correct" is much less application-sensitive.) - -For this, I've adopted the application context that `Flipboard`_ discuss in a -recent blog post: they want to display lead-text to readers on mobile devices, -so that readers can easily choose interesting links. - -I've chosen this application context for two reasons. First, `Flipboard`_ say -they're putting something like this into production. Second, there's a ready -source of evaluation data. We can look at the lead-text that human editors -have chosen, and evaluate whether our automatic system chooses similar text. - -Experimental Setup ------------------- - -Instead of scraping data, I'm using articles from the New York Times Annotated -Corpus, which is a handy dump of XML-annotated articles distributed by the LDC. -The annotations come with a field named "online lead paragraph". Our -summarization systems will be evaluated on their Rouge-1 overlap with this -field. - -Further details of the experimental setup can be found in the appendices. - -.. _newyorktimes.com: http://newyorktimes.com - -.. _Flipboard: http://engineering.flipboard.com/2014/10/summarization/ - -.. _vector-space model: https://en.wikipedia.org/wiki/Vector_space_model - -.. _LexRank algorithm: https://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html - -.. _PageRank: https://en.wikipedia.org/wiki/PageRank - -Summarizer API --------------- - -Each summarization model will have the following API: - -.. py:func:`summarize(nlp: spacy.en.English, headline: unicode, paragraphs: List[unicode], - target_length: int) --> summary: unicode - -We receive the headline and a list of paragraphs, and a target length. We have -to produce a block of text where len(text) < target_length. We want summaries -that users will click-on, and not bounce back out of. Long-term, we want -summaries that would keep people using the app. - -Baselines: Truncate -------------------- - -.. code:: python - - def truncate_chars(nlp, headline, paragraphs, target_length): - text = ' '.join(paragraphs) - return text[:target_length - 3] + '...' - - def truncate_words(nlp, headline, paragraphs, target_length): - text = ' '.join(paragraphs) - tokens = text.split() - summary = [] - n_words = 0 - n_chars = 0 - while n_chars < target_length - 3: - n_chars += len(tokens[n_words]) - n_chars += 1 # Space - n_words += 1 - return ' '.join(tokens[:n_words]) + '...' - - def truncate_sentences(nlp, headline, paragraphs, target_length): - sentences = [] - summary = '' - for para in paragraphs: - tokens = nlp(para) - for sentence in tokens.sentences(): - if len(summary) + len(sentence) >= target_length: - return summary - summary += str(sentence) - return summary - -I'd be surprised if Flipboard never had something like this in production. Details -like lead-text take a while to float up the priority list. This strategy also has -the advantage of transparency: it's obvious to users how the decision is being -made, so nobody is likely to complain about the feature if it works this way. - -Instead of cutting off the text mid-word, we can tokenize the text, and - -+----------------+-----------+ -| System | Rouge-1 R | -+----------------+-----------+ -| Truncate chars | 69.3 | -+----------------+-----------+ -| Truncate words | 69.8 | -+----------------+-----------+ -| Truncate sents | 48.5 | -+----------------+-----------+ - -Sentence Vectors ----------------- - -A simple bag-of-words model can be created using the `count_by` method, which -produces a dictionary of frequencies, keyed by string IDs: - -.. code:: python - - >>> from spacy.en import English - >>> from spacy.en.attrs import SIC - >>> nlp = English() - >>> tokens = nlp(u'a a a. b b b b.') - >>> tokens.count_by(SIC) - {41L: 4, 11L: 3, 5L: 2} - >>> [s.count_by(SIC) for s in tokens.sentences()] - [{11L: 3, 5L: 1}, {41L: 4, 5L: 1}] - - -Similar functionality is provided by `scikit-learn`_, but with a different -style of API design. With spaCy, functions generally have more limited -responsibility. The advantage of this is that spaCy's APIs are much simpler, -and it's often easier to compose functions in a more flexible way. - -One particularly powerful feature of spaCy is its support for -`word embeddings`_ --- the dense vectors introduced by deep learning models, and -now commonly produced by `word2vec`_ and related systems. - -Once a set of word embeddings has been installed, the vectors are available -from any token: - - >>> from spacy.en import English - >>> from spacy.en.attrs import SIC - >>> from scipy.spatial.distance import cosine - >>> nlp = English() - >>> tokens = nlp(u'Apple banana Batman hero') - >>> cosine(tokens[0].vec, tokens[1].vec) - - - - - -.. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/ - -.. _word2vec: https://code.google.com/p/word2vec/ - -.. code:: python - - def main(db_loc, output_dir, feat_type="tfidf"): - nlp = spacy.en.English() - - # Read stop list and make TF-IDF weights --- data needed for the - # feature extraction. - with open(stops_loc) as file_: - stop_words = set(nlp.vocab.strings[word.strip()] for word in file_) - idf_weights = get_idf_weights(nlp, iter_docs(db_loc)) - if feat_type == 'tfidf': - feature_extractor = tfidf_extractor(stop_words, idf_weights) - elif feat_type == 'vec': - feature_extractor = vec_extractor(stop_words, idf_weights) - - for i, text in enumerate(iter_docs(db_loc)): - tokens = nlp(body) - sentences = tokens.sentences() - summary = summarize(sentences, feature_extractor) - write_output(summary, output_dir, i) - - - - -.. _scikit-learn: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text - - - - - -The LexRank Algorithm ----------------------- - -LexRank is described as a graph-based algorithm, derived from `Google's PageRank`_. -The nodes are sentences, and the edges are the similarities between one -sentence and another. The "graph" is fully-connected, and its edges are -undirected --- so, it's natural to represent this as a matrix: - -.. code:: python - - from scipy.spatial.distance import cosine - import numpy - - - def lexrank(sent_vectors): - n = len(sent_vectors) - # Build the cosine similarity matrix - matrix = numpy.ndarray(shape=(n, n)) - for i in range(n): - for j in range(n): - matrix[i, j] = cosine(sent_vectors[i], sent_vectors[j]) - # Normalize - for i in range(n): - matrix[i] /= sum(matrix[i]) - return _pagerank(matrix) - -The rows are normalized (i.e. rows sum to 1), allowing the PageRank algorithm -to be applied. Unfortunately the PageRank implementation is rather opaque --- -it's easier to just read the Wikipedia page: - -.. code:: python - - def _pagerank(matrix, d=0.85): - # This is admittedly opaque --- just read the Wikipedia page. - n = len(matrix) - rank = numpy.ones(shape=(n,)) / n - new_rank = numpy.zeros(shape=(n,)) - while not _has_converged(rank, new_rank): - rank, new_rank = new_rank, rank - for i in range(n): - new_rank[i] = ((1.0 - d) / n) + (d * sum(rank * matrix[i])) - return rank - - def _has_converged(x, y, epsilon=0.0001): - return all(abs(x[i] - y[i]) < epsilon for i in range(n)) - - -Initial Processing ------------------- - - - - -Feature Extraction ------------------- - - .. code:: python - def sentence_vectors(sentence, idf_weights): - tf_idf = {} - for term, freq in sent.count_by(LEMMA).items(): - tf_idf[term] = freq * idf_weights[term] - vectors.append(tf_idf) - return vectors - -The LexRank paper models each sentence as a bag-of-words - -This is simple and fairly standard, but often gives -underwhelming results. My idea is to instead calculate vectors from -`word-embeddings`_, which have been one of the exciting outcomes of the recent -work on deep-learning. I had a quick look at the literature, and found -a `recent workshop paper`_ that suggested the idea was plausible. - - - - -Taking the feature representation and similarity function as parameters, the -LexRank function looks like this: - - -Given a list of N sentences, a function that maps a sentence to a feature -vector, and a function that computes a similarity measure of two feature -vectors, this produces a vector of N floats, which indicate how well each -sentence represents the document as a whole. - -.. _Rouge: https://en.wikipedia.org/wiki/ROUGE_%28metric%29 - - -.. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/ - -.. _recent workshop paper: https://www.aclweb.org/anthology/W/W14/W14-1504.pdf - - -Document Model --------------- diff --git a/docs/source/updates.rst b/docs/source/updates.rst deleted file mode 100644 index 12b8ac543..000000000 --- a/docs/source/updates.rst +++ /dev/null @@ -1,298 +0,0 @@ -Updates -======= - -To update your installation: - -.. code:: bash - - $ pip install --upgrade spacy - $ python -m spacy.en.download all - -Most updates ship a new model, so you will usually have to redownload the data. - -2015-07-28 v0.89 ----------------- - -Major update! - -* Support efficient binary serialization. The dependency tree, - part-of-speech tags, named entities, tokenization and text can be dumped to a - byte string smaller than the original text representation. Serialization is - lossless, so there's no need to separately store the original text. - - Serialize: - - .. code-block:: python - - byte_string = doc.to_bytes() - - Deserialize by first creating a Doc object, and then loading the bytes: - - .. code-block:: python - - doc = Doc(nlp.vocab) - doc.from_bytes(byte_string) - - If you have a binary file with several parses saved, you can iterate over - them using the staticmethod `Doc.read_bytes`. Putting it all together: - - .. code-block:: python - - import codecs - - from spacy.en import English - - def serialize(nlp, texts, out_loc): - with open(out_loc, 'wb') as out_file: - for text in texts: - doc = nlp(text) - out_file.write(doc.to_bytes()) - - def deserialize(nlp, file_loc): - docs = [] - with open(file_loc, 'rb') as read_file: - for byte_string in Doc.read_bytes(read_file, 'rb')): - doc = Doc(nlp.vocab).from_bytes(byte_string) - docs.append(doc) - return docs - - - Full tutorial coming soon. - - -* Fix probability estimates, and base them off counts from the 2015 Reddit Comments - dump. The probability estimates are now very reliable, and out-of-vocabulary - words now receive an accurate smoothed probability estimate. - -* Fix regression in parse times on very long texts. Recent versions were - calculating parse features in a way that was polynomial in input length. - -* Allow slicing into the Doc object, so that you can do e.g. doc[2:4]. Returns - a Span object. - -* Add tag SP (coarse tag SPACE) for whitespace tokens. Fix bug where - whitespace was sometimes marked as an entity. - -* Reduce memory usage. Memory usage now under 2GB per process. - -* Rename :code:`Span.head` to :code:`Span.root`, fix its documentation, and make - it more efficient. I considered adding Span.head, Span.dep and Span.dep\_ as - well, but for now I leave these as accessible via :code:`Span.root.head`, - :code:`Span.head.dep`, and :code:`Span.head.dep\_`, to keep the API smaller. - -* Add boolean features to Token and Lexeme objects. - -* Main parse function now marked **nogil**. This - means I'll be able to add a Worker class that allows multi-threaded - processing. This will be available in the next version. In the meantime, - you should continue to use multiprocessing for parallelization. - - -2015-07-08 v0.88 ----------------- - -Refactoring release. - -If you have the data for v0.87, you don't need to redownload the data for this -release. - -* You can now set tag=False, parse=False or entity=False when creating the pipleine, - to disable some of the models. See the documentation for details. -* Models no longer lazy-loaded. -* Warning emitted when parse=True or entity=True but model not loaded. -* Rename the tokens.Tokens class to tokens.Doc. An alias has been made to assist - backwards compatibility, but you should update your code to refer to the new - class name. -* Various bits of internal refactoring - - -2015-07-01 v0.87 ----------------- - -* Changed weights data structure. Memory use should be reduced 30-40%. -* Fixed speed regressions introduced in the last few versions. -* Models should now be slightly more robust to noise in the input text, as I'm - now training on data with a small amount of noise added, e.g. I randomly corrupt - capitalization, swap spaces for newlines, etc. This is bringing a small - benefit on out-of-domain data. I think this strategy could yield better - results with a better noise-generation function. If you think you have a good - way to make clean text resemble the kind of noisy input you're seeing in your - domain, get in touch. - -2015-06-24 v0.86 ----------------- - -* Parser now more accurate, using novel non-monotonic transition system that's - currently under review. - - -2015-05-12 v0.85 ----------------- - -* Parser produces richer dependency labels following the `ClearNLP scheme`_ -* Training data now includes text from a variety of genres. -* Parser now uses more memory and the data is slightly larger, due to the additional - labels. Impact on efficiency is minimal: entire process still takes - <10ms per document. - -Most users should see a substantial increase in accuracy from the new model. -Long post on accuracy evaluation and model details coming soon. - -.. _ClearNLP scheme: https://github.com/clir/clearnlp-guidelines/blob/master/md/dependency/dependency_guidelines.md - - -2015-05-12 v0.84 ----------------- - -* Bug fixes for parsing -* Bug fixes for named entity recognition - -2015-04-13 v0.80 ----------------- - -* Preliminary support for named-entity recognition. Its accuracy is substantially behind the state-of-the-art. I'm working on improvements. - -* Better sentence boundary detection, drawn from the syntactic structure. - -* Lots of bug fixes. - -2015-03-05 v0.70 ----------------- - -* Improved parse navigation API -* Bug fixes to labelled parsing - - -2015-01-30 spaCy v0.4: Still alpha, improving quickly ------------------------------------------------------ - -Five days ago I presented the alpha release of spaCy, a natural language -processing library that brings state-of-the-art technology to small companies. - -spaCy has been well received, and there are now a lot of eyes on the project. -Naturally, lots of issues have surfaced. I'm grateful to those who've reported -them. I've worked hard to address them as quickly as I could. - -Bug Fixes ----------- - -* Lexemes.bin data file had a platform-specific encoding. - This was a silly error: instead of the string, or an index into the - list of strings, I was storing the 64-bit hash of the string. On - wide-unicode builds, a unicode string hashes differently. This meant that - all look-ups into the vocabulary failed on wide unicode builds, which - further meant that the part-of-speech tagger and parser features were not - computed correctly. - - The fix is simple: we already have to read in a list of all the strings, so - just store an index into that list, instead of a hash. - -* Parse tree navigation API was rough, and buggy. - The parse-tree navigation API was the last thing I added before v0.3. I've - now replaced it with something better. The previous API design was flawed, - and the implementation was buggy --- Token.child() and Token.head were - sometimes inconsistent. - - I've addressed the most immediate problems, but this part of the design is - still a work in progress. It's a difficult problem. The parse is a tree, - and we want to freely navigate up and down it without creating reference - cycles that inhibit garbage collection, and without doing a lot of copying, - creating and deleting. - - I think I've got a promising solution to this, but I suspect there's - currently a memory leak. Please get in touch no the tracker if you want to - know more, especially if you think you can help. - -Known Issues ------------- - -Some systems are still experiencing memory errors, which I'm having trouble -pinning down or reproducing. Please send details of your system to the -`Issue Tracker`_ if this is happening to you. - -.. _Issue Tracker: https://github.com/honnibal/spaCy/issues - -Enhancements: Train and evaluate on whole paragraphs ----------------------------------------------------- - -.. note:: tl;dr: I shipped the wrong parsing model with 0.3. That model expected input to be segmented into sentences. 0.4 ships the correct model, which uses some algorithmic tricks to minimize the impact of tokenization and sentence segmentation errors on the parser. - - -Most English parsing research is performed on text with perfect pre-processing: -one newline between every sentence, one space between every token. -It's always been done this way, and it's good. It's a useful idealisation, -because the pre-processing has few algorithmic implications. - -But, for practical performance, this stuff can matter a lot. -Dridan and Oepen (2013) did a simple but rare thing: they actually ran a few -parsers on raw text. Even on the standard Wall Street Journal corpus, -where pre-processing tools are quite good, the quality of pre-processing -made a big difference: - - +-------------+-------+----------+ - | Preprocess | BLLIP | Berkeley | - +-------------+-------+----------+ - | Gold | 90.9 | 89.8 | - +-------------+-------+----------+ - | Default | 86.4 | 88.4 | - +-------------+-------+----------+ - | Corrected | 89.9 | 88.8 | - +-------------+-------+----------+ - -.. note:: spaCy is evaluated on unlabelled dependencies, where the above accuracy figures refer to phrase-structure trees. Accuracies are non-comparable. - - - -In the standard experimental condition --- gold pre-processing --- the -BLLIP parser is better. But, it turns out it ships with lousy pre-processing -tools: when you evaluate the parsers on raw text, the BLLIP parser falls way -behind. To verify that this was due to the quality of the pre-processing -tools, and not some particular algorithmic sensitivity, Dridan and Oepen ran -both parsers with their high-quality tokenizer and sentence segmenter. This -confirmed that with equal pre-processing, the BLLIP parser is better. - -The Dridan and Oepen paper really convinced me to take pre-processing seriously -in spaCy. In fact, spaCy started life as just a tokenizer --- hence the name. - -The spaCy parser has a special trick up its sleeve. Because both the tagger -and parser run in linear time, it doesn't require that the input be divided -into sentences. This is nice because it avoids error-cascades: if you segment -first, then the parser just has to live with whatever decision the segmenter -made. - -But, even though I designed the system with this consideration in mind, -I decided to present the initial results using the standard methodology, using -gold-standard inputs. But...then I made a mistake. - -Unfortunately, with all the other things I was doing before launch, I forgot -all about this problem. spaCy launched with a parsing model that expected the -input to be segmented into sentences, but with no sentence segmenter. This -caused a drop in parse accuracy of 4%! - -Over the last five days, I've worked hard to correct this. I implemented the -modifications to the parsing algorithm I had planned, from Dongdong Zhang et al. -(2013), and trained and evaluated the parser on raw text, using the version of -the WSJ distributed by Read et al. (2012), and used in Dridan and Oepen's -experiments. - -I'm pleased to say that on the WSJ at least, spaCy 0.4 performs almost exactly -as well on raw text as text with gold-standard tokenization and sentence -boundary detection. - -I still need to evaluate this on web text, and I need to compare against the -Stanford CoreNLP and other parsers. I suspect that most other parsers will -decline in accuracy by 1% --- we'll see. - - -+-------------+---------+ -| Preprocess | spaCy | -+-------------+---------+ -| Gold | 92.4% | -+-------------+---------+ -| Default | 92.2% | -+-------------+---------+ - -2015-01-25 ----------- - -spaCy v0.33 launched --- first alpha build. diff --git a/examples/twitter_filter.py b/examples/twitter_filter.py new file mode 100644 index 000000000..b6e4e4e83 --- /dev/null +++ b/examples/twitter_filter.py @@ -0,0 +1,36 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function +import plac +import codecs +import pathlib +import random + +import twython +import spacy.en + +import _handler + + +class Connection(twython.TwythonStreamer): + def __init__(self, keys_dir, nlp, query): + keys_dir = pathlib.Path(keys_dir) + read = lambda fn: (keys_dir / (fn + '.txt')).open().read().strip() + api_key = map(read, ['key', 'secret', 'token', 'token_secret']) + twython.TwythonStreamer.__init__(self, *api_key) + self.nlp = nlp + self.query = query + + def on_success(self, data): + _handler.handle_tweet(self.nlp, data, self.query) + if random.random() >= 0.1: + reload(_handler) + + +def main(keys_dir, term): + nlp = spacy.en.English() + twitter = Connection(keys_dir, nlp, term) + twitter.statuses.filter(track=term, language='en') + + +if __name__ == '__main__': + plac.call(main) diff --git a/lang_data/de/infix.txt b/lang_data/de/infix.txt new file mode 100644 index 000000000..37eca7350 --- /dev/null +++ b/lang_data/de/infix.txt @@ -0,0 +1,3 @@ +\.\.\. +(?<=[a-z])\.(?=[A-Z]) +(?<=[a-zA-Z])-(?=[a-zA-z]) diff --git a/lang_data/de/lemma_rules.json b/lang_data/de/lemma_rules.json new file mode 100644 index 000000000..e69de29bb diff --git a/lang_data/de/morphs.json b/lang_data/de/morphs.json new file mode 100644 index 000000000..e69de29bb diff --git a/lang_data/de/prefix.txt b/lang_data/de/prefix.txt new file mode 100644 index 000000000..48c4fc549 --- /dev/null +++ b/lang_data/de/prefix.txt @@ -0,0 +1,21 @@ +, +" +( +[ +{ +* +< +$ +£ +“ +' +`` +` +# +US$ +C$ +A$ +a- +‘ +.... +... diff --git a/lang_data/de/sample.txt b/lang_data/de/sample.txt new file mode 100644 index 000000000..12c0bb787 --- /dev/null +++ b/lang_data/de/sample.txt @@ -0,0 +1,3 @@ +Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern. + +Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs. diff --git a/lang_data/de/specials.json b/lang_data/de/specials.json new file mode 100644 index 000000000..0e0986339 --- /dev/null +++ b/lang_data/de/specials.json @@ -0,0 +1,149 @@ +{ +"a.m.": [{"F": "a.m."}], +"p.m.": [{"F": "p.m."}], + +"1a.m.": [{"F": "1"}, {"F": "a.m."}], +"2a.m.": [{"F": "2"}, {"F": "a.m."}], +"3a.m.": [{"F": "3"}, {"F": "a.m."}], +"4a.m.": [{"F": "4"}, {"F": "a.m."}], +"5a.m.": [{"F": "5"}, {"F": "a.m."}], +"6a.m.": [{"F": "6"}, {"F": "a.m."}], +"7a.m.": [{"F": "7"}, {"F": "a.m."}], +"8a.m.": [{"F": "8"}, {"F": "a.m."}], +"9a.m.": [{"F": "9"}, {"F": "a.m."}], +"10a.m.": [{"F": "10"}, {"F": "a.m."}], +"11a.m.": [{"F": "11"}, {"F": "a.m."}], +"12a.m.": [{"F": "12"}, {"F": "a.m."}], +"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}], +"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}], +"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}], +"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}], +"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}], +"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}], +"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}], +"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}], +"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}], +"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}], +"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}], +"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}], + + +"1p.m.": [{"F": "1"}, {"F": "p.m."}], +"2p.m.": [{"F": "2"}, {"F": "p.m."}], +"3p.m.": [{"F": "3"}, {"F": "p.m."}], +"4p.m.": [{"F": "4"}, {"F": "p.m."}], +"5p.m.": [{"F": "5"}, {"F": "p.m."}], +"6p.m.": [{"F": "6"}, {"F": "p.m."}], +"7p.m.": [{"F": "7"}, {"F": "p.m."}], +"8p.m.": [{"F": "8"}, {"F": "p.m."}], +"9p.m.": [{"F": "9"}, {"F": "p.m."}], +"10p.m.": [{"F": "10"}, {"F": "p.m."}], +"11p.m.": [{"F": "11"}, {"F": "p.m."}], +"12p.m.": [{"F": "12"}, {"F": "p.m."}], +"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}], +"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}], +"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}], +"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}], +"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}], +"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}], +"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}], +"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}], +"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}], +"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}], +"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}], +"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}], + +"Jan.": [{"F": "Jan.", "L": "Januar"}], +"Feb.": [{"F": "Feb.", "L": "Februar"}], +"Mär.": [{"F": "Mär.", "L": "März"}], +"Apr.": [{"F": "Apr.", "L": "April"}], +"Mai.": [{"F": "Mai.", "L": "Mai"}], +"Jun.": [{"F": "Jun.", "L": "Juni"}], +"Jul.": [{"F": "Jul.", "L": "Juli"}], +"Aug.": [{"F": "Aug.", "L": "August"}], +"Sep.": [{"F": "Sep.", "L": "September"}], +"Sept.": [{"F": "Sept.", "L": "September"}], +"Okt.": [{"F": "Okt.", "L": "Oktober"}], +"Nov.": [{"F": "Nov.", "L": "November"}], +"Dez.": [{"F": "Dez.", "L": "Dezember"}], + +":)": [{"F": ":)"}], +"<3": [{"F": "<3"}], +";)": [{"F": ";)"}], +"(:": [{"F": "(:"}], +":(": [{"F": ":("}], +"-_-": [{"F": "-_-"}], +"=)": [{"F": "=)"}], +":/": [{"F": ":/"}], +":>": [{"F": ":>"}], +";-)": [{"F": ";-)"}], +":Y": [{"F": ":Y"}], +":P": [{"F": ":P"}], +":-P": [{"F": ":-P"}], +":3": [{"F": ":3"}], +"=3": [{"F": "=3"}], +"xD": [{"F": "xD"}], +"^_^": [{"F": "^_^"}], +"=]": [{"F": "=]"}], +"=D": [{"F": "=D"}], +"<333": [{"F": "<333"}], +":))": [{"F": ":))"}], +":0": [{"F": ":0"}], +"-__-": [{"F": "-__-"}], +"xDD": [{"F": "xDD"}], +"o_o": [{"F": "o_o"}], +"o_O": [{"F": "o_O"}], +"V_V": [{"F": "V_V"}], +"=[[": [{"F": "=[["}], +"<33": [{"F": "<33"}], +";p": [{"F": ";p"}], +";D": [{"F": ";D"}], +";-p": [{"F": ";-p"}], +";(": [{"F": ";("}], +":p": [{"F": ":p"}], +":]": [{"F": ":]"}], +":O": [{"F": ":O"}], +":-/": [{"F": ":-/"}], +":-)": [{"F": ":-)"}], +":(((": [{"F": ":((("}], +":((": [{"F": ":(("}], +":')": [{"F": ":')"}], +"(^_^)": [{"F": "(^_^)"}], +"(=": [{"F": "(="}], +"o.O": [{"F": "o.O"}], +"\")": [{"F": "\")"}], +"a.": [{"F": "a."}], +"b.": [{"F": "b."}], +"c.": [{"F": "c."}], +"d.": [{"F": "d."}], +"e.": [{"F": "e."}], +"f.": [{"F": "f."}], +"g.": [{"F": "g."}], +"h.": [{"F": "h."}], +"i.": [{"F": "i."}], +"j.": [{"F": "j."}], +"k.": [{"F": "k."}], +"l.": [{"F": "l."}], +"m.": [{"F": "m."}], +"n.": [{"F": "n."}], +"o.": [{"F": "o."}], +"p.": [{"F": "p."}], +"q.": [{"F": "q."}], +"s.": [{"F": "s."}], +"t.": [{"F": "t."}], +"u.": [{"F": "u."}], +"v.": [{"F": "v."}], +"w.": [{"F": "w."}], +"x.": [{"F": "x."}], +"y.": [{"F": "y."}], +"z.": [{"F": "z."}], + +"z.b.": [{"F": "z.b."}], +"e.h.": [{"F": "I.e."}], +"o.ä.": [{"F": "I.E."}], +"bzw.": [{"F": "bzw."}], +"usw.": [{"F": "usw."}], +"\n": [{"F": "\n", "pos": "SP"}], +"\t": [{"F": "\t", "pos": "SP"}], +" ": [{"F": " ", "pos": "SP"}] +} diff --git a/lang_data/de/suffix.txt b/lang_data/de/suffix.txt new file mode 100644 index 000000000..d8c6bc2c2 --- /dev/null +++ b/lang_data/de/suffix.txt @@ -0,0 +1,26 @@ +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +'' +'s +'S +’s +’S +’ +\.\. +\.\.\. +\.\.\.\. +(?<=[a-z0-9)\]"'%\)])\. +(?<=[0-9])km diff --git a/lang_data/de/tag_map.json b/lang_data/de/tag_map.json new file mode 100644 index 000000000..ee1bb1b81 --- /dev/null +++ b/lang_data/de/tag_map.json @@ -0,0 +1,56 @@ +{ +"$(": {"pos": "PUNCT", "PunctType": "Brck"}, +"$,": {"pos": "PUNCT", "PunctType": "Comm"}, +"$.": {"pos": "PUNCT", "PunctType": "Peri"}, +"ADJA": {"pos": "ADJ"}, +"ADJD": {"pos": "ADJ", "Variant": "Short"}, +"ADV": {"pos": "ADV"}, +"APPO": {"pos": "ADP", "AdpType": "Post"}, +"APPR": {"pos": "ADP", "AdpType": "Prep"}, +"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"}, +"APZR": {"pos": "ADP", "AdpType": "Circ"}, +"ART": {"pos": "DET", "PronType": "Art"}, +"CARD": {"pos": "NUM", "NumType": "Card"}, +"FM": {"pos": "X", "Foreign": "Yes"}, +"ITJ": {"pos": "INTJ"}, +"KOKOM": {"pos": "CONJ", "ConjType": "Comp"}, +"KON": {"pos": "CONJ"}, +"KOUI": {"pos": "SCONJ"}, +"KOUS": {"pos": "SCONJ"}, +"NE": {"pos": "PROPN"}, +"NN": {"pos": "NOUN"}, +"PAV": {"pos": "ADV", "PronType": "Dem"}, +"PDAT": {"pos": "DET", "PronType": "Dem"}, +"PDS": {"pos": "PRON", "PronType": "Dem"}, +"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"}, +"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"}, +"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"}, +"PPER": {"pos": "PRON", "PronType": "Prs"}, +"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"}, +"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"}, +"PRELAT": {"pos": "DET", "PronType": "Rel"}, +"PRELS": {"pos": "PRON", "PronType": "Rel"}, +"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"}, +"PTKA": {"pos": "PART"}, +"PTKANT": {"pos": "PART", "PartType": "Res"}, +"PTKNEG": {"pos": "PART", "Negative": "Neg"}, +"PTKVZ": {"pos": "PART", "PartType": "Vbp"}, +"PTKZU": {"pos": "PART", "PartType": "Inf"}, +"PWAT": {"pos": "DET", "PronType": "Int"}, +"PWAV": {"pos": "ADV", "PronType": "Int"}, +"PWS": {"pos": "PRON", "PronType": "Int"}, +"TRUNC": {"pos": "X", "Hyph": "Yes"}, +"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"}, +"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"}, +"VAINF": {"pos": "AUX", "VerbForm": "Inf"}, +"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"}, +"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"}, +"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"}, +"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"}, +"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"}, +"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"}, +"VVINF": {"pos": "VERB", "VerbForm": "Inf"}, +"VVIZU": {"pos": "VERB", "VerbForm": "Inf"}, +"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"}, +"XY": {"pos": "X"} +} diff --git a/lang_data/en/gazetteer.json b/lang_data/en/gazetteer.json new file mode 100644 index 000000000..dce2e1f2a --- /dev/null +++ b/lang_data/en/gazetteer.json @@ -0,0 +1,198 @@ +{ + "Reddit": [ + "PRODUCT", + {}, + [ + [{"lower": "reddit"}] + ] + ], + "SeptemberElevenAttacks": [ + "EVENT", + {}, + [ + [ + {"orth": "9/11"} + ], + [ + {"lower": "septmber"}, + {"lower": "eleven"} + ], + [ + {"lower": "september"}, + {"orth": "11"} + ] + ] + ], + "Linux": [ + "PRODUCT", + {}, + [ + [{"lower": "linux"}] + ] + ], + "Haskell": [ + "PRODUCT", + {}, + [ + [{"lower": "haskell"}] + ] + ], + "HaskellCurry": [ + "PERSON", + {}, + [ + [ + {"lower": "haskell"}, + {"lower": "curry"} + ] + ] + ], + "Javascript": [ + "PRODUCT", + {}, + [ + [{"lower": "javascript"}] + ] + ], + "CSS": [ + "PRODUCT", + {}, + [ + [{"lower": "css"}], + [{"lower": "css3"}] + ] + ], + "displaCy": [ + "PRODUCT", + {}, + [ + [{"lower": "displacy"}] + ] + ], + "spaCy": [ + "PRODUCT", + {}, + [ + [{"orth": "spaCy"}] + ] + ], + + "HTML": [ + "PRODUCT", + {}, + [ + [{"lower": "html"}], + [{"lower": "html5"}] + ] + ], + "Python": [ + "PRODUCT", + {}, + [ + [{"orth": "Python"}] + ] + ], + "Ruby": [ + "PRODUCT", + {}, + [ + [{"orth": "Ruby"}] + ] + ], + "Digg": [ + "PRODUCT", + {}, + [ + [{"lower": "digg"}] + ] + ], + "FoxNews": [ + "ORG", + {}, + [ + [{"orth": "Fox"}], + [{"orth": "News"}] + ] + ], + "Google": [ + "ORG", + {}, + [ + [{"lower": "google"}] + ] + ], + "Mac": [ + "PRODUCT", + {}, + [ + [{"lower": "mac"}] + ] + ], + "Wikipedia": [ + "PRODUCT", + {}, + [ + [{"lower": "wikipedia"}] + ] + ], + "Windows": [ + "PRODUCT", + {}, + [ + [{"orth": "Windows"}] + ] + ], + "Dell": [ + "ORG", + {}, + [ + [{"lower": "dell"}] + ] + ], + "Facebook": [ + "ORG", + {}, + [ + [{"lower": "facebook"}] + ] + ], + "Blizzard": [ + "ORG", + {}, + [ + [{"orth": "Facebook"}] + ] + ], + "Ubuntu": [ + "ORG", + {}, + [ + [{"orth": "Ubuntu"}] + ] + ], + "Youtube": [ + "PRODUCT", + {}, + [ + [{"lower": "youtube"}] + ] + ], + "false_positives": [ + null, + {}, + [ + [{"orth": "Shit"}], + [{"orth": "Weed"}], + [{"orth": "Cool"}], + [{"orth": "Btw"}], + [{"orth": "Bah"}], + [{"orth": "Bullshit"}], + [{"orth": "Lol"}], + [{"orth": "Yo"}, {"lower": "dawg"}], + [{"orth": "Yay"}], + [{"orth": "Ahh"}], + [{"orth": "Yea"}], + [{"orth": "Bah"}] + ] + ] +} diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json new file mode 100644 index 000000000..c45eb1df6 --- /dev/null +++ b/lang_data/en/lemma_rules.json @@ -0,0 +1,31 @@ +{ + "noun": [ + ["s", ""], + ["ses", "s"], + ["ves", "f"], + ["xes", "x"], + ["zes", "z"], + ["ches", "ch"], + ["shes", "sh"], + ["men", "man"], + ["ies", "y"] + ], + + "verb": [ + ["s", ""], + ["ies", "y"], + ["es", "e"], + ["es", ""], + ["ed", "e"], + ["ed", ""], + ["ing", "e"], + ["ing", ""] + ], + + "adj": [ + ["er", ""], + ["est", ""], + ["er", "e"], + ["est", "e"] + ] +} diff --git a/lang_data/en/suffix.txt b/lang_data/en/suffix.txt index 5ac21dbc9..d8c6bc2c2 100644 --- a/lang_data/en/suffix.txt +++ b/lang_data/en/suffix.txt @@ -16,6 +16,8 @@ '' 's 'S +’s +’S ’ \.\. \.\.\. diff --git a/lang_data/en/tag_map.json b/lang_data/en/tag_map.json new file mode 100644 index 000000000..8678e5afe --- /dev/null +++ b/lang_data/en/tag_map.json @@ -0,0 +1,60 @@ +{ +".": {"pos": "punct", "puncttype": "peri"}, +",": {"pos": "punct", "puncttype": "comm"}, +"-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"}, +"-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"}, +"``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"}, +"\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"}, +"''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"}, +":": {"pos": "punct"}, +"$": {"pos": "sym", "other": {"symtype": "currency"}}, +"#": {"pos": "sym", "other": {"symtype": "numbersign"}}, +"AFX": {"pos": "adj", "hyph": "hyph"}, +"CC": {"pos": "conj", "conjtype": "coor"}, +"CD": {"pos": "num", "numtype": "card"}, +"DT": {"pos": "adj", "prontype": "prn"}, +"EX": {"pos": "adv", "advtype": "ex"}, +"FW": {"pos": "x", "foreign": "foreign"}, +"HYPH": {"pos": "punct", "puncttype": "dash"}, +"IN": {"pos": "adp"}, +"JJ": {"pos": "adj", "degree": "pos"}, +"JJR": {"pos": "adj", "degree": "comp"}, +"JJS": {"pos": "adj", "degree": "sup"}, +"LS": {"pos": "punct", "numtype": "ord"}, +"MD": {"pos": "verb", "verbtype": "mod"}, +"NIL": {"pos": "no_tag"}, +"NN": {"pos": "noun", "number": "sing"}, +"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"}, +"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"}, +"NNS": {"pos": "noun", "number": "plur"}, +"PDT": {"pos": "adj", "adjtype": "pdt", "prontype": "prn"}, +"POS": {"pos": "part", "poss": "poss"}, +"PRP": {"pos": "noun", "prontype": "prs"}, +"PRP$": {"pos": "adj", "prontype": "prs", "poss": "poss"}, +"RB": {"pos": "adv", "degree": "pos"}, +"RBR": {"pos": "adv", "degree": "comp"}, +"RBS": {"pos": "adv", "degree": "sup"}, +"RP": {"pos": "part"}, +"SYM": {"pos": "sym"}, +"TO": {"pos": "part", "parttype": "inf", "verbform": "inf"}, +"UH": {"pos": "intJ"}, +"VB": {"pos": "verb", "verbform": "inf"}, +"VBD": {"pos": "verb", "verbform": "fin", "tense": "past"}, +"VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"}, +"VBN": {"pos": "verb", "verbform": "part", "tense": "past", "aspect": "perf"}, +"VBP": {"pos": "verb", "verbform": "fin", "tense": "pres"}, +"VBZ": {"pos": "verb", "verbform": "fin", "tense": "pres", "number": "sing", "person": 3}, +"WDT": {"pos": "adj", "prontype": "int|rel"}, +"WP": {"pos": "noun", "prontype": "int|rel"}, +"WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"}, +"WRB": {"pos": "adv", "prontype": "int|rel"}, +"SP": {"pos": "space"}, +"ADD": {"pos": "x"}, +"NFP": {"pos": "punct"}, +"GW": {"pos": "x"}, +"AFX": {"pos": "x"}, +"HYPH": {"pos": "punct"}, +"XX": {"pos": "x"}, +"BES": {"pos": "verb"}, +"HVS": {"pos": "verb"} +} diff --git a/lang_data/fi/infix.txt b/lang_data/fi/infix.txt new file mode 100644 index 000000000..37eca7350 --- /dev/null +++ b/lang_data/fi/infix.txt @@ -0,0 +1,3 @@ +\.\.\. +(?<=[a-z])\.(?=[A-Z]) +(?<=[a-zA-Z])-(?=[a-zA-z]) diff --git a/lang_data/fi/lemma_rules.json b/lang_data/fi/lemma_rules.json new file mode 100644 index 000000000..0967ef424 --- /dev/null +++ b/lang_data/fi/lemma_rules.json @@ -0,0 +1 @@ +{} diff --git a/lang_data/fi/morphs.json b/lang_data/fi/morphs.json new file mode 100644 index 000000000..e69de29bb diff --git a/lang_data/fi/prefix.txt b/lang_data/fi/prefix.txt new file mode 100644 index 000000000..48c4fc549 --- /dev/null +++ b/lang_data/fi/prefix.txt @@ -0,0 +1,21 @@ +, +" +( +[ +{ +* +< +$ +£ +“ +' +`` +` +# +US$ +C$ +A$ +a- +‘ +.... +... diff --git a/lang_data/fi/sample.txt b/lang_data/fi/sample.txt new file mode 100644 index 000000000..12c0bb787 --- /dev/null +++ b/lang_data/fi/sample.txt @@ -0,0 +1,3 @@ +Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern. + +Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs. diff --git a/lang_data/fi/specials.json b/lang_data/fi/specials.json new file mode 100644 index 000000000..0e0986339 --- /dev/null +++ b/lang_data/fi/specials.json @@ -0,0 +1,149 @@ +{ +"a.m.": [{"F": "a.m."}], +"p.m.": [{"F": "p.m."}], + +"1a.m.": [{"F": "1"}, {"F": "a.m."}], +"2a.m.": [{"F": "2"}, {"F": "a.m."}], +"3a.m.": [{"F": "3"}, {"F": "a.m."}], +"4a.m.": [{"F": "4"}, {"F": "a.m."}], +"5a.m.": [{"F": "5"}, {"F": "a.m."}], +"6a.m.": [{"F": "6"}, {"F": "a.m."}], +"7a.m.": [{"F": "7"}, {"F": "a.m."}], +"8a.m.": [{"F": "8"}, {"F": "a.m."}], +"9a.m.": [{"F": "9"}, {"F": "a.m."}], +"10a.m.": [{"F": "10"}, {"F": "a.m."}], +"11a.m.": [{"F": "11"}, {"F": "a.m."}], +"12a.m.": [{"F": "12"}, {"F": "a.m."}], +"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}], +"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}], +"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}], +"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}], +"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}], +"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}], +"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}], +"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}], +"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}], +"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}], +"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}], +"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}], + + +"1p.m.": [{"F": "1"}, {"F": "p.m."}], +"2p.m.": [{"F": "2"}, {"F": "p.m."}], +"3p.m.": [{"F": "3"}, {"F": "p.m."}], +"4p.m.": [{"F": "4"}, {"F": "p.m."}], +"5p.m.": [{"F": "5"}, {"F": "p.m."}], +"6p.m.": [{"F": "6"}, {"F": "p.m."}], +"7p.m.": [{"F": "7"}, {"F": "p.m."}], +"8p.m.": [{"F": "8"}, {"F": "p.m."}], +"9p.m.": [{"F": "9"}, {"F": "p.m."}], +"10p.m.": [{"F": "10"}, {"F": "p.m."}], +"11p.m.": [{"F": "11"}, {"F": "p.m."}], +"12p.m.": [{"F": "12"}, {"F": "p.m."}], +"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}], +"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}], +"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}], +"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}], +"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}], +"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}], +"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}], +"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}], +"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}], +"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}], +"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}], +"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}], + +"Jan.": [{"F": "Jan.", "L": "Januar"}], +"Feb.": [{"F": "Feb.", "L": "Februar"}], +"Mär.": [{"F": "Mär.", "L": "März"}], +"Apr.": [{"F": "Apr.", "L": "April"}], +"Mai.": [{"F": "Mai.", "L": "Mai"}], +"Jun.": [{"F": "Jun.", "L": "Juni"}], +"Jul.": [{"F": "Jul.", "L": "Juli"}], +"Aug.": [{"F": "Aug.", "L": "August"}], +"Sep.": [{"F": "Sep.", "L": "September"}], +"Sept.": [{"F": "Sept.", "L": "September"}], +"Okt.": [{"F": "Okt.", "L": "Oktober"}], +"Nov.": [{"F": "Nov.", "L": "November"}], +"Dez.": [{"F": "Dez.", "L": "Dezember"}], + +":)": [{"F": ":)"}], +"<3": [{"F": "<3"}], +";)": [{"F": ";)"}], +"(:": [{"F": "(:"}], +":(": [{"F": ":("}], +"-_-": [{"F": "-_-"}], +"=)": [{"F": "=)"}], +":/": [{"F": ":/"}], +":>": [{"F": ":>"}], +";-)": [{"F": ";-)"}], +":Y": [{"F": ":Y"}], +":P": [{"F": ":P"}], +":-P": [{"F": ":-P"}], +":3": [{"F": ":3"}], +"=3": [{"F": "=3"}], +"xD": [{"F": "xD"}], +"^_^": [{"F": "^_^"}], +"=]": [{"F": "=]"}], +"=D": [{"F": "=D"}], +"<333": [{"F": "<333"}], +":))": [{"F": ":))"}], +":0": [{"F": ":0"}], +"-__-": [{"F": "-__-"}], +"xDD": [{"F": "xDD"}], +"o_o": [{"F": "o_o"}], +"o_O": [{"F": "o_O"}], +"V_V": [{"F": "V_V"}], +"=[[": [{"F": "=[["}], +"<33": [{"F": "<33"}], +";p": [{"F": ";p"}], +";D": [{"F": ";D"}], +";-p": [{"F": ";-p"}], +";(": [{"F": ";("}], +":p": [{"F": ":p"}], +":]": [{"F": ":]"}], +":O": [{"F": ":O"}], +":-/": [{"F": ":-/"}], +":-)": [{"F": ":-)"}], +":(((": [{"F": ":((("}], +":((": [{"F": ":(("}], +":')": [{"F": ":')"}], +"(^_^)": [{"F": "(^_^)"}], +"(=": [{"F": "(="}], +"o.O": [{"F": "o.O"}], +"\")": [{"F": "\")"}], +"a.": [{"F": "a."}], +"b.": [{"F": "b."}], +"c.": [{"F": "c."}], +"d.": [{"F": "d."}], +"e.": [{"F": "e."}], +"f.": [{"F": "f."}], +"g.": [{"F": "g."}], +"h.": [{"F": "h."}], +"i.": [{"F": "i."}], +"j.": [{"F": "j."}], +"k.": [{"F": "k."}], +"l.": [{"F": "l."}], +"m.": [{"F": "m."}], +"n.": [{"F": "n."}], +"o.": [{"F": "o."}], +"p.": [{"F": "p."}], +"q.": [{"F": "q."}], +"s.": [{"F": "s."}], +"t.": [{"F": "t."}], +"u.": [{"F": "u."}], +"v.": [{"F": "v."}], +"w.": [{"F": "w."}], +"x.": [{"F": "x."}], +"y.": [{"F": "y."}], +"z.": [{"F": "z."}], + +"z.b.": [{"F": "z.b."}], +"e.h.": [{"F": "I.e."}], +"o.ä.": [{"F": "I.E."}], +"bzw.": [{"F": "bzw."}], +"usw.": [{"F": "usw."}], +"\n": [{"F": "\n", "pos": "SP"}], +"\t": [{"F": "\t", "pos": "SP"}], +" ": [{"F": " ", "pos": "SP"}] +} diff --git a/lang_data/fi/suffix.txt b/lang_data/fi/suffix.txt new file mode 100644 index 000000000..d8c6bc2c2 --- /dev/null +++ b/lang_data/fi/suffix.txt @@ -0,0 +1,26 @@ +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +'' +'s +'S +’s +’S +’ +\.\. +\.\.\. +\.\.\.\. +(?<=[a-z0-9)\]"'%\)])\. +(?<=[0-9])km diff --git a/lang_data/fi/tag_map.json b/lang_data/fi/tag_map.json new file mode 100644 index 000000000..6b21a1e29 --- /dev/null +++ b/lang_data/fi/tag_map.json @@ -0,0 +1,17 @@ +{ + "NOUN": {"pos": "NOUN"}, + "VERB": {"pos": "VERB"}, + "PUNCT": {"pos": "PUNCT"}, + "ADV": {"pos": "ADV"}, + "ADJ": {"pos": "ADJ"}, + "PRON": {"pos": "PRON"}, + "PROPN": {"pos": "PROPN"}, + "CONJ": {"pos": "CONJ"}, + "NUM": {"pos": "NUM"}, + "AUX": {"pos": "AUX"}, + "SCONJ": {"pos": "SCONJ"}, + "ADP": {"pos": "ADP"}, + "SYM": {"pos": "SYM"}, + "X": {"pos": "X"}, + "INTJ": {"pos": "INTJ"} +} diff --git a/lang_data/it/infix.txt b/lang_data/it/infix.txt new file mode 100644 index 000000000..37eca7350 --- /dev/null +++ b/lang_data/it/infix.txt @@ -0,0 +1,3 @@ +\.\.\. +(?<=[a-z])\.(?=[A-Z]) +(?<=[a-zA-Z])-(?=[a-zA-z]) diff --git a/lang_data/it/morphs.json b/lang_data/it/morphs.json new file mode 100644 index 000000000..e69de29bb diff --git a/lang_data/it/prefix.txt b/lang_data/it/prefix.txt new file mode 100644 index 000000000..48c4fc549 --- /dev/null +++ b/lang_data/it/prefix.txt @@ -0,0 +1,21 @@ +, +" +( +[ +{ +* +< +$ +£ +“ +' +`` +` +# +US$ +C$ +A$ +a- +‘ +.... +... diff --git a/lang_data/it/specials.json b/lang_data/it/specials.json new file mode 100644 index 000000000..0e0986339 --- /dev/null +++ b/lang_data/it/specials.json @@ -0,0 +1,149 @@ +{ +"a.m.": [{"F": "a.m."}], +"p.m.": [{"F": "p.m."}], + +"1a.m.": [{"F": "1"}, {"F": "a.m."}], +"2a.m.": [{"F": "2"}, {"F": "a.m."}], +"3a.m.": [{"F": "3"}, {"F": "a.m."}], +"4a.m.": [{"F": "4"}, {"F": "a.m."}], +"5a.m.": [{"F": "5"}, {"F": "a.m."}], +"6a.m.": [{"F": "6"}, {"F": "a.m."}], +"7a.m.": [{"F": "7"}, {"F": "a.m."}], +"8a.m.": [{"F": "8"}, {"F": "a.m."}], +"9a.m.": [{"F": "9"}, {"F": "a.m."}], +"10a.m.": [{"F": "10"}, {"F": "a.m."}], +"11a.m.": [{"F": "11"}, {"F": "a.m."}], +"12a.m.": [{"F": "12"}, {"F": "a.m."}], +"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}], +"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}], +"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}], +"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}], +"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}], +"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}], +"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}], +"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}], +"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}], +"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}], +"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}], +"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}], + + +"1p.m.": [{"F": "1"}, {"F": "p.m."}], +"2p.m.": [{"F": "2"}, {"F": "p.m."}], +"3p.m.": [{"F": "3"}, {"F": "p.m."}], +"4p.m.": [{"F": "4"}, {"F": "p.m."}], +"5p.m.": [{"F": "5"}, {"F": "p.m."}], +"6p.m.": [{"F": "6"}, {"F": "p.m."}], +"7p.m.": [{"F": "7"}, {"F": "p.m."}], +"8p.m.": [{"F": "8"}, {"F": "p.m."}], +"9p.m.": [{"F": "9"}, {"F": "p.m."}], +"10p.m.": [{"F": "10"}, {"F": "p.m."}], +"11p.m.": [{"F": "11"}, {"F": "p.m."}], +"12p.m.": [{"F": "12"}, {"F": "p.m."}], +"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}], +"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}], +"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}], +"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}], +"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}], +"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}], +"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}], +"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}], +"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}], +"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}], +"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}], +"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}], + +"Jan.": [{"F": "Jan.", "L": "Januar"}], +"Feb.": [{"F": "Feb.", "L": "Februar"}], +"Mär.": [{"F": "Mär.", "L": "März"}], +"Apr.": [{"F": "Apr.", "L": "April"}], +"Mai.": [{"F": "Mai.", "L": "Mai"}], +"Jun.": [{"F": "Jun.", "L": "Juni"}], +"Jul.": [{"F": "Jul.", "L": "Juli"}], +"Aug.": [{"F": "Aug.", "L": "August"}], +"Sep.": [{"F": "Sep.", "L": "September"}], +"Sept.": [{"F": "Sept.", "L": "September"}], +"Okt.": [{"F": "Okt.", "L": "Oktober"}], +"Nov.": [{"F": "Nov.", "L": "November"}], +"Dez.": [{"F": "Dez.", "L": "Dezember"}], + +":)": [{"F": ":)"}], +"<3": [{"F": "<3"}], +";)": [{"F": ";)"}], +"(:": [{"F": "(:"}], +":(": [{"F": ":("}], +"-_-": [{"F": "-_-"}], +"=)": [{"F": "=)"}], +":/": [{"F": ":/"}], +":>": [{"F": ":>"}], +";-)": [{"F": ";-)"}], +":Y": [{"F": ":Y"}], +":P": [{"F": ":P"}], +":-P": [{"F": ":-P"}], +":3": [{"F": ":3"}], +"=3": [{"F": "=3"}], +"xD": [{"F": "xD"}], +"^_^": [{"F": "^_^"}], +"=]": [{"F": "=]"}], +"=D": [{"F": "=D"}], +"<333": [{"F": "<333"}], +":))": [{"F": ":))"}], +":0": [{"F": ":0"}], +"-__-": [{"F": "-__-"}], +"xDD": [{"F": "xDD"}], +"o_o": [{"F": "o_o"}], +"o_O": [{"F": "o_O"}], +"V_V": [{"F": "V_V"}], +"=[[": [{"F": "=[["}], +"<33": [{"F": "<33"}], +";p": [{"F": ";p"}], +";D": [{"F": ";D"}], +";-p": [{"F": ";-p"}], +";(": [{"F": ";("}], +":p": [{"F": ":p"}], +":]": [{"F": ":]"}], +":O": [{"F": ":O"}], +":-/": [{"F": ":-/"}], +":-)": [{"F": ":-)"}], +":(((": [{"F": ":((("}], +":((": [{"F": ":(("}], +":')": [{"F": ":')"}], +"(^_^)": [{"F": "(^_^)"}], +"(=": [{"F": "(="}], +"o.O": [{"F": "o.O"}], +"\")": [{"F": "\")"}], +"a.": [{"F": "a."}], +"b.": [{"F": "b."}], +"c.": [{"F": "c."}], +"d.": [{"F": "d."}], +"e.": [{"F": "e."}], +"f.": [{"F": "f."}], +"g.": [{"F": "g."}], +"h.": [{"F": "h."}], +"i.": [{"F": "i."}], +"j.": [{"F": "j."}], +"k.": [{"F": "k."}], +"l.": [{"F": "l."}], +"m.": [{"F": "m."}], +"n.": [{"F": "n."}], +"o.": [{"F": "o."}], +"p.": [{"F": "p."}], +"q.": [{"F": "q."}], +"s.": [{"F": "s."}], +"t.": [{"F": "t."}], +"u.": [{"F": "u."}], +"v.": [{"F": "v."}], +"w.": [{"F": "w."}], +"x.": [{"F": "x."}], +"y.": [{"F": "y."}], +"z.": [{"F": "z."}], + +"z.b.": [{"F": "z.b."}], +"e.h.": [{"F": "I.e."}], +"o.ä.": [{"F": "I.E."}], +"bzw.": [{"F": "bzw."}], +"usw.": [{"F": "usw."}], +"\n": [{"F": "\n", "pos": "SP"}], +"\t": [{"F": "\t", "pos": "SP"}], +" ": [{"F": " ", "pos": "SP"}] +} diff --git a/lang_data/it/suffix.txt b/lang_data/it/suffix.txt new file mode 100644 index 000000000..d8c6bc2c2 --- /dev/null +++ b/lang_data/it/suffix.txt @@ -0,0 +1,26 @@ +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +'' +'s +'S +’s +’S +’ +\.\. +\.\.\. +\.\.\.\. +(?<=[a-z0-9)\]"'%\)])\. +(?<=[0-9])km diff --git a/lang_data/it/tag_map.json b/lang_data/it/tag_map.json new file mode 100644 index 000000000..514e978a6 --- /dev/null +++ b/lang_data/it/tag_map.json @@ -0,0 +1,44 @@ +{ +"S": {"pos": "NOUN"}, +"E": {"pos": "ADP"}, +"RD": {"pos": "DET"}, +"V": {"pos": "VER"}, +"_": {"pos": "_"}, +"A": {"pos": "ADJ"}, +"SP": {"pos": "PROP"}, +"FF": {"pos": "PUNC"}, +"FS": {"pos": "PUNC"}, +"B": {"pos": "ADV"}, +"CC": {"pos": "CON"}, +"FB": {"pos": "PUNC"}, +"VA": {"pos": "AUX"}, +"PC": {"pos": "PRO"}, +"N": {"pos": "NUM"}, +"RI": {"pos": "DET"}, +"PR": {"pos": "PRO"}, +"CS": {"pos": "SCON"}, +"BN": {"pos": "ADV"}, +"AP": {"pos": "DET"}, +"VM": {"pos": "AUX"}, +"DI": {"pos": "DET"}, +"FC": {"pos": "PUNC"}, +"PI": {"pos": "PRO"}, +"DD": {"pos": "DET"}, +"DQ": {"pos": "DET"}, +"PQ": {"pos": "PRO"}, +"PD": {"pos": "PRO"}, +"NO": {"pos": "ADJ"}, +"PE": {"pos": "PRO"}, +"T": {"pos": "DET"}, +"X": {"pos": "SYM"}, +"SW": {"pos": "X"}, +"NO": {"pos": "PRO"}, +"I": {"pos": "INT"}, +"X": {"pos": "X"}, +"DR": {"pos": "DET"}, +"EA": {"pos": "ADP"}, +"PP": {"pos": "PRO"}, +"X": {"pos": "NUM"}, +"DE": {"pos": "DET"}, +"X": {"pos": "PAR"} +} diff --git a/setup.py b/setup.py index 3617b66dd..fe55d0d5a 100644 --- a/setup.py +++ b/setup.py @@ -153,7 +153,7 @@ def main(modules, is_pypy): MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.lexeme', 'spacy.vocab', 'spacy.attrs', - 'spacy.morphology', + 'spacy.morphology', 'spacy.tagger', 'spacy.syntax.stateclass', 'spacy._ml', 'spacy._theano', 'spacy.tokenizer', 'spacy.en.attrs', @@ -164,7 +164,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.gold', 'spacy.orth', 'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token', 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', - 'spacy.cfile', + 'spacy.cfile', 'spacy.matcher', 'spacy.syntax.ner'] diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index 18908e89e..56c080fa6 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -91,6 +91,8 @@ cdef class Model: count_feats(counts[guess], feats, n_feats, -cost) self._model.update(counts) - def end_training(self): + def end_training(self, model_loc=None): + if model_loc is None: + model_loc = self.model_loc self._model.end_training() - self._model.dump(self.model_loc, freq_thresh=0) + self._model.dump(model_loc, freq_thresh=0) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 2c3e2849d..c810762ef 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -14,7 +14,7 @@ cpdef enum attr_id_t: IS_STOP IS_OOV - FLAG13 + FLAG13 = 13 FLAG14 FLAG15 FLAG16 @@ -84,3 +84,4 @@ cpdef enum attr_id_t: ENT_TYPE HEAD SPACY + PROB diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 5bf83a253..f68ff196e 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -1,174 +1,12 @@ -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function + from os import path -import re -import struct -import json -from .. import orth -from ..vocab import Vocab -from ..tokenizer import Tokenizer -from ..syntax.arc_eager import ArcEager -from ..syntax.ner import BiluoPushDown -from ..syntax.parser import ParserFactory -from ..serialize.bits import BitArray +from ..language import Language -from ..tokens import Doc -from ..multi_words import RegexMerger - -from .pos import EnPosTagger -from .pos import POS_TAGS -from .attrs import get_flags -from . import regexes - -from ..util import read_lang_data - -from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB - - -def get_lex_props(string, oov_prob=-30, is_oov=False): - return { - 'flags': get_flags(string, is_oov=is_oov), - 'length': len(string), - 'orth': string, - 'lower': string.lower(), - 'norm': string, - 'shape': orth.word_shape(string), - 'prefix': string[0], - 'suffix': string[-3:], - 'cluster': 0, - 'prob': oov_prob, - 'sentiment': 0 - } - -if_model_present = -1 LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') - -class English(object): - """The English NLP pipeline. - - Example: - - Load data from default directory: - - >>> nlp = English() - >>> nlp = English(data_dir=u'') - - Load data from specified directory: - - >>> nlp = English(data_dir=u'path/to/data_directory') - - Disable (and avoid loading) parts of the processing pipeline: - - >>> nlp = English(vectors=False, parser=False, tagger=False, entity=False) - - Start with nothing loaded: - - >>> nlp = English(data_dir=None) - """ - ParserTransitionSystem = ArcEager - EntityTransitionSystem = BiluoPushDown - - def __init__(self, - data_dir=LOCAL_DATA_DIR, - Tokenizer=Tokenizer.from_dir, - Tagger=EnPosTagger, - Parser=ParserFactory(ParserTransitionSystem), - Entity=ParserFactory(EntityTransitionSystem), - Packer=None, - load_vectors=True - ): - - self.data_dir = data_dir - - if path.exists(path.join(data_dir, 'vocab', 'oov_prob')): - oov_prob = float(open(path.join(data_dir, 'vocab', 'oov_prob')).read()) - else: - oov_prob = None - - self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None, - get_lex_props=get_lex_props, load_vectors=load_vectors, - pos_tags=POS_TAGS, - oov_prob=oov_prob) - if Tagger is True: - Tagger = EnPosTagger - if Parser is True: - transition_system = self.ParserTransitionSystem - Parser = lambda s, d: parser.Parser(s, d, transition_system) - if Entity is True: - transition_system = self.EntityTransitionSystem - Entity = lambda s, d: parser.Parser(s, d, transition_system) - - self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer')) - - if Tagger and path.exists(path.join(data_dir, 'pos')): - self.tagger = Tagger(self.vocab.strings, data_dir) - else: - self.tagger = None - if Parser and path.exists(path.join(data_dir, 'deps')): - self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps')) - else: - self.parser = None - if Entity and path.exists(path.join(data_dir, 'ner')): - self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner')) - else: - self.entity = None - if Packer: - self.packer = Packer(self.vocab, data_dir) - else: - self.packer = None - self.mwe_merger = RegexMerger([ - ('IN', 'O', regexes.MW_PREPOSITIONS_RE), - ('CD', 'TIME', regexes.TIME_RE), - ('NNP', 'DATE', regexes.DAYS_RE), - ('CD', 'MONEY', regexes.MONEY_RE)]) - - def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False): - """Apply the pipeline to some text. The text can span multiple sentences, - and can contain arbtrary whitespace. Alignment into the original string - is preserved. - - Args: - text (unicode): The text to be processed. - - Returns: - tokens (spacy.tokens.Doc): - - >>> from spacy.en import English - >>> nlp = English() - >>> tokens = nlp('An example sentence. Another example sentence.') - >>> tokens[0].orth_, tokens[0].head.tag_ - ('An', 'NN') - """ - tokens = self.tokenizer(text) - if self.tagger and tag: - self.tagger(tokens) - if self.parser and parse: - self.parser(tokens) - if self.entity and entity: - self.entity(tokens) - if merge_mwes and self.mwe_merger is not None: - self.mwe_merger(tokens) - return tokens - - def end_training(self, data_dir=None): - if data_dir is None: - data_dir = self.data_dir - self.parser.model.end_training() - self.entity.model.end_training() - self.tagger.model.end_training() - self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) - - with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: - file_.write( - json.dumps([ - (TAG, list(self.tagger.freqs[TAG].items())), - (DEP, list(self.parser.moves.freqs[DEP].items())), - (ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())), - (ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())), - (HEAD, list(self.parser.moves.freqs[HEAD].items()))])) - - @property - def tags(self): - """Deprecated. List of part-of-speech tag names.""" - return self.tagger.tag_names +class English(Language): + @classmethod + def default_data_dir(cls): + return LOCAL_DATA_DIR diff --git a/spacy/en/lemmatizer.py b/spacy/en/lemmatizer.py deleted file mode 100644 index 5883e12c8..000000000 --- a/spacy/en/lemmatizer.py +++ /dev/null @@ -1,105 +0,0 @@ -from __future__ import unicode_literals -from os import path -import codecs - - -NOUN_RULES = ( - ('s', ''), - ('ses', 's'), - ('ves', 'f'), - ('xes', 'x'), - ('zes', 'z'), - ('ches', 'ch'), - ('shes', 'sh'), - ('men', 'man'), - ('ies', 'y') -) - - -VERB_RULES = ( - ("s", ""), - ("ies", "y"), - ("es", "e"), - ("es", ""), - ("ed", "e"), - ("ed", ""), - ("ing", "e"), - ("ing", "") -) - - -ADJ_RULES = ( - ("er", ""), - ("est", ""), - ("er", "e"), - ("est", "e") -) - - -class Lemmatizer(object): - def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id): - self.noun_id = noun_id - self.verb_id = verb_id - self.adj_id = adj_id - self.index = {} - self.exc = {} - for pos in ['adj', 'adv', 'noun', 'verb']: - self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos)) - self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) - - def __call__(self, string, pos): - if pos == self.noun_id: - return self.noun(string) - elif pos == self.verb_id: - return self.verb(string) - elif pos == self.adj_id: - return self.adj(string) - else: - raise Exception("Cannot lemmatize with unknown pos: %s" % pos) - - def noun(self, string): - return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES) - - def verb(self, string): - return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES) - - def adj(self, string): - return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES) - - -def lemmatize(string, index, exceptions, rules): - string = string.lower() - forms = [] - if string in index: - forms.append(string) - forms.extend(exceptions.get(string, [])) - for old, new in rules: - if string.endswith(old): - form = string[:len(string) - len(old)] + new - if form in index: - forms.append(form) - if not forms: - forms.append(string) - return set(forms) - - -def read_index(loc): - index = set() - for line in codecs.open(loc, 'r', 'utf8'): - if line.startswith(' '): - continue - pieces = line.split() - word = pieces[0] - if word.count('_') == 0: - index.add(word) - return index - - -def read_exc(loc): - exceptions = {} - for line in codecs.open(loc, 'r', 'utf8'): - if line.startswith(' '): - continue - pieces = line.split() - exceptions[pieces[0]] = tuple(pieces[1:]) - return exceptions diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd index 2fc7b4ac7..213752cf5 100644 --- a/spacy/en/pos.pxd +++ b/spacy/en/pos.pxd @@ -1,26 +1,5 @@ -from preshed.maps cimport PreshMapArray -from preshed.counter cimport PreshCounter -from cymem.cymem cimport Pool - -from .._ml cimport Model -from ..strings cimport StringStore -from ..structs cimport TokenC, LexemeC, Morphology, PosTag -from ..parts_of_speech cimport univ_pos_t -from .lemmatizer import Lemmatizer +from ..tagger cimport Tagger -cdef class EnPosTagger: - cdef readonly Pool mem - cdef readonly StringStore strings - cdef readonly Model model - cdef public object lemmatizer - cdef PreshMapArray _morph_cache - cdef public dict freqs - - cdef PosTag* tags - cdef readonly object tag_names - cdef readonly object tag_map - cdef readonly int n_tags - - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 +cdef class EnPosTagger(Tagger): + pass diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 569b209fc..8e034eadf 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -1,389 +1,11 @@ from os import path -import json -import os -import shutil -from libc.string cimport memset +from ..parts_of_speech cimport NOUN, VERB, ADJ -from cymem.cymem cimport Address -from thinc.typedefs cimport atom_t, weight_t -from collections import defaultdict - -from ..parts_of_speech cimport univ_pos_t -from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON - -from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE -from ..structs cimport TokenC, Morphology, LexemeC -from ..tokens.doc cimport Doc -from ..morphology cimport set_morph_from_dict -from .._ml cimport arg_max - -from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL -from ..typedefs cimport attr_t - -from .lemmatizer import Lemmatizer +from ..lemmatizer import Lemmatizer -cpdef enum en_person_t: - NO_PERSON - FIRST - SECOND - THIRD - NON_THIRD - - -cpdef enum en_number_t: - NO_NUMBER - SINGULAR - PLURAL - MASS - - -cpdef enum en_gender_t: - NO_GENDER - MASCULINE - FEMININE - NEUTER - - -cpdef enum en_case_t: - NO_CASE - NOMINATIVE - GENITIVE - ACCUSATIVE - REFLEXIVE - DEMONYM - - -cpdef enum en_tenspect_t: - NO_TENSE - BASE_VERB - PRESENT - PAST - PASSIVE - ING - MODAL - - -cpdef enum misc_t: - NO_MISC - COMPARATIVE - SUPERLATIVE - RELATIVE - NAME - - -cpdef enum: - P2_orth - P2_cluster - P2_shape - P2_prefix - P2_suffix - P2_pos - P2_lemma - P2_flags - - P1_orth - P1_cluster - P1_shape - P1_prefix - P1_suffix - P1_pos - P1_lemma - P1_flags - - W_orth - W_cluster - W_shape - W_prefix - W_suffix - W_pos - W_lemma - W_flags - - N1_orth - N1_cluster - N1_shape - N1_prefix - N1_suffix - N1_pos - N1_lemma - N1_flags - - N2_orth - N2_cluster - N2_shape - N2_prefix - N2_suffix - N2_pos - N2_lemma - N2_flags - - N_CONTEXT_FIELDS - - -POS_TAGS = { - 'NULL': (NO_TAG, {}), - 'EOL': (EOL, {}), - 'CC': (CONJ, {}), - 'CD': (NUM, {}), - 'DT': (DET, {}), - 'EX': (DET, {}), - 'FW': (X, {}), - 'IN': (ADP, {}), - 'JJ': (ADJ, {}), - 'JJR': (ADJ, {'misc': COMPARATIVE}), - 'JJS': (ADJ, {'misc': SUPERLATIVE}), - 'LS': (X, {}), - 'MD': (VERB, {'tenspect': MODAL}), - 'NN': (NOUN, {}), - 'NNS': (NOUN, {'number': PLURAL}), - 'NNP': (NOUN, {'misc': NAME}), - 'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}), - 'PDT': (DET, {}), - 'POS': (PRT, {'case': GENITIVE}), - 'PRP': (PRON, {}), - 'PRP$': (PRON, {'case': GENITIVE}), - 'RB': (ADV, {}), - 'RBR': (ADV, {'misc': COMPARATIVE}), - 'RBS': (ADV, {'misc': SUPERLATIVE}), - 'RP': (PRT, {}), - 'SYM': (X, {}), - 'TO': (PRT, {}), - 'UH': (X, {}), - 'VB': (VERB, {}), - 'VBD': (VERB, {'tenspect': PAST}), - 'VBG': (VERB, {'tenspect': ING}), - 'VBN': (VERB, {'tenspect': PASSIVE}), - 'VBP': (VERB, {'tenspect': PRESENT}), - 'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}), - 'WDT': (DET, {'misc': RELATIVE}), - 'WP': (PRON, {'misc': RELATIVE}), - 'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}), - 'WRB': (ADV, {'misc': RELATIVE}), - '!': (PUNCT, {}), - '#': (PUNCT, {}), - '$': (PUNCT, {}), - "''": (PUNCT, {}), - "(": (PUNCT, {}), - ")": (PUNCT, {}), - "-LRB-": (PUNCT, {}), - "-RRB-": (PUNCT, {}), - ".": (PUNCT, {}), - ",": (PUNCT, {}), - "``": (PUNCT, {}), - ":": (PUNCT, {}), - "?": (PUNCT, {}), - "ADD": (X, {}), - "NFP": (PUNCT, {}), - "GW": (X, {}), - "AFX": (X, {}), - "HYPH": (PUNCT, {}), - "XX": (X, {}), - "BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}), - "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}), - "SP": (SPACE, {}) -} - - -POS_TEMPLATES = ( - (W_orth,), - (P1_lemma, P1_pos), - (P2_lemma, P2_pos), - (N1_orth,), - (N2_orth,), - - (W_suffix,), - (W_prefix,), - - (P1_pos,), - (P2_pos,), - (P1_pos, P2_pos), - (P1_pos, W_orth), - (P1_suffix,), - (N1_suffix,), - - (W_shape,), - (W_cluster,), - (N1_cluster,), - (N2_cluster,), - (P1_cluster,), - (P2_cluster,), - - (W_flags,), - (N1_flags,), - (N2_flags,), - (P1_flags,), - (P2_flags,), -) - - -cdef struct _CachedMorph: - Morphology morph - int lemma - - -def setup_model_dir(tag_names, tag_map, templates, model_dir): - if path.exists(model_dir): - shutil.rmtree(model_dir) - os.mkdir(model_dir) - config = { - 'templates': templates, - 'tag_names': tag_names, - 'tag_map': tag_map - } - with open(path.join(model_dir, 'config.json'), 'w') as file_: - json.dump(config, file_) - - -cdef class EnPosTagger: +cdef class EnPosTagger(Tagger): """A part-of-speech tagger for English""" - def __init__(self, StringStore strings, data_dir): - self.mem = Pool() - model_dir = path.join(data_dir, 'pos') - self.strings = strings - cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) - self.tag_names = sorted(cfg['tag_names']) - assert self.tag_names - self.n_tags = len(self.tag_names) - self.tag_map = cfg['tag_map'] - cdef int n_tags = len(self.tag_names) + 1 - - self.model = Model(n_tags, cfg['templates'], model_dir) - self._morph_cache = PreshMapArray(n_tags) - self.tags = self.mem.alloc(n_tags, sizeof(PosTag)) - for i, tag in enumerate(sorted(self.tag_names)): - pos, props = self.tag_map[tag] - self.tags[i].id = i - self.tags[i].pos = pos - set_morph_from_dict(&self.tags[i].morph, props) - if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')): - self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer', - 'morphs.json')))) - self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) - self.freqs = {TAG: defaultdict(int)} - for tag in self.tag_names: - self.freqs[TAG][self.strings[tag]] = 1 - self.freqs[TAG][0] = 1 - - def __call__(self, Doc tokens): - """Apply the tagger, setting the POS tags onto the Doc object. - - Args: - tokens (Doc): The tokens to be tagged. - """ - if tokens.length == 0: - return 0 - cdef int i - cdef atom_t[N_CONTEXT_FIELDS] context - cdef const weight_t* scores - for i in range(tokens.length): - if tokens.data[i].pos == 0: - fill_context(context, i, tokens.data) - scores = self.model.score(context) - guess = arg_max(scores, self.model.n_classes) - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) - - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - - def tag_from_strings(self, Doc tokens, object tag_strs): - cdef int i - for i in range(tokens.length): - tokens.data[i].tag = self.strings[tag_strs[i]] - self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])], - tokens.data) - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - - def train(self, Doc tokens, object gold_tag_strs): - cdef int i - cdef int loss - cdef atom_t[N_CONTEXT_FIELDS] context - cdef const weight_t* scores - golds = [self.tag_names.index(g) if g is not None else -1 - for g in gold_tag_strs] - correct = 0 - for i in range(tokens.length): - fill_context(context, i, tokens.data) - scores = self.model.score(context) - guess = arg_max(scores, self.model.n_classes) - loss = guess != golds[i] if golds[i] != -1 else 0 - self.model.update(context, guess, golds[i], loss) - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) - correct += loss == 0 - self.freqs[TAG][tokens.data[i].tag] += 1 - return correct - - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1: - tokens[i].pos = tag.pos - cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) - if cached is NULL: - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) - cached.morph = tag.morph - self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) - tokens[i].lemma = cached.lemma - tokens[i].morph = cached.morph - - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: - if self.lemmatizer is None: - return lex.orth - cdef unicode py_string = self.strings[lex.orth] - if pos != NOUN and pos != VERB and pos != ADJ: - return lex.orth - cdef set lemma_strings - cdef unicode lemma_string - lemma_strings = self.lemmatizer(py_string, pos) - lemma_string = sorted(lemma_strings)[0] - lemma = self.strings[lemma_string] - return lemma - - def load_morph_exceptions(self, dict exc): - cdef unicode pos_str - cdef unicode form_str - cdef unicode lemma_str - cdef dict entries - cdef dict props - cdef int lemma - cdef attr_t orth - cdef int pos - for pos_str, entries in exc.items(): - pos = self.tag_names.index(pos_str) - for form_str, props in entries.items(): - lemma_str = props.get('L', form_str) - orth = self.strings[form_str] - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.strings[lemma_str] - set_morph_from_dict(&cached.morph, props) - self._morph_cache.set(pos, orth, cached) - - -cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1: - _fill_from_token(&context[P2_orth], &tokens[i-2]) - _fill_from_token(&context[P1_orth], &tokens[i-1]) - _fill_from_token(&context[W_orth], &tokens[i]) - _fill_from_token(&context[N1_orth], &tokens[i+1]) - _fill_from_token(&context[N2_orth], &tokens[i+2]) - - -cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: - context[0] = t.lex.lower - context[1] = t.lex.cluster - context[2] = t.lex.shape - context[3] = t.lex.prefix - context[4] = t.lex.suffix - context[5] = t.tag - context[6] = t.lemma - if t.lex.flags & (1 << IS_ALPHA): - context[7] = 1 - elif t.lex.flags & (1 << IS_PUNCT): - context[7] = 2 - elif t.lex.flags & (1 << LIKE_URL): - context[7] = 3 - elif t.lex.flags & (1 << LIKE_NUM): - context[7] = 4 - else: - context[7] = 0 + def make_lemmatizer(self, data_dir): + return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) diff --git a/spacy/fi/__init__.py b/spacy/fi/__init__.py new file mode 100644 index 000000000..8e7173767 --- /dev/null +++ b/spacy/fi/__init__.py @@ -0,0 +1,11 @@ +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language + + +class Finnish(Language): + @classmethod + def default_data_dir(cls): + return path.join(path.dirname(__file__), 'data') diff --git a/spacy/language.py b/spacy/language.py new file mode 100644 index 000000000..881df7d1a --- /dev/null +++ b/spacy/language.py @@ -0,0 +1,252 @@ +from os import path + +try: + import ujson as json +except ImportError: + import json + +from .tokenizer import Tokenizer +from .vocab import Vocab +from .syntax.parser import Parser +from .tagger import Tagger +from .matcher import Matcher +from .serialize.packer import Packer +from ._ml import Model +from . import attrs +from . import orth +from .syntax.ner import BiluoPushDown +from .syntax.arc_eager import ArcEager + +from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD + + +class Language(object): + @staticmethod + def lower(string): + return string.lower() + + @staticmethod + def norm(string): + return string + + @staticmethod + def shape(string): + return orth.word_shape(string) + + @staticmethod + def prefix(string): + return string[0] + + @staticmethod + def suffix(string): + return string[-3:] + + @staticmethod + def prob(string): + return -30 + + @staticmethod + def cluster(string): + return 0 + + @staticmethod + def is_alpha(string): + return orth.is_alpha(string) + + @staticmethod + def is_ascii(string): + return orth.is_ascii(string) + + @staticmethod + def is_digit(string): + return string.isdigit() + + @staticmethod + def is_lower(string): + return orth.is_lower(string) + + @staticmethod + def is_punct(string): + return orth.is_punct(string) + + @staticmethod + def is_space(string): + return string.isspace() + + @staticmethod + def is_title(string): + return orth.is_title(string) + + @staticmethod + def is_upper(string): + return orth.is_upper(string) + + @staticmethod + def like_url(string): + return orth.like_url(string) + + @staticmethod + def like_number(string): + return orth.like_number(string) + + @staticmethod + def like_email(string): + return orth.like_email(string) + + @classmethod + def default_lex_attrs(cls, data_dir=None): + return { + attrs.LOWER: cls.lower, + attrs.NORM: cls.norm, + attrs.SHAPE: cls.shape, + attrs.PREFIX: cls.prefix, + attrs.SUFFIX: cls.suffix, + attrs.CLUSTER: cls.cluster, + attrs.PROB: lambda string: -10.0, + + attrs.IS_ALPHA: cls.is_alpha, + attrs.IS_ASCII: cls.is_ascii, + attrs.IS_DIGIT: cls.is_digit, + attrs.IS_LOWER: cls.is_lower, + attrs.IS_PUNCT: cls.is_punct, + attrs.IS_SPACE: cls.is_space, + attrs.IS_TITLE: cls.is_title, + attrs.IS_UPPER: cls.is_upper, + attrs.LIKE_URL: cls.like_url, + attrs.LIKE_NUM: cls.like_number, + attrs.LIKE_EMAIL: cls.like_email, + attrs.IS_STOP: lambda string: False, + attrs.IS_OOV: lambda string: True + } + + @classmethod + def default_dep_labels(cls): + return {0: {'ROOT': True}} + + @classmethod + def default_ner_labels(cls): + return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} + + @classmethod + def default_data_dir(cls): + return path.join(path.dirname(__file__), 'data') + + @classmethod + def default_vectors(cls, data_dir): + return None + + @classmethod + def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None): + if data_dir is None: + data_dir = cls.default_data_dir() + if vectors is None: + vectors = cls.default_vectors(data_dir) + if get_lex_attr is None: + get_lex_attr = cls.default_lex_attrs(data_dir) + return Vocab.from_dir( + path.join(data_dir, 'vocab'), + get_lex_attr=get_lex_attr, + vectors=vectors) + + @classmethod + def default_tokenizer(cls, vocab, data_dir): + if path.exists(data_dir): + return Tokenizer.from_dir(vocab, data_dir) + else: + return Tokenizer(vocab, {}, None, None, None) + + @classmethod + def default_tagger(cls, vocab, data_dir): + if path.exists(data_dir): + return Tagger.from_dir(data_dir, vocab) + else: + return None + + @classmethod + def default_parser(cls, vocab, data_dir): + if path.exists(data_dir): + return Parser.from_dir(data_dir, vocab.strings, ArcEager) + else: + return None + + @classmethod + def default_entity(cls, vocab, data_dir): + if path.exists(data_dir): + return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) + else: + return None + + @classmethod + def default_matcher(cls, vocab, data_dir): + if path.exists(data_dir): + return Matcher.from_dir(data_dir, vocab) + else: + return None + + def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None, + parser=None, entity=None, matcher=None, serializer=None): + if data_dir is None: + data_dir = self.default_data_dir() + if vocab is None: + vocab = self.default_vocab(data_dir) + if tokenizer is None: + tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer')) + if tagger is None: + tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos')) + if entity is None: + entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner')) + if parser is None: + parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps')) + if matcher is None: + matcher = self.default_matcher(vocab, data_dir=data_dir) + self.vocab = vocab + self.tokenizer = tokenizer + self.tagger = tagger + self.parser = parser + self.entity = entity + self.matcher = matcher + + def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False): + """Apply the pipeline to some text. The text can span multiple sentences, + and can contain arbtrary whitespace. Alignment into the original string + is preserved. + + Args: + text (unicode): The text to be processed. + + Returns: + tokens (spacy.tokens.Doc): + + >>> from spacy.en import English + >>> nlp = English() + >>> tokens = nlp('An example sentence. Another example sentence.') + >>> tokens[0].orth_, tokens[0].head.tag_ + ('An', 'NN') + """ + tokens = self.tokenizer(text) + if self.tagger and tag: + self.tagger(tokens) + if self.matcher and entity: + self.matcher(tokens) + if self.parser and parse: + self.parser(tokens) + if self.entity and entity: + self.entity(tokens) + return tokens + + def end_training(self, data_dir=None): + if data_dir is None: + data_dir = self.data_dir + self.parser.model.end_training(path.join(data_dir, 'deps', 'model')) + self.entity.model.end_training(path.join(data_dir, 'ner', 'model')) + self.tagger.model.end_training(path.join(data_dir, 'pos', 'model')) + self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) + + with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: + file_.write( + json.dumps([ + (TAG, list(self.tagger.freqs[TAG].items())), + (DEP, list(self.parser.moves.freqs[DEP].items())), + (ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())), + (ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())), + (HEAD, list(self.parser.moves.freqs[HEAD].items()))])) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py new file mode 100644 index 000000000..5e08e80a4 --- /dev/null +++ b/spacy/lemmatizer.py @@ -0,0 +1,86 @@ +from __future__ import unicode_literals +from os import path +import codecs + +try: + import ujson as json +except ImportError: + import json + +from .parts_of_speech import NOUN, VERB, ADJ + + +class Lemmatizer(object): + @classmethod + def from_dir(cls, data_dir): + index = {} + exc = {} + for pos in ['adj', 'adv', 'noun', 'verb']: + index[pos] = read_index(path.join(data_dir, 'index.%s' % pos)) + exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos)) + rules = json.load(open(path.join(data_dir, 'lemma_rules.json'))) + return cls(index, exc, rules) + + def __init__(self, index, exceptions, rules): + self.index = index + self.exc = exceptions + self.rules = rules + + def __call__(self, string, pos): + if pos == NOUN: + pos = 'noun' + elif pos == VERB: + pos = 'verb' + elif pos == ADJ: + pos = 'adj' + else: + return string + lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, [])) + return min(lemmas) + + def noun(self, string): + return self(string, 'noun') + + def verb(self, string): + return self(string, 'verb') + + def adj(self, string): + return self(string, 'adj') + + +def lemmatize(string, index, exceptions, rules): + string = string.lower() + forms = [] + if string in index: + forms.append(string) + forms.extend(exceptions.get(string, [])) + for old, new in rules: + if string.endswith(old): + form = string[:len(string) - len(old)] + new + if form in index: + forms.append(form) + if not forms: + forms.append(string) + return set(forms) + + +def read_index(loc): + index = set() + for line in codecs.open(loc, 'r', 'utf8'): + if line.startswith(' '): + continue + pieces = line.split() + word = pieces[0] + if word.count('_') == 0: + index.add(word) + return index + + +def read_exc(loc): + exceptions = {} + for line in codecs.open(loc, 'r', 'utf8'): + if line.startswith(' '): + continue + pieces = line.split() + exceptions[pieces[0]] = tuple(pieces[1:]) + return exceptions diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index e0c99b3e6..63280155c 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -4,101 +4,80 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE from .structs cimport LexemeC from .strings cimport StringStore +from .vocab cimport Vocab from numpy cimport ndarray - cdef LexemeC EMPTY_LEXEME - -cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings, - const float* empty_vec) except -1 - cdef class Lexeme: - cdef readonly ndarray repvec - - cdef readonly flags_t flags - cdef readonly attr_t id - cdef readonly attr_t length - + cdef LexemeC* c + cdef readonly Vocab vocab cdef readonly attr_t orth - cdef readonly attr_t lower - cdef readonly attr_t norm - cdef readonly attr_t shape - cdef readonly attr_t prefix - cdef readonly attr_t suffix - cdef readonly unicode orth_ - cdef readonly unicode lower_ - cdef readonly unicode norm_ - cdef readonly unicode shape_ - cdef readonly unicode prefix_ - cdef readonly unicode suffix_ - - cdef readonly attr_t cluster - cdef readonly float prob - cdef readonly float sentiment - cdef readonly float l2_norm - - # Workaround for an apparent bug in the way the decorator is handled --- - # TODO: post bug report / patch to Cython. @staticmethod - cdef inline Lexeme from_ptr(const LexemeC* ptr, StringStore strings, int repvec_length): - cdef Lexeme py = Lexeme.__new__(Lexeme, repvec_length) - for i in range(repvec_length): - py.repvec[i] = ptr.repvec[i] - py.l2_norm = ptr.l2_norm - py.flags = ptr.flags - py.id = ptr.id - py.length = ptr.length - - py.orth = ptr.orth - py.lower = ptr.lower - py.norm = ptr.norm - py.shape = ptr.shape - py.prefix = ptr.prefix - py.suffix = ptr.suffix - - py.orth_ = strings[ptr.orth] - py.lower_ = strings[ptr.lower] - py.norm_ = strings[ptr.norm] - py.shape_ = strings[ptr.shape] - py.prefix_ = strings[ptr.prefix] - py.suffix_ = strings[ptr.suffix] - - py.cluster = ptr.cluster - py.prob = ptr.prob - py.sentiment = ptr.sentiment - return py - - cpdef bint check_flag(self, attr_id_t flag_id) except -1 + cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length): + cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth) + self.c = lex + self.vocab = vocab + self.orth = lex.orth + @staticmethod + cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil: + if name < (sizeof(flags_t) * 8): + Lexeme.set_flag(lex, name, value) + elif name == ID: + lex.id = value + elif name == LOWER: + lex.lower = value + elif name == NORM: + lex.norm = value + elif name == SHAPE: + lex.shape = value + elif name == PREFIX: + lex.prefix = value + elif name == SUFFIX: + lex.suffix = value + elif name == CLUSTER: + lex.cluster = value -cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: - return lexeme.flags & (1 << flag_id) + @staticmethod + cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: + if feat_name < (sizeof(flags_t) * 8): + if Lexeme.check_flag(lex, feat_name): + return 1 + else: + return 0 + elif feat_name == ID: + return lex.id + elif feat_name == ORTH: + return lex.orth + elif feat_name == LOWER: + return lex.lower + elif feat_name == NORM: + return lex.norm + elif feat_name == SHAPE: + return lex.shape + elif feat_name == PREFIX: + return lex.prefix + elif feat_name == SUFFIX: + return lex.suffix + elif feat_name == LENGTH: + return lex.length + elif feat_name == CLUSTER: + return lex.cluster + else: + return 0 + + @staticmethod + cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: + return lexeme.flags & (1 << flag_id) - -cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil: - if feat_name < (sizeof(flags_t) * 8): - return check_flag(lex, feat_name) - elif feat_name == ID: - return lex.id - elif feat_name == ORTH: - return lex.orth - elif feat_name == LOWER: - return lex.norm - elif feat_name == NORM: - return lex.norm - elif feat_name == SHAPE: - return lex.shape - elif feat_name == PREFIX: - return lex.prefix - elif feat_name == SUFFIX: - return lex.suffix - elif feat_name == LENGTH: - return lex.length - elif feat_name == CLUSTER: - return lex.cluster - else: - return 0 + @staticmethod + cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil: + cdef flags_t one = 1 + if value: + lex.flags |= one << flag_id + else: + lex.flags &= ~(one << flag_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 07f151114..8ec238e32 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -17,70 +17,120 @@ from .attrs cimport IS_OOV memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) -cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store, - const float* empty_vec) except -1: - lex.length = props['length'] - lex.orth = string_store[props['orth']] - lex.lower = string_store[props['lower']] - lex.norm = string_store[props['norm']] - lex.shape = string_store[props['shape']] - lex.prefix = string_store[props['prefix']] - lex.suffix = string_store[props['suffix']] - - lex.cluster = props['cluster'] - lex.prob = props['prob'] - lex.sentiment = props['sentiment'] - - lex.flags = props['flags'] - lex.repvec = empty_vec - - cdef class Lexeme: """An entry in the vocabulary. A Lexeme has no string context --- it's a word-type, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag). """ - def __cinit__(self, int vec_size): - self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32) + def __init__(self, Vocab vocab, int orth): + self.vocab = vocab + self.orth = orth + self.c = vocab.get_by_orth(vocab.mem, orth) + assert self.c.orth == orth - @property - def has_repvec(self): - return self.l2_norm != 0 + def py_set_flag(self, attr_id_t flag_id): + Lexeme.set_flag(self.c, flag_id, True) + + def py_check_flag(self, attr_id_t flag_id): + return True if Lexeme.check_flag(self.c, flag_id) else False - cpdef bint check_flag(self, attr_id_t flag_id) except -1: - cdef flags_t one = 1 - return self.flags & (one << flag_id) + property orth_: + def __get__(self): + return self.vocab.strings[self.c.orth] + + property lower: + def __get__(self): return self.c.lower + def __set__(self, int x): self.c.lower = x + + property norm: + def __get__(self): return self.c.norm + def __set__(self, int x): self.c.norm = x + + property shape: + def __get__(self): return self.c.shape + def __set__(self, int x): self.c.shape = x + + property prefix: + def __get__(self): return self.c.prefix + def __set__(self, int x): self.c.prefix = x + + property suffix: + def __get__(self): return self.c.suffix + def __set__(self, int x): self.c.suffix = x + + property cluster: + def __get__(self): return self.c.suffix + def __set__(self, int x): self.c.suffix = x + + property prob: + def __get__(self): return self.c.suffix + def __set__(self, int x): self.c.suffix = x + + property lower_: + def __get__(self): return self.vocab.strings[self.c.lower] + def __set__(self, unicode x): self.c.lower = self.vocab.strings[x] + + property norm_: + def __get__(self): return self.c.norm + def __set__(self, unicode x): self.c.norm = self.vocab.strings[x] + + property shape_: + def __get__(self): return self.vocab.strings[self.c.shape] + def __set__(self, unicode x): self.c.shape = self.vocab.strings[x] + + property prefix_: + def __get__(self): return self.c.prefix + def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x] + + property suffix_: + def __get__(self): return self.c.suffix + def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x] + + property flags: + def __get__(self): return self.c.flags + def __set__(self, flags_t x): self.c.flags = x property is_oov: - def __get__(self): return self.check_flag(IS_OOV) + def __get__(self): return Lexeme.check_flag(self.c, IS_OOV) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_OOV, x) property is_alpha: - def __get__(self): return self.check_flag(IS_ALPHA) + def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ALPHA, x) property is_ascii: - def __get__(self): return self.check_flag(IS_ASCII) + def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ASCII, x) property is_digit: - def __get__(self): return self.check_flag(IS_DIGIT) + def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_DIGIT, x) property is_lower: - def __get__(self): return self.check_flag(IS_LOWER) + def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_LOWER, x) property is_title: - def __get__(self): return self.check_flag(IS_TITLE) + def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_TITLE, x) property is_punct: - def __get__(self): return self.check_flag(IS_PUNCT) + def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_PUNCT, x) property is_space: - def __get__(self): return self.check_flag(IS_SPACE) + def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_SPACE, x) property like_url: - def __get__(self): return self.check_flag(LIKE_URL) + def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL) + def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_URL, x) property like_num: - def __get__(self): return self.check_flag(LIKE_NUM) + def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM) + def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_NUM, x) property like_email: - def __get__(self): return self.check_flag(LIKE_EMAIL) + def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL) + def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_EMAIL, x) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx new file mode 100644 index 000000000..b8a45d469 --- /dev/null +++ b/spacy/matcher.pyx @@ -0,0 +1,200 @@ +from os import path + +from .typedefs cimport attr_t +from .attrs cimport attr_id_t +from .structs cimport TokenC + +from cymem.cymem cimport Pool +from libcpp.vector cimport vector + +from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE +from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 +from .tokens.doc cimport get_token_attr +from .tokens.doc cimport Doc +from .vocab cimport Vocab + +from libcpp.vector cimport vector + +try: + import ujson as json +except ImportError: + import json + + +cdef struct AttrValue: + attr_id_t attr + attr_t value + + +cdef struct Pattern: + AttrValue* spec + int length + + +cdef Pattern* init_pattern(Pool mem, object token_specs, attr_t entity_type) except NULL: + pattern = mem.alloc(len(token_specs) + 1, sizeof(Pattern)) + cdef int i + for i, spec in enumerate(token_specs): + pattern[i].spec = mem.alloc(len(spec), sizeof(AttrValue)) + pattern[i].length = len(spec) + for j, (attr, value) in enumerate(spec): + pattern[i].spec[j].attr = attr + pattern[i].spec[j].value = value + i = len(token_specs) + pattern[i].spec = mem.alloc(1, sizeof(AttrValue)) + pattern[i].spec[0].attr = ENT_TYPE + pattern[i].spec[0].value = entity_type + pattern[i].spec[1].attr = LENGTH + pattern[i].spec[1].value = len(token_specs) + pattern[i].length = 0 + return pattern + + +cdef int match(const Pattern* pattern, const TokenC* token) except -1: + cdef int i + for i in range(pattern.length): + if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value: + print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value + print get_token_attr(token, pattern.spec[i].attr) + return False + return True + + +cdef int is_final(const Pattern* pattern) except -1: + return (pattern + 1).length == 0 + + +cdef object get_entity(const Pattern* pattern, const TokenC* tokens, int i): + pattern += 1 + i += 1 + return (pattern.spec[0].value, i - pattern.spec[1].value, i) + + +def _convert_strings(token_specs, string_store): + converted = [] + for spec in token_specs: + converted.append([]) + for attr, value in spec.items(): + if isinstance(attr, basestring): + attr = map_attr_name(attr) + if isinstance(value, basestring): + value = string_store[value] + if isinstance(value, bool): + value = int(value) + converted[-1].append((attr, value)) + print "Converted", converted[-1] + return converted + + +def map_attr_name(attr): + attr = attr.upper() + if attr == 'ORTH': + return ORTH + elif attr == 'LEMMA': + return LEMMA + elif attr == 'LOWER': + return LOWER + elif attr == 'SHAPE': + return SHAPE + elif attr == 'NORM': + return NORM + elif attr == 'FLAG13': + return FLAG13 + elif attr == 'FLAG14': + return FLAG14 + elif attr == 'FLAG15': + return FLAG15 + elif attr == 'FLAG16': + return FLAG16 + elif attr == 'FLAG17': + return FLAG17 + elif attr == 'FLAG18': + return FLAG18 + elif attr == 'FLAG19': + return FLAG19 + elif attr == 'FLAG20': + return FLAG20 + elif attr == 'FLAG21': + return FLAG21 + elif attr == 'FLAG22': + return FLAG22 + elif attr == 'FLAG23': + return FLAG23 + elif attr == 'FLAG24': + return FLAG24 + elif attr == 'FLAG25': + return FLAG25 + else: + raise Exception("TODO: Finish supporting attr mapping %s" % attr) + + +cdef class Matcher: + cdef Pool mem + cdef vector[Pattern*] patterns + cdef readonly Vocab vocab + + def __init__(self, vocab, patterns): + self.vocab = vocab + self.mem = Pool() + self.vocab = vocab + for entity_key, (etype, attrs, specs) in sorted(patterns.items()): + self.add(entity_key, etype, attrs, specs) + + @classmethod + def from_dir(cls, data_dir, Vocab vocab): + patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json') + if path.exists(patterns_loc): + patterns_data = open(patterns_loc).read() + patterns = json.loads(patterns_data) + return cls(vocab, patterns) + else: + return cls(vocab, {}) + + property n_patterns: + def __get__(self): return self.patterns.size() + + def add(self, entity_key, etype, attrs, specs): + if isinstance(entity_key, basestring): + entity_key = self.vocab.strings[entity_key] + if isinstance(etype, basestring): + etype = self.vocab.strings[etype] + elif etype is None: + etype = -1 + # TODO: Do something more clever about multiple patterns for single + # entity + for spec in specs: + spec = _convert_strings(spec, self.vocab.strings) + self.patterns.push_back(init_pattern(self.mem, spec, etype)) + + def __call__(self, Doc doc): + cdef vector[Pattern*] partials + cdef int n_partials = 0 + cdef int q = 0 + cdef int i, token_i + cdef const TokenC* token + cdef Pattern* state + matches = [] + for token_i in range(doc.length): + print 'check', doc[token_i].orth_ + token = &doc.data[token_i] + q = 0 + for i in range(partials.size()): + state = partials.at(i) + if match(state, token): + print 'match!' + if is_final(state): + matches.append(get_entity(state, token, token_i)) + else: + partials[q] = state + 1 + q += 1 + partials.resize(q) + for i in range(self.n_patterns): + state = self.patterns[i] + if match(state, token): + print 'match!' + if is_final(state): + matches.append(get_entity(state, token, token_i)) + else: + partials.push_back(state + 1) + doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches + return matches diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 5dfee4250..2229da0ad 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,4 +1,755 @@ -from .structs cimport TokenC, Morphology, PosTag +from cymem.cymem cimport Pool +from preshed.maps cimport PreshMapArray +from libc.stdint cimport uint64_t + +from .structs cimport TokenC +from .strings cimport StringStore +from .typedefs cimport attr_t +from .parts_of_speech cimport univ_pos_t -cdef int set_morph_from_dict(Morphology* morph, dict props) except -1 +cdef struct RichTagC: + uint64_t morph + int id + univ_pos_t pos + attr_t name + + +cdef struct MorphAnalysisC: + RichTagC tag + attr_t lemma + + +cdef class Morphology: + cdef readonly Pool mem + cdef readonly StringStore strings + cdef public object lemmatizer + cdef public object n_tags + cdef public object reverse_index + cdef public object tag_names + + cdef RichTagC* rich_tags + cdef PreshMapArray _cache + + cdef int assign_tag(self, TokenC* token, tag) except -1 + + cdef int assign_feature(self, uint64_t* morph, feature, value) except -1 + + + +# +#cpdef enum Feature_t: +# Abbr +# AdpType +# AdvType +# ConjType +# Connegative +# Derivation +# Echo +# Foreign +# Gender_dat +# Gender_erg +# Gender_psor +# Hyph +# InfForm +# NameType +# NounType +# NumberAbs +# NumberDat +# NumberErg +# NumberPsee +# NumberPsor +# NumForm +# NumValue +# PartForm +# PartType +# Person_abs +# Person_dat +# Person_psor +# Polite +# Polite_abs +# Polite_dat +# Prefix +# PrepCase +# PunctSide +# PunctType +# Style +# Typo +# Variant +# VerbType +# +# +#cpdef enum Animacy: +# Anim +# Inam +# +# +#cpdef enum Aspect: +# Freq +# Imp +# Mod +# None_ +# Perf +# +# +#cpdef enum Case1: +# Nom +# Gen +# Acc +# Dat +# Voc +# Abl +# +#cdef enum Case2: +# Abe +# Abs +# Ade +# All +# Cau +# Com +# Del +# Dis +# +#cdef enum Case3: +# Ela +# Ess +# Ill +# Ine +# Ins +# Loc +# Lat +# Par +# +#cdef enum Case4: +# Sub +# Sup +# Tem +# Ter +# Tra +# +# +#cpdef enum Definite: +# Two +# Def +# Red +# Ind +# +# +#cpdef enum Degree: +# Cmp +# Comp +# None_ +# Pos +# Sup +# Abs +# Com +# Degree # du +# +# +#cpdef enum Gender: +# Com +# Fem +# Masc +# Neut +# +# +#cpdef enum Mood: +# Cnd +# Imp +# Ind +# N +# Pot +# Sub +# Opt +# +# +#cpdef enum Negative: +# Neg +# Pos +# Yes +# +# +#cpdef enum Number: +# Com +# Dual +# None_ +# Plur +# Sing +# Ptan # bg +# Count # bg +# +# +#cpdef enum NumType: +# Card +# Dist +# Frac +# Gen +# Mult +# None_ +# Ord +# Sets +# +# +#cpdef enum Person: +# One +# Two +# Three +# None_ +# +# +#cpdef enum Poss: +# Yes +# +# +#cpdef enum PronType1: +# AdvPart +# Art +# Default +# Dem +# Ind +# Int +# Neg +# +#cpdef enum PronType2: +# Prs +# Rcp +# Rel +# Tot +# Clit +# Exc # es, ca, it, fa +# Clit # it +# +# +#cpdef enum Reflex: +# Yes +# +# +#cpdef enum Tense: +# Fut +# Imp +# Past +# Pres +# +#cpdef enum VerbForm1: +# Fin +# Ger +# Inf +# None_ +# Part +# PartFut +# PartPast +# +#cpdef enum VerbForm2: +# PartPres +# Sup +# Trans +# Gdv # la +# +# +#cpdef enum Voice: +# Act +# Cau +# Pass +# Mid # gkc +# Int # hb +# +# +#cpdef enum Abbr: +# Yes # cz, fi, sl, U +# +#cpdef enum AdpType: +# Prep # cz, U +# Post # U +# Voc # cz +# Comprep # cz +# Circ # U +# Voc # U +# +# +#cpdef enum AdvType1: +# # U +# Man +# Loc +# Tim +# Deg +# Cau +# Mod +# Sta +# Ex +# +#cpdef enum AdvType2: +# Adadj +# +#cpdef enum ConjType: +# Oper # cz, U +# Comp # cz, U +# +#cpdef enum Connegative: +# Yes # fi +# +# +#cpdef enum Derivation1: +# Minen # fi +# Sti # fi +# Inen # fi +# Lainen # fi +# Ja # fi +# Ton # fi +# Vs # fi +# Ttain # fi +# +#cpdef enum Derivation2: +# Ttaa +# +# +#cpdef enum Echo: +# Rdp # U +# Ech # U +# +# +#cpdef enum Foreign: +# Foreign # cz, fi, U +# Fscript # cz, fi, U +# Tscript # cz, U +# Yes # sl +# +# +#cpdef enum Gender_dat: +# Masc # bq, U +# Fem # bq, U +# +# +#cpdef enum Gender_erg: +# Masc # bq +# Fem # bq +# +# +#cpdef enum Gender_psor: +# Masc # cz, sl, U +# Fem # cz, sl, U +# Neut # sl +# +# +#cpdef enum Hyph: +# Yes # cz, U +# +# +#cpdef enum InfForm: +# One # fi +# Two # fi +# Three # fi +# +# +#cpdef enum NameType: +# Geo # U, cz +# Prs # U, cz +# Giv # U, cz +# Sur # U, cz +# Nat # U, cz +# Com # U, cz +# Pro # U, cz +# Oth # U, cz +# +# +#cpdef enum NounType: +# Com # U +# Prop # U +# Class # U +# +#cpdef enum Number_abs: +# Sing # bq, U +# Plur # bq, U +# +#cpdef enum Number_dat: +# Sing # bq, U +# Plur # bq, U +# +#cpdef enum Number_erg: +# Sing # bq, U +# Plur # bq, U +# +#cpdef enum Number_psee: +# Sing # U +# Plur # U +# +# +#cpdef enum Number_psor: +# Sing # cz, fi, sl, U +# Plur # cz, fi, sl, U +# +# +#cpdef enum NumForm: +# Digit # cz, sl, U +# Roman # cz, sl, U +# Word # cz, sl, U +# +# +#cpdef enum NumValue: +# One # cz, U +# Two # cz, U +# Three # cz, U +# +# +#cpdef enum PartForm: +# Pres # fi +# Past # fi +# Agt # fi +# Neg # fi +# +# +#cpdef enum PartType: +# Mod # U +# Emp # U +# Res # U +# Inf # U +# Vbp # U +# +#cpdef enum Person_abs: +# One # bq, U +# Two # bq, U +# Three # bq, U +# +# +#cpdef enum Person_dat: +# One # bq, U +# Two # bq, U +# Three # bq, U +# +# +#cpdef enum Person_erg: +# One # bq, U +# Two # bq, U +# Three # bq, U +# +# +#cpdef enum Person_psor: +# One # fi, U +# Two # fi, U +# Three # fi, U +# +# +#cpdef enum Polite: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Polite_abs: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Polite_erg: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Polite_dat: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Prefix: +# Yes # U +# +# +#cpdef enum PrepCase: +# Npr # cz +# Pre # U +# +# +#cpdef enum PunctSide: +# Ini # U +# Fin # U +# +#cpdef enum PunctType1: +# Peri # U +# Qest # U +# Excl # U +# Quot # U +# Brck # U +# Comm # U +# Colo # U +# Semi # U +# +#cpdef enum PunctType2: +# Dash # U +# +# +#cpdef enum Style1: +# Arch # cz, fi, U +# Rare # cz, fi, U +# Poet # cz, U +# Norm # cz, U +# Coll # cz, U +# Vrnc # cz, U +# Sing # cz, U +# Expr # cz, U +# +# +#cpdef enum Style2: +# Derg # cz, U +# Vulg # cz, U +# +# +#cpdef enum Typo: +# Yes # fi, U +# +# +#cpdef enum Variant: +# Short # cz +# Bound # cz, sl +# +# +#cpdef enum VerbType: +# Aux # U +# Cop # U +# Mod # U +# Light # U +# + +cpdef enum Value_t: + Animacy_Anim + Animacy_Inam + Aspect_Freq + Aspect_Imp + Aspect_Mod + Aspect_None_ + Aspect_Perf + Case_Abe + Case_Abl + Case_Abs + Case_Acc + Case_Ade + Case_All + Case_Cau + Case_Com + Case_Dat + Case_Del + Case_Dis + Case_Ela + Case_Ess + Case_Gen + Case_Ill + Case_Ine + Case_Ins + Case_Loc + Case_Lat + Case_Nom + Case_Par + Case_Sub + Case_Sup + Case_Tem + Case_Ter + Case_Tra + Case_Voc + Definite_Two + Definite_Def + Definite_Red + Definite_Ind + Degree_Cmp + Degree_Comp + Degree_None + Degree_Pos + Degree_Sup + Degree_Abs + Degree_Com + Degree_Dim # du + Gender_Com + Gender_Fem + Gender_Masc + Gender_Neut + Mood_Cnd + Mood_Imp + Mood_Ind + Mood_N + Mood_Pot + Mood_Sub + Mood_Opt + Negative_Neg + Negative_Pos + Negative_Yes + Number_Com + Number_Dual + Number_None + Number_Plur + Number_Sing + Number_Ptan # bg + Number_Count # bg + NumType_Card + NumType_Dist + NumType_Frac + NumType_Gen + NumType_Mult + NumType_None + NumType_Ord + NumType_Sets + Person_One + Person_Two + Person_Three + Person_None + Poss_Yes + PronType_AdvPart + PronType_Art + PronType_Default + PronType_Dem + PronType_Ind + PronType_Int + PronType_Neg + PronType_Prs + PronType_Rcp + PronType_Rel + PronType_Tot + PronType_Clit + PronType_Exc # es, ca, it, fa + Reflex_Yes + Tense_Fut + Tense_Imp + Tense_Past + Tense_Pres + VerbForm_Fin + VerbForm_Ger + VerbForm_Inf + VerbForm_None + VerbForm_Part + VerbForm_PartFut + VerbForm_PartPast + VerbForm_PartPres + VerbForm_Sup + VerbForm_Trans + VerbForm_Gdv # la + Voice_Act + Voice_Cau + Voice_Pass + Voice_Mid # gkc + Voice_Int # hb + Abbr_Yes # cz, fi, sl, U + AdpType_Prep # cz, U + AdpType_Post # U + AdpType_Voc # cz + AdpType_Comprep # cz + AdpType_Circ # U + AdvType_Man + AdvType_Loc + AdvType_Tim + AdvType_Deg + AdvType_Cau + AdvType_Mod + AdvType_Sta + AdvType_Ex + AdvType_Adadj + ConjType_Oper # cz, U + ConjType_Comp # cz, U + Connegative_Yes # fi + Derivation_Minen # fi + Derivation_Sti # fi + Derivation_Inen # fi + Derivation_Lainen # fi + Derivation_Ja # fi + Derivation_Ton # fi + Derivation_Vs # fi + Derivation_Ttain # fi + Derivation_Ttaa # fi + Echo_Rdp # U + Echo_Ech # U + Foreign_Foreign # cz, fi, U + Foreign_Fscript # cz, fi, U + Foreign_Tscript # cz, U + Foreign_Yes # sl + Gender_dat_Masc # bq, U + Gender_dat_Fem # bq, U + Gender_erg_Masc # bq + Gender_erg_Fem # bq + Gender_psor_Masc # cz, sl, U + Gender_psor_Fem # cz, sl, U + Gender_psor_Neut # sl + Hyph_Yes # cz, U + InfForm_One # fi + InfForm_Two # fi + InfForm_Three # fi + NameType_Geo # U, cz + NameType_Prs # U, cz + NameType_Giv # U, cz + NameType_Sur # U, cz + NameType_Nat # U, cz + NameType_Com # U, cz + NameType_Pro # U, cz + NameType_Oth # U, cz + NounType_Com # U + NounType_Prop # U + NounType_Class # U + Number_abs_Sing # bq, U + Number_abs_Plur # bq, U + Number_dat_Sing # bq, U + Number_dat_Plur # bq, U + Number_erg_Sing # bq, U + Number_erg_Plur # bq, U + Number_psee_Sing # U + Number_psee_Plur # U + Number_psor_Sing # cz, fi, sl, U + Number_psor_Plur # cz, fi, sl, U + NumForm_Digit # cz, sl, U + NumForm_Roman # cz, sl, U + NumForm_Word # cz, sl, U + NumValue_One # cz, U + NumValue_Two # cz, U + NumValue_Three # cz, U + PartForm_Pres # fi + PartForm_Past # fi + PartForm_Agt # fi + PartForm_Neg # fi + PartType_Mod # U + PartType_Emp # U + PartType_Res # U + PartType_Inf # U + PartType_Vbp # U + Person_abs_One # bq, U + Person_abs_Two # bq, U + Person_abs_Three # bq, U + Person_dat_One # bq, U + Person_dat_Two # bq, U + Person_dat_Three # bq, U + Person_erg_One # bq, U + Person_erg_Two # bq, U + Person_erg_Three # bq, U + Person_psor_One # fi, U + Person_psor_Two # fi, U + Person_psor_Three # fi, U + Polite_Inf # bq, U + Polite_Pol # bq, U + Polite_abs_Inf # bq, U + Polite_abs_Pol # bq, U + Polite_erg_Inf # bq, U + Polite_erg_Pol # bq, U + Polite_dat_Inf # bq, U + Polite_dat_Pol # bq, U + Prefix_Yes # U + PrepCase_Npr # cz + PrepCase_Pre # U + PunctSide_Ini # U + PunctSide_Fin # U + PunctType_Peri # U + PunctType_Qest # U + PunctType_Excl # U + PunctType_Quot # U + PunctType_Brck # U + PunctType_Comm # U + PunctType_Colo # U + PunctType_Semi # U + PunctType_Dash # U + Style_Arch # cz, fi, U + Style_Rare # cz, fi, U + Style_Poet # cz, U + Style_Norm # cz, U + Style_Coll # cz, U + Style_Vrnc # cz, U + Style_Sing # cz, U + Style_Expr # cz, U + Style_Derg # cz, U + Style_Vulg # cz, U + Style_Yes # fi, U + StyleVariant_StyleShort # cz + StyleVariant_StyleBound # cz, sl + VerbType_Aux # U + VerbType_Cop # U + VerbType_Mod # U + VerbType_Light # U diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 96a4ba884..fc6a4936b 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,11 +1,89 @@ -# cython: embedsignature=True +from os import path +from .lemmatizer import Lemmatizer + +try: + import ujson as json +except ImportError: + import json + +from .parts_of_speech import UNIV_POS_NAMES +from .parts_of_speech cimport ADJ, VERB, NOUN -cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: - morph.number = props.get('number', 0) - morph.tenspect = props.get('tenspect', 0) - morph.mood = props.get('mood', 0) - morph.gender = props.get('gender', 0) - morph.person = props.get('person', 0) - morph.case = props.get('case', 0) - morph.misc = props.get('misc', 0) +cdef class Morphology: + def __init__(self, StringStore string_store, tag_map, lemmatizer): + self.mem = Pool() + self.strings = string_store + self.lemmatizer = lemmatizer + self.n_tags = len(tag_map) + 1 + self.tag_names = tuple(sorted(tag_map.keys())) + self.reverse_index = {} + + self.rich_tags = self.mem.alloc(self.n_tags, sizeof(RichTagC)) + for i, (tag_str, props) in enumerate(sorted(tag_map.items())): + self.rich_tags[i].id = i + self.rich_tags[i].name = self.strings[tag_str] + self.rich_tags[i].morph = 0 + self.reverse_index[self.rich_tags[i].name] = i + self._cache = PreshMapArray(self.n_tags) + + cdef int assign_tag(self, TokenC* token, tag) except -1: + cdef int tag_id + if isinstance(tag, basestring): + try: + tag_id = self.reverse_index[self.strings[tag]] + except KeyError: + print tag + raise + else: + tag_id = tag + analysis = self._cache.get(tag_id, token.lex.orth) + if analysis is NULL: + analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) + analysis.tag = self.rich_tags[tag_id] + analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth) + token.lemma = analysis.lemma + token.pos = analysis.tag.pos + token.tag = analysis.tag.name + token.morph = analysis.tag.morph + + cdef int assign_feature(self, uint64_t* morph, feature, value) except -1: + pass + + def load_morph_exceptions(self, dict exc): + # Map (form, pos) to (lemma, rich tag) + cdef unicode pos_str + cdef unicode form_str + cdef unicode lemma_str + cdef dict entries + cdef dict props + cdef int lemma + cdef attr_t orth + cdef int pos + for tag_str, entries in exc.items(): + tag = self.strings[tag_str] + rich_tag = self.rich_tags[self.reverse_index[tag]] + for form_str, props in entries.items(): + cached = self.mem.alloc(1, sizeof(MorphAnalysisC)) + orth = self.strings[form_str] + for name_str, value_str in props.items(): + if name_str == 'L': + cached.lemma = self.strings[value_str] + else: + self.assign_feature(&cached.tag.morph, name_str, value_str) + if cached.lemma == 0: + cached.lemma = self.lemmatize(rich_tag.pos, orth) + self._cache.set(rich_tag.pos, orth, cached) + + def lemmatize(self, const univ_pos_t pos, attr_t orth): + if self.lemmatizer is None: + return orth + cdef unicode py_string = self.strings[orth] + if pos != NOUN and pos != VERB and pos != ADJ: + return orth + cdef set lemma_strings + cdef unicode lemma_string + lemma_strings = self.lemmatizer(py_string, pos) + lemma_string = sorted(lemma_strings)[0] + lemma = self.strings[lemma_string] + return lemma diff --git a/spacy/orth.pyx b/spacy/orth.pyx index 6ffac839b..df4e2dc32 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -69,7 +69,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu cpdef bint like_url(unicode string): # We're looking for things that function in text like URLs. So, valid URL # or not, anything they say http:// is going to be good. - if string.startswith('http://'): + if string.startswith('http://') or string.startswith('https://'): return True elif string.startswith('www.') and len(string) >= 5: return True @@ -92,6 +92,7 @@ cpdef bint like_url(unicode string): return False +# TODO: This should live in the language.orth NUM_WORDS = set('zero one two three four five six seven eight nine ten' 'eleven twelve thirteen fourteen fifteen sixteen seventeen' 'eighteen nineteen twenty thirty forty fifty sixty seventy' diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index b915b9dde..e410c6971 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -2,17 +2,22 @@ cpdef enum univ_pos_t: NO_TAG ADJ - ADV ADP + ADV + AUX CONJ DET + INTJ NOUN NUM + PART PRON - PRT + PROPN + PUNCT + SCONJ + SYM VERB X - PUNCT EOL SPACE N_UNIV_TAGS diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 994a48eba..8c2348a47 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -4,17 +4,22 @@ from __future__ import unicode_literals UNIV_POS_NAMES = { "NO_TAG": NO_TAG, "ADJ": ADJ, - "ADV": ADV, "ADP": ADP, + "ADV": ADV, + "AUX": AUX, "CONJ": CONJ, "DET": DET, + "INTJ": INTJ, "NOUN": NOUN, "NUM": NUM, + "PART": PART, "PRON": PRON, - "PRT": PRT, + "PROPN": PROPN, + "PUNCT": PUNCT, + "SCONJ": SCONJ, + "SYM": SYM, "VERB": VERB, "X": X, - "PUNCT": PUNCT, - "SPACE": SPACE, - "EOL": EOL + "EOL": EOL, + "SPACE": SPACE } diff --git a/spacy/strings.pyx b/spacy/strings.pyx index b35ed2ccb..a4a470158 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -108,6 +108,11 @@ cdef class StringStore: else: raise TypeError(type(string_or_id)) + def __iter__(self): + cdef int i + for i in range(self.size): + yield self[i] + cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL: # 0 means missing, but we don't bother offsetting the index. key = hash64(chars, length * sizeof(char), 0) @@ -137,6 +142,8 @@ cdef class StringStore: def load(self, loc): with codecs.open(loc, 'r', 'utf8') as file_: strings = file_.read().split(SEPARATOR) + if strings == ['']: + return None cdef unicode string cdef bytes byte_string for string in strings: diff --git a/spacy/structs.pxd b/spacy/structs.pxd index f3095df51..a0a3d65a3 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -1,4 +1,4 @@ -from libc.stdint cimport uint8_t, uint32_t, int32_t +from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t from .typedefs cimport flags_t, attr_t, hash_t from .parts_of_speech cimport univ_pos_t @@ -26,22 +26,6 @@ cdef struct LexemeC: float l2_norm -cdef struct Morphology: - uint8_t number - uint8_t tenspect # Tense/aspect/voice - uint8_t mood - uint8_t gender - uint8_t person - uint8_t case - uint8_t misc - - -cdef struct PosTag: - Morphology morph - int id - univ_pos_t pos - - cdef struct Entity: int start int end @@ -59,8 +43,8 @@ cdef struct Constituent: cdef struct TokenC: const LexemeC* lex - Morphology morph const Constituent* ctnt + uint64_t morph univ_pos_t pos bint spacy int tag diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index b297140ba..265018920 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -109,7 +109,7 @@ cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: return gold.labels[word] == -1 or gold.heads[word] == word - + cdef class Shift: @staticmethod @@ -267,7 +267,7 @@ cdef class Break: return cost else: return cost + 1 - + @staticmethod cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: return 0 @@ -279,7 +279,7 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil: return -1 else: return word - + cdef class ArcEager(TransitionSystem): @classmethod @@ -322,8 +322,9 @@ cdef class ArcEager(TransitionSystem): cdef Transition lookup_transition(self, object name) except *: if '-' in name: move_str, label_str = name.split('-', 1) - label = self.label_ids[label_str] + label = self.strings[label_str] else: + move_str = name label = 0 move = MOVE_NAMES.index(move_str) for i in range(self.n_moves): diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index fbd580b29..8414456b6 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -47,6 +47,7 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil: else: return False + cdef class BiluoPushDown(TransitionSystem): @classmethod def get_labels(cls, gold_tuples): @@ -160,7 +161,17 @@ cdef class Missing: cdef class Begin: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: - return label != 0 and not st.entity_is_open() + # Ensure we don't clobber preset entities. If no entity preset, + # ent_iob is 0 + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 1: + return False + elif preset_ent_iob == 2: + return False + elif preset_ent_iob == 3 and st.B_(0).ent_type != label: + return False + else: + return label != 0 and not st.entity_is_open() @staticmethod cdef int transition(StateClass st, int label) nogil: @@ -190,6 +201,14 @@ cdef class Begin: cdef class In: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 2: + return False + elif preset_ent_iob == 3: + return False + # TODO: Is this quite right? + elif st.B_(1).ent_iob != preset_ent_iob: + return False return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod @@ -230,6 +249,14 @@ cdef class In: cdef class Last: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 2: + return False + elif preset_ent_iob == 3: + return False + elif st.B_(1).ent_iob == 1: + return False + return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod @@ -269,6 +296,15 @@ cdef class Last: cdef class Unit: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 2: + return False + elif preset_ent_iob == 1: + return False + elif preset_ent_iob == 3 and st.B_(0).ent_type != label: + return False + elif st.B_(1).ent_iob == 1: + return False return label != 0 and not st.entity_is_open() @staticmethod @@ -300,6 +336,11 @@ cdef class Unit: cdef class Out: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 3: + return False + elif preset_ent_iob == 1: + return False return not st.entity_is_open() @staticmethod diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 411172eda..70a0229c2 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -11,8 +11,8 @@ from .stateclass cimport StateClass cdef class Parser: - cdef readonly object cfg cdef readonly Model model cdef readonly TransitionSystem moves cdef void parse(self, StateClass stcls, ExampleC eg) nogil + cdef void predict(self, StateClass stcls, ExampleC* eg) nogil diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 69d70ad03..cf61647b9 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -67,16 +67,22 @@ def ParserFactory(transition_system): cdef class Parser: - def __init__(self, StringStore strings, model_dir, transition_system): + def __init__(self, StringStore strings, transition_system, model): + self.moves = transition_system + self.model = model + + @classmethod + def from_dir(cls, model_dir, strings, transition_system): if not os.path.exists(model_dir): print >> sys.stderr, "Warning: No model found at", model_dir elif not os.path.isdir(model_dir): print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory" - else: - self.cfg = Config.read(model_dir, 'config') - self.moves = transition_system(strings, self.cfg.labels) - templates = get_templates(self.cfg.features) - self.model = Model(self.moves.n_moves, templates, model_dir) + cfg = Config.read(model_dir, 'config') + moves = transition_system(strings, cfg.labels) + templates = get_templates(cfg.features) + model = Model(moves.n_moves, templates, model_dir) + return cls(strings, moves, model) + def __call__(self, Doc tokens): cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) @@ -84,17 +90,21 @@ cdef class Parser: cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats, self.model.n_feats) - with nogil: - self.parse(stcls, eg.c) + self.parse(stcls, eg.c) tokens.set_parse(stcls._sent) + cdef void predict(self, StateClass stcls, ExampleC* eg) nogil: + memset(eg.scores, 0, eg.nr_class * sizeof(weight_t)) + self.moves.set_valid(eg.is_valid, stcls) + fill_context(eg.atoms, stcls) + self.model.set_scores(eg.scores, eg.atoms) + eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes) + cdef void parse(self, StateClass stcls, ExampleC eg) nogil: while not stcls.is_final(): - memset(eg.scores, 0, eg.nr_class * sizeof(weight_t)) - self.moves.set_valid(eg.is_valid, stcls) - fill_context(eg.atoms, stcls) - self.model.set_scores(eg.scores, eg.atoms) - eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes) + self.predict(stcls, &eg) + if not eg.is_valid[eg.guess]: + break self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) self.moves.finalize_state(stcls) @@ -109,15 +119,93 @@ cdef class Parser: cdef Transition G while not stcls.is_final(): memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t)) - self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) - fill_context(eg.c.atoms, stcls) - self.model.train(eg) - G = self.moves.c[eg.c.guess] self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label) loss += eg.c.loss return loss + + def step_through(self, Doc doc): + return StepwiseState(self, doc) + + +cdef class StepwiseState: + cdef readonly StateClass stcls + cdef readonly Example eg + cdef readonly Doc doc + cdef readonly Parser parser + + def __init__(self, Parser parser, Doc doc): + self.parser = parser + self.doc = doc + self.stcls = StateClass.init(doc.data, doc.length) + self.parser.moves.initialize_state(self.stcls) + self.eg = Example(self.parser.model.n_classes, CONTEXT_SIZE, + self.parser.model.n_feats, self.parser.model.n_feats) + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.finish() + + @property + def is_final(self): + return self.stcls.is_final() + + @property + def stack(self): + return self.stcls.stack + + @property + def queue(self): + return self.stcls.queue + + @property + def heads(self): + return [self.stcls.H(i) for i in range(self.stcls.length)] + + @property + def deps(self): + return [self.doc.vocab.strings[self.stcls._sent[i].dep] + for i in range(self.stcls.length)] + + def predict(self): + self.parser.predict(self.stcls, &self.eg.c) + action = self.parser.moves.c[self.eg.c.guess] + return self.parser.moves.move_name(action.move, action.label) + + def transition(self, action_name): + moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3} + if action_name == '_': + action_name = self.predict() + action = self.parser.moves.lookup_transition(action_name) + elif action_name == 'L' or action_name == 'R': + self.predict() + move = moves[action_name] + clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c, + self.eg.c.nr_class) + action = self.parser.moves.c[clas] + else: + action = self.parser.moves.lookup_transition(action_name) + action.do(self.stcls, action.label) + + def finish(self): + if self.stcls.is_final(): + self.parser.moves.finalize_state(self.stcls) + self.doc.set_parse(self.stcls._sent) + + +cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, + int nr_class) except -1: + cdef weight_t score = 0 + cdef int mode = -1 + cdef int i + for i in range(nr_class): + if actions[i].move == move and (mode == -1 or scores[i] >= score): + mode = i + score = scores[i] + return mode diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index 905d8cdde..8a10f5a39 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -71,7 +71,6 @@ cdef class StateClass: return -1 return self._sent[i].head + i - cdef int E(self, int i) nogil cdef int R(self, int i, int idx) nogil @@ -125,7 +124,7 @@ cdef class StateClass: cdef void add_arc(self, int head, int child, int label) nogil cdef void del_arc(self, int head, int child) nogil - + cdef void open_ent(self, int label) nogil cdef void close_ent(self) nogil diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 2a7bcfd7a..6f7951987 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -36,6 +36,14 @@ cdef class StateClass: self._buffer[i] = i self._empty_token.lex = &EMPTY_LEXEME + @property + def stack(self): + return {self.S(i) for i in range(self._s_i)} + + @property + def queue(self): + return {self.B(i) for i in range(self._b_i)} + cdef int E(self, int i) nogil: if self._e_i <= 0 or self._e_i >= self.length: return 0 diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 387cd0fc9..4cf9aae7e 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -47,6 +47,6 @@ cdef class TransitionSystem: cdef Transition init_transition(self, int clas, int move, int label) except * cdef int set_valid(self, int* output, StateClass state) nogil - + cdef int set_costs(self, int* is_valid, int* costs, StateClass state, GoldParse gold) except -1 diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 00395333f..86aef1fbc 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -54,6 +54,10 @@ cdef class TransitionSystem: cdef Transition init_transition(self, int clas, int move, int label) except *: raise NotImplementedError + def is_valid(self, StateClass stcls, move_name): + action = self.lookup_transition(move_name) + return action.is_valid(stcls, action.label) + cdef int set_valid(self, int* is_valid, StateClass stcls) nogil: cdef int i for i in range(self.n_moves): diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd new file mode 100644 index 000000000..28d7fc711 --- /dev/null +++ b/spacy/tagger.pxd @@ -0,0 +1,12 @@ +from ._ml cimport Model +from .structs cimport TokenC +from .vocab cimport Vocab + + +cdef class Tagger: + cdef readonly Vocab vocab + cdef readonly Model model + cdef public dict freqs + + cdef int predict(self, int i, const TokenC* tokens) except -1 + cdef int update(self, int i, const TokenC* tokens, int gold) except -1 diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx new file mode 100644 index 000000000..756bb7ea4 --- /dev/null +++ b/spacy/tagger.pyx @@ -0,0 +1,220 @@ +import json +from os import path +from collections import defaultdict + +from thinc.typedefs cimport atom_t, weight_t + +from .typedefs cimport attr_t +from .tokens.doc cimport Doc +from .attrs cimport TAG +from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON +from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE + +from .attrs cimport * +from ._ml cimport arg_max + + +cpdef enum: + P2_orth + P2_cluster + P2_shape + P2_prefix + P2_suffix + P2_pos + P2_lemma + P2_flags + + P1_orth + P1_cluster + P1_shape + P1_prefix + P1_suffix + P1_pos + P1_lemma + P1_flags + + W_orth + W_cluster + W_shape + W_prefix + W_suffix + W_pos + W_lemma + W_flags + + N1_orth + N1_cluster + N1_shape + N1_prefix + N1_suffix + N1_pos + N1_lemma + N1_flags + + N2_orth + N2_cluster + N2_shape + N2_prefix + N2_suffix + N2_pos + N2_lemma + N2_flags + + N_CONTEXT_FIELDS + + +cdef class Tagger: + """A part-of-speech tagger for English""" + @classmethod + def read_config(cls, data_dir): + return json.load(open(path.join(data_dir, 'pos', 'config.json'))) + + @classmethod + def default_templates(cls): + return ( + (W_orth,), + (P1_lemma, P1_pos), + (P2_lemma, P2_pos), + (N1_orth,), + (N2_orth,), + + (W_suffix,), + (W_prefix,), + + (P1_pos,), + (P2_pos,), + (P1_pos, P2_pos), + (P1_pos, W_orth), + (P1_suffix,), + (N1_suffix,), + + (W_shape,), + (W_cluster,), + (N1_cluster,), + (N2_cluster,), + (P1_cluster,), + (P2_cluster,), + + (W_flags,), + (N1_flags,), + (N2_flags,), + (P1_flags,), + (P2_flags,), + ) + + @classmethod + def blank(cls, vocab, templates): + model = Model(vocab.morphology.n_tags, templates, model_loc=None) + return cls(vocab, model) + + @classmethod + def from_dir(cls, data_dir, vocab): + if path.exists(path.join(data_dir, 'templates.json')): + templates = json.loads(open(path.join(data_dir, 'templates.json'))) + else: + templates = cls.default_templates() + model = Model(vocab.morphology.n_tags, templates, data_dir) + return cls(vocab, model) + + def __init__(self, Vocab vocab, model): + self.vocab = vocab + self.model = model + + # TODO: Move this to tag map + self.freqs = {TAG: defaultdict(int)} + for tag in self.tag_names: + self.freqs[TAG][self.vocab.strings[tag]] = 1 + self.freqs[TAG][0] = 1 + + @property + def tag_names(self): + return self.vocab.morphology.tag_names + + def __call__(self, Doc tokens): + """Apply the tagger, setting the POS tags onto the Doc object. + + Args: + tokens (Doc): The tokens to be tagged. + """ + if tokens.length == 0: + return 0 + cdef int i + cdef const weight_t* scores + for i in range(tokens.length): + if tokens.data[i].pos == 0: + guess = self.predict(i, tokens.data) + self.vocab.morphology.assign_tag(&tokens.data[i], guess) + + tokens.is_tagged = True + tokens._py_tokens = [None] * tokens.length + + def tag_from_strings(self, Doc tokens, object tag_strs): + cdef int i + for i in range(tokens.length): + self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i]) + tokens.is_tagged = True + tokens._py_tokens = [None] * tokens.length + + def train(self, Doc tokens, object gold_tag_strs): + assert len(tokens) == len(gold_tag_strs) + cdef int i + cdef int loss + cdef const weight_t* scores + try: + golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] + except ValueError: + raise ValueError( + [g for g in gold_tag_strs if g is not None and g not in self.tag_names]) + correct = 0 + for i in range(tokens.length): + guess = self.update(i, tokens.data, golds[i]) + loss = golds[i] != -1 and guess != golds[i] + + self.vocab.morphology.assign_tag(&tokens.data[i], guess) + + correct += loss == 0 + self.freqs[TAG][tokens.data[i].tag] += 1 + return correct + + cdef int predict(self, int i, const TokenC* tokens) except -1: + cdef atom_t[N_CONTEXT_FIELDS] context + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + return arg_max(scores, self.model.n_classes) + + cdef int update(self, int i, const TokenC* tokens, int gold) except -1: + cdef atom_t[N_CONTEXT_FIELDS] context + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + guess = arg_max(scores, self.model.n_classes) + loss = guess != gold if gold != -1 else 0 + self.model.update(context, guess, gold, loss) + return guess + + +cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: + context[0] = t.lex.lower + context[1] = t.lex.cluster + context[2] = t.lex.shape + context[3] = t.lex.prefix + context[4] = t.lex.suffix + context[5] = t.tag + context[6] = t.lemma + if t.lex.flags & (1 << IS_ALPHA): + context[7] = 1 + elif t.lex.flags & (1 << IS_PUNCT): + context[7] = 2 + elif t.lex.flags & (1 << LIKE_URL): + context[7] = 3 + elif t.lex.flags & (1 << LIKE_NUM): + context[7] = 4 + else: + context[7] = 0 diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index a7f69c5aa..9d60d2a6e 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -4,15 +4,10 @@ from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from .typedefs cimport hash_t -from .structs cimport LexemeC, TokenC, Morphology +from .structs cimport LexemeC, TokenC from .strings cimport StringStore from .tokens.doc cimport Doc -from .vocab cimport Vocab, _Cached - - -cdef union LexemesOrTokens: - const LexemeC* const* lexemes - TokenC* tokens +from .vocab cimport Vocab, LexemesOrTokens, _Cached cdef class Tokenizer: diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 1e857aefc..d54770d2b 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE from cymem.cymem cimport Pool from preshed.maps cimport PreshMap -from .morphology cimport set_morph_from_dict from .strings cimport hash_string cimport cython @@ -29,7 +28,7 @@ cdef class Tokenizer: self._suffix_re = suffix_re self._infix_re = infix_re self.vocab = vocab - self._load_special_tokenization(rules, self.vocab.pos_tags) + self._load_special_tokenization(rules) @classmethod def from_dir(cls, Vocab vocab, data_dir): @@ -193,9 +192,7 @@ cdef class Tokenizer: tokens.push_back(prefixes[0][i], False) if string: cache_hit = self._try_cache(hash_string(string), tokens) - if cache_hit: - pass - else: + if not cache_hit: match = self.find_infix(string) if match is None: tokens.push_back(self.vocab.get(tokens.mem, string), False) @@ -242,7 +239,7 @@ cdef class Tokenizer: match = self._suffix_re.search(string) return (match.end() - match.start()) if match is not None else 0 - def _load_special_tokenization(self, object rules, object tag_map): + def _load_special_tokenization(self, special_cases): '''Add a special-case tokenization rule. ''' cdef int i @@ -253,29 +250,11 @@ cdef class Tokenizer: cdef dict props cdef LexemeC** lexemes cdef hash_t hashed - for chunk, substrings in sorted(rules.items()): - tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) - for i, props in enumerate(substrings): - form = props['F'] - lemma = props.get("L", None) - tokens[i].lex = self.vocab.get(self.vocab.mem, form) - if lemma is not None: - tokens[i].lemma = self.vocab.strings[lemma] - else: - tokens[i].lemma = 0 - if 'pos' in props: - tokens[i].tag = self.vocab.strings[props['pos']] - tokens[i].pos = tag_map[props['pos']][0] - # These are defaults, which can be over-ridden by the - # token-specific props. - set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1]) - if tokens[i].lemma == 0: - tokens[i].lemma = tokens[i].lex.orth - set_morph_from_dict(&tokens[i].morph, props) + for chunk, substrings in sorted(special_cases.items()): cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings) cached.is_lex = False - cached.data.tokens = tokens - hashed = hash_string(chunk) - self._specials.set(hashed, cached) - self._cache.set(hashed, cached) + cached.data.tokens = self.vocab.make_fused_token(substrings) + key = hash_string(chunk) + self._specials.set(key, cached) + self._cache.set(key, cached) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 7de5e0bea..a13858175 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -4,14 +4,19 @@ from preshed.counter cimport PreshCounter from ..vocab cimport Vocab from ..structs cimport TokenC, LexemeC +from ..typedefs cimport attr_t +from ..attrs cimport attr_id_t + + +cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil ctypedef const LexemeC* const_Lexeme_ptr -ctypedef TokenC* TokenC_ptr +ctypedef const TokenC* const_TokenC_ptr ctypedef fused LexemeOrToken: const_Lexeme_ptr - TokenC_ptr + const_TokenC_ptr cdef class Doc: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index feb11bd87..41d24d8ac 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -5,16 +5,17 @@ from libc.stdint cimport uint32_t import numpy import struct +from ..lexeme cimport Lexeme from ..lexeme cimport EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t from ..attrs cimport attr_id_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech import UNIV_POS_NAMES -from ..parts_of_speech cimport CONJ, PUNCT -from ..lexeme cimport check_flag -from ..lexeme cimport get_attr as get_lex_attr -from .spans import Span +from ..parts_of_speech cimport CONJ, PUNCT, NOUN +from ..parts_of_speech cimport univ_pos_t +from ..lexeme cimport Lexeme +from .spans cimport Span from .token cimport Token from ..serialize.bits cimport BitArray @@ -47,7 +48,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: elif feat_name == ENT_TYPE: return token.ent_type else: - return get_lex_attr(token.lex, feat_name) + return Lexeme.get_struct_attr(token.lex, feat_name) cdef class Doc: @@ -119,40 +120,79 @@ cdef class Doc: def string(self): return u''.join([t.string for t in self]) - @property - def ents(self): - """Yields named-entity Span objects. + property ents: + def __get__(self): + """Yields named-entity Span objects. - Iterate over the span to get individual Token objects, or access the label: + Iterate over the span to get individual Token objects, or access the label: - >>> from spacy.en import English - >>> nlp = English() - >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') - >>> ents = list(tokens.ents) - >>> ents[0].label, ents[0].label_, ''.join(t.orth_ for t in ents[0]) - (112504, u'PERSON', u'Best ') - """ - cdef int i - cdef const TokenC* token - cdef int start = -1 - cdef int label = 0 + >>> from spacy.en import English + >>> nlp = English() + >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') + >>> ents = list(tokens.ents) + >>> ents[0].label, ents[0].label_, ''.join(t.orth_ for t in ents[0]) + (112504, u'PERSON', u'Best ') + """ + cdef int i + cdef const TokenC* token + cdef int start = -1 + cdef int label = 0 + output = [] + for i in range(self.length): + token = &self.data[i] + if token.ent_iob == 1: + assert start != -1 + elif token.ent_iob == 2 or token.ent_iob == 0: + if start != -1: + output.append(Span(self, start, i, label=label)) + start = -1 + label = 0 + elif token.ent_iob == 3: + if start != -1: + output.append(Span(self, start, i, label=label)) + start = i + label = token.ent_type + if start != -1: + output.append(Span(self, start, self.length, label=label)) + return tuple(output) + + def __set__(self, ents): + # TODO: + # 1. Allow negative matches + # 2. Ensure pre-set NERs are not over-written during statistical prediction + # 3. Test basic data-driven ORTH gazetteer + # 4. Test more nuanced date and currency regex + cdef int i + for i in range(self.length): + self.data[i].ent_type = 0 + self.data[i].ent_iob = 0 + cdef attr_t ent_type + cdef int start, end + for ent_type, start, end in ents: + if ent_type is None or ent_type < 0: + # Mark as O + for i in range(start, end): + self.data[i].ent_type = 0 + self.data[i].ent_iob = 2 + else: + # Mark (inside) as I + for i in range(start, end): + self.data[i].ent_type = ent_type + self.data[i].ent_iob = 1 + # Set start as B + self.data[start].ent_iob = 3 + + @property + def noun_chunks(self): + """Yield spans for base noun phrases.""" + cdef const TokenC* word + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr'] + np_deps = [self.vocab.strings[label] for label in labels] + np_label = self.vocab.strings['NP'] for i in range(self.length): - token = &self.data[i] - if token.ent_iob == 1: - assert start != -1 - pass - elif token.ent_iob == 2: - if start != -1: - yield Span(self, start, i, label=label) - start = -1 - label = 0 - elif token.ent_iob == 3: - if start != -1: - yield Span(self, start, i, label=label) - start = i - label = token.ent_type - if start != -1: - yield Span(self, start, self.length, label=label) + word = &self.data[i] + if word.pos == NOUN and word.dep in np_deps: + yield Span(self, word.l_edge, i+1, label=np_label) @property def sents(self): @@ -171,7 +211,7 @@ cdef class Doc: if self.length == self.max_length: self._realloc(self.length * 2) cdef TokenC* t = &self.data[self.length] - if LexemeOrToken is TokenC_ptr: + if LexemeOrToken is const_TokenC_ptr: t[0] = lex_or_tok[0] else: t.lex = lex_or_tok @@ -179,6 +219,7 @@ cdef class Doc: t.idx = 0 else: t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy + assert t.lex.orth != 0 t.spacy = has_space self.length += 1 self._py_tokens.append(None) @@ -288,6 +329,9 @@ cdef class Doc: elif attr_id == TAG: for i in range(length): tokens[i].tag = values[i] + elif attr_id == POS: + for i in range(length): + tokens[i].pos = values[i] elif attr_id == DEP: for i in range(length): tokens[i].dep = values[i] @@ -297,20 +341,7 @@ cdef class Doc: elif attr_id == ENT_TYPE: for i in range(length): tokens[i].ent_type = values[i] - cdef TokenC* head - cdef TokenC* child - # Set left edges - for i in range(length): - child = &tokens[i] - head = &tokens[i + child.head] - if child < head and child.l_edge < head.l_edge: - head.l_edge = child.l_edge - # Set right edges --- same as above, but iterate in reverse - for i in range(length-1, -1, -1): - child = &tokens[i] - head = &tokens[i + child.head] - if child > head and child.r_edge > head.r_edge: - head.r_edge = child.r_edge + set_children_from_heads(self.data, self.length) return self def to_bytes(self): @@ -354,14 +385,18 @@ cdef class Doc: break else: return None - cdef unicode string = self.string + + cdef Span span = self[start:end] # Get LexemeC for newly merged token - new_orth = string[start_idx:end_idx] + new_orth = ''.join([t.string for t in span]) + if span[-1].whitespace_: + new_orth = new_orth[:-1] cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth) # House the new merged token where it starts cdef TokenC* token = &self.data[start] # Update fields token.lex = lex + token.spacy = self.data[end].spacy # What to do about morphology?? # TODO: token.morph = ??? token.tag = self.vocab.strings[tag] @@ -372,30 +407,16 @@ cdef class Doc: else: token.ent_iob = 3 token.ent_type = self.vocab.strings[ent_type] - # Fix dependencies # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets + # Before thinking of something simpler, beware the case where a dependency + # bridges over the entity. Here the alignment of the tokens changes. + span_root = span.root.i + token.dep = span.root.dep for i in range(self.length): self.data[i].head += i - # Find the head of the merged token, and its dep relation - outer_heads = {} - for i in range(start, end): - head_idx = self.data[i].head - if head_idx == i or head_idx < start or head_idx >= end: - # Don't consider "heads" which are actually dominated by a word - # in the region we're merging - gp = head_idx - while self.data[gp].head != gp: - if start <= gp < end: - break - gp = self.data[gp].head - else: - # If we have multiple words attaching to the same head, - # but with different dep labels, we're preferring the last - # occurring dep label. Shrug. What else could we do, I guess? - outer_heads[head_idx] = self.data[i].dep - - token.head, token.dep = max(outer_heads.items()) + # Set the head of the merged token, and its dep relation, from the Span + token.head = self.data[span_root].head # Adjust deps before shrinking tokens # Tokens which point into the merged token should now point to it # Subtract the offset from all tokens which point to >= end @@ -406,7 +427,6 @@ cdef class Doc: self.data[i].head = start elif head_idx >= end: self.data[i].head -= offset - # TODO: Fix left and right deps # Now compress the token array for i in range(end, self.length): self.data[i - offset] = self.data[i] @@ -417,6 +437,28 @@ cdef class Doc: for i in range(self.length): # ...And, set heads back to a relative position self.data[i].head -= i - + # Set the left/right children, left/right edges + set_children_from_heads(self.data, self.length) + # Clear the cached Python objects + self._py_tokens = [None] * self.length # Return the merged Python object return self[start] + + +cdef int set_children_from_heads(TokenC* tokens, int length) except -1: + cdef TokenC* head + cdef TokenC* child + cdef int i + # Set left edges + for i in range(length): + child = &tokens[i] + head = &tokens[i + child.head] + if child < head and child.l_edge < head.l_edge: + head.l_edge = child.l_edge + # Set right edges --- same as above, but iterate in reverse + for i in range(length-1, -1, -1): + child = &tokens[i] + head = &tokens[i + child.head] + if child > head and child.r_edge > head.r_edge: + head.r_edge = child.r_edge + diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 2c37e9b85..e2aa1a7f9 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -1,7 +1,7 @@ from __future__ import unicode_literals from collections import defaultdict -from ..structs cimport Morphology, TokenC, LexemeC +from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t from ..attrs cimport attr_id_t from ..parts_of_speech cimport univ_pos_t @@ -40,12 +40,18 @@ cdef class Span: return self.end - self.start def __getitem__(self, int i): - return self._seq[self.start + i] + if i < 0: + return self._seq[self.end + i] + else: + return self._seq[self.start + i] def __iter__(self): for i in range(self.start, self.end): yield self._seq[i] + def merge(self, unicode tag, unicode lemma, unicode ent_type): + self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type) + property root: """The first ancestor of the first word of the span that has its head outside the span. diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 6aa000f05..f3b9aa056 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -1,6 +1,5 @@ from libc.string cimport memcpy from cpython.mem cimport PyMem_Malloc, PyMem_Free -from ..lexeme cimport check_flag # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray cimport numpy as np @@ -9,6 +8,7 @@ np.import_array() import numpy +from ..lexeme cimport Lexeme from ..parts_of_speech import UNIV_POS_NAMES from ..attrs cimport LEMMA @@ -20,6 +20,8 @@ from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_OOV +from ..lexeme cimport Lexeme + cdef class Token: """An individual token --- i.e. a word, a punctuation symbol, etc. Created @@ -42,7 +44,7 @@ cdef class Token: return self.string cpdef bint check_flag(self, attr_id_t flag_id) except -1: - return check_flag(self.c.lex, flag_id) + return Lexeme.check_flag(self.c.lex, flag_id) def nbor(self, int i=1): return self.doc[self.i+i] @@ -142,7 +144,7 @@ cdef class Token: """The leftward immediate children of the word, in the syntactic dependency parse. """ - cdef const TokenC* ptr = self.c - self.i + cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge) while ptr < self.c: # If this head is still to the right of us, we can skip to it # No token that's between this token and this head could be our @@ -160,7 +162,7 @@ cdef class Token: def __get__(self): """The rightward immediate children of the word, in the syntactic dependency parse.""" - cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1) + cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) tokens = [] while ptr > self.c: # If this head is still to the right of us, we can skip to it @@ -193,7 +195,7 @@ cdef class Token: property left_edge: def __get__(self): return self.doc[self.c.l_edge] - + property right_edge: def __get__(self): return self.doc[self.c.r_edge] @@ -202,7 +204,7 @@ cdef class Token: def __get__(self): """The token predicted by the parser to be the head of the current token.""" return self.doc[self.i + self.c.head] - + property conjuncts: def __get__(self): """Get a list of conjoined words""" @@ -286,37 +288,37 @@ cdef class Token: return self.vocab.strings[self.c.dep] property is_oov: - def __get__(self): return check_flag(self.c.lex, IS_OOV) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV) property is_alpha: - def __get__(self): return check_flag(self.c.lex, IS_ALPHA) - + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA) + property is_ascii: - def __get__(self): return check_flag(self.c.lex, IS_ASCII) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII) property is_digit: - def __get__(self): return check_flag(self.c.lex, IS_DIGIT) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT) property is_lower: - def __get__(self): return check_flag(self.c.lex, IS_LOWER) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER) property is_title: - def __get__(self): return check_flag(self.c.lex, IS_TITLE) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE) property is_punct: - def __get__(self): return check_flag(self.c.lex, IS_PUNCT) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT) property is_space: - def __get__(self): return check_flag(self.c.lex, IS_SPACE) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE) property like_url: - def __get__(self): return check_flag(self.c.lex, LIKE_URL) - + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL) + property like_num: - def __get__(self): return check_flag(self.c.lex, LIKE_NUM) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM) property like_email: - def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL) _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 2503cdcee..e491a48e3 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -7,6 +7,7 @@ from murmurhash.mrmr cimport hash64 from .structs cimport LexemeC, TokenC from .typedefs cimport utf8_t, attr_t, hash_t from .strings cimport StringStore +from .morphology cimport Morphology cdef LexemeC EMPTY_LEXEME @@ -14,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME cdef union LexemesOrTokens: const LexemeC* const* lexemes - TokenC* tokens + const TokenC* tokens cdef struct _Cached: @@ -27,16 +28,20 @@ cdef class Vocab: cpdef public lexeme_props_getter cdef Pool mem cpdef readonly StringStore strings - cdef readonly object pos_tags + cpdef readonly Morphology morphology cdef readonly int length cdef public object _serializer cdef public object data_dir - cdef public float oov_prob + cdef public object get_lex_attr + cdef public object pos_tags cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL + cdef const TokenC* make_fused_token(self, substrings) except NULL + cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 + cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL cdef PreshMap _by_hash cdef PreshMap _by_orth diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ac2e11e11..012909755 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -12,16 +12,17 @@ import math import json from .lexeme cimport EMPTY_LEXEME -from .lexeme cimport set_lex_struct_props from .lexeme cimport Lexeme from .strings cimport hash_string from .orth cimport word_shape from .typedefs cimport attr_t from .cfile cimport CFile +from .lemmatizer import Lemmatizer from cymem.cymem cimport Address from . import util from .serialize.packer cimport Packer +from .attrs cimport PROB DEF MAX_VEC_SIZE = 100000 @@ -36,34 +37,33 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True, - pos_tags=None, oov_prob=-30): - if oov_prob is None: - oov_prob = -30 + def __init__(self, get_lex_attr=None, tag_map=None, vectors=None): + if tag_map is None: + tag_map = {} self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() - self.pos_tags = pos_tags if pos_tags is not None else {} - - self.lexeme_props_getter = get_lex_props - self.repvec_length = 0 - self.length = 0 - self._add_lex_to_vocab(0, &EMPTY_LEXEME) - if data_dir is not None: - if not path.exists(data_dir): - raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) - if data_dir is not None: - if not path.isdir(data_dir): - raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) - self.load_lexemes(path.join(data_dir, 'strings.txt'), - path.join(data_dir, 'lexemes.bin')) - if load_vectors and path.exists(path.join(data_dir, 'vec.bin')): - self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) - + self.get_lex_attr = get_lex_attr + self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {})) + + self.length = 1 self._serializer = None - self.data_dir = data_dir - self.oov_prob = oov_prob + + @classmethod + def from_dir(cls, data_dir, get_lex_attr=None, vectors=None): + if not path.exists(data_dir): + raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) + if not path.isdir(data_dir): + raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) + + tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) + cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map) + + self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin')) + if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): + self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) + return self property serializer: def __get__(self): @@ -89,20 +89,12 @@ cdef class Vocab: cdef LexemeC* lex cdef hash_t key = hash_string(string) lex = self._by_hash.get(key) + cdef size_t addr if lex != NULL: + assert lex.orth == self.strings[string] return lex - cdef bint is_oov = mem is not self.mem - if len(string) < 3: - mem = self.mem - lex = mem.alloc(sizeof(LexemeC), 1) - props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov) - set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) - if is_oov: - lex.id = 0 else: - self._add_lex_to_vocab(key, lex) - assert lex != NULL, string - return lex + return self._new_lexeme(mem, string) cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme @@ -114,18 +106,34 @@ cdef class Vocab: lex = self._by_orth.get(orth) if lex != NULL: return lex - cdef unicode string = self.strings[orth] + else: + return self._new_lexeme(mem, self.strings[orth]) + + cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: + cdef hash_t key cdef bint is_oov = mem is not self.mem + mem = self.mem if len(string) < 3: mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) - props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov) - set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) + lex.orth = self.strings[string] + lex.length = len(string) + lex.id = self.length + if self.get_lex_attr is not None: + for attr, func in self.get_lex_attr.items(): + value = func(string) + if isinstance(value, unicode): + value = self.strings[value] + if attr == PROB: + lex.prob = value + else: + Lexeme.set_struct_attr(lex, attr, value) if is_oov: lex.id = 0 else: - self._add_lex_to_vocab(hash_string(string), lex) - assert lex != NULL, orth + key = hash_string(string) + self._add_lex_to_vocab(key, lex) + assert lex != NULL, string return lex cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: @@ -137,7 +145,7 @@ cdef class Vocab: cdef attr_t orth cdef size_t addr for orth, addr in self._by_orth.items(): - yield Lexeme.from_ptr(addr, self.strings, self.repvec_length) + yield Lexeme(self, orth) def __getitem__(self, id_or_string): '''Retrieve a lexeme, given an int ID or a unicode string. If a previously @@ -154,32 +162,29 @@ cdef class Vocab: An instance of the Lexeme Python class, with data copied on instantiation. ''' - cdef const LexemeC* lexeme cdef attr_t orth - if type(id_or_string) == int: - orth = id_or_string - lexeme = self._by_orth.get(orth) - if lexeme == NULL: - raise KeyError(id_or_string) - assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth)) - elif type(id_or_string) == unicode: - lexeme = self.get(self.mem, id_or_string) - assert lexeme.orth == self.strings[id_or_string] + if type(id_or_string) == unicode: + orth = self.strings[id_or_string] else: - raise ValueError("Vocab unable to map type: " - "%s. Maps unicode --> Lexeme or " - "int --> Lexeme" % str(type(id_or_string))) - return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length) - - def __setitem__(self, unicode string, dict props): - cdef hash_t key = hash_string(string) - cdef LexemeC* lex - lex = self._by_hash.get(key) - if lex == NULL: - lex = self.mem.alloc(sizeof(LexemeC), 1) - set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) - self._add_lex_to_vocab(key, lex) + orth = id_or_string + return Lexeme(self, orth) + cdef const TokenC* make_fused_token(self, substrings) except NULL: + cdef int i + tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) + for i, props in enumerate(substrings): + token = &tokens[i] + # Set the special tokens up to have morphology and lemmas if + # specified, otherwise use the part-of-speech tag (if specified) + token.lex = self.get(self.mem, props['F']) + if 'pos' in props: + self.morphology.assign_tag(token, props['pos']) + if 'L' in props: + tokens[i].lemma = self.strings[props['L']] + for feature, value in props.get('morph', {}).items(): + self.morphology.assign_feature(&token.morph, feature, value) + return tokens + def dump(self, loc): if path.exists(loc): assert not path.isdir(loc) diff --git a/tests/parser/test_base_nps.py b/tests/parser/test_base_nps.py new file mode 100644 index 000000000..f37c80f07 --- /dev/null +++ b/tests/parser/test_base_nps.py @@ -0,0 +1,40 @@ +import pytest + + +@pytest.mark.models +def test_nsubj(EN): + sent = EN(u'A base phrase should be recognized.') + base_nps = list(sent.noun_chunks) + assert len(base_nps) == 1 + assert base_nps[0].string == 'A base phrase ' + + +@pytest.mark.models +def test_coord(EN): + sent = EN(u'A base phrase and a good phrase are often the same.') + base_nps = list(sent.noun_chunks) + assert len(base_nps) == 2 + assert base_nps[0].string == 'A base phrase ' + assert base_nps[1].string == 'a good phrase ' + + +@pytest.mark.models +def test_pp(EN): + sent = EN(u'A phrase with another phrase occurs') + base_nps = list(sent.noun_chunks) + assert len(base_nps) == 2 + assert base_nps[0].string == 'A phrase ' + assert base_nps[1].string == 'another phrase ' + + +@pytest.mark.models +def test_merge_pp(EN): + sent = EN(u'A phrase with another phrase occurs') + nps = [(np[0].idx, np[-1].idx + len(np[-1]), np[0].ent_type_) for np in sent.noun_chunks] + + for start, end, ent_type in nps: + sent.merge(start, end, u'NP', np.lemma_, ent_type) + assert sent[0].string == 'A phrase ' + assert sent[1].string == 'with ' + assert sent[2].string == 'another phrase ' + assert sent[3].string == 'occurs' diff --git a/tests/parser/test_initial_actions_parse.py b/tests/parser/test_initial_actions_parse.py new file mode 100644 index 000000000..a4a57e5b2 --- /dev/null +++ b/tests/parser/test_initial_actions_parse.py @@ -0,0 +1,16 @@ +import pytest + + +@pytest.mark.models +def test_initial(EN): + doc = EN.tokenizer(u'I ate the pizza with anchovies.') + EN.tagger(doc) + with EN.parser.step_through(doc) as stepwise: + stepwise.transition('L-nsubj') + stepwise.transition('S') + stepwise.transition('L-det') + assert doc[0].head.i == 1 + assert doc[1].head.i == 1 + assert doc[2].head.i == 3 + assert doc[3].head.i == 3 + assert doc diff --git a/tests/serialize/test_codecs.py b/tests/serialize/test_codecs.py index ad9012068..00177f21a 100644 --- a/tests/serialize/test_codecs.py +++ b/tests/serialize/test_codecs.py @@ -41,25 +41,10 @@ def test_attribute(): def test_vocab_codec(): - def get_lex_props(string, prob): - return { - 'flags': 0, - 'length': len(string), - 'orth': string, - 'lower': string, - 'norm': string, - 'shape': string, - 'prefix': string[0], - 'suffix': string[-3:], - 'cluster': 0, - 'prob': prob, - 'sentiment': 0 - } - vocab = Vocab() - vocab['dog'] = get_lex_props('dog', 0.001) - vocab['the'] = get_lex_props('the', 0.05) - vocab['jumped'] = get_lex_props('jumped', 0.005) + lex = vocab['dog'] + lex = vocab['the'] + lex = vocab['jumped'] codec = HuffmanCodec([(lex.orth, lex.prob) for lex in vocab]) diff --git a/tests/serialize/test_packer.py b/tests/serialize/test_packer.py index 5770a8938..6ec583d08 100644 --- a/tests/serialize/test_packer.py +++ b/tests/serialize/test_packer.py @@ -5,6 +5,7 @@ import re import pytest import numpy +from spacy.language import Language from spacy.vocab import Vocab from spacy.tokens.doc import Doc from spacy.tokenizer import Tokenizer @@ -17,30 +18,14 @@ from spacy.serialize.packer import Packer from spacy.serialize.bits import BitArray -def get_lex_props(string, prob=-22, is_oov=False): - return { - 'flags': 0, - 'length': len(string), - 'orth': string, - 'lower': string, - 'norm': string, - 'shape': string, - 'prefix': string[0], - 'suffix': string[-3:], - 'cluster': 0, - 'prob': prob, - 'sentiment': 0 - } - - @pytest.fixture def vocab(): - vocab = Vocab(get_lex_props=get_lex_props) - vocab['dog'] = get_lex_props('dog', 0.001) + vocab = Vocab(Language.default_lex_attrs()) + lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' - vocab['the'] = get_lex_props('the', 0.01) - vocab['quick'] = get_lex_props('quick', 0.005) - vocab['jumped'] = get_lex_props('jumped', 0.007) + lex = vocab['the'] + lex = vocab['quick'] + lex = vocab['jumped'] return vocab diff --git a/tests/spans/test_merge.py b/tests/spans/test_merge.py index 3bba13064..a47e4e53a 100644 --- a/tests/spans/test_merge.py +++ b/tests/spans/test_merge.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import pytest - @pytest.mark.models def test_merge_tokens(EN): tokens = EN(u'Los Angeles start.') @@ -32,3 +31,19 @@ def test_merge_heads(EN): def test_issue_54(EN): text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).' tokens = EN(text, merge_mwes=True) + +@pytest.mark.models +def test_np_merges(EN): + text = u'displaCy is a parse tool built with Javascript' + tokens = EN(text) + assert tokens[4].head.i == 1 + tokens.merge(tokens[2].idx, tokens[4].idx + len(tokens[4]), u'NP', u'tool', u'O') + assert tokens[2].head.i == 1 + tokens = EN('displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript.') + + ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) + for e in tokens.ents] + for start, end, label, lemma in ents: + merged = tokens.merge(start, end, label, lemma, label) + assert merged != None, (start, end, label, lemma) + diff --git a/tests/tagger/test_spaces.py b/tests/tagger/test_spaces.py index c3052160e..0ef05637b 100644 --- a/tests/tagger/test_spaces.py +++ b/tests/tagger/test_spaces.py @@ -14,6 +14,7 @@ def tagged(EN): tokens = EN(string, tag=True, parse=False) return tokens +@pytest.mark.models def test_spaces(tagged): assert tagged[0].pos != SPACE assert tagged[0].pos_ != 'SPACE' diff --git a/tests/test_docs.py b/tests/test_docs.py index c5307b5a0..4b0831dfd 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -1,82 +1,81 @@ # -*- coding: utf-8 -*- """Sphinx doctest is just too hard. Manually paste doctest examples here""" -from spacy.en.attrs import IS_LOWER import pytest -@pytest.mark.models -def test_1(): - import spacy.en - from spacy.parts_of_speech import ADV - # Load the pipeline, and call it with some text. - nlp = spacy.en.English() - tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", - tag=True, parse=False) - o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) - assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’" - - o = nlp.vocab[u'back'].prob - assert o == -7.033305644989014 - o = nlp.vocab[u'not'].prob - assert o == -5.332601070404053 - o = nlp.vocab[u'quietly'].prob - assert o == -11.994928359985352 - - -@pytest.mark.models -def test2(): - import spacy.en - from spacy.parts_of_speech import ADV - nlp = spacy.en.English() - # Find log probability of Nth most frequent word - probs = [lex.prob for lex in nlp.vocab] - probs.sort() - is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] - tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") - o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) - o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' - -@pytest.mark.models -def test3(): - import spacy.en - from spacy.parts_of_speech import ADV - nlp = spacy.en.English() - # Find log probability of Nth most frequent word - probs = [lex.prob for lex in nlp.vocab] - probs.sort() - is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] - tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") - o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) - assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' - - pleaded = tokens[7] - assert pleaded.repvec.shape == (300,) - o = pleaded.repvec[:5] - assert sum(o) != 0 - from numpy import dot - from numpy.linalg import norm - - cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) - words = [w for w in nlp.vocab if w.is_lower and w.has_repvec] - words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) - words.reverse() - o = [w.orth_ for w in words[0:20]] - assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded', - u'pleads', u'testified', u'conspired', u'motioned', u'demurred', - u'countersued', u'remonstrated', u'begged', u'apologised', - u'consented', u'acquiesced', u'petitioned', u'quarreled', - u'appealed', u'pleading'] - o = [w.orth_ for w in words[50:60]] - assert o == [u'martialed', u'counselled', u'bragged', - u'backtracked', u'caucused', u'refiled', u'dueled', u'mused', - u'dissented', u'yearned'] - o = [w.orth_ for w in words[100:110]] - assert o == [u'acquits', u'cabled', u'ducked', u'sentenced', - u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed', - u'clerked'] - - #o = [w.orth_ for w in words[1000:1010]] - #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled', - # u'posited', u'firebombed', u'slimed', u'deferred', u'sagged'] - #o = [w.orth_ for w in words[50000:50010]] - #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid', - # u'dirty', u'rims', u'artists'] +#@pytest.mark.models +#def test_1(): +# import spacy.en +# from spacy.parts_of_speech import ADV +# # Load the pipeline, and call it with some text. +# nlp = spacy.en.English() +# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", +# tag=True, parse=False) +# o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) +# assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’" +# +# o = nlp.vocab[u'back'].prob +# assert o == -7.033305644989014 +# o = nlp.vocab[u'not'].prob +# assert o == -5.332601070404053 +# o = nlp.vocab[u'quietly'].prob +# assert o == -11.994928359985352 +# +# +#@pytest.mark.m +#def test2(): +# import spacy.en +# from spacy.parts_of_speech import ADV +# nlp = spacy.en.English() +# # Find log probability of Nth most frequent word +# probs = [lex.prob for lex in nlp.vocab] +# probs.sort() +# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] +# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") +# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) +# o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' +# +#@pytest.mark.models +#def test3(): +# import spacy.en +# from spacy.parts_of_speech import ADV +# nlp = spacy.en.English() +# # Find log probability of Nth most frequent word +# probs = [lex.prob for lex in nlp.vocab] +# probs.sort() +# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] +# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") +# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) +# assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' +# +# pleaded = tokens[7] +# assert pleaded.repvec.shape == (300,) +# o = pleaded.repvec[:5] +# assert sum(o) != 0 +# from numpy import dot +# from numpy.linalg import norm +# +# cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) +# words = [w for w in nlp.vocab if w.is_lower and w.has_repvec] +# words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) +# words.reverse() +# o = [w.orth_ for w in words[0:20]] +# assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded', +# u'pleads', u'testified', u'conspired', u'motioned', u'demurred', +# u'countersued', u'remonstrated', u'begged', u'apologised', +# u'consented', u'acquiesced', u'petitioned', u'quarreled', +# u'appealed', u'pleading'] +# o = [w.orth_ for w in words[50:60]] +# assert o == [u'martialed', u'counselled', u'bragged', +# u'backtracked', u'caucused', u'refiled', u'dueled', u'mused', +# u'dissented', u'yearned'] +# o = [w.orth_ for w in words[100:110]] +# assert o == [u'acquits', u'cabled', u'ducked', u'sentenced', +# u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed', +# u'clerked'] +# +# #o = [w.orth_ for w in words[1000:1010]] +# #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled', +# # u'posited', u'firebombed', u'slimed', u'deferred', u'sagged'] +# #o = [w.orth_ for w in words[50000:50010]] +# #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid', +# # u'dirty', u'rims', u'artists'] diff --git a/tests/test_matcher.py b/tests/test_matcher.py new file mode 100644 index 000000000..986d8a8bd --- /dev/null +++ b/tests/test_matcher.py @@ -0,0 +1,61 @@ +from __future__ import unicode_literals +import pytest + +from spacy.strings import StringStore +from spacy.matcher import * +from spacy.attrs import LOWER +from spacy.tokens.doc import Doc +from spacy.vocab import Vocab + + +@pytest.fixture +def matcher(EN): + patterns = { + 'Javascript': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], + 'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], + 'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]], + } + return Matcher(EN.vocab, patterns) + + +def test_compile(matcher): + assert matcher.n_patterns == 3 + + +def test_no_match(matcher, EN): + tokens = EN('I like cheese') + assert matcher(tokens) == [] + + +def test_match_start(matcher, EN): + tokens = EN('JavaScript is good') + assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 0, 1)] + + +def test_match_end(matcher, EN): + tokens = EN('I like java') + assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 3)] + + +def test_match_middle(matcher, EN): + tokens = EN('I like Google Now best') + assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4)] + + +def test_match_multi(matcher, EN): + tokens = EN('I like Google Now and java best') + assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4), + (EN.vocab.strings['PRODUCT'], 5, 6)] + + +@pytest.mark.models +def test_match_preserved(matcher, EN): + doc = EN.tokenizer('I like java') + EN.tagger(doc) + assert len(doc.ents) == 0 + doc = EN.tokenizer('I like java') + matcher(doc) + assert len(doc.ents) == 1 + EN.tagger(doc) + EN.entity(doc) + assert len(doc.ents) == 1 diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index b935bbce7..e1238373f 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -4,7 +4,6 @@ from spacy.tokens import Doc import pytest - @pytest.mark.models def test_getitem(EN): tokens = EN(u'Give it back! He pleaded.') @@ -32,3 +31,15 @@ def test_serialize_whitespace(EN): assert tokens.string == new_tokens.string assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens] + + +def test_set_ents(EN): + tokens = EN.tokenizer(u'I use goggle chrone to surf the web') + assert len(tokens.ents) == 0 + tokens.ents = [(EN.vocab.strings['PRODUCT'], 2, 4)] + assert len(list(tokens.ents)) == 1 + assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0] + ent = tokens.ents[0] + assert ent.label_ == 'PRODUCT' + assert ent.start == 2 + assert ent.end == 4