diff --git a/.gitignore b/.gitignore index 4d9f6efab..74d63241a 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ tmp/ .eggs *.tgz .sass-cache +.python-version MANIFEST @@ -36,6 +37,7 @@ data/en/strings _build/ .env/ +tmp/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/.travis.yml b/.travis.yml index 136c6edee..856da5cd6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,34 +1,30 @@ language: python -sudo: required +sudo: false dist: trusty group: edge python: - "2.7" - - "3.4" + - "3.5" os: - linux +env: + - VIA="compile" + - VIA="sdist" + install: - - "pip install -r requirements.txt" - - "pip install -e ." - - "mkdir -p corpora/en" - - "cd corpora/en" - - "wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz" - - "tar -xzf WordNet-3.0.tar.gz" - - "mv WordNet-3.0 wordnet" - - "cd ../../" - - "python bin/init_model.py en lang_data/ corpora/ data" - - "cp package.json data" - - "sputnik build data en_default.sputnik" - - "sputnik --name spacy install en_default.sputnik" + - "./travis.sh" script: - "pip install pytest" - - "python -m pytest spacy" - + - if [[ "${VIA}" == "compile" ]]; then SPACY_DATA=models/en python -m pytest spacy; fi + - if [[ "${VIA}" == "pypi" ]]; then python -m pytest `python -c "import pathlib; import spacy; print(pathlib.Path(spacy.__file__).parent.resolve())"`; fi + - if [[ "${VIA}" == "sdist" ]]; then python -m pytest `python -c "import pathlib; import spacy; print(pathlib.Path(spacy.__file__).parent.resolve())"`; fi + notifications: slack: secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ= + email: false diff --git a/README.rst b/README.rst index 3236766c5..2149187b9 100644 --- a/README.rst +++ b/README.rst @@ -192,6 +192,7 @@ OS X ships with Python and git preinstalled. Windows ------- +<<<<<<< HEAD Install a version of Visual Studio Express or higher that matches the version that was used to compile your Python interpreter. For official distributions @@ -211,6 +212,27 @@ Python install. Run: Run tests ========= +======= + +Install a version of Visual Studio Express or higher that matches the version +that was used to compile your Python interpreter. For official distributions +these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and VS 2015 (Python 3.5). + +Workaround for obsolete system Python +===================================== + +If you're stuck using a system with an old version of Python, and you don't +have root access, we've prepared a bootstrap script to help you compile a local +Python install. Run: + +.. code:: bash + + curl https://raw.githubusercontent.com/spacy-io/gist/master/bootstrap_python_env.sh | bash && source .env/bin/activate + +Run tests +========= + +>>>>>>> v1.0.0-rc1 spaCy comes with an extensive test suite. First, find out where spaCy is installed: diff --git a/bin/parser/train.py b/bin/parser/train.py index 372c7932e..5d588a317 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -17,6 +17,7 @@ import spacy.util from spacy.syntax.util import Config from spacy.gold import read_json_file from spacy.gold import GoldParse +from spacy.gold import merge_sents from spacy.scorer import Scorer @@ -63,96 +64,24 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): scorer.score(tokens, gold, verbose=verbose) -def _merge_sents(sents): - m_deps = [[], [], [], [], [], []] - m_brackets = [] - i = 0 - for (ids, words, tags, heads, labels, ner), brackets in sents: - m_deps[0].extend(id_ + i for id_ in ids) - m_deps[1].extend(words) - m_deps[2].extend(tags) - m_deps[3].extend(head + i for head in heads) - m_deps[4].extend(labels) - m_deps[5].extend(ner) - m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) - i += len(ids) - return [(m_deps, m_brackets)] - - -def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', - seed=0, gold_preproc=False, n_sents=0, corruption_level=0, - beam_width=1, verbose=False, - use_orig_arc_eager=False, pseudoprojective=False): - dep_model_dir = path.join(model_dir, 'deps') - ner_model_dir = path.join(model_dir, 'ner') - pos_model_dir = path.join(model_dir, 'pos') - if path.exists(dep_model_dir): - shutil.rmtree(dep_model_dir) - if path.exists(ner_model_dir): - shutil.rmtree(ner_model_dir) - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) - os.mkdir(dep_model_dir) - os.mkdir(ner_model_dir) - os.mkdir(pos_model_dir) - - if pseudoprojective: - # preprocess training data here before ArcEager.get_labels() is called - gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) - - Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, - labels=ArcEager.get_labels(gold_tuples), - beam_width=beam_width,projectivize=pseudoprojective) - Config.write(ner_model_dir, 'config', features='ner', seed=seed, - labels=BiluoPushDown.get_labels(gold_tuples), - beam_width=0) - - if n_sents > 0: - gold_tuples = gold_tuples[:n_sents] - - nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) - nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) - nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) - nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown) +def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg, + n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0): print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") - for itn in range(n_iter): - scorer = Scorer() + format_str = '{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}' + with Language.train(model_dir, train_data, + tagger_cfg, parser_cfg, entity_cfg) as trainer: loss = 0 - for raw_text, sents in gold_tuples: - if gold_preproc: - raw_text = None - else: - sents = _merge_sents(sents) - for annot_tuples, ctnt in sents: - if len(annot_tuples[1]) == 1: - continue - score_model(scorer, nlp, raw_text, annot_tuples, - verbose=verbose if itn >= 2 else False) - if raw_text is None: - words = add_noise(annot_tuples[1], corruption_level) - tokens = nlp.tokenizer.tokens_from_list(words) - else: - raw_text = add_noise(raw_text, corruption_level) - tokens = nlp.tokenizer(raw_text) - nlp.tagger(tokens) - gold = GoldParse(tokens, annot_tuples) - if not gold.is_projective: - raise Exception("Non-projective sentence in training: %s" % annot_tuples[1]) - loss += nlp.parser.train(tokens, gold) - nlp.entity.train(tokens, gold) - nlp.tagger.train(tokens, gold.tags) - random.shuffle(gold_tuples) - print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, - scorer.tags_acc, - scorer.token_acc)) - print('end training') - nlp.end_training(model_dir) - print('done') + for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc, + augment_data=None)): + for doc, gold in epoch: + trainer.update(doc, gold) + dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc) + print(format_str.format(itn, loss, **dev_scores.scores)) def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None, cand_preproc=None): - nlp = Language(data_dir=model_dir) + nlp = Language(path=model_dir) if nlp.lang == 'de': nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string]) if beam_width is not None: @@ -162,7 +91,7 @@ def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False if gold_preproc: raw_text = None else: - sents = _merge_sents(sents) + sents = merge_sents(sents) for annot_tuples, brackets in sents: if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) @@ -219,15 +148,21 @@ def write_parses(Language, dev_loc, model_dir, out_loc): ) def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False): + parser_cfg = dict(locals()) + tagger_cfg = dict(locals()) + entity_cfg = dict(locals()) + lang = spacy.util.get_lang_class(language) + + parser_cfg['features'] = lang.Defaults.parser_features + entity_cfg['features'] = lang.Defaults.entity_features if not eval_only: gold_train = list(read_json_file(train_loc)) - train(lang, gold_train, model_dir, - feat_set='basic' if not debug else 'debug', - gold_preproc=gold_preproc, n_sents=n_sents, - corruption_level=corruption_level, n_iter=n_iter, - verbose=verbose,pseudoprojective=pseudoprojective) + gold_dev = list(read_json_file(dev_loc)) + train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg, + n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level, + n_iter=n_iter) if out_loc: write_parses(lang, dev_loc, model_dir, out_loc) scorer = evaluate(lang, list(read_json_file(dev_loc)), diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py new file mode 100644 index 000000000..9fe2ab490 --- /dev/null +++ b/examples/training/train_ner.py @@ -0,0 +1,63 @@ +from __future__ import unicode_literals, print_function +import json +import pathlib +import random + +import spacy +from spacy.pipeline import EntityRecognizer +from spacy.gold import GoldParse + + +def train_ner(nlp, train_data, entity_types): + ner = EntityRecognizer(nlp.vocab, entity_types=entity_types) + for itn in range(5): + random.shuffle(train_data) + for raw_text, entity_offsets in train_data: + doc = nlp.make_doc(raw_text) + gold = GoldParse(doc, entities=entity_offsets) + ner.update(doc, gold) + ner.model.end_training() + return ner + + +def main(model_dir=None): + if model_dir is not None: + model_dir = pathlb.Path(model_dir) + if not model_dir.exists(): + model_dir.mkdir() + assert model_dir.isdir() + + nlp = spacy.load('en', parser=False, entity=False, vectors=False) + + train_data = [ + ( + 'Who is Shaka Khan?', + [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')] + ), + ( + 'I like London and Berlin.', + [(len('I like '), len('I like London'), 'LOC'), + (len('I like London and '), len('I like London and Berlin'), 'LOC')] + ) + ] + ner = train_ner(nlp, train_data, ['PERSON', 'LOC']) + + doc = nlp.make_doc('Who is Shaka Khan?') + nlp.tagger(doc) + ner(doc) + for word in doc: + print(word.text, word.tag_, word.ent_type_, word.ent_iob) + + if model_dir is not None: + with (model_dir / 'config.json').open('wb') as file_: + json.dump(ner.cfg, file_) + ner.model.dump(str(model_dir / 'model')) + + +if __name__ == '__main__': + main() + # Who "" 2 + # is "" 2 + # Shaka "" PERSON 3 + # Khan "" PERSON 1 + # ? "" 2 diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py new file mode 100644 index 000000000..7e4ed90d0 --- /dev/null +++ b/examples/training/train_parser.py @@ -0,0 +1,75 @@ +from __future__ import unicode_literals, print_function +import json +import pathlib +import random + +import spacy +from spacy.pipeline import DependencyParser +from spacy.gold import GoldParse +from spacy.tokens import Doc + + +def train_parser(nlp, train_data, left_labels, right_labels): + parser = DependencyParser( + nlp.vocab, + left_labels=left_labels, + right_labels=right_labels) + for itn in range(1000): + random.shuffle(train_data) + loss = 0 + for words, heads, deps in train_data: + doc = Doc(nlp.vocab, words=words) + gold = GoldParse(doc, heads=heads, deps=deps) + loss += parser.update(doc, gold) + parser.model.end_training() + return parser + + +def main(model_dir=None): + if model_dir is not None: + model_dir = pathlb.Path(model_dir) + if not model_dir.exists(): + model_dir.mkdir() + assert model_dir.isdir() + + nlp = spacy.load('en', tagger=False, parser=False, entity=False, vectors=False) + + train_data = [ + ( + ['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'], + [1, 1, 4, 4, 5, 1, 1], + ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] + ), + ( + ['I', 'like', 'London', 'and', 'Berlin', '.'], + [1, 1, 1, 2, 2, 1], + ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] + ) + ] + left_labels = set() + right_labels = set() + for _, heads, deps in train_data: + for i, (head, dep) in enumerate(zip(heads, deps)): + if i < head: + left_labels.add(dep) + elif i > head: + right_labels.add(dep) + parser = train_parser(nlp, train_data, sorted(left_labels), sorted(right_labels)) + + doc = Doc(nlp.vocab, words=['I', 'like', 'securities', '.']) + parser(doc) + for word in doc: + print(word.text, word.dep_, word.head.text) + + if model_dir is not None: + with (model_dir / 'config.json').open('wb') as file_: + json.dump(parser.cfg, file_) + parser.model.dump(str(model_dir / 'model')) + + +if __name__ == '__main__': + main() + # I nsubj like + # like ROOT like + # securities dobj like + # . cc securities diff --git a/examples/train_pos_tagger.py b/examples/training/train_tagger.py similarity index 60% rename from examples/train_pos_tagger.py rename to examples/training/train_tagger.py index 43bd607c7..9fac234dc 100644 --- a/examples/train_pos_tagger.py +++ b/examples/training/train_tagger.py @@ -5,12 +5,11 @@ from __future__ import unicode_literals from __future__ import print_function import plac -from os import path -import os +from pathlib import Path from spacy.vocab import Vocab -from spacy.tokenizer import Tokenizer from spacy.tagger import Tagger +from spacy.tokens import Doc import random @@ -39,34 +38,41 @@ DATA = [ ) ] -def ensure_dir(*parts): - path_ = path.join(*parts) - if not path.exists(path_): - os.mkdir(path_) - return path_ +def ensure_dir(path): + if not path.exists(): + path.mkdir() -def main(output_dir): - ensure_dir(output_dir) - ensure_dir(output_dir, "pos") - ensure_dir(output_dir, "vocab") +def main(output_dir=None): + if output_dir is not None: + output_dir = Path(output_dir) + ensure_dir(output_dir) + ensure_dir(output_dir / "pos") + ensure_dir(output_dir / "vocab") vocab = Vocab(tag_map=TAG_MAP) - tokenizer = Tokenizer(vocab, {}, None, None, None) # The default_templates argument is where features are specified. See # spacy/tagger.pyx for the defaults. - tagger = Tagger.blank(vocab, Tagger.default_templates()) - + tagger = Tagger(vocab) for i in range(5): for words, tags in DATA: - tokens = tokenizer.tokens_from_list(words) - tagger.train(tokens, tags) + doc = Doc(vocab, words=words) + tagger.update(doc, tags) random.shuffle(DATA) tagger.model.end_training() - tagger.model.dump(path.join(output_dir, 'pos', 'model')) - with io.open(output_dir, 'vocab', 'strings.json') as file_: - tagger.vocab.strings.dump(file_) + doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True]*4)) + tagger(doc) + for word in doc: + print(word.text, word.tag_, word.pos_) + if output_dir is not None: + tagger.model.dump(str(output_dir / 'pos' / 'model')) + with (output_dir / 'vocab' / 'strings.json').open('wb') as file_: + tagger.vocab.strings.dump(file_) if __name__ == '__main__': plac.call(main) + # I V VERB + # like V VERB + # blue N NOUN + # eggs N NOUN diff --git a/fabfile.py b/fabfile.py index 7a53fb88f..d35ef8253 100644 --- a/fabfile.py +++ b/fabfile.py @@ -182,7 +182,7 @@ def train(json_dir=None, dev_loc=None, model_dir=None): with virtualenv(VENV_DIR): with lcd(path.dirname(__file__)): local('python bin/init_model.py en lang_data/ corpora/ ' + model_dir) - local('python bin/parser/train.py %s %s' % (json_dir, model_dir)) + local('python bin/parser/train.py -p en %s/train/ %s/development %s' % (json_dir, json_dir, model_dir)) def travis(): diff --git a/requirements.txt b/requirements.txt index 11dff81f0..9ce19a4e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ cython<0.24 pathlib numpy>=1.7 cymem>=1.30,<1.32 -preshed>=0.46.1,<0.47.0 +preshed>=0.46.4,<0.47.0 thinc>=5.0.0,<5.1.0 murmurhash>=0.26,<0.27 plac<0.9.3 diff --git a/setup.py b/setup.py index 2098fb377..7491b4324 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,6 @@ #!/usr/bin/env python from __future__ import print_function +from __future__ import unicode_literals import os import subprocess import sys @@ -47,6 +48,7 @@ MOD_NAMES = [ 'spacy.attrs', 'spacy.morphology', 'spacy.tagger', + 'spacy.pipeline', 'spacy.syntax.stateclass', 'spacy.syntax._state', 'spacy.tokenizer', @@ -152,11 +154,11 @@ def setup_package(): return clean(root) with chdir(root): - with open(os.path.join(root, 'spacy', 'about.py')) as f: + with io.open(os.path.join(root, 'spacy', 'about.py'), encoding='utf8') as f: about = {} exec(f.read(), about) - with open(os.path.join(root, 'README.rst')) as f: + with io.open(os.path.join(root, 'README.rst'), encoding='utf8') as f: readme = f.read() include_dirs = [ @@ -181,7 +183,7 @@ def setup_package(): name=about['__title__'], zip_safe=False, packages=PACKAGES, - package_data={'': ['*.pyx', '*.pxd', '*.txt', '*.tokens']}, + package_data={'': ['*.pyx', '*.pxd', '*.txt', '*.tokens', 'data']}, description=about['__summary__'], long_description=readme, author=about['__author__'], @@ -194,11 +196,12 @@ def setup_package(): 'numpy>=1.7', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.32', - 'preshed>=0.46.1,<0.47', + 'preshed>=0.46.0,<0.47.0', 'thinc>=5.0.0,<5.1.0', 'plac', 'six', 'cloudpickle', + 'pathlib', 'sputnik>=0.9.2,<0.10.0'], classifiers=[ 'Development Status :: 5 - Production/Stable', diff --git a/spacy/__init__.py b/spacy/__init__.py index 478fd80c8..030616ca7 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -19,43 +19,13 @@ set_lang_class(de.German.lang, de.German) set_lang_class(zh.Chinese.lang, zh.Chinese) -def blank(name, vocab=None, tokenizer=None, parser=None, tagger=None, entity=None, - matcher=None, serializer=None, vectors=None, pipeline=None): +def load(name, **overrides): target_name, target_version = util.split_data_name(name) - cls = get_lang_class(target_name) - return cls( - path, - vectors=vectors, - vocab=vocab, - tokenizer=tokenizer, - tagger=tagger, - parser=parser, - entity=entity, - matcher=matcher, - pipeline=pipeline, - serializer=serializer) + path = overrides.get('path', util.get_data_path()) + path = util.match_best_version(target_name, target_version, path) - -def load(name, vocab=True, tokenizer=True, parser=True, tagger=True, entity=True, - matcher=True, serializer=True, vectors=True, pipeline=True, via=None): - if via is None: - via = util.get_data_path() - - target_name, target_version = util.split_data_name(name) - path = util.match_best_version(target_name, target_version, via) - - if isinstance(vectors, basestring): - vectors = util.match_best_version(vectors, None, via) + if isinstance(overrides.get('vectors'), basestring): + vectors = util.match_best_version(overrides.get('vectors'), None, path) cls = get_lang_class(target_name) - return cls( - path, - vectors=vectors, - vocab=vocab, - tokenizer=tokenizer, - tagger=tagger, - parser=parser, - entity=entity, - matcher=matcher, - pipeline=pipeline, - serializer=serializer) + return cls(path=path, **overrides) diff --git a/spacy/about.py b/spacy/about.py index d374153ad..ad65a7acf 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '0.101.0' +__version__ = '1.0.0' __summary__ = 'Industrial-strength NLP' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' diff --git a/spacy/data/.gitignore b/spacy/data/.gitignore deleted file mode 100644 index 5e7d2734c..000000000 --- a/spacy/data/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# Ignore everything in this directory -* -# Except this file -!.gitignore diff --git a/spacy/de/__init__.py b/spacy/de/__init__.py index be5b3b0f0..7a8e5727c 100644 --- a/spacy/de/__init__.py +++ b/spacy/de/__init__.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals, print_function from os import path from ..language import Language +from ..attrs import LANG from . import language_data @@ -11,6 +12,8 @@ class German(Language): class Defaults(Language.Defaults): tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'de' prefixes = tuple(language_data.TOKENIZER_PREFIXES) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index e05787f12..ade3e8e7a 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -8,6 +8,7 @@ from .. import util from ..lemmatizer import Lemmatizer from ..vocab import Vocab from ..tokenizer import Tokenizer +from ..attrs import LANG class English(Language): @@ -15,16 +16,16 @@ class English(Language): class Defaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'en' tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) - + prefixes = tuple(language_data.TOKENIZER_PREFIXES) - + suffixes = tuple(language_data.TOKENIZER_SUFFIXES) - + infixes = tuple(language_data.TOKENIZER_INFIXES) tag_map = dict(language_data.TAG_MAP) stop_words = set(language_data.STOP_WORDS) - diff --git a/spacy/gold.pyx b/spacy/gold.pyx index c3badc60d..aea055ead 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,3 +1,5 @@ +from __future__ import unicode_literals, print_function + import numpy import io import json @@ -42,6 +44,21 @@ def tags_to_entities(tags): return entities +def merge_sents(sents): + m_deps = [[], [], [], [], [], []] + m_brackets = [] + i = 0 + for (ids, words, tags, heads, labels, ner), brackets in sents: + m_deps[0].extend(id_ + i for id_ in ids) + m_deps[1].extend(words) + m_deps[2].extend(tags) + m_deps[3].extend(head + i for head in heads) + m_deps[4].extend(labels) + m_deps[5].extend(ner) + m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) + i += len(ids) + return [(m_deps, m_brackets)] + def align(cand_words, gold_words): cost, edit_path = _min_edit_path(cand_words, gold_words) @@ -128,7 +145,6 @@ def _min_edit_path(cand_words, gold_words): def read_json_file(loc, docs_filter=None): - print loc if path.isdir(loc): for filename in os.listdir(loc): yield from read_json_file(path.join(loc, filename)) @@ -199,33 +215,53 @@ def _consume_ent(tags): cdef class GoldParse: - def __init__(self, tokens, annot_tuples, brackets=tuple(), make_projective=False): + @classmethod + def from_annot_tuples(cls, doc, annot_tuples, make_projective=False): + _, words, tags, heads, deps, entities = annot_tuples + return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities, + make_projective=make_projective) + + def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, + deps=None, entities=None, make_projective=False): + if words is None: + words = [token.text for token in doc] + if tags is None: + tags = [None for _ in doc] + if heads is None: + heads = [token.i for token in doc] + if deps is None: + deps = [None for _ in doc] + if entities is None: + entities = [None for _ in doc] + elif len(entities) == 0: + entities = ['O' for _ in doc] + elif not isinstance(entities[0], basestring): + # Assume we have entities specified by character offset. + entities = biluo_tags_from_offsets(doc, entities) + self.mem = Pool() self.loss = 0 - self.length = len(tokens) + self.length = len(doc) # These are filled by the tagger/parser/entity recogniser - self.c.tags = self.mem.alloc(len(tokens), sizeof(int)) - self.c.heads = self.mem.alloc(len(tokens), sizeof(int)) - self.c.labels = self.mem.alloc(len(tokens), sizeof(int)) - self.c.ner = self.mem.alloc(len(tokens), sizeof(Transition)) - self.c.brackets = self.mem.alloc(len(tokens), sizeof(int*)) - for i in range(len(tokens)): - self.c.brackets[i] = self.mem.alloc(len(tokens), sizeof(int)) + self.c.tags = self.mem.alloc(len(doc), sizeof(int)) + self.c.heads = self.mem.alloc(len(doc), sizeof(int)) + self.c.labels = self.mem.alloc(len(doc), sizeof(int)) + self.c.ner = self.mem.alloc(len(doc), sizeof(Transition)) - self.tags = [None] * len(tokens) - self.heads = [None] * len(tokens) - self.labels = [''] * len(tokens) - self.ner = ['-'] * len(tokens) + self.tags = [None] * len(doc) + self.heads = [None] * len(doc) + self.labels = [''] * len(doc) + self.ner = ['-'] * len(doc) - self.cand_to_gold = align([t.orth_ for t in tokens], annot_tuples[1]) - self.gold_to_cand = align(annot_tuples[1], [t.orth_ for t in tokens]) + self.cand_to_gold = align([t.orth_ for t in doc], words) + self.gold_to_cand = align(words, [t.orth_ for t in doc]) + annot_tuples = (range(len(words)), words, tags, heads, deps, entities) self.orig_annot = list(zip(*annot_tuples)) - words = [w.orth_ for w in tokens] for i, gold_i in enumerate(self.cand_to_gold): - if words[i].isspace(): + if doc[i].text.isspace(): self.tags[i] = 'SP' self.heads[i] = None self.labels[i] = None @@ -233,27 +269,19 @@ cdef class GoldParse: if gold_i is None: pass else: - self.tags[i] = annot_tuples[2][gold_i] - self.heads[i] = self.gold_to_cand[annot_tuples[3][gold_i]] - self.labels[i] = annot_tuples[4][gold_i] - self.ner[i] = annot_tuples[5][gold_i] + self.tags[i] = tags[gold_i] + self.heads[i] = self.gold_to_cand[heads[gold_i]] + self.labels[i] = deps[gold_i] + self.ner[i] = entities[gold_i] cycle = nonproj.contains_cycle(self.heads) if cycle != None: raise Exception("Cycle found: %s" % cycle) if make_projective: - proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads,self.labels) + proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads, self.labels) self.heads = proj_heads - self.brackets = {} - for (gold_start, gold_end, label_str) in brackets: - start = self.gold_to_cand[gold_start] - end = self.gold_to_cand[gold_end] - if start is not None and end is not None: - self.brackets.setdefault(start, {}).setdefault(end, set()) - self.brackets[end][start].add(label_str) - def __len__(self): return self.length @@ -262,15 +290,68 @@ cdef class GoldParse: return not nonproj.is_nonproj_tree(self.heads) +def biluo_tags_from_offsets(doc, entities): + '''Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out + scheme (biluo). + + Arguments: + doc (Doc): + The document that the entity offsets refer to. The output tags will + refer to the token boundaries within the document. + + entities (sequence): + A sequence of (start, end, label) triples. start and end should be + character-offset integers denoting the slice into the original string. + + Returns: + tags (list): + A list of unicode strings, describing the tags. Each tag string will + be of the form either "", "O" or "{action}-{label}", where action is one + of "B", "I", "L", "U". The string "-" is used where the entity + offsets don't align with the tokenization in the Doc object. The + training algorithm will view these as missing values. "O" denotes + a non-entity token. "B" denotes the beginning of a multi-token entity, + "I" the inside of an entity of three or more tokens, and "L" the end + of an entity of two or more tokens. "U" denotes a single-token entity. + + Example: + text = 'I like London.' + entities = [(len('I like '), len('I like London'), 'LOC')] + doc = nlp.tokenizer(text) + + tags = biluo_tags_from_offsets(doc, entities) + + assert tags == ['O', 'O', 'U-LOC', 'O'] + ''' + starts = {token.idx: token.i for token in doc} + ends = {token.idx+len(token): token.i for token in doc} + biluo = ['-' for _ in doc] + # Handle entity cases + for start_char, end_char, label in entities: + start_token = starts.get(start_char) + end_token = ends.get(end_char) + # Only interested if the tokenization is correct + if start_token is not None and end_token is not None: + if start_token == end_token: + biluo[start_token] = 'U-%s' % label + else: + biluo[start_token] = 'B-%s' % label + for i in range(start_token+1, end_token): + biluo[i] = 'I-%s' % label + biluo[end_token] = 'L-%s' % label + # Now distinguish the O cases from ones where we miss the tokenization + entity_chars = set() + for start_char, end_char, label in entities: + for i in range(start_char, end_char): + entity_chars.add(i) + for token in doc: + for i in range(token.idx, token.idx+len(token)): + if i in entity_chars: + break + else: + biluo[token.i] = 'O' + return biluo + + def is_punct_label(label): return label == 'P' or label.lower() == 'punct' - - - - - - - - - - diff --git a/spacy/language.py b/spacy/language.py index df7728d08..040dc1ed4 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -2,6 +2,8 @@ from __future__ import absolute_import from __future__ import unicode_literals from warnings import warn import pathlib +from contextlib import contextmanager +import shutil try: import ujson as json @@ -15,126 +17,120 @@ except NameError: basestring = str - from .tokenizer import Tokenizer from .vocab import Vocab -from .syntax.parser import Parser from .tagger import Tagger from .matcher import Matcher from . import attrs from . import orth -from .syntax.ner import BiluoPushDown -from .syntax.arc_eager import ArcEager from . import util from .lemmatizer import Lemmatizer +from .train import Trainer from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP from .syntax.parser import get_templates +from .syntax.nonproj import PseudoProjectivity +from .pipeline import DependencyParser, EntityRecognizer class BaseDefaults(object): - def __init__(self, lang, path): - self.path = path - self.lang = lang - self.lex_attr_getters = dict(self.__class__.lex_attr_getters) - if self.path and (self.path / 'vocab' / 'oov_prob').exists(): - with (self.path / 'vocab' / 'oov_prob').open() as file_: - oov_prob = file_.read().strip() - self.lex_attr_getters[PROB] = lambda string: oov_prob - self.lex_attr_getters[LANG] = lambda string: lang - self.lex_attr_getters[IS_STOP] = lambda string: string in self.stop_words + @classmethod + def create_lemmatizer(cls, nlp=None): + if nlp is None or nlp.path is None: + return Lemmatizer({}, {}, {}) + else: + return Lemmatizer.load(nlp.path) - def Lemmatizer(self): - return Lemmatizer.load(self.path) if self.path else Lemmatizer({}, {}, {}) - - def Vectors(self): + @classmethod + def create_vocab(cls, nlp=None): + lemmatizer = cls.create_lemmatizer(nlp) + if nlp is None or nlp.path is None: + return Vocab(lex_attr_getters=cls.lex_attr_getters, tag_map=cls.tag_map, + lemmatizer=lemmatizer) + else: + return Vocab.load(nlp.path, lex_attr_getters=cls.lex_attr_getters, + tag_map=cls.tag_map, lemmatizer=lemmatizer) + + @classmethod + def add_vectors(cls, nlp=None): return True - def Vocab(self, lex_attr_getters=True, tag_map=True, - lemmatizer=True, serializer_freqs=True, vectors=True): - if lex_attr_getters is True: - lex_attr_getters = self.lex_attr_getters - if tag_map is True: - tag_map = self.tag_map - if lemmatizer is True: - lemmatizer = self.Lemmatizer() - if vectors is True: - vectors = self.Vectors() - if self.path: - return Vocab.load(self.path, lex_attr_getters=lex_attr_getters, - tag_map=tag_map, lemmatizer=lemmatizer, - serializer_freqs=serializer_freqs) + @classmethod + def create_tokenizer(cls, nlp=None): + rules = cls.tokenizer_exceptions + prefix_search = util.compile_prefix_regex(cls.prefixes).search + suffix_search = util.compile_suffix_regex(cls.suffixes).search + infix_finditer = util.compile_infix_regex(cls.infixes).finditer + vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) + return Tokenizer(nlp.vocab, rules=rules, + prefix_search=prefix_search, suffix_search=suffix_search, + infix_finditer=infix_finditer) + + @classmethod + def create_tagger(cls, nlp=None): + if nlp is None: + return Tagger(cls.create_vocab(), features=cls.tagger_features) + elif nlp.path is None or not (nlp.path / 'pos').exists(): + return Tagger(nlp.vocab, features=cls.tagger_features) else: - return Vocab(lex_attr_getters=lex_attr_getters, tag_map=tag_map, - lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) + return Tagger.load(nlp.path / 'pos', nlp.vocab) - def Tokenizer(self, vocab, rules=None, prefix_search=None, suffix_search=None, - infix_finditer=None): - if rules is None: - rules = self.tokenizer_exceptions - if prefix_search is None: - prefix_search = util.compile_prefix_regex(self.prefixes).search - if suffix_search is None: - suffix_search = util.compile_suffix_regex(self.suffixes).search - if infix_finditer is None: - infix_finditer = util.compile_infix_regex(self.infixes).finditer - if self.path: - return Tokenizer.load(self.path, vocab, rules=rules, - prefix_search=prefix_search, - suffix_search=suffix_search, - infix_finditer=infix_finditer) + @classmethod + def create_parser(cls, nlp=None): + if nlp is None: + return DependencyParser(cls.create_vocab(), features=cls.parser_features) + elif nlp.path is None or not (nlp.path / 'deps').exists(): + return DependencyParser(nlp.vocab, features=cls.parser_features) else: - return Tokenizer(vocab, rules=rules, - prefix_search=prefix_search, suffix_search=suffix_search, - infix_finditer=infix_finditer) + return DependencyParser.load(nlp.path / 'deps', nlp.vocab) - def Tagger(self, vocab): - if self.path: - return Tagger.load(self.path / 'pos', vocab) + @classmethod + def create_entity(cls, nlp=None): + if nlp is None: + return EntityRecognizer(cls.create_vocab(), features=cls.entity_features) + elif nlp.path is None or not (nlp.path / 'ner').exists(): + return EntityRecognizer(nlp.vocab, features=cls.entity_features) else: - return Tagger.blank(vocab, Tagger.default_templates()) + return EntityRecognizer.load(nlp.path / 'ner', nlp.vocab) - def Parser(self, vocab): - if self.path: - if (self.path / 'deps').exists(): - return Parser.load(self.path / 'deps', vocab, ArcEager) - else: - return None + @classmethod + def create_matcher(cls, nlp=None): + if nlp is None: + return Matcher(cls.create_vocab()) + elif nlp.path is None or not (nlp.path / 'vocab').exists(): + return Matcher(nlp.vocab) else: - return Parser.blank(vocab, ArcEager, - features=self.parser_features, labels=self.parser_labels) + return Matcher.load(nlp.path / 'vocab', nlp.vocab) - def Entity(self, vocab): - if self.path: - if (self.path / 'ner').exists(): - return Parser.load(self.path / 'ner', vocab, BiluoPushDown) - else: - return None - else: - return Parser.blank(vocab, BiluoPushDown, - features=self.entity_features, labels=self.entity_labels) + @classmethod + def create_pipeline(self, nlp=None): + pipeline = [] + if nlp is None: + return [] + if nlp.tagger: + pipeline.append(nlp.tagger) + if nlp.parser: + pipeline.append(nlp.parser) + if nlp.entity: + pipeline.append(nlp.entity) + return pipeline + + prefixes = tuple() - def Matcher(self, vocab): - if self.path: - return Matcher.load(self.path, vocab) - else: - return Matcher(vocab) + suffixes = tuple() - def Pipeline(self, nlp): - return [ - nlp.tokenizer, - nlp.tagger, - nlp.parser, - nlp.entity] - - parser_labels = {0: {'ROOT': True}} - - entity_labels = {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} + infixes = tuple() + + tag_map = {} + tokenizer_exceptions = {} + parser_features = get_templates('parser') entity_features = get_templates('ner') + tagger_features = Tagger.feature_templates # TODO -- fix this + stop_words = set() lex_attr_getters = { @@ -171,66 +167,105 @@ class Language(object): Defaults = BaseDefaults lang = None - def __init__(self, - path=None, - vocab=True, - tokenizer=True, - tagger=True, - parser=True, - entity=True, - matcher=True, - serializer=True, - vectors=True, - pipeline=True, - defaults=True, - data_dir=None): - """ - A model can be specified: - - 1) by calling a Language subclass - - spacy.en.English() - - 2) by calling a Language subclass with data_dir - - spacy.en.English('my/model/root') - - spacy.en.English(data_dir='my/model/root') - - 3) by package name - - spacy.load('en_default') - - spacy.load('en_default==1.0.0') - - 4) by package name with a relocated package base - - spacy.load('en_default', via='/my/package/root') - - spacy.load('en_default==1.0.0', via='/my/package/root') - """ - if data_dir is not None and path is None: - warn("'data_dir' argument now named 'path'. Doing what you mean.") - path = data_dir + @classmethod + @contextmanager + def train(cls, path, gold_tuples, *configs): if isinstance(path, basestring): path = pathlib.Path(path) - if path is None: - path = util.match_best_version(self.lang, '', util.get_data_path()) - self.path = path - defaults = defaults if defaults is not True else self.get_defaults(self.path) - - self.vocab = vocab if vocab is not True else defaults.Vocab(vectors=vectors) - self.tokenizer = tokenizer if tokenizer is not True else defaults.Tokenizer(self.vocab) - self.tagger = tagger if tagger is not True else defaults.Tagger(self.vocab) - self.entity = entity if entity is not True else defaults.Entity(self.vocab) - self.parser = parser if parser is not True else defaults.Parser(self.vocab) - self.matcher = matcher if matcher is not True else defaults.Matcher(self.vocab) - self.pipeline = pipeline(self) if pipeline is not True else defaults.Pipeline(self) + tagger_cfg, parser_cfg, entity_cfg = configs + dep_model_dir = path / 'deps' + ner_model_dir = path / 'ner' + pos_model_dir = path / 'pos' + if dep_model_dir.exists(): + shutil.rmtree(str(dep_model_dir)) + if ner_model_dir.exists(): + shutil.rmtree(str(ner_model_dir)) + if pos_model_dir.exists(): + shutil.rmtree(str(pos_model_dir)) + dep_model_dir.mkdir() + ner_model_dir.mkdir() + pos_model_dir.mkdir() - def __reduce__(self): - args = ( - self.path, - self.vocab, - self.tokenizer, - self.tagger, - self.parser, - self.entity, - self.matcher - ) - return (self.__class__, args, None, None) + if parser_cfg['pseudoprojective']: + # preprocess training data here before ArcEager.get_labels() is called + gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) + + parser_cfg['labels'] = ArcEager.get_labels(gold_tuples) + entity_cfg['labels'] = BiluoPushDown.get_labels(gold_tuples) + + with (dep_model_dir / 'config.json').open('wb') as file_: + json.dump(parser_cfg, file_) + with (ner_model_dir / 'config.json').open('wb') as file_: + json.dump(entity_cfg, file_) + with (pos_model_dir / 'config.json').open('wb') as file_: + json.dump(tagger_cfg, file_) + + self = cls( + path=path, + vocab=False, + tokenizer=False, + tagger=False, + parser=False, + entity=False, + matcher=False, + serializer=False, + vectors=False, + pipeline=False) + + self.defaults.parser_labels = parser_cfg['labels'] + self.defaults.entity_labels = entity_cfg['labels'] + + self.vocab = self.defaults.Vocab() + self.tokenizer = self.defaults.Tokenizer(self.vocab) + self.tagger = self.defaults.Tagger(self.vocab, **tagger_cfg) + self.parser = self.defaults.Parser(self.vocab, **parser_cfg) + self.entity = self.defaults.Entity(self.vocab, **entity_cfg) + self.pipeline = self.defaults.Pipeline(self) + yield Trainer(self, gold_tuples) + self.end_training() + + def __init__(self, path=True, **overrides): + if 'data_dir' in overrides and 'path' not in overrides: + raise ValueError("The argument 'data_dir' has been renamed to 'path'") + path = overrides.get('path', True) + if isinstance(path, basestring): + path = pathlib.Path(path) + if path is True: + path = util.match_best_version(self.lang, '', util.get_data_path()) + + self.path = path + + self.vocab = self.Defaults.create_vocab(self) \ + if 'vocab' not in overrides \ + else overrides['vocab'] + self.tokenizer = self.Defaults.create_tokenizer(self) \ + if 'tokenizer' not in overrides \ + else overrides['tokenizer'] + self.tagger = self.Defaults.create_tagger(self) \ + if 'tagger' not in overrides \ + else overrides['tagger'] + self.parser = self.Defaults.create_parser(self) \ + if 'parser' not in overrides \ + else overrides['parser'] + self.entity = self.Defaults.create_entity(self) \ + if 'entity' not in overrides \ + else overrides['entity'] + self.matcher = self.Defaults.create_matcher(self) \ + if 'matcher' not in overrides \ + else overrides['matcher'] + + if 'make_doc' in overrides: + self.make_doc = overrides['make_doc'] + elif 'create_make_doc' in overrides: + self.make_doc = overrides['create_make_doc'] + else: + self.make_doc = lambda text: self.tokenizer(text) + if 'pipeline' in overrides: + self.pipeline = overrides['pipeline'] + elif 'create_pipeline' in overrides: + self.pipeline = overrides['create_pipeline'] + else: + self.pipeline = [self.tagger, self.parser, self.matcher, self.entity] def __call__(self, text, tag=True, parse=True, entity=True): """Apply the pipeline to some text. The text can span multiple sentences, @@ -249,24 +284,22 @@ class Language(object): >>> tokens[0].orth_, tokens[0].head.tag_ ('An', 'NN') """ - doc = self.pipeline[0](text) + doc = self.make_doc(text) if self.entity and entity: # Add any of the entity labels already set, in case we don't have them. for token in doc: if token.ent_type != 0: self.entity.add_label(token.ent_type) skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity} - for proc in self.pipeline[1:]: + for proc in self.pipeline: if proc and not skip.get(proc): proc(doc) return doc - def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, - batch_size=1000): + def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000): skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity} - stream = self.pipeline[0].pipe(texts, - n_threads=n_threads, batch_size=batch_size) - for proc in self.pipeline[1:]: + stream = (self.make_doc(text) for text in texts) + for proc in self.pipeline: if proc and not skip.get(proc): if hasattr(proc, 'pipe'): stream = proc.pipe(stream, n_threads=n_threads, batch_size=batch_size) @@ -278,16 +311,19 @@ class Language(object): def end_training(self, path=None): if path is None: path = self.path - if self.parser: - self.parser.model.end_training() - self.parser.model.dump(path / 'deps' / 'model') - if self.entity: - self.entity.model.end_training() - self.entity.model.dump(path / 'ner' / 'model') + elif isinstance(path, basestring): + path = pathlib.Path(path) + if self.tagger: self.tagger.model.end_training() - self.tagger.model.dump(path / 'pos' / 'model') - + self.tagger.model.dump(str(path / 'pos' / 'model')) + if self.parser: + self.parser.model.end_training() + self.parser.model.dump(str(path / 'deps' / 'model')) + if self.entity: + self.entity.model.end_training() + self.entity.model.dump(str(path / 'ner' / 'model')) + strings_loc = path / 'vocab' / 'strings.json' with strings_loc.open('w', encoding='utf8') as file_: self.vocab.strings.dump(file_) @@ -309,7 +345,7 @@ class Language(object): else: entity_iob_freqs = [] entity_type_freqs = [] - with (path / 'vocab' / 'serializer.json').open('w') as file_: + with (path / 'vocab' / 'serializer.json').open('wb') as file_: file_.write( json.dumps([ (TAG, tagger_freqs), diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 3307eb864..c3e40c1a0 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -92,8 +92,8 @@ ctypedef TokenPatternC* TokenPatternC_ptr ctypedef pair[int, TokenPatternC_ptr] StateC -cdef TokenPatternC* init_pattern(Pool mem, object token_specs, attr_t entity_id, - attr_t entity_type) except NULL: +cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label, + object token_specs) except NULL: pattern = mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC)) cdef int i for i, (quantifier, spec) in enumerate(token_specs): @@ -108,7 +108,7 @@ cdef TokenPatternC* init_pattern(Pool mem, object token_specs, attr_t entity_id, pattern[i].attrs[0].attr = ID pattern[i].attrs[0].value = entity_id pattern[i].attrs[1].attr = ENT_TYPE - pattern[i].attrs[1].value = entity_type + pattern[i].attrs[1].value = label pattern[i].nr_attr = 0 return pattern @@ -163,37 +163,14 @@ def _convert_strings(token_specs, string_store): return tokens -def get_bilou(length): - if length == 1: - return [U_ENT] - elif length == 2: - return [B2_ENT, L2_ENT] - elif length == 3: - return [B3_ENT, I3_ENT, L3_ENT] - elif length == 4: - return [B4_ENT, I4_ENT, I4_ENT, L4_ENT] - elif length == 5: - return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT] - elif length == 6: - return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT] - elif length == 7: - return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT] - elif length == 8: - return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT] - elif length == 9: - return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT] - elif length == 10: - return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, - I10_ENT, I10_ENT, L10_ENT] - else: - raise ValueError("Max length currently 10 for phrase matching") - - cdef class Matcher: cdef Pool mem cdef vector[TokenPatternC*] patterns cdef readonly Vocab vocab cdef public object _patterns + cdef public object _entities + cdef public object _callbacks + cdef public object _acceptors @classmethod def load(cls, path, vocab): @@ -205,12 +182,17 @@ cdef class Matcher: return cls(vocab, patterns) def __init__(self, vocab, patterns={}): - self._patterns = dict(patterns) # Make sure we own the object + self._patterns = {} + self._entities = {} + self._acceptors = {} + self._callbacks = {} self.vocab = vocab self.mem = Pool() self.vocab = vocab - for entity_key, (etype, attrs, specs) in sorted(self._patterns.items()): - self.add(entity_key, etype, attrs, specs) + for entity_key, (etype, attrs, specs) in sorted(patterns.items()): + self.add_entity(entity_key, attrs) + for spec in specs: + self.add_pattern(entity_key, spec, label=etype) def __reduce__(self): return (self.__class__, (self.vocab, self._patterns), None, None) @@ -218,21 +200,69 @@ cdef class Matcher: property n_patterns: def __get__(self): return self.patterns.size() - def add(self, entity_key, etype, attrs, specs): - self._patterns[entity_key] = (etype, dict(attrs), list(specs)) - if isinstance(entity_key, basestring): - entity_key = self.vocab.strings[entity_key] - if isinstance(etype, basestring): - etype = self.vocab.strings[etype] - elif etype is None: - etype = -1 - # TODO: Do something more clever about multiple patterns for single - # entity + def add_entity(self, entity_key, attrs=None, if_exists='raise', + acceptor=None, on_match=None): + if if_exists not in ('raise', 'ignore', 'update'): + raise ValueError( + "Unexpected value for if_exists: %s.\n" + "Expected one of: ['raise', 'ignore', 'update']" % if_exists) + if attrs is None: + attrs = {} + entity_key = self.normalize_entity_key(entity_key) + if self.has_entity(entity_key): + if if_exists == 'raise': + raise KeyError( + "Tried to add entity %s. Entity exists, and if_exists='raise'.\n" + "Set if_exists='ignore' or if_exists='update', or check with " + "matcher.has_entity()") + elif if_exists == 'ignore': + return + self._entities[entity_key] = dict(attrs) + self._patterns.setdefault(entity_key, []) + self._acceptors[entity_key] = acceptor + self._callbacks[entity_key] = on_match + + def add_pattern(self, entity_key, token_specs, label=""): + entity_key = self.normalize_entity_key(entity_key) + if not self.has_entity(entity_key): + self.add_entity(entity_key) + if isinstance(label, basestring): + label = self.vocab.strings[label] + elif label is None: + label = 0 + spec = _convert_strings(token_specs, self.vocab.strings) + + self.patterns.push_back(init_pattern(self.mem, entity_key, label, spec)) + self._patterns[entity_key].append((label, token_specs)) + + def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None): + self.add_entity(entity_key, attrs=attrs, if_exists='update', + acceptor=acceptor, on_match=on_match) for spec in specs: - spec = _convert_strings(spec, self.vocab.strings) - self.patterns.push_back(init_pattern(self.mem, spec, entity_key, etype)) + self.add_pattern(entity_key, spec, label=label) + + def normalize_entity_key(self, entity_key): + if isinstance(entity_key, basestring): + return self.vocab.strings[entity_key] + else: + return entity_key + + def has_entity(self, entity_key): + entity_key = self.normalize_entity_key(entity_key) + return entity_key in self._entities + + def get_entity(self, entity_key): + entity_key = self.normalize_entity_key(entity_key) + if entity_key in self._entities: + return self._entities[entity_key] + else: + return None def __call__(self, Doc doc, acceptor=None): + if acceptor is not None: + raise ValueError( + "acceptor keyword argument to Matcher deprecated. Specify acceptor " + "functions when you add patterns instead.") cdef vector[StateC] partials cdef int n_partials = 0 cdef int q = 0 @@ -267,8 +297,12 @@ cdef class Matcher: end = token_i+1 ent_id = state.second[1].attrs[0].value label = state.second[1].attrs[1].value - if acceptor is None or acceptor(doc, ent_id, label, start, end): - matches.append((ent_id, label, start, end)) + acceptor = self._acceptors.get(ent_id) + if acceptor is not None: + match = acceptor(doc, ent_id, label, start, end) + if match: + ent_id, label, start, end = match + matches.append((ent_id, label, start, end)) partials.resize(q) # Check whether we open any new patterns on this token for pattern in self.patterns: @@ -293,6 +327,10 @@ cdef class Matcher: label = pattern[1].attrs[1].value if acceptor is None or acceptor(doc, ent_id, label, start, end): matches.append((ent_id, label, start, end)) + for i, (ent_id, label, start, end) in enumerate(matches): + on_match = self._callbacks.get(ent_id) + if on_match is not None: + on_match(self, doc, i, matches) return matches def pipe(self, docs, batch_size=1000, n_threads=2): @@ -301,6 +339,32 @@ cdef class Matcher: yield doc +def get_bilou(length): + if length == 1: + return [U_ENT] + elif length == 2: + return [B2_ENT, L2_ENT] + elif length == 3: + return [B3_ENT, I3_ENT, L3_ENT] + elif length == 4: + return [B4_ENT, I4_ENT, I4_ENT, L4_ENT] + elif length == 5: + return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT] + elif length == 6: + return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT] + elif length == 7: + return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT] + elif length == 8: + return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT] + elif length == 9: + return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT] + elif length == 10: + return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, + I10_ENT, I10_ENT, L10_ENT] + else: + raise ValueError("Max length currently 10 for phrase matching") + + cdef class PhraseMatcher: cdef Pool mem cdef Vocab vocab diff --git a/spacy/pipeline.pxd b/spacy/pipeline.pxd new file mode 100644 index 000000000..84e47af5c --- /dev/null +++ b/spacy/pipeline.pxd @@ -0,0 +1,12 @@ +from .syntax.parser cimport Parser +from .syntax.ner cimport BiluoPushDown +from .syntax.arc_eager cimport ArcEager +from .tagger cimport Tagger + + +cdef class EntityRecognizer(Parser): + pass + + +cdef class DependencyParser(Parser): + pass diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx new file mode 100644 index 000000000..ac1f0d75c --- /dev/null +++ b/spacy/pipeline.pyx @@ -0,0 +1,24 @@ +from .syntax.parser cimport Parser +from .syntax.ner cimport BiluoPushDown +from .syntax.arc_eager cimport ArcEager +from .vocab cimport Vocab +from .tagger import Tagger + +# TODO: The disorganization here is pretty embarrassing. At least it's only +# internals. +from .syntax.parser import get_templates as get_feature_templates + + +cdef class EntityRecognizer(Parser): + TransitionSystem = BiluoPushDown + + feature_templates = get_feature_templates('ner') + + +cdef class DependencyParser(Parser): + TransitionSystem = ArcEager + + feature_templates = get_feature_templates('basic') + + +__all__ = [Tagger, DependencyParser, EntityRecognizer] diff --git a/spacy/scorer.py b/spacy/scorer.py index 043cf5b2c..c668845e5 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -70,6 +70,15 @@ class Scorer(object): def ents_f(self): return self.ner.fscore * 100 + @property + def scores(self): + return { + 'uas': self.uas, 'las': self.las, + 'ents_p': self.ents_p, 'ents_r': self.ents_r, 'ents_f': self.ents_f, + 'tags_acc': self.tags_acc, + 'token_acc': self.token_acc + } + def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')): assert len(tokens) == len(gold) diff --git a/spacy/strings.pxd b/spacy/strings.pxd index df61712c8..35b7d3d76 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -1,13 +1,11 @@ +from libc.stdint cimport int64_t + from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from murmurhash.mrmr cimport hash64 -from .typedefs cimport attr_t -from libc.stdint cimport int64_t +from .typedefs cimport attr_t, hash_t -from .typedefs cimport hash_t - -DEF UINT64_MAX = 18446744073709551615 cpdef hash_t hash_string(unicode string) except 0 @@ -24,10 +22,6 @@ cdef class StringStore: cdef public PreshMap _map cdef int64_t _resize_at - cdef PreshMap oov_maps - cpdef int remove_oov_map(self, Pool mem) except -1 - - cdef hash_t intern(self, unicode py_string, Pool mem=*) except UINT64_MAX - cdef const Utf8Str* _intern_utf8(self, const unsigned char* utf8_string, - int length) except NULL + cdef const Utf8Str* intern(self, unicode py_string) except NULL + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 509b475e9..2e81bd87b 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,4 +1,3 @@ -# cython: infer_types=True from __future__ import unicode_literals, absolute_import cimport cython @@ -7,8 +6,7 @@ from libc.stdint cimport uint64_t from murmurhash.mrmr cimport hash64 -from preshed.maps cimport map_init, map_set, map_get, map_iter -from preshed.maps cimport MapStruct +from preshed.maps cimport map_iter, key_t from .typedefs cimport hash_t @@ -18,17 +16,13 @@ except ImportError: import json -DEF UINT64_MAX = 18446744073709551615 - - cpdef hash_t hash_string(unicode string) except 0: - byte_string = string.encode('utf8') - cdef unsigned char* chars = byte_string - return _hash_utf8(chars, len(byte_string)) + chars = string.encode('utf8') + return _hash_utf8(chars, len(chars)) -cdef hash_t _hash_utf8(const unsigned char* utf8_string, int length) nogil: - return hash64(utf8_string, length, 1) +cdef hash_t _hash_utf8(char* utf8_string, int length): + return hash64(utf8_string, length, 1) cdef unicode _decode(const Utf8Str* string): @@ -80,7 +74,6 @@ cdef class StringStore: def __init__(self, strings=None): self.mem = Pool() self._map = PreshMap() - self.oov_maps = PreshMap() self._resize_at = 10000 self.c = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) self.size = 1 @@ -115,21 +108,14 @@ cdef class StringStore: byte_string = string_or_id if len(byte_string) == 0: return 0 - key = _hash_utf8(byte_string, len(byte_string)) - utf8str = self._map.get(key) - if utf8str is NULL: - raise KeyError(byte_string) - else: - return utf8str - self.c + utf8str = self._intern_utf8(byte_string, len(byte_string)) + return utf8str - self.c elif isinstance(string_or_id, unicode): if len(string_or_id) == 0: return 0 - key = hash_string(string_or_id) - utf8str = self._map.get(key) - if utf8str is NULL: - raise KeyError(string_or_id) - else: - return utf8str - self.c + byte_string = (string_or_id).encode('utf8') + utf8str = self._intern_utf8(byte_string, len(byte_string)) + return utf8str - self.c else: raise TypeError(type(string_or_id)) @@ -145,8 +131,6 @@ cdef class StringStore: yield _decode(&self.c[i]) if i > 0 else u'' def __reduce__(self): - # TODO: Is it problematic that we don't save the OOV strings? - # Probably yes? We're not restoring all the state... strings = [""] for i in range(1, self.size): string = &self.c[i] @@ -154,77 +138,27 @@ cdef class StringStore: strings.append(py_string) return (StringStore, (strings,), None, None, None) - cdef hash_t intern(self, unicode py_string, Pool mem=None) except UINT64_MAX: - if mem is None: - mem = self.mem - cdef hash_t map_key = id(mem) + cdef const Utf8Str* intern(self, unicode py_string) except NULL: + # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode('utf8') - cdef hash_t key = _hash_utf8(byte_string, len(byte_string)) - cdef const Utf8Str* utf8str = self._map.get(key) - cdef hash_t map_id = id(mem) - cdef MapStruct* oov_map - if utf8str is not NULL: - return utf8str - self.c - elif mem is None or mem is self.mem: - utf8str = self._intern_utf8(byte_string, len(byte_string)) - return utf8str - self.c - else: - new_utf8str = mem.alloc(sizeof(Utf8Str), 1) - oov_map = self.oov_maps.get(map_key) - if oov_map is NULL: - oov_map = mem.alloc(sizeof(MapStruct), 1) - map_init(mem, oov_map, 16) - self.oov_maps.set(id(mem), oov_map) - new_utf8str[0] = _allocate(mem, byte_string, len(byte_string)) - map_set(mem, oov_map, key, new_utf8str) - return key - - def decode_int(self, hash_t int_, Pool mem=None): - cdef hash_t map_key - if int_ == 0: - return u'' - elif int_ < self.size: - return _decode(&self.c[int_]) - elif mem is None or mem is self.mem: - raise IndexError(int_) - else: - map_key = id(mem) - oov_map = self.oov_maps.get(map_key) - if oov_map is NULL: - raise IndexError( - "Trying to decode integer into string, but it's not in " + - "the main store, and the memory pool hasn't been seen before.\n" + - ("int_ == %d\n" % int_) + - "id(mem) == %d" % map_key) - else: - utf8str = map_get(oov_map, int_) - if utf8str is NULL: - raise IndexError( - "Trying to decode integer into string, but it's not in " + - "the main store. The integer was also not found in the " + - "indicated auxiliary pool " + - "(which is usually specific to a document)." + - ("int_ == %d\n" % int_) + - "id(mem) == %d" % map_key) - return _decode(utf8str) + return self._intern_utf8(byte_string, len(byte_string)) @cython.final - cdef const Utf8Str* _intern_utf8(self, const unsigned char* utf8_string, - int length) except NULL: + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL: + # 0 means missing, but we don't bother offsetting the index. + cdef hash_t key = _hash_utf8(utf8_string, length) + value = self._map.get(key) + if value is not NULL: + return value + if self.size == self._resize_at: self._realloc() - key = _hash_utf8(utf8_string, length) - self.c[self.size] = _allocate(self.mem, utf8_string, length) + self.c[self.size] = _allocate(self.mem, utf8_string, length) self._map.set(key, &self.c[self.size]) self.size += 1 return &self.c[self.size-1] - cpdef int remove_oov_map(self, Pool mem) except -1: - cdef hash_t key = id(mem) - self._maps.pop(key) - def dump(self, file_): - # TODO: Is it problematic that we don't save the OOV strings? No, right? string_data = json.dumps(list(self)) if not isinstance(string_data, unicode): string_data = string_data.decode('utf8') @@ -246,8 +180,8 @@ cdef class StringStore: # we resize our array. So, first we remap to indices, then we resize, # then we can acquire the new pointers. cdef Pool tmp_mem = Pool() - keys = tmp_mem.alloc(self.size, sizeof(hash_t)) - cdef hash_t key + keys = tmp_mem.alloc(self.size, sizeof(key_t)) + cdef key_t key cdef void* value cdef const Utf8Str ptr cdef int i = 0 diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 712bef9a3..7254297d4 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + IDS = { "": NIL, "IS_ALPHA": IS_ALPHA, diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 4e2590734..2bd4da6cc 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -279,20 +279,32 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil: cdef class ArcEager(TransitionSystem): @classmethod - def get_labels(cls, gold_parses): - move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {}, - LEFT: {}, BREAK: {'ROOT': True}} - for raw_text, sents in gold_parses: + def get_actions(cls, **kwargs): + actions = kwargs.get('actions', + { + SHIFT: {'': True}, + REDUCE: {'': True}, + RIGHT: {}, + LEFT: {}, + BREAK: {'ROOT': True}}) + for label in kwargs.get('left_labels', []): + if label.upper() != 'ROOT': + actions[LEFT][label] = True + for label in kwargs.get('right_labels', []): + if label.upper() != 'ROOT': + actions[RIGHT][label] = True + + for raw_text, sents in kwargs.get('gold_parses', []): for (ids, words, tags, heads, labels, iob), ctnts in sents: for child, head, label in zip(ids, heads, labels): if label.upper() == 'ROOT': label = 'ROOT' if label != 'ROOT': if head < child: - move_labels[RIGHT][label] = True + actions[RIGHT][label] = True elif head > child: - move_labels[LEFT][label] = True - return move_labels + actions[LEFT][label] = True + return actions property action_types: def __get__(self): @@ -312,12 +324,6 @@ cdef class ArcEager(TransitionSystem): # Count frequencies, for use in encoder self.freqs[HEAD][gold.c.heads[i] - i] += 1 self.freqs[DEP][gold.c.labels[i]] += 1 - for end, brackets in gold.brackets.items(): - for start, label_strs in brackets.items(): - gold.c.brackets[start][end] = 1 - for label_str in label_strs: - # Add the encoded label to the set - gold.brackets[end][start].add(self.strings[label_str]) cdef Transition lookup_transition(self, object name) except *: if '-' in name: diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 62d4dc0e2..53eb1496d 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -51,11 +51,21 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil: cdef class BiluoPushDown(TransitionSystem): @classmethod - def get_labels(cls, gold_tuples): - move_labels = {MISSING: {'': True}, BEGIN: {}, IN: {}, LAST: {}, UNIT: {}, - OUT: {'': True}} + def get_actions(cls, **kwargs): + actions = kwargs.get('actions', + { + MISSING: {'': True}, + BEGIN: {}, + IN: {}, + LAST: {}, + UNIT: {}, + OUT: {'': True} + }) + for entity_type in kwargs.get('entity_types', []): + for action in (BEGIN, IN, LAST, UNIT): + actions[action][entity_type] = True moves = ('M', 'B', 'I', 'L', 'U') - for raw_text, sents in gold_tuples: + for raw_text, sents in kwargs.get('gold_tuples', []): for (ids, words, tags, heads, labels, biluo), _ in sents: for i, ner_tag in enumerate(biluo): if ner_tag != 'O' and ner_tag != '-': @@ -63,8 +73,8 @@ cdef class BiluoPushDown(TransitionSystem): raise ValueError(ner_tag) _, label = ner_tag.split('-') for move_str in ('B', 'I', 'L', 'U'): - move_labels[moves.index(move_str)][label] = True - return move_labels + actions[moves.index(move_str)][label] = True + return actions property action_types: def __get__(self): diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 2c3a106b3..1ad0ce729 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -13,6 +13,7 @@ from ._state cimport StateC cdef class ParserModel(AveragedPerceptron): cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil + cdef class Parser: cdef readonly Vocab vocab cdef readonly ParserModel model diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index a8f1973af..85407b942 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -67,10 +67,6 @@ def get_templates(name): pf.tree_shape + pf.trigrams) -def ParserFactory(transition_system): - return lambda strings, dir_: Parser(strings, dir_, transition_system) - - cdef class ParserModel(AveragedPerceptron): cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil: fill_context(eg.atoms, state) @@ -79,27 +75,31 @@ cdef class ParserModel(AveragedPerceptron): cdef class Parser: @classmethod - def load(cls, path, Vocab vocab, moves_class): + def load(cls, path, Vocab vocab, TransitionSystem=None, require=False): with (path / 'config.json').open() as file_: cfg = json.load(file_) - moves = moves_class(vocab.strings, cfg['labels']) - templates = get_templates(cfg['features']) - model = ParserModel(templates) + # TODO: remove this shim when we don't have to support older data + if 'labels' in cfg: + cfg['actions'] = cfg.pop('labels') + self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg) if (path / 'model').exists(): - model.load(str(path / 'model')) - return cls(vocab, moves, model, **cfg) + self.model.load(str(path / 'model')) + elif require: + raise IOError( + "Required file %s/model not found when loading" % str(path)) + return self - @classmethod - def blank(cls, Vocab vocab, moves_class, **cfg): - moves = moves_class(vocab.strings, cfg.get('labels', {})) - templates = cfg.get('features', tuple()) - model = ParserModel(templates) - return cls(vocab, moves, model, **cfg) - - - def __init__(self, Vocab vocab, transition_system, ParserModel model, **cfg): - self.moves = transition_system - self.model = model + def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg): + if TransitionSystem is None: + TransitionSystem = self.TransitionSystem + actions = TransitionSystem.get_actions(**cfg) + self.moves = TransitionSystem(vocab.strings, actions) + # TODO: Remove this when we no longer need to support old-style models + if isinstance(cfg.get('features'), basestring): + cfg['features'] = get_templates(cfg['features']) + elif 'features' not in cfg: + cfg['features'] = self.feature_templates + self.model = ParserModel(cfg['features']) self.cfg = cfg def __reduce__(self): @@ -191,7 +191,7 @@ cdef class Parser: free(eg.is_valid) return 0 - def train(self, Doc tokens, GoldParse gold): + def update(self, Doc tokens, GoldParse gold): self.moves.preprocess_gold(gold) cdef StateClass stcls = StateClass.init(tokens.c, tokens.length) self.moves.initialize_state(stcls.c) @@ -283,7 +283,9 @@ cdef class StepwiseState: cdef Transition action = self.parser.moves.c[self.eg.guess] return self.parser.moves.move_name(action.move, action.label) - def transition(self, action_name): + def transition(self, action_name=None): + if action_name is None: + action_name = self.predict() moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3} if action_name == '_': action_name = self.predict() @@ -306,14 +308,14 @@ cdef class StepwiseState: class ParserStateError(ValueError): - def __repr__(self): - raise ValueError( + def __init__(self, doc): + ValueError.__init__(self, "Error analysing doc -- no valid actions available. This should " "never happen, so please report the error on the issue tracker. " "Here's the thread to do so --- reopen it if it's closed:\n" "https://github.com/spacy-io/spaCy/issues/429\n" "Please include the text that the parser failed on, which is:\n" - "%s" % repr(self.args[0].text)) + "%s" % repr(doc.text)) cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index a18cc284a..df485933d 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -21,7 +21,7 @@ cdef class StateClass: @property def queue(self): - return {self.B(i) for i in range(self.c._b_i)} + return {self.B(i) for i in range(self.c.buffer_length())} def print_state(self, words): words = list(words) + ['_'] diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 51e465188..6d2cef1f4 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -14,3 +14,4 @@ cdef class Tagger: cdef readonly Vocab vocab cdef readonly TaggerModel model cdef public dict freqs + cdef public object cfg diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 64b3e9cc2..b2bc344bb 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -15,6 +15,7 @@ from .tokens.doc cimport Doc from .attrs cimport TAG from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE +from .gold cimport GoldParse from .attrs cimport * @@ -103,58 +104,30 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: cdef class Tagger: """A part-of-speech tagger for English""" @classmethod - def default_templates(cls): - return ( - (W_orth,), - (P1_lemma, P1_pos), - (P2_lemma, P2_pos), - (N1_orth,), - (N2_orth,), - - (W_suffix,), - (W_prefix,), - - (P1_pos,), - (P2_pos,), - (P1_pos, P2_pos), - (P1_pos, W_orth), - (P1_suffix,), - (N1_suffix,), - - (W_shape,), - (W_cluster,), - (N1_cluster,), - (N2_cluster,), - (P1_cluster,), - (P2_cluster,), - - (W_flags,), - (N1_flags,), - (N2_flags,), - (P1_flags,), - (P2_flags,), - ) - - @classmethod - def blank(cls, vocab, templates): - model = TaggerModel(templates) - return cls(vocab, model) - - @classmethod - def load(cls, path, vocab): + def load(cls, path, vocab, require=False): + # TODO: Change this to expect config.json when we don't have to + # support old data. path = path if not isinstance(path, basestring) else pathlib.Path(path) if (path / 'templates.json').exists(): with (path / 'templates.json').open() as file_: templates = json.load(file_) + elif require: + raise IOError( + "Required file %s/templates.json not found when loading Tagger" % str(path)) else: - templates = cls.default_templates() + templates = cls.feature_templates + self = cls(vocab, model=None, feature_templates=templates) - model = TaggerModel(templates) if (path / 'model').exists(): - model.load(str(path / 'model')) - return cls(vocab, model) + self.model.load(str(path / 'model')) + elif require: + raise IOError( + "Required file %s/model not found when loading Tagger" % str(path)) + return self - def __init__(self, Vocab vocab, TaggerModel model): + def __init__(self, Vocab vocab, TaggerModel model=None, **cfg): + if model is None: + model = TaggerModel(cfg.get('features', self.feature_templates)) self.vocab = vocab self.model = model # TODO: Move this to tag map @@ -162,6 +135,7 @@ cdef class Tagger: for tag in self.tag_names: self.freqs[TAG][self.vocab.strings[tag]] = 1 self.freqs[TAG][0] = 1 + self.cfg = cfg @property def tag_names(self): @@ -208,11 +182,12 @@ cdef class Tagger: self(doc) yield doc - def train(self, Doc tokens, object gold_tag_strs): + def update(self, Doc tokens, GoldParse gold): + gold_tag_strs = gold.tags assert len(tokens) == len(gold_tag_strs) for tag in gold_tag_strs: if tag != None and tag not in self.tag_names: - msg = ("Unrecognized gold tag: %s. tag_map.json must contain all" + msg = ("Unrecognized gold tag: %s. tag_map.json must contain all " "gold tags, to maintain coarse-grained mapping.") raise ValueError(msg % tag) golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] @@ -238,3 +213,35 @@ cdef class Tagger: tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length return correct + + + feature_templates = ( + (W_orth,), + (P1_lemma, P1_pos), + (P2_lemma, P2_pos), + (N1_orth,), + (N2_orth,), + + (W_suffix,), + (W_prefix,), + + (P1_pos,), + (P2_pos,), + (P1_pos, P2_pos), + (P1_pos, W_orth), + (P1_suffix,), + (N1_suffix,), + + (W_shape,), + (W_cluster,), + (N1_cluster,), + (N2_cluster,), + (P1_cluster,), + (P2_cluster,), + + (W_flags,), + (N1_flags,), + (N2_flags,), + (P1_flags,), + (P2_flags,), + ) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 901f459dd..c339f66ef 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,16 +1,17 @@ import pytest import os -import spacy +from ..en import English +from ..de import German @pytest.fixture(scope="session") def EN(): - return spacy.load("en") + return English() @pytest.fixture(scope="session") def DE(): - return spacy.load("de") + return German() def pytest_addoption(parser): diff --git a/spacy/tests/gold/__init__.py b/spacy/tests/gold/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/gold/test_biluo.py b/spacy/tests/gold/test_biluo.py new file mode 100644 index 000000000..804739ae6 --- /dev/null +++ b/spacy/tests/gold/test_biluo.py @@ -0,0 +1,48 @@ +from __future__ import unicode_literals + +from ...gold import biluo_tags_from_offsets +from ...vocab import Vocab +from ...tokens.doc import Doc + +import pytest + + +@pytest.fixture +def vocab(): + return Vocab() + + +def test_U(vocab): + orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('London', False), + ('.', True)] + doc = Doc(vocab, orths_and_spaces=orths_and_spaces) + entities = [(len("I flew to "), len("I flew to London"), 'LOC')] + tags = biluo_tags_from_offsets(doc, entities) + assert tags == ['O', 'O', 'O', 'U-LOC', 'O'] + + +def test_BL(vocab): + orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True), + ('Francisco', False), ('.', True)] + doc = Doc(vocab, orths_and_spaces=orths_and_spaces) + entities = [(len("I flew to "), len("I flew to San Francisco"), 'LOC')] + tags = biluo_tags_from_offsets(doc, entities) + assert tags == ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O'] + + +def test_BIL(vocab): + orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True), + ('Francisco', True), ('Valley', False), ('.', True)] + doc = Doc(vocab, orths_and_spaces=orths_and_spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')] + tags = biluo_tags_from_offsets(doc, entities) + assert tags == ['O', 'O', 'O', 'B-LOC', 'I-LOC', 'L-LOC', 'O'] + + +def test_misalign(vocab): + orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True), + ('Francisco', True), ('Valley.', False)] + doc = Doc(vocab, orths_and_spaces=orths_and_spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')] + tags = biluo_tags_from_offsets(doc, entities) + assert tags == ['O', 'O', 'O', '-', '-', '-'] diff --git a/spacy/tests/matcher/test_matcher_bugfixes.py b/spacy/tests/matcher/test_matcher_bugfixes.py index 6816cb680..9b9fcc421 100644 --- a/spacy/tests/matcher/test_matcher_bugfixes.py +++ b/spacy/tests/matcher/test_matcher_bugfixes.py @@ -46,9 +46,9 @@ def test_overlap_issue242(): if os.environ.get('SPACY_DATA'): data_dir = os.environ.get('SPACY_DATA') else: - data_dir = None + data_dir = False - nlp = spacy.en.English(data_dir=data_dir, tagger=False, parser=False, entity=False) + nlp = spacy.en.English(path=data_dir, tagger=False, parser=False, entity=False) nlp.matcher.add('FOOD', 'FOOD', {}, patterns) diff --git a/spacy/tests/parser/test_parser_pickle.py b/spacy/tests/parser/test_parser_pickle.py index 547bf4a24..72e001eba 100644 --- a/spacy/tests/parser/test_parser_pickle.py +++ b/spacy/tests/parser/test_parser_pickle.py @@ -6,12 +6,12 @@ import cloudpickle import io -@pytest.mark.models -def test_pickle(EN): - file_ = io.BytesIO() - cloudpickle.dump(EN.parser, file_) - - file_.seek(0) - - loaded = pickle.load(file_) - +#@pytest.mark.models +#def test_pickle(EN): +# file_ = io.BytesIO() +# cloudpickle.dump(EN.parser, file_) +# +# file_.seek(0) +# +# loaded = pickle.load(file_) +# diff --git a/spacy/tests/serialize/test_packer.py b/spacy/tests/serialize/test_packer.py index 0eb24a08d..a89962be8 100644 --- a/spacy/tests/serialize/test_packer.py +++ b/spacy/tests/serialize/test_packer.py @@ -28,7 +28,7 @@ def vocab(): else: path = util.match_best_version('en', None, path) - vocab = English.Defaults('en', path).Vocab() + vocab = English.Defaults.create_vocab() lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' lex = vocab['the'] diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index ad877df4c..27a88a61b 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -29,9 +29,8 @@ def test_root(doc): assert np.root.head.orth_ == 'is' -def test_root2(): +def test_root2(EN): text = 'through North and South Carolina' - EN = English(parser=False) doc = EN(text) heads = np.asarray([[0, 3, -1, -2, -4]], dtype='int32') doc.from_array([HEAD], heads.T) diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index e81bfe8c9..e90d62c84 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import os import io import pickle +import pathlib from spacy.lemmatizer import Lemmatizer, read_index, read_exc from spacy import util @@ -12,30 +13,39 @@ import pytest @pytest.fixture def path(): - return util.match_best_version('en', None, - os.environ.get('SPACY_DATA', util.get_data_path())) + if 'SPACY_DATA' in os.environ: + return pathlib.Path(os.environ['SPACY_DATA']) + else: + return util.match_best_version('en', None, util.get_data_path()) @pytest.fixture def lemmatizer(path): - return Lemmatizer.load(path) + if path is not None: + return Lemmatizer.load(path) + else: + return None def test_read_index(path): - with (path / 'wordnet' / 'index.noun').open() as file_: - index = read_index(file_) - assert 'man' in index - assert 'plantes' not in index - assert 'plant' in index + if path is not None: + with (path / 'wordnet' / 'index.noun').open() as file_: + index = read_index(file_) + assert 'man' in index + assert 'plantes' not in index + assert 'plant' in index def test_read_exc(path): - with (path / 'wordnet' / 'verb.exc').open() as file_: - exc = read_exc(file_) - assert exc['was'] == ('be',) + if path is not None: + with (path / 'wordnet' / 'verb.exc').open() as file_: + exc = read_exc(file_) + assert exc['was'] == ('be',) def test_noun_lemmas(lemmatizer): + if lemmatizer is None: + return None do = lemmatizer.noun assert do('aardwolves') == set(['aardwolf']) @@ -46,23 +56,35 @@ def test_noun_lemmas(lemmatizer): def test_base_form_dive(lemmatizer): + if lemmatizer is None: + return None + do = lemmatizer.noun assert do('dive', number='sing') == set(['dive']) assert do('dive', number='plur') == set(['diva']) def test_base_form_saw(lemmatizer): + if lemmatizer is None: + return None + do = lemmatizer.verb assert do('saw', verbform='past') == set(['see']) def test_smart_quotes(lemmatizer): + if lemmatizer is None: + return None + do = lemmatizer.punct assert do('“') == set(['"']) assert do('“') == set(['"']) def test_pickle_lemmatizer(lemmatizer): + if lemmatizer is None: + return None + file_ = io.BytesIO() pickle.dump(lemmatizer, file_) diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index 7a3049f0b..9f04a7a0d 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -24,30 +24,30 @@ def test_compile(matcher): def test_no_match(matcher): - doc = Doc(matcher.vocab, ['I', 'like', 'cheese', '.']) + doc = Doc(matcher.vocab, words=['I', 'like', 'cheese', '.']) assert matcher(doc) == [] def test_match_start(matcher): - doc = Doc(matcher.vocab, ['JavaScript', 'is', 'good']) + doc = Doc(matcher.vocab, words=['JavaScript', 'is', 'good']) assert matcher(doc) == [(matcher.vocab.strings['JS'], matcher.vocab.strings['PRODUCT'], 0, 1)] def test_match_end(matcher): - doc = Doc(matcher.vocab, ['I', 'like', 'java']) + doc = Doc(matcher.vocab, words=['I', 'like', 'java']) assert matcher(doc) == [(doc.vocab.strings['Java'], doc.vocab.strings['PRODUCT'], 2, 3)] def test_match_middle(matcher): - doc = Doc(matcher.vocab, ['I', 'like', 'Google', 'Now', 'best']) + doc = Doc(matcher.vocab, words=['I', 'like', 'Google', 'Now', 'best']) assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], doc.vocab.strings['PRODUCT'], 2, 4)] def test_match_multi(matcher): - doc = Doc(matcher.vocab, 'I like Google Now and java best'.split()) + doc = Doc(matcher.vocab, words='I like Google Now and java best'.split()) assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], doc.vocab.strings['PRODUCT'], 2, 4), (doc.vocab.strings['Java'], @@ -61,9 +61,9 @@ def test_match_zero(matcher): {'OP': '!', 'IS_PUNCT': True}, {'ORTH': '"'} ]]) - doc = Doc(matcher.vocab, 'He said , " some words " ...'.split()) + doc = Doc(matcher.vocab, words='He said , " some words " ...'.split()) assert len(matcher(doc)) == 1 - doc = Doc(matcher.vocab, 'He said , " some three words " ...'.split()) + doc = Doc(matcher.vocab, words='He said , " some three words " ...'.split()) assert len(matcher(doc)) == 0 matcher.add('Quote', '', {}, [ [ @@ -83,24 +83,24 @@ def test_match_zero_plus(matcher): {'OP': '*', 'IS_PUNCT': False}, {'ORTH': '"'} ]]) - doc = Doc(matcher.vocab, 'He said , " some words " ...'.split()) + doc = Doc(matcher.vocab, words='He said , " some words " ...'.split()) assert len(matcher(doc)) == 1 -@pytest.mark.models -def test_match_preserved(EN): - patterns = { - 'JS': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], - 'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], - 'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]], - } - matcher = Matcher(EN.vocab, patterns) - doc = EN.tokenizer('I like java.') - EN.tagger(doc) - assert len(doc.ents) == 0 - doc = EN.tokenizer('I like java.') - doc.ents += tuple(matcher(doc)) - assert len(doc.ents) == 1 - EN.tagger(doc) - EN.entity(doc) - assert len(doc.ents) == 1 +#@pytest.mark.models +#def test_match_preserved(EN): +# patterns = { +# 'JS': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], +# 'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], +# 'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]], +# } +# matcher = Matcher(EN.vocab, patterns) +# doc = EN.tokenizer('I like java.') +# EN.tagger(doc) +# assert len(doc.ents) == 0 +# doc = EN.tokenizer('I like java.') +# doc.ents += tuple(matcher(doc)) +# assert len(doc.ents) == 1 +# EN.tagger(doc) +# EN.entity(doc) +# assert len(doc.ents) == 1 diff --git a/spacy/tests/tokens/test_token_api.py b/spacy/tests/tokens/test_token_api.py index 0d9fca1db..832350938 100644 --- a/spacy/tests/tokens/test_token_api.py +++ b/spacy/tests/tokens/test_token_api.py @@ -44,6 +44,7 @@ def test_str_builtin(EN): assert str(tokens[1]) == u'two' +@pytest.mark.models def test_is_properties(EN): Hi, comma, my, email, is_, addr = EN(u'Hi, my email is test@me.com') assert Hi.is_title diff --git a/spacy/tests/vocab/test_lexeme_flags.py b/spacy/tests/vocab/test_lexeme_flags.py index 5cc7bd16f..064bd6a8d 100644 --- a/spacy/tests/vocab/test_lexeme_flags.py +++ b/spacy/tests/vocab/test_lexeme_flags.py @@ -21,3 +21,22 @@ def test_is_digit(en_vocab): assert year.flags & (1 << IS_DIGIT) mixed = en_vocab['hello1'] assert not mixed.flags & (1 << IS_DIGIT) + + +def test_add_flag_auto_id(en_vocab): + is_len4 = en_vocab.add_flag(lambda string: len(string) == 4) + assert en_vocab['1999'].check_flag(is_len4) == True + assert en_vocab['1999'].check_flag(IS_DIGIT) == True + assert en_vocab['199'].check_flag(is_len4) == False + assert en_vocab['199'].check_flag(IS_DIGIT) == True + assert en_vocab['the'].check_flag(is_len4) == False + assert en_vocab['dogs'].check_flag(is_len4) == True + + +def test_add_flag_provided_id(en_vocab): + is_len4 = en_vocab.add_flag(lambda string: len(string) == 4, flag_id=IS_DIGIT) + assert en_vocab['1999'].check_flag(is_len4) == True + assert en_vocab['199'].check_flag(is_len4) == False + assert en_vocab['199'].check_flag(IS_DIGIT) == False + assert en_vocab['the'].check_flag(is_len4) == False + assert en_vocab['dogs'].check_flag(is_len4) == True diff --git a/spacy/tests/website/conftest.py b/spacy/tests/website/conftest.py index e2c64cfd7..4f533ae76 100644 --- a/spacy/tests/website/conftest.py +++ b/spacy/tests/website/conftest.py @@ -9,10 +9,12 @@ def nlp(): if os.environ.get('SPACY_DATA'): data_dir = os.environ.get('SPACY_DATA') else: - data_dir = None - return English(data_dir=data_dir) + data_dir = True + return English(path=data_dir) @pytest.fixture() def doc(nlp): + for word in ['Hello', ',', 'world', '.', 'Here', 'are', 'two', 'sentences', '.']: + _ = nlp.vocab[word] return nlp('Hello, world. Here are two sentences.') diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index fb068fee1..6b11476a9 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -32,11 +32,17 @@ cdef class Doc: cdef public object _vector cdef public object _vector_norm + cdef public np.ndarray tensor + cdef public object user_data + cdef TokenC* c cdef public bint is_tagged cdef public bint is_parsed + cdef public dict getters_for_tokens + cdef public dict getters_for_spans + cdef public list _py_tokens cdef int length diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 66654482e..4870efcb6 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -75,7 +75,7 @@ cdef class Doc: doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)]) """ - def __init__(self, Vocab vocab, orths_and_spaces=None): + def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): ''' Create a Doc object. @@ -89,11 +89,14 @@ cdef class Doc: A Vocabulary object, which must match any models you want to use (e.g. tokenizer, parser, entity recognizer). - orths_and_spaces: - A list of tokens in the document as a sequence of - `(orth_id, has_space)` tuples, where `orth_id` is an - integer and `has_space` is a boolean, indicating whether the - token has a trailing space. + words: + A list of unicode strings to add to the document as words. If None, + defaults to empty list. + + spaces: + A list of boolean values, of the same length as words. True + means that the word is followed by a space, False means it is not. + If None, defaults to [True]*len(words) ''' self.vocab = vocab size = 20 @@ -112,11 +115,25 @@ cdef class Doc: self.length = 0 self.is_tagged = False self.is_parsed = False + self.getters_for_tokens = {} + self.getters_for_spans = {} + self.tensor = numpy.zeros((0,), dtype='float32') + self.user_data = {} self._py_tokens = [] self._vector = None self.noun_chunks_iterator = CHUNKERS.get(self.vocab.lang) cdef unicode orth cdef bint has_space + if orths_and_spaces is None and words is not None: + if spaces is None: + spaces = [True] * len(words) + elif len(spaces) != len(words): + raise ValueError( + "Arguments 'words' and 'spaces' should be sequences of the " + "same length, or 'spaces' should be left default at None. " + "spaces should be a sequence of booleans, with True meaning " + "that the word owns a ' ' character following it.") + orths_and_spaces = zip(words, spaces) if orths_and_spaces is not None: for orth_space in orths_and_spaces: if isinstance(orth_space, unicode): @@ -576,9 +593,22 @@ cdef class Doc: keep_reading = False yield n_bytes_str + data - def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, - unicode ent_type): + def merge(self, int start_idx, int end_idx, *args, **attributes): """Merge a multi-word expression into a single token.""" + cdef unicode tag, lemma, ent_type + if len(args) == 3: + # TODO: Warn deprecation + tag, lemma, ent_type = args + attributes[TAG] = self.vocab.strings[tag] + attributes[LEMMA] = self.vocab.strings[lemma] + attributes[ENT_TYPE] = self.vocab.strings[ent_type] + elif args: + raise ValueError( + "Doc.merge received %d non-keyword arguments. " + "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). " + "Arguments supplied:\n%s\n" + "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) + cdef int start = token_by_start(self.c, self.length, start_idx) if start == -1: return None @@ -587,8 +617,11 @@ cdef class Doc: return None # Currently we have the token index, we want the range-end index end += 1 - cdef Span span = self[start:end] + tag = self.vocab.strings[attributes.get(TAG, span.root.tag)] + lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)] + ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)] + # Get LexemeC for newly merged token new_orth = ''.join([t.text_with_ws for t in span]) if span[-1].whitespace_: diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd index eeb663f67..303933d42 100644 --- a/spacy/tokens/span.pxd +++ b/spacy/tokens/span.pxd @@ -1,3 +1,5 @@ +cimport numpy as np + from .doc cimport Doc diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 49403ccc3..dc23481f6 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -77,10 +77,12 @@ cdef class Span: for i in range(self.start, self.end): yield self.doc[i] - def merge(self, unicode tag, unicode lemma, unicode ent_type): - self.doc.merge(self.start_char, self.end_char, tag, lemma, ent_type) + def merge(self, *args, **attributes): + self.doc.merge(self.start_char, self.end_char, *args, **attributes) def similarity(self, other): + if 'similarity' in self.doc.getters_for_spans: + self.doc.getters_for_spans['similarity'](self, other) if self.vector_norm == 0.0 or other.vector_norm == 0.0: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) @@ -102,6 +104,8 @@ cdef class Span: property sent: '''Get the sentence span that this span is a part of.''' def __get__(self): + if 'sent' in self.doc.getters_for_spans: + return self.doc.getters_for_spans['sent'](self) # This should raise if we're not parsed. self.doc.sents cdef int n = 0 @@ -115,16 +119,22 @@ cdef class Span: property has_vector: def __get__(self): + if 'has_vector' in self.doc.getters_for_spans: + return self.doc.getters_for_spans['has_vector'](self) return any(token.has_vector for token in self) property vector: def __get__(self): + if 'vector' in self.doc.getters_for_spans: + return self.doc.getters_for_spans['vector'](self) if self._vector is None: self._vector = sum(t.vector for t in self) / len(self) return self._vector property vector_norm: def __get__(self): + if 'vector_norm' in self.doc.getters_for_spans: + return self.doc.getters_for_spans['vector'](self) cdef float value if self._vector_norm is None: self._vector_norm = 1e-20 @@ -187,6 +197,8 @@ cdef class Span: """ def __get__(self): self._recalculate_indices() + if 'root' in self.doc.getters_for_spans: + return self.doc.getters_for_spans['root'](self) # This should probably be called 'head', and the other one called # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/ cdef int i diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index aa2f09394..9dc456b08 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -1,3 +1,4 @@ +from numpy cimport ndarray from ..vocab cimport Vocab from ..structs cimport TokenC from ..attrs cimport attr_id_t diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 9320cb85a..52e393b9b 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -29,27 +29,6 @@ from ..attrs cimport IS_OOV from ..lexeme cimport Lexeme -_STR_TRAILING_WHITESPACE = False - -def use_deprecated_Token__str__semantics(value): - ''' - Preserve deprecated semantics for Token.__str__ and Token.__unicode__ methods. - - spaCy < 0.100.7 had a bug in the semantics of the Token.__str__ and Token.__unicode__ - built-ins: they included a trailing space. To ease the transition to the - new semantics, you can use this function to switch the old semantics back on. - - Example: - - from spacy.tokens.token import keep_deprecated_Token.__str__semantics - keep_deprecated_Token.__str__semantics(True) - - This function will not remain in future versions --- it's a temporary shim. - ''' - global _STR_TRAILING_WHITESPACE - _STR_TRAILING_WHITESPACE = value - - cdef class Token: """An individual token --- i.e. a word, a punctuation symbol, etc. Created via Doc.__getitem__ and Doc.__iter__. @@ -64,20 +43,10 @@ cdef class Token: return self.c.lex.length def __unicode__(self): - # Users can toggle this on to preserve former buggy semantics. - # Remove this in future versions. - if _STR_TRAILING_WHITESPACE: - return self.text_with_ws - else: - return self.text + return self.text def __bytes__(self): - # Users can toggle this on to preserve former buggy semantics. - # Remove this in future versions. - if _STR_TRAILING_WHITESPACE: - return self.text_with_ws.encode('utf8') - else: - return self.text.encode('utf8') + return self.text.encode('utf8') def __str__(self): if six.PY3: @@ -94,6 +63,8 @@ cdef class Token: return self.doc[self.i+i] def similarity(self, other): + if 'similarity' in self.doc.getters_for_tokens: + return self.doc.getters_for_tokens['similarity'](self, other) if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) @@ -116,11 +87,11 @@ cdef class Token: property text_with_ws: def __get__(self): - orth_ = self.orth_ + cdef unicode orth = self.vocab.strings[self.c.lex.orth] if self.c.spacy: - return orth_ + u' ' + return orth + u' ' else: - return orth_ + return orth property prob: def __get__(self): @@ -182,6 +153,8 @@ cdef class Token: property has_vector: def __get__(self): + if 'has_vector' in self.doc.getters_for_tokens: + return self.doc.getters_for_tokens['has_vector'](self) cdef int i for i in range(self.vocab.vectors_length): if self.c.lex.vector[i] != 0: @@ -191,6 +164,8 @@ cdef class Token: property vector: def __get__(self): + if 'vector' in self.doc.getters_for_tokens: + return self.doc.getters_for_tokens['vector'](self) cdef int length = self.vocab.vectors_length if length == 0: raise ValueError( @@ -204,10 +179,15 @@ cdef class Token: property repvec: def __get__(self): - return self.vector + raise AttributeError("repvec was renamed to vector in v0.100") + property has_repvec: + def __get__(self): + raise AttributeError("has_repvec was renamed to has_vector in v0.100") property vector_norm: def __get__(self): + if 'vector_norm' in self.doc.getters_for_tokens: + return self.doc.getters_for_tokens['vector_norm'](self) return self.c.lex.l2_norm property n_lefts: @@ -387,11 +367,14 @@ cdef class Token: def __get__(self): """Get a list of conjoined words.""" cdef Token word - if self.dep_ != 'conj': - for word in self.rights: - if word.dep_ == 'conj': - yield word - yield from word.conjuncts + if 'conjuncts' in self.doc.getters_for_tokens: + yield from self.doc.getters_for_tokens['conjuncts'](self) + else: + if self.dep_ != 'conj': + for word in self.rights: + if word.dep_ == 'conj': + yield word + yield from word.conjuncts property ent_type: def __get__(self): @@ -403,7 +386,7 @@ cdef class Token: property ent_type_: def __get__(self): - return self.vocab.strings.decode_int(self.c.ent_type, mem=self.mem) + return self.vocab.strings[self.c.ent_type] property ent_iob_: def __get__(self): @@ -424,7 +407,7 @@ cdef class Token: property ent_id_: '''A (string) entity ID. Usually assigned by patterns in the Matcher.''' def __get__(self): - return self.vocab.strings.decode_int(self.c.ent_id, mem=self.mem) + return self.vocab.strings[self.c.ent_id] def __set__(self, hash_t key): # TODO @@ -438,35 +421,35 @@ cdef class Token: property orth_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.orth, mem=self.mem) + return self.vocab.strings[self.c.lex.orth] property lower_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.lower, mem=self.mem) + return self.vocab.strings[self.c.lex.lower] property norm_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.norm, mem=self.mem) + return self.vocab.strings[self.c.lex.norm] property shape_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.shape, mem=self.mem) + return self.vocab.strings[self.c.lex.shape] property prefix_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.prefix, mem=self.mem) + return self.vocab.strings[self.c.lex.prefix] property suffix_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.suffix, mem=self.mem) + return self.vocab.strings[self.c.lex.suffix] property lang_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.lang, mem=self.mem) + return self.vocab.strings[self.c.lex.lang] property lemma_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lemma, mem=self.mem) + return self.vocab.strings[self.c.lemma] property pos_: def __get__(self): @@ -474,13 +457,13 @@ cdef class Token: property tag_: def __get__(self): - return self.vocab.strings.decode_int(self.c.tag, mem=self.mem) + return self.vocab.strings[self.c.tag] property dep_: def __get__(self): - return self.vocab.decode_int(self.c.dep, mem=self.mem) + return self.vocab.strings[self.c.dep] def __set__(self, unicode label): - self.c.dep = self.vocab.strings.intern(label, mem=self.mem) + self.c.dep = self.vocab.strings[label] property is_oov: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV) diff --git a/spacy/train.py b/spacy/train.py new file mode 100644 index 000000000..097218310 --- /dev/null +++ b/spacy/train.py @@ -0,0 +1,69 @@ +from __future__ import absolute_import +from __future__ import unicode_literals + +import random +from .gold import GoldParse +from .scorer import Scorer +from .gold import merge_sents + + +class Trainer(object): + '''Manage training of an NLP pipeline.''' + def __init__(self, nlp, gold_tuples): + self.nlp = nlp + self.gold_tuples = gold_tuples + + def epochs(self, nr_epoch, augment_data=None, gold_preproc=False): + def _epoch(): + for raw_text, paragraph_tuples in self.gold_tuples: + if gold_preproc: + raw_text = None + else: + paragraph_tuples = merge_sents(paragraph_tuples) + if augment_data is not None: + raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples) + docs = self.make_docs(raw_text, paragraph_tuples) + golds = self.make_golds(docs, paragraph_tuples) + for doc, gold in zip(docs, golds): + yield doc, gold + + for itn in range(nr_epoch): + random.shuffle(self.gold_tuples) + yield _epoch() + + def update(self, doc, gold): + for process in self.nlp.pipeline: + if hasattr(process, 'update'): + process.update(doc, gold) + process(doc) + return doc + + def evaluate(self, dev_sents, gold_preproc=False): + scorer = Scorer() + for raw_text, paragraph_tuples in dev_sents: + if gold_preproc: + raw_text = None + else: + paragraph_tuples = merge_sents(paragraph_tuples) + docs = self.make_docs(raw_text, paragraph_tuples) + golds = self.make_golds(docs, paragraph_tuples) + for doc, gold in zip(docs, golds): + for process in self.nlp.pipeline[1:]: + process(doc) + scorer.score(doc, gold) + return scorer + + def make_docs(self, raw_text, paragraph_tuples): + if raw_text is not None: + return [self.nlp.tokenizer(raw_text)] + else: + return [self.nlp.tokenizer.tokens_from_list(sent_tuples[0][1]) + for sent_tuples in paragraph_tuples] + + def make_golds(self, docs, paragraph_tuples): + if len(docs) == 1: + return [GoldParse(docs[0], sent_tuples[0]) + for sent_tuples in paragraph_tuples] + else: + return [GoldParse(doc, sent_tuples[0]) + for doc, sent_tuples in zip(docs, paragraph_tuples)] diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index 65ab7fe62..bd863d247 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -1,10 +1,6 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t from libc.stdint cimport uint8_t -from libc.stdint cimport UINT64_MAX as err_hash_t -from libc.stdint cimport UINT64_MAX as err_flags_t -from libc.stdint cimport UINT64_MAX as err_len_t -from libc.stdint cimport UINT64_MAX as err_tag_t ctypedef uint64_t hash_t ctypedef char* utf8_t diff --git a/spacy/util.py b/spacy/util.py index dd5805333..ee285cf80 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -13,6 +13,7 @@ try: except NameError: basestring = str + LANGUAGES = {} _data_path = pathlib.Path(__file__).parent / 'data' @@ -52,6 +53,8 @@ def or_(val1, val2): def match_best_version(target_name, target_version, path): path = path if not isinstance(path, basestring) else pathlib.Path(path) + if not path.exists(): + return None matches = [] for data_name in path.iterdir(): name, version = split_data_name(data_name.parts[-1]) diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index f136ef900..8e0e363bc 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -27,7 +27,6 @@ cdef struct _Cached: cdef class Vocab: cdef Pool mem cpdef readonly StringStore strings - cpdef readonly dict oov_stores cpdef readonly Morphology morphology cdef readonly int length cdef public object _serializer diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index d7dca9cf9..55b5366c8 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -50,7 +50,7 @@ cdef class Vocab: ''' @classmethod def load(cls, path, lex_attr_getters=None, vectors=True, lemmatizer=True, - tag_map=True, serializer_freqs=None, **deprecated_kwargs): + tag_map=True, serializer_freqs=True, **deprecated_kwargs): util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) if tag_map is True and (path / 'vocab' / 'tag_map.json').exists(): with (path / 'vocab' / 'tag_map.json').open() as file_: @@ -93,7 +93,6 @@ cdef class Vocab: self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() - self.oov_stores = {} # Load strings in a special order, so that we have an onset number for # the vocabulary. This way, when words are added in order, the orth ID # is the frequency rank of the word, plus a certain offset. The structural @@ -130,6 +129,44 @@ cdef class Vocab: """The current number of lexemes stored.""" return self.length + def add_flag(self, flag_getter, int flag_id=-1): + '''Set a new boolean flag to words in the vocabulary. The flag_setter + function will be called over the words currently in the vocab, and then + applied to new words as they occur. You'll then be able to access the + flag value on each token, using token.check_flag(flag_id). See also: + Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag. + + Arguments: + flag_getter: + A function f(unicode) -> bool, to get the flag value. + + flag_id (int): + An integer between 1 and 63 (inclusive), specifying the bit at which the + flag will be stored. If -1, the lowest available bit will be + chosen. + + Returns: + flag_id (int): The integer ID by which the flag value can be checked. + ''' + if flag_id == -1: + for bit in range(1, 64): + if bit not in self.lex_attr_getters: + flag_id = bit + break + else: + raise ValueError( + "Cannot find empty bit for new lexical flag. All bits between " + "0 and 63 are occupied. You can replace one by specifying the " + "flag_id explicitly, e.g. nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA") + elif flag_id >= 64 or flag_id < 1: + raise ValueError( + "Invalid value for flag_id: %d. Flag IDs must be between " + "1 and 63 (inclusive)" % flag_id) + for lex in self: + lex.set_flag(flag_id, flag_getter(lex.orth_)) + self.lex_attr_getters[flag_id] = flag_getter + return flag_id + cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme if necessary, using memory acquired from the given pool. If the pool @@ -141,7 +178,7 @@ cdef class Vocab: lex = self._by_hash.get(key) cdef size_t addr if lex != NULL: - if (string not in self.strings) or (lex.orth != self.strings[string]): + if lex.orth != self.strings[string]: raise LookupError.mismatched_strings( lex.orth, self.strings[string], self.strings[lex.orth], string) return lex @@ -164,10 +201,10 @@ cdef class Vocab: cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef hash_t key cdef bint is_oov = mem is not self.mem - if len(string) < 3 or not is_oov: + if len(string) < 3: mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) - lex.orth = self.strings.intern(string, mem=mem) + lex.orth = self.strings[string] lex.length = len(string) lex.id = self.length lex.vector = mem.alloc(self.vectors_length, sizeof(float)) @@ -175,10 +212,10 @@ cdef class Vocab: for attr, func in self.lex_attr_getters.items(): value = func(string) if isinstance(value, unicode): - value = self.strings.intern(value) + value = self.strings[value] if attr == PROB: lex.prob = value - else: + elif value is not None: Lexeme.set_struct_attr(lex, attr, value) if is_oov: lex.id = 0 @@ -206,8 +243,7 @@ cdef class Vocab: def __getitem__(self, id_or_string): '''Retrieve a lexeme, given an int ID or a unicode string. If a previously - unseen unicode string is given, a new lexeme is created and stored, and - the string is interned in the vocabulary. + unseen unicode string is given, a new lexeme is created and stored. Args: id_or_string (int or unicode): @@ -222,7 +258,7 @@ cdef class Vocab: ''' cdef attr_t orth if type(id_or_string) == unicode: - orth = self.strings.intern(id_or_string) + orth = self.strings[id_or_string] else: orth = id_or_string return Lexeme(self, orth) @@ -238,7 +274,7 @@ cdef class Vocab: if 'pos' in props: self.morphology.assign_tag(token, props['pos']) if 'L' in props: - tokens[i].lemma = self.strings.intern(props['L']) + tokens[i].lemma = self.strings[props['L']] for feature, value in props.get('morph', {}).items(): self.morphology.assign_feature(&token.morph, feature, value) return tokens diff --git a/travis.sh b/travis.sh new file mode 100755 index 000000000..416b70211 --- /dev/null +++ b/travis.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +if [ "${VIA}" == "pypi" ]; then + rm -rf * + pip install spacy + python -m spacy.en.download + python -m spacy.de.download +fi + +if [ "${VIA}" == "sdist" ]; then + rm -rf * + pip uninstall spacy + wget https://api.explosion.ai/build/spacy/sdist/$TRAVIS_COMMIT + mv $TRAVIS_COMMIT sdist.tgz + pip install -U sdist.tgz +fi + + +if [ "${VIA}" == "compile" ]; then + pip install -r requirements.txt + pip install -e . + mkdir -p corpora/en + cd corpora/en + wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz + tar -xzf WordNet-3.0.tar.gz + mv WordNet-3.0 wordnet + cd ../../ + mkdir models/ + python bin/init_model.py en lang_data/ corpora/ models/en +fi diff --git a/website/_harp.json b/website/_harp.json index 1ceb54023..0257b58d2 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -2,16 +2,13 @@ "globals": { "title": "spaCy.io", "description": "spaCy is a free open-source library featuring state-of-the-art speed and accuracy and a powerful Python API.", + "url": "https://spacy.io", + "email": "contact@spacy.io", + "company": "spaCy.io", - "SITENAME": "spaCy", - "SLOGAN": "Industrial-strength Natural Language Processing", - "SITE_URL": "https://spacy.io", - "EMAIL": "contact@explosion.ai", - - "COMPANY": "Explosion AI", - "COMPANY_URL": "https://explosion.ai", - "DEMOS_URL": "https://demos.explosion.ai", - + "navigation": { "Docs": "docs", "Demos": "demos", "Blog": "blog" }, + "profiles": { "twitter": "spacy_io", "github": "spacy-io", "reddit": "spacynlp", "medium": "spacy" }, + "google_analytics": "UA-58931649-1", "SOCIAL": { "twitter": "spacy_io", "github": "explosion", @@ -25,5 +22,38 @@ "SPACY_VERSION": "0.101.0", "SPACY_STARS": "2300", "GITHUB": { "user": "explosion", "repo": "spacy" } + "spacy_version": "0.101.0", + "spacy_stars": "2100", + "github_settings": { "user": "spacy-io", "repo": "spacy" }, + + "authors" : { + "matt" : { + "name" : "Matthew Honnibal", + "description" : "studied linguistics as an undergrad, and never thought he'd be a programmer. By 2009 he had a PhD in computer science, and in 2014 he left academia to write spaCy. He's from Sydney and lives in Berlin.", + "links": { + "twitter": [ "https://twitter.com/honnibal", "Twitter" ], + "website": [ "https://www.semanticscholar.org/search?q=Matthew%20Honnibal", "Semantic Scholar" ] + } + }, + + "ines": { + "name": "Ines Montani", + "description": "has developed, designed and implemented our interactive demos and the spacy.io website. She has a degree in media, linguistics and communications, and over ten years experience in web development.", + "links": { + "twitter": [ "https://twitter.com/_inesmontani", "Twitter" ], + "codepen": [ "https://codepen.io/ines", "Codepen"], + "github": [ "https://github.com/ines", "GitHub"], + "website": [ "http://ines.io", "Blog" ] + } + }, + + "wolfgang": { + "name": "Wolfgang Seeker", + "description": "is a computational linguist from Germany. He is fascinated with the complexity and variety of human language, and spent his PhD looking for ways to make NLP work well with any kind of language in the world. He joined spaCy to build effective and truly multilingual NLP software.", + "links": { + "website": [ "https://www.semanticscholar.org/search?q=Wolfgang%20Seeker", "Semantic Scholar" ] + } + } + } } } diff --git a/website/_includes/_article.jade b/website/_includes/_article.jade new file mode 100644 index 000000000..08d7c9e04 --- /dev/null +++ b/website/_includes/_article.jade @@ -0,0 +1,35 @@ +include ../_includes/_mixins + +//- Article +//- ============================================================================ + +article.article(id=current.source) + + header.article-header + +h2.article-title=title + .article-meta + if author + | by #[a.link(href=(authors[author].url || url) target='_blank')=authors[author].name] on   + | #[+date(date)] + + .article-body!=yield + + footer.article-footer + + +grid('padding', 'align-right', 'valign-center') + if hide_social != true + +tweet(title) + + if links + for link, index in links + div: +button('primary', 'small', index.toLowerCase())(href=link target='_blank') + +icon(index.toLowerCase(), 'medium', 'secondary') + | Discussion on #{index} + + if author + +divider + + !=partial('_profile', { label: 'About the Author', style: 'alt' }) + +!=partial('_newsletter', { divider: 'both' }) +!=partial('_latest-posts', { max: 2, _section: _section } ) diff --git a/website/_includes/_footer.jade b/website/_includes/_footer.jade index bd7688bfb..5a97d4d79 100644 --- a/website/_includes/_footer.jade +++ b/website/_includes/_footer.jade @@ -7,8 +7,13 @@ include _mixins footer.o-footer.o-inline-list.u-pattern.u-text-center.u-text-label.u-text-strong span © #{new Date().getFullYear()} #[+a(COMPANY_URL, true)=COMPANY] +<<<<<<< HEAD +a(COMPANY_URL + "/legal", true) Legal / Imprint a(href="mailto:#{EMAIL}") #[+icon("mail", 16)] +======= +footer.footer + span © #{new Date().getFullYear()} #{company} +>>>>>>> v1.0.0-rc1 +a("https://twitter.com/" + SOCIAL.twitter)(aria-label="Twitter") +icon("twitter", 20) diff --git a/website/_includes/_functions.jade b/website/_includes/_functions.jade index a191b330d..cef2a90b9 100644 --- a/website/_includes/_functions.jade +++ b/website/_includes/_functions.jade @@ -5,7 +5,64 @@ //- Add prefixes to items of an array (for modifier CSS classes) - function prefixArgs(array, prefix) { +<<<<<<< HEAD - return array.map(function(arg) { - return prefix + '--' + arg; - }).join(' '); +======= +- for(var i = 0; i < array.length; i++) { +- array[i] = prefix + array[i]; +- } +- return array.join(' '); +- } + + +//- Convert date to human readable and timestamp format + input - [string] date in the format YYYY-MM-DD + +- function convertDate(input) { +- var dates = []; +- var months = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ]; +- var date = new Date(input); +- dates.full = months[date.getMonth()] + ' ' + date.getDate() + ', ' + date.getFullYear(); +- dates.timestamp = JSON.parse(JSON.stringify(date)); +- return dates; +- } + + +//- Convert date to valid RSS pubDate + input - [string] date in the format YYYY-MM-DD + +- function convertPubDate(input) { +- var date = new Date(input); +- var pieces = date.toString().split(' '); +- var offsetTime = pieces[5].match(/[-+]\d{4}/); +- var offset = (offsetTime) ? offsetTime : pieces[5]; +- var parts = [ pieces[0] + ',', pieces[2], pieces[1], pieces[3], pieces[4], offset ]; +- return parts.join(' '); +- } + + +//- Compile scrset attribute for hero images + image - [object] article image object from _data.json + path - [string] relative path to image folder + +- function getScrset(image, path) { +- var scrset = path + image.file + ' ' + image_sizes.medium + 'w'; +- if(image.file_small) scrset += ', ' + path + image.file_small + ' ' + image_sizes.small + 'w'; +- if(image.file_large) scrset += ', ' + path + image.file_large + ' ' + image_sizes.large + 'w'; +- return scrset; +- } + + +//- Get meta image + +- function getMetaImage() { +- if(current.path[0] == 'blog' && image && image.file) { +- return url + '/blog/img/' + (image.file_small || image.file); +- } +- else { +- return url + '/assets/img/social.png'; +- } +>>>>>>> v1.0.0-rc1 - } diff --git a/website/_includes/_head.jade b/website/_includes/_head.jade new file mode 100644 index 000000000..5d36dc2f7 --- /dev/null +++ b/website/_includes/_head.jade @@ -0,0 +1,27 @@ +include _mixins + +//- Head +//- ============================================================================ + +head + title=getPageTitle() + + meta(charset='utf-8') + meta(name="viewport" content="width=device-width, initial-scale=1.0") + meta(name='referrer' content='always') + meta(name='description' content=description) + + meta(property='og:type' content='website') + meta(property='og:site_name' content=sitename) + meta(property='og:url' content=getCurrentUrl()) + meta(property='og:title' content=title) + meta(property='og:description' content=description) + meta(property='og:image' content=getMetaImage()) + + meta(name='twitter:card' content='summary_large_image') + meta(name='twitter:site' content='@' + profiles.twitter) + meta(name='twitter:title' content=title) + meta(name='twitter:description' content=description) + meta(name='twitter:image' content=getMetaImage()) + + link(rel='icon' type='image/x-icon' href='/assets/img/favicon.ico') diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 04faf8993..f02e5d9e0 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -1,8 +1,389 @@ +<<<<<<< HEAD //- ---------------------------------- //- 💫 INCLUDES > MIXINS //- ---------------------------------- include _functions +======= +include _functions + +//- Mixins +//- ============================================================================ + +//- Sections for content pages + id - [string] id, can be headline id as it's being prefixed (optional) + block - section content (block and inline elements) + +mixin section(id) + section.section(id=(id) ? 'section-' + id : '')&attributes(attributes) + block + + +//- Flexbox grid to align children elements + ...style - [strings] flexbox CSS classes without prefix (optional) + block - container content (block and inline elements) + +mixin grid(...style) + .grid(class=prefixArgs(style, 'grid--'))&attributes(attributes) + block + +mixin grid-col(...style) + .grid-col(class=prefixArgs(style, 'grid-col--'))&attributes(attributes) + block + + +//- Aside + headline - [string] Headline of aside (optional) + block - aside content (inline elements) + +mixin aside(headline) + span.aside(data-label=headline)&attributes(attributes) + span.aside-body + block + + +//- Paragraphs + block - paragraph content (inline elements) + +mixin lead + p.text-lead&attributes(attributes) + block + + +//- Various text styles + block - text (inline elements) + +mixin example + p.text-example&attributes(attributes) + block + +mixin source + span.text-source&attributes(attributes) + block + +mixin label(...style) + span(class=(style != '') ? prefixArgs(style, 'label-') : 'label')&attributes(attributes) + block + + +//- Headings with optional permalinks + id - [string] unique id (optional, no permalink without id) + source - [string] link for source button (optional) + block - headline text (inline elements) + +mixin headline(level, id, source) + if level == 2 + +h2(id, source) + block + + else if level == 3 + +h3(id, source) + block + + else if level == 4 + +h4(id, source) + block + + else if level == 5 + +h5(id, source) + block + + else + +h6(id, source) + block + +mixin h1(id, source) + h1(id=id)&attributes(attributes) + +permalink(id, source) + block + +mixin h2(id, source) + h2(id=id)&attributes(attributes) + +permalink(id, source) + block + +mixin h3(id, source) + h3(id=id)&attributes(attributes) + +permalink(id, source) + block + +mixin h4(id, source) + h4(id=id)&attributes(attributes) + +permalink(id, source) + block + +mixin h5(id, source) + h5(id=id)&attributes(attributes) + +permalink(id, source) + block + +mixin h6(id, source) + h6(id=id)&attributes(attributes) + +permalink(id, source) + block + +mixin permalink(id, source) + if id + a.permalink(href='#' + id) + block + + else + block + + if source + +button('secondary', 'small', 'source')(href=source target='_blank') Source + + +//- Button + element - [string] specifies HTML element, 'button' or 'link' + ...style - [strings] button CSS classes without prefix (optional) + block - button text (inline elements) + +mixin button(type, ...style) + - var classname = 'button-' + type + ' ' + ((style) ? prefixArgs(style, 'button--') : '') + + a.button(class=classname)&attributes(attributes) + block + +mixin form-button(type, ...style) + - var classname = 'button-' + type + ' ' + ((style) ? prefixArgs(style, 'button--') : '') + button(class=classname)&attributes(attributes) + block + + +//- Input + placeholder - [string] placeholder for input field (optional) + value - [string] value of input field (optional) + +mixin input(placeholder, value) + input.input(placeholder=placeholder value=value)&attributes(attributes) + + +//- Icon + name - [string] icon name, refers to CSS classes + size - [string] 'medium' or 'large' (optional) + type - [string] 'button' (optional) + block - description, if as a text node to the icon element it prevents line + breaks between icon and text (inline elements) + +mixin icon(type, ...style) + span(class='icon-' + type + ' ' + prefixArgs(style, 'icon--') aria-hidden="true")&attributes(attributes) + block + + +//- Image for illustration purposes + file - [string] file name (in /img) + alt - [string] descriptive alt text (optional) + caption - [string] image caption (optional) + +mixin image(file, alt, caption, size) + figure.image-container&attributes(attributes) + img(src='img/' + file alt=alt class=(size) ? 'image--' + size : '') + + if caption + figcaption.text-caption=caption + + block + + +//- Illustrated code view + title - [string] title of window + +mixin code-demo(title) + .x-terminal&attributes(attributes) + .x-terminal-icons: span + .x-terminal-title=title + +code.x-terminal-code + block + + +//- Data table + head - [array] column headings (optional, without headings no table + head is displayed) + ...style - [strings] table CSS classes without prefix (optional) + block - only +row (tr) + +mixin table(head, ...style) + table.table(class=prefixArgs(style, 'table--'))&attributes(attributes) + + if head + tr.table-row + each column in head + th.table-head-cell=column + + block + + +//- Data table row + block - only +cell (td) + +mixin row(...style) + tr.table-row(class=prefixArgs(style, 'table-cell--'))&attributes(attributes) + block + + +//- Data table cell + block - table cell content (inline elements) + +mixin cell(...style) + td.table-cell(class=prefixArgs(style, 'table-cell--'))&attributes(attributes) + block + + +//- General list (ordered and unordered) + type - [string] 'numbers', 'letters', 'roman' (optional) + start - [integer] starting point of list (1 = list starts at 1 or A) + block - only +item (li) + +mixin list(type, start) + if type + ol.list(class='list--' + type style=(start === 0 || start) ? 'counter-reset: li ' + (start - 1) : '')&attributes(attributes) + block + + else + ul.list.list--bullets&attributes(attributes) + block + + +//- List item + block - item text (inline elements) + +mixin item + li.list-item&attributes(attributes) + block + + +//- Blockquote + source - [string] quote source / author (optional) + link - [string] link to quote source (only with source, optional) + block - quote text (inline elements) + +mixin quote(source, link) + blockquote.quote&attributes(attributes) + p.quote-text + block + + if source && link + | #[a.quote-source(href=link target='_blank')=source] + + else if source && !link + .quote-source !{source} + + +//- Pullquotes with optional 'tweet this' function + tweet - [string] text to be tweeted (optional) + block - pullquote text (inline elements, only shown if no tweet text) + +mixin pullquote(tweet) + blockquote.quote&attributes(attributes) + + p.quote-text-strong + if tweet + | !{tweet} #[a.quote-source(href=twitterShareUrl(current.path, tweet) target='_blank') Tweet this] + + else + block + + +//- Code block + use as +code(args). to preserve whitespace and prevent code interprettion + language - [string] language for syntax highlighting (optional, default: + 'python', see Prism for options: http://prismjs.com) + label - [string] code block headline (optional) + block - code text (inline elements) + + +mixin code(language, label) + pre.code-block(class='lang-' + (language || default_syntax) data-label=label)&attributes(attributes) + code.code-inline + block + + +//- Infobox for notes and alerts + label - [string] infobox headline (optional) + block - infobox text (inline and block elements) + +mixin infobox(label) + .box.box--info(data-label=label)&attributes(attributes) + p.box-body + block + + +//- Alerts for notes and updates + +mixin alert(button) + .alert&attributes(attributes) + span + block + + if button + +form-button('primary', 'small')(onclick='this.parentNode.parentNode.removeChild(this.parentNode);')=button + + else + button.alert-close(onclick='this.parentNode.parentNode.removeChild(this.parentNode);') + + + +//- Embeds + border - [boolean] add border to embed container + caption - [string] embed caption + block - embed content (inline and block elements) + +mixin embed(border, caption) + figure.embed(class=(border) ? 'embed--border' : '')&attributes(attributes) + block + + if caption + figcaption.embed-caption=caption + + +//- displaCy + filename - [string] name of file in displacy folder (no .html) + caption - [string] caption (optional) + height - [integer] iframe height in px (optional) + +mixin displacy(filename, caption, height) + +embed(true, caption).embed--displacy + iframe(src='/blog/displacy/' + filename height=height) + + +//- Logo, imports SVG + size - [string] 'tiny', 'small', 'regular' or 'large' + +mixin logo(size) + !=partial('/_includes/_logo', { logo_size: size }) + + +//-